To: vim_dev@googlegroups.com Subject: Patch 9.0.1485 Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------ Patch 9.0.1485 Problem: no functions for converting from/to UTF-16 index. Solution: Add UTF-16 flag to existing funtions and add strutf16len() and utf16idx(). (Yegappan Lakshmanan, closes #12216) Files: runtime/doc/builtin.txt, runtime/doc/eval.txt, runtime/doc/usr_41.txt, src/evalfunc.c, src/strings.c, src/proto/strings.pro, src/testdir/test_functions.vim *** ../vim-9.0.1484/runtime/doc/builtin.txt 2023-03-16 21:43:07.339227880 +0000 --- runtime/doc/builtin.txt 2023-04-24 20:31:26.200724135 +0100 *************** *** 81,88 **** bufwinid({buf}) Number window ID of buffer {buf} bufwinnr({buf}) Number window number of buffer {buf} byte2line({byte}) Number line number at byte count {byte} ! byteidx({expr}, {nr}) Number byte index of {nr}'th char in {expr} ! byteidxcomp({expr}, {nr}) Number byte index of {nr}'th char in {expr} call({func}, {arglist} [, {dict}]) any call {func} with arguments {arglist} ceil({expr}) Float round {expr} up --- 81,90 ---- bufwinid({buf}) Number window ID of buffer {buf} bufwinnr({buf}) Number window number of buffer {buf} byte2line({byte}) Number line number at byte count {byte} ! byteidx({expr}, {nr} [, {utf16}]) ! Number byte index of {nr}'th char in {expr} ! byteidxcomp({expr}, {nr} [, {utf16}]) ! Number byte index of {nr}'th char in {expr} call({func}, {arglist} [, {dict}]) any call {func} with arguments {arglist} ceil({expr}) Float round {expr} up *************** *** 117,123 **** char2nr({expr} [, {utf8}]) Number ASCII/UTF-8 value of first char in {expr} charclass({string}) Number character class of {string} charcol({expr} [, {winid}]) Number column number of cursor or mark ! charidx({string}, {idx} [, {countcc}]) Number char index of byte {idx} in {string} chdir({dir}) String change current working directory cindent({lnum}) Number C indent for line {lnum} --- 119,125 ---- char2nr({expr} [, {utf8}]) Number ASCII/UTF-8 value of first char in {expr} charclass({string}) Number character class of {string} charcol({expr} [, {winid}]) Number column number of cursor or mark ! charidx({string}, {idx} [, {countcc} [, {utf16}]]) Number char index of byte {idx} in {string} chdir({dir}) String change current working directory cindent({lnum}) Number C indent for line {lnum} *************** *** 604,609 **** --- 606,613 ---- strridx({haystack}, {needle} [, {start}]) Number last index of {needle} in {haystack} strtrans({expr}) String translate string to make it printable + strutf16len({string} [, {countcc}]) + Number number of UTF-16 code units in {string} strwidth({expr}) Number display cell length of the String {expr} submatch({nr} [, {list}]) String or List specific match in ":s" or substitute() *************** *** 704,709 **** --- 708,715 ---- undotree() List undo file tree uniq({list} [, {func} [, {dict}]]) List remove adjacent duplicates from a list + utf16idx({string}, {idx} [, {countcc} [, {charidx}]]) + Number UTF-16 index of byte {idx} in {string} values({dict}) List values in {dict} virtcol({expr} [, {list}]) Number or List screen column of cursor or mark *************** *** 1363,1369 **** < {not available when compiled without the |+byte_offset| feature} ! byteidx({expr}, {nr}) *byteidx()* Return byte index of the {nr}'th character in the String {expr}. Use zero for the first character, it then returns zero. --- 1369,1375 ---- < {not available when compiled without the |+byte_offset| feature} ! byteidx({expr}, {nr} [, {utf16}]) *byteidx()* Return byte index of the {nr}'th character in the String {expr}. Use zero for the first character, it then returns zero. *************** *** 1373,1378 **** --- 1379,1391 ---- length is added to the preceding base character. See |byteidxcomp()| below for counting composing characters separately. + When {utf16} is present and TRUE, {nr} is used as the UTF-16 + index in the String {expr} instead of as the character index. + The UTF-16 index is the index in the string when it is encoded + with 16-bit words. If the specified UTF-16 index is in the + middle of a character (e.g. in a 4-byte character), then the + byte index of the first byte in the character is returned. + Refer to |string-offset-encoding| for more information. Example : > echo matchstr(str, ".", byteidx(str, 3)) < will display the fourth character. Another way to do the *************** *** 1384,1394 **** If there are less than {nr} characters -1 is returned. If there are exactly {nr} characters the length of the string in bytes is returned. ! Can also be used as a |method|: > GetName()->byteidx(idx) ! byteidxcomp({expr}, {nr}) *byteidxcomp()* Like byteidx(), except that a composing character is counted as a separate character. Example: > let s = 'e' .. nr2char(0x301) --- 1397,1413 ---- If there are less than {nr} characters -1 is returned. If there are exactly {nr} characters the length of the string in bytes is returned. ! See |charidx()| and |utf16idx()| for getting the character and ! UTF-16 index respectively from the byte index. ! Examples: > ! echo byteidx('a馃槉馃槉', 2) returns 5 ! echo byteidx('a馃槉馃槉', 2, 1) returns 1 ! echo byteidx('a馃槉馃槉', 3, 1) returns 5 ! < Can also be used as a |method|: > GetName()->byteidx(idx) ! byteidxcomp({expr}, {nr} [, {utf16}]) *byteidxcomp()* Like byteidx(), except that a composing character is counted as a separate character. Example: > let s = 'e' .. nr2char(0x301) *************** *** 1493,1519 **** GetPos()->col() < *charidx()* ! charidx({string}, {idx} [, {countcc}]) Return the character index of the byte at {idx} in {string}. The index of the first character is zero. If there are no multibyte characters the returned value is equal to {idx}. When {countcc} is omitted or |FALSE|, then composing characters ! are not counted separately, their byte length is ! added to the preceding base character. When {countcc} is |TRUE|, then composing characters are counted as separate characters. Returns -1 if the arguments are invalid or if {idx} is greater than the index of the last byte in {string}. An error is given if the first argument is not a string, the second argument is not a number or when the third argument is present and is not zero or one. See |byteidx()| and |byteidxcomp()| for getting the byte index ! from the character index. Examples: > echo charidx('a虂b虂c虂', 3) returns 1 echo charidx('a虂b虂c虂', 6, 1) returns 4 echo charidx('a虂b虂c虂', 16) returns -1 < Can also be used as a |method|: > GetName()->charidx(idx) --- 1512,1547 ---- GetPos()->col() < *charidx()* ! charidx({string}, {idx} [, {countcc} [, {utf16}]]) Return the character index of the byte at {idx} in {string}. The index of the first character is zero. If there are no multibyte characters the returned value is equal to {idx}. + When {countcc} is omitted or |FALSE|, then composing characters ! are not counted separately, their byte length is added to the ! preceding base character. When {countcc} is |TRUE|, then composing characters are counted as separate characters. + + When {utf16} is present and TRUE, {idx} is used as the UTF-16 + index in the String {expr} instead of as the byte index. + Returns -1 if the arguments are invalid or if {idx} is greater than the index of the last byte in {string}. An error is given if the first argument is not a string, the second argument is not a number or when the third argument is present and is not zero or one. + See |byteidx()| and |byteidxcomp()| for getting the byte index ! from the character index and |utf16idx()| for getting the ! UTF-16 index from the character index. ! Refer to |string-offset-encoding| for more information. Examples: > echo charidx('a虂b虂c虂', 3) returns 1 echo charidx('a虂b虂c虂', 6, 1) returns 4 echo charidx('a虂b虂c虂', 16) returns -1 + echo charidx('a馃槉馃槉', 4, 0, 1) returns 2 < Can also be used as a |method|: > GetName()->charidx(idx) *************** *** 9236,9241 **** --- 9272,9299 ---- Can also be used as a |method|: > GetString()->strtrans() + strutf16len({string} [, {countcc}]) *strutf16len()* + The result is a Number, which is the number of UTF-16 code + units in String {string} (after converting it to UTF-16). + + When {countcc} is TRUE, composing characters are counted + separately. + When {countcc} is omitted or FALSE, composing characters are + ignored. + + Returns zero on error. + + Also see |strlen()| and |strcharlen()|. + Examples: > + echo strutf16len('a') returns 1 + echo strutf16len('漏') returns 1 + echo strutf16len('馃槉') returns 2 + echo strutf16len('a台虂') returns 1 + echo strutf16len('a台虂', v:true) returns 3 + + Can also be used as a |method|: > + GetText()->strutf16len() + < strwidth({string}) *strwidth()* The result is a Number, which is the number of display cells String {string} occupies. A Tab character is counted as one *************** *** 10049,10054 **** --- 10109,10142 ---- Can also be used as a |method|: > mylist->uniq() + < + *utf16idx()* + utf16idx({string}, {idx} [, {countcc} [, {charidx}]]) + Same as |charidx()| but returns the UTF-16 index of the byte + at {idx} in {string} (after converting it to UTF-16). + + When {charidx} is present and TRUE, {idx} is used as the + character index in the String {string} instead of as the byte + index. + An {idx} in the middle of a UTF-8 sequence is rounded upwards + to the end of that sequence. + + See |byteidx()| and |byteidxcomp()| for getting the byte index + from the UTF-16 index and |charidx()| for getting the + character index from the UTF-16 index. + Refer to |string-offset-encoding| for more information. + Examples: > + echo utf16idx('a馃槉馃槉', 3) returns 2 + echo utf16idx('a馃槉馃槉', 7) returns 4 + echo utf16idx('a馃槉馃槉', 1, 0, 1) returns 2 + echo utf16idx('a馃槉馃槉', 2, 0, 1) returns 4 + echo utf16idx('aa台虂c', 6) returns 2 + echo utf16idx('aa台虂c', 6, 1) returns 4 + echo utf16idx('a馃槉馃槉', 9) returns -1 + < + Can also be used as a |method|: > + GetName()->utf16idx(idx) + values({dict}) *values()* Return a |List| with all the values of {dict}. The |List| is *** ../vim-9.0.1484/runtime/doc/eval.txt 2023-01-12 21:07:58.636905098 +0000 --- runtime/doc/eval.txt 2023-04-24 20:53:21.609044564 +0100 *************** *** 1574,1579 **** --- 1580,1612 ---- echo $"The square root of {{9}} is {sqrt(9)}" < The square root of {9} is 3.0 ~ + *string-offset-encoding* + A string consists of multiple characters. How the characters are stored + depends on 'encoding'. Most common is UTF-8, which uses one byte for ASCII + characters, two bytes for other latin characters and more bytes for other + characters. + + A string offset can count characters or bytes. Other programs may use + UTF-16 encoding (16-bit words) and an offset of UTF-16 words. Some functions + use byte offsets, usually for UTF-8 encoding. Other functions use character + offsets, in which case the encoding doesn't matter. + + The different offsets for the string "a漏馃槉" are below: + + UTF-8 offsets: + [0]: 61, [1]: C2, [2]: A9, [3]: F0, [4]: 9F, [5]: 98, [6]: 8A + UTF-16 offsets: + [0]: 0061, [1]: 00A9, [2]: D83D, [3]: DE0A + UTF-32 (character) offsets: + [0]: 00000061, [1]: 000000A9, [2]: 0001F60A + + You can use the "g8" and "ga" commands on a character to see the + decimal/hex/octal values. + + The functions |byteidx()|, |utf16idx()| and |charidx()| can be used to convert + between these indices. The functions |strlen()|, |strutf16len()| and + |strcharlen()| return the number of bytes, UTF-16 code units and characters in + a string respectively. option *expr-option* *E112* *E113* ------ *** ../vim-9.0.1484/runtime/doc/usr_41.txt 2023-01-17 18:31:20.423373305 +0000 --- runtime/doc/usr_41.txt 2023-04-24 20:22:12.664400625 +0100 *************** *** 753,758 **** --- 754,760 ---- strlen() length of a string in bytes strcharlen() length of a string in characters strchars() number of characters in a string + strutf16len() number of UTF-16 code units in a string strwidth() size of string when displayed strdisplaywidth() size of string when displayed, deals with tabs setcellwidths() set character cell width overrides *************** *** 770,775 **** --- 772,778 ---- byteidx() byte index of a character in a string byteidxcomp() like byteidx() but count composing characters charidx() character index of a byte in a string + utf16idx() UTF-16 index of a byte in a string repeat() repeat a string multiple times eval() evaluate a string expression execute() execute an Ex command and get the output *** ../vim-9.0.1484/src/evalfunc.c 2023-04-13 19:15:50.023391985 +0100 --- src/evalfunc.c 2023-04-24 20:22:12.664400625 +0100 *************** *** 1751,1759 **** ret_number, f_bufwinnr}, {"byte2line", 1, 1, FEARG_1, arg1_number, ret_number, f_byte2line}, ! {"byteidx", 2, 2, FEARG_1, arg2_string_number, ret_number, f_byteidx}, ! {"byteidxcomp", 2, 2, FEARG_1, arg2_string_number, ret_number, f_byteidxcomp}, {"call", 2, 3, FEARG_1, arg3_any_list_dict, ret_any, f_call}, --- 1751,1759 ---- ret_number, f_bufwinnr}, {"byte2line", 1, 1, FEARG_1, arg1_number, ret_number, f_byte2line}, ! {"byteidx", 2, 3, FEARG_1, arg3_string_number_bool, ret_number, f_byteidx}, ! {"byteidxcomp", 2, 3, FEARG_1, arg3_string_number_bool, ret_number, f_byteidxcomp}, {"call", 2, 3, FEARG_1, arg3_any_list_dict, ret_any, f_call}, *************** *** 1803,1809 **** ret_number, f_charclass}, {"charcol", 1, 2, FEARG_1, arg2_string_or_list_number, ret_number, f_charcol}, ! {"charidx", 2, 3, FEARG_1, arg3_string_number_bool, ret_number, f_charidx}, {"chdir", 1, 1, FEARG_1, arg1_string, ret_string, f_chdir}, --- 1803,1809 ---- ret_number, f_charclass}, {"charcol", 1, 2, FEARG_1, arg2_string_or_list_number, ret_number, f_charcol}, ! {"charidx", 2, 4, FEARG_1, arg3_string_number_bool, ret_number, f_charidx}, {"chdir", 1, 1, FEARG_1, arg1_string, ret_string, f_chdir}, *************** *** 2601,2606 **** --- 2601,2608 ---- ret_number, f_strridx}, {"strtrans", 1, 1, FEARG_1, arg1_string, ret_string, f_strtrans}, + {"strutf16len", 1, 2, FEARG_1, arg2_string_bool, + ret_number, f_strutf16len}, {"strwidth", 1, 1, FEARG_1, arg1_string, ret_number, f_strwidth}, {"submatch", 1, 2, FEARG_1, arg2_number_bool, *************** *** 2785,2790 **** --- 2787,2794 ---- ret_dict_any, f_undotree}, {"uniq", 1, 3, FEARG_1, arg13_sortuniq, ret_first_arg, f_uniq}, + {"utf16idx", 2, 4, FEARG_1, arg3_string_number_bool, + ret_number, f_utf16idx}, {"values", 1, 1, FEARG_1, arg1_dict_any, ret_list_member, f_values}, {"virtcol", 1, 2, FEARG_1, arg2_string_or_list_bool, *** ../vim-9.0.1484/src/strings.c 2023-04-16 20:53:50.189171575 +0100 --- src/strings.c 2023-04-24 21:00:25.749101228 +0100 *************** *** 1006,1015 **** static void byteidx(typval_T *argvars, typval_T *rettv, int comp UNUSED) { - char_u *t; - char_u *str; - varnumber_T idx; - rettv->vval.v_number = -1; if (in_vim9script() --- 1006,1011 ---- *************** *** 1017,1036 **** || check_for_number_arg(argvars, 1) == FAIL)) return; ! str = tv_get_string_chk(&argvars[0]); ! idx = tv_get_number_chk(&argvars[1], NULL); if (str == NULL || idx < 0) return; ! t = str; for ( ; idx > 0; idx--) { if (*t == NUL) // EOL reached return; ! if (enc_utf8 && comp) ! t += utf_ptr2len(t); ! else ! t += (*mb_ptr2len)(t); } rettv->vval.v_number = (varnumber_T)(t - str); } --- 1013,1054 ---- || check_for_number_arg(argvars, 1) == FAIL)) return; ! char_u *str = tv_get_string_chk(&argvars[0]); ! varnumber_T idx = tv_get_number_chk(&argvars[1], NULL); if (str == NULL || idx < 0) return; ! varnumber_T utf16idx = FALSE; ! if (argvars[2].v_type != VAR_UNKNOWN) ! { ! utf16idx = tv_get_bool(&argvars[2]); ! if (utf16idx < 0 || utf16idx > 1) ! { ! semsg(_(e_using_number_as_bool_nr), utf16idx); ! return; ! } ! } ! ! int (*ptr2len)(char_u *); ! if (enc_utf8 && comp) ! ptr2len = utf_ptr2len; ! else ! ptr2len = mb_ptr2len; ! ! char_u *t = str; for ( ; idx > 0; idx--) { if (*t == NUL) // EOL reached return; ! if (utf16idx) ! { ! int clen = ptr2len(t); ! int c = (clen > 1) ? utf_ptr2char(t) : *t; ! if (c > 0xFFFF) ! idx--; ! } ! if (idx > 0) ! t += ptr2len(t); } rettv->vval.v_number = (varnumber_T)(t - str); } *************** *** 1059,1100 **** void f_charidx(typval_T *argvars, typval_T *rettv) { - char_u *str; - varnumber_T idx; - varnumber_T countcc = FALSE; - char_u *p; - int len; - int (*ptr2len)(char_u *); - rettv->vval.v_number = -1; ! if ((check_for_string_arg(argvars, 0) == FAIL || check_for_number_arg(argvars, 1) == FAIL ! || check_for_opt_bool_arg(argvars, 2) == FAIL)) return; ! str = tv_get_string_chk(&argvars[0]); ! idx = tv_get_number_chk(&argvars[1], NULL); if (str == NULL || idx < 0) return; if (argvars[2].v_type != VAR_UNKNOWN) - countcc = tv_get_bool(&argvars[2]); - if (countcc < 0 || countcc > 1) { ! semsg(_(e_using_number_as_bool_nr), countcc); ! return; } if (enc_utf8 && countcc) ptr2len = utf_ptr2len; else ptr2len = mb_ptr2len; ! for (p = str, len = 0; p <= str + idx; len++) { if (*p == NUL) return; p += ptr2len(p); } --- 1077,1125 ---- void f_charidx(typval_T *argvars, typval_T *rettv) { rettv->vval.v_number = -1; ! if (check_for_string_arg(argvars, 0) == FAIL || check_for_number_arg(argvars, 1) == FAIL ! || check_for_opt_bool_arg(argvars, 2) == FAIL ! || (argvars[2].v_type != VAR_UNKNOWN ! && check_for_opt_bool_arg(argvars, 3) == FAIL)) return; ! char_u *str = tv_get_string_chk(&argvars[0]); ! varnumber_T idx = tv_get_number_chk(&argvars[1], NULL); if (str == NULL || idx < 0) return; + varnumber_T countcc = FALSE; + varnumber_T utf16idx = FALSE; if (argvars[2].v_type != VAR_UNKNOWN) { ! countcc = tv_get_bool(&argvars[2]); ! if (argvars[3].v_type != VAR_UNKNOWN) ! utf16idx = tv_get_bool(&argvars[3]); } + int (*ptr2len)(char_u *); if (enc_utf8 && countcc) ptr2len = utf_ptr2len; else ptr2len = mb_ptr2len; ! char_u *p; ! int len; ! for (p = str, len = 0; utf16idx ? idx >= 0 : p <= str + idx; len++) { if (*p == NUL) return; + if (utf16idx) + { + idx--; + int clen = ptr2len(p); + int c = (clen > 1) ? utf_ptr2char(p) : *p; + if (c > 0xFFFF) + idx--; + } p += ptr2len(p); } *************** *** 1359,1364 **** --- 1384,1421 ---- } /* + * "strutf16len()" function + */ + void + f_strutf16len(typval_T *argvars, typval_T *rettv) + { + rettv->vval.v_number = -1; + + if (check_for_string_arg(argvars, 0) == FAIL + || check_for_opt_bool_arg(argvars, 1) == FAIL) + return; + + varnumber_T countcc = FALSE; + if (argvars[1].v_type != VAR_UNKNOWN) + countcc = tv_get_bool(&argvars[1]); + + char_u *s = tv_get_string(&argvars[0]); + varnumber_T len = 0; + int (*func_mb_ptr2char_adv)(char_u **pp); + int ch; + + func_mb_ptr2char_adv = countcc ? mb_cptr2char_adv : mb_ptr2char_adv; + while (*s != NUL) + { + ch = func_mb_ptr2char_adv(&s); + if (ch > 0xFFFF) + ++len; + ++len; + } + rettv->vval.v_number = len; + } + + /* * "strdisplaywidth()" function */ void *************** *** 1619,1624 **** --- 1676,1736 ---- rettv->vval.v_string = transstr(tv_get_string(&argvars[0])); } + + /* + * + * "utf16idx()" function + */ + void + f_utf16idx(typval_T *argvars, typval_T *rettv) + { + rettv->vval.v_number = -1; + + if (check_for_string_arg(argvars, 0) == FAIL + || check_for_opt_number_arg(argvars, 1) == FAIL + || check_for_opt_bool_arg(argvars, 2) == FAIL + || (argvars[2].v_type != VAR_UNKNOWN + && check_for_opt_bool_arg(argvars, 3) == FAIL)) + return; + + char_u *str = tv_get_string_chk(&argvars[0]); + varnumber_T idx = tv_get_number_chk(&argvars[1], NULL); + if (str == NULL || idx < 0) + return; + + varnumber_T countcc = FALSE; + varnumber_T charidx = FALSE; + if (argvars[2].v_type != VAR_UNKNOWN) + { + countcc = tv_get_bool(&argvars[2]); + if (argvars[3].v_type != VAR_UNKNOWN) + charidx = tv_get_bool(&argvars[3]); + } + + int (*ptr2len)(char_u *); + if (enc_utf8 && countcc) + ptr2len = utf_ptr2len; + else + ptr2len = mb_ptr2len; + + char_u *p; + int len; + for (p = str, len = 0; charidx ? idx >= 0 : p <= str + idx; len++) + { + if (*p == NUL) + return; + int clen = ptr2len(p); + int c = (clen > 1) ? utf_ptr2char(p) : *p; + if (c > 0xFFFF) + len++; + p += ptr2len(p); + if (charidx) + idx--; + } + + rettv->vval.v_number = len > 0 ? len - 1 : 0; + } + /* * "tolower(string)" function */ *** ../vim-9.0.1484/src/proto/strings.pro 2023-01-04 15:56:47.868550539 +0000 --- src/proto/strings.pro 2023-04-24 20:22:12.664400625 +0100 *************** *** 36,47 **** --- 36,49 ---- void f_strlen(typval_T *argvars, typval_T *rettv); void f_strcharlen(typval_T *argvars, typval_T *rettv); void f_strchars(typval_T *argvars, typval_T *rettv); + void f_strutf16len(typval_T *argvars, typval_T *rettv); void f_strdisplaywidth(typval_T *argvars, typval_T *rettv); void f_strwidth(typval_T *argvars, typval_T *rettv); void f_strcharpart(typval_T *argvars, typval_T *rettv); void f_strpart(typval_T *argvars, typval_T *rettv); void f_strridx(typval_T *argvars, typval_T *rettv); void f_strtrans(typval_T *argvars, typval_T *rettv); + void f_utf16idx(typval_T *argvars, typval_T *rettv); void f_tolower(typval_T *argvars, typval_T *rettv); void f_toupper(typval_T *argvars, typval_T *rettv); void f_tr(typval_T *argvars, typval_T *rettv); *** ../vim-9.0.1484/src/testdir/test_functions.vim 2023-02-04 10:58:28.815703377 +0000 --- src/testdir/test_functions.vim 2023-04-24 20:22:12.668400624 +0100 *************** *** 1192,1210 **** bw! endfunc ! " Test for byteidx() and byteidxcomp() functions func Test_byteidx() let a = '.茅.' " one char of two bytes call assert_equal(0, byteidx(a, 0)) - call assert_equal(0, byteidxcomp(a, 0)) call assert_equal(1, byteidx(a, 1)) - call assert_equal(1, byteidxcomp(a, 1)) call assert_equal(3, byteidx(a, 2)) - call assert_equal(3, byteidxcomp(a, 2)) call assert_equal(4, byteidx(a, 3)) - call assert_equal(4, byteidxcomp(a, 3)) call assert_equal(-1, byteidx(a, 4)) - call assert_equal(-1, byteidxcomp(a, 4)) let b = '.e虂.' " normal e with composing char call assert_equal(0, b->byteidx(0)) --- 1192,1205 ---- bw! endfunc ! " Test for byteidx() using a character index func Test_byteidx() let a = '.茅.' " one char of two bytes call assert_equal(0, byteidx(a, 0)) call assert_equal(1, byteidx(a, 1)) call assert_equal(3, byteidx(a, 2)) call assert_equal(4, byteidx(a, 3)) call assert_equal(-1, byteidx(a, 4)) let b = '.e虂.' " normal e with composing char call assert_equal(0, b->byteidx(0)) *************** *** 1212,1229 **** call assert_equal(4, b->byteidx(2)) call assert_equal(5, b->byteidx(3)) call assert_equal(-1, b->byteidx(4)) call assert_fails("call byteidx([], 0)", 'E730:') call assert_equal(0, b->byteidxcomp(0)) call assert_equal(1, b->byteidxcomp(1)) call assert_equal(2, b->byteidxcomp(2)) call assert_equal(4, b->byteidxcomp(3)) call assert_equal(5, b->byteidxcomp(4)) call assert_equal(-1, b->byteidxcomp(5)) call assert_fails("call byteidxcomp([], 0)", 'E730:') endfunc ! " Test for charidx() func Test_charidx() let a = 'xa虂b虂y' call assert_equal(0, charidx(a, 0)) --- 1207,1390 ---- call assert_equal(4, b->byteidx(2)) call assert_equal(5, b->byteidx(3)) call assert_equal(-1, b->byteidx(4)) + + " string with multiple composing characters + let str = '-a台虂-a台虂' + call assert_equal(0, byteidx(str, 0)) + call assert_equal(1, byteidx(str, 1)) + call assert_equal(6, byteidx(str, 2)) + call assert_equal(7, byteidx(str, 3)) + call assert_equal(12, byteidx(str, 4)) + call assert_equal(-1, byteidx(str, 5)) + + " empty string + call assert_equal(0, byteidx('', 0)) + call assert_equal(-1, byteidx('', 1)) + + " error cases call assert_fails("call byteidx([], 0)", 'E730:') + call assert_fails("call byteidx('abc', [])", 'E745:') + endfunc + " Test for byteidxcomp() using a character index + func Test_byteidxcomp() + let a = '.茅.' " one char of two bytes + call assert_equal(0, byteidxcomp(a, 0)) + call assert_equal(1, byteidxcomp(a, 1)) + call assert_equal(3, byteidxcomp(a, 2)) + call assert_equal(4, byteidxcomp(a, 3)) + call assert_equal(-1, byteidxcomp(a, 4)) + + let b = '.e虂.' " normal e with composing char call assert_equal(0, b->byteidxcomp(0)) call assert_equal(1, b->byteidxcomp(1)) call assert_equal(2, b->byteidxcomp(2)) call assert_equal(4, b->byteidxcomp(3)) call assert_equal(5, b->byteidxcomp(4)) call assert_equal(-1, b->byteidxcomp(5)) + + " string with multiple composing characters + let str = '-a台虂-a台虂' + call assert_equal(0, byteidxcomp(str, 0)) + call assert_equal(1, byteidxcomp(str, 1)) + call assert_equal(2, byteidxcomp(str, 2)) + call assert_equal(4, byteidxcomp(str, 3)) + call assert_equal(6, byteidxcomp(str, 4)) + call assert_equal(7, byteidxcomp(str, 5)) + call assert_equal(8, byteidxcomp(str, 6)) + call assert_equal(10, byteidxcomp(str, 7)) + call assert_equal(12, byteidxcomp(str, 8)) + call assert_equal(-1, byteidxcomp(str, 9)) + + " empty string + call assert_equal(0, byteidxcomp('', 0)) + call assert_equal(-1, byteidxcomp('', 1)) + + " error cases call assert_fails("call byteidxcomp([], 0)", 'E730:') + call assert_fails("call byteidxcomp('abc', [])", 'E745:') endfunc ! " Test for byteidx() using a UTF-16 index ! func Test_byteidx_from_utf16_index() ! " string with single byte characters ! let str = "abc" ! for i in range(3) ! call assert_equal(i, byteidx(str, i, v:true)) ! endfor ! call assert_equal(3, byteidx(str, 3, v:true)) ! call assert_equal(-1, byteidx(str, 4, v:true)) ! ! " string with two byte characters ! let str = "a漏漏b" ! call assert_equal(0, byteidx(str, 0, v:true)) ! call assert_equal(1, byteidx(str, 1, v:true)) ! call assert_equal(3, byteidx(str, 2, v:true)) ! call assert_equal(5, byteidx(str, 3, v:true)) ! call assert_equal(6, byteidx(str, 4, v:true)) ! call assert_equal(-1, byteidx(str, 5, v:true)) ! ! " string with two byte characters ! let str = "a馃槉馃槉b" ! call assert_equal(0, byteidx(str, 0, v:true)) ! call assert_equal(1, byteidx(str, 1, v:true)) ! call assert_equal(1, byteidx(str, 2, v:true)) ! call assert_equal(5, byteidx(str, 3, v:true)) ! call assert_equal(5, byteidx(str, 4, v:true)) ! call assert_equal(9, byteidx(str, 5, v:true)) ! call assert_equal(10, byteidx(str, 6, v:true)) ! call assert_equal(-1, byteidx(str, 7, v:true)) ! ! " string with composing characters ! let str = '-a虂-b虂' ! call assert_equal(0, byteidx(str, 0, v:true)) ! call assert_equal(1, byteidx(str, 1, v:true)) ! call assert_equal(4, byteidx(str, 2, v:true)) ! call assert_equal(5, byteidx(str, 3, v:true)) ! call assert_equal(8, byteidx(str, 4, v:true)) ! call assert_equal(-1, byteidx(str, 5, v:true)) ! ! " string with multiple composing characters ! let str = '-a台虂-a台虂' ! call assert_equal(0, byteidx(str, 0, v:true)) ! call assert_equal(1, byteidx(str, 1, v:true)) ! call assert_equal(6, byteidx(str, 2, v:true)) ! call assert_equal(7, byteidx(str, 3, v:true)) ! call assert_equal(12, byteidx(str, 4, v:true)) ! call assert_equal(-1, byteidx(str, 5, v:true)) ! ! " empty string ! call assert_equal(0, byteidx('', 0, v:true)) ! call assert_equal(-1, byteidx('', 1, v:true)) ! ! " error cases ! call assert_fails('call byteidx(str, 0, [])', 'E745:') ! endfunc ! ! " Test for byteidxcomp() using a UTF-16 index ! func Test_byteidxcomp_from_utf16_index() ! " string with single byte characters ! let str = "abc" ! for i in range(3) ! call assert_equal(i, byteidxcomp(str, i, v:true)) ! endfor ! call assert_equal(3, byteidxcomp(str, 3, v:true)) ! call assert_equal(-1, byteidxcomp(str, 4, v:true)) ! ! " string with two byte characters ! let str = "a漏漏b" ! call assert_equal(0, byteidxcomp(str, 0, v:true)) ! call assert_equal(1, byteidxcomp(str, 1, v:true)) ! call assert_equal(3, byteidxcomp(str, 2, v:true)) ! call assert_equal(5, byteidxcomp(str, 3, v:true)) ! call assert_equal(6, byteidxcomp(str, 4, v:true)) ! call assert_equal(-1, byteidxcomp(str, 5, v:true)) ! ! " string with two byte characters ! let str = "a馃槉馃槉b" ! call assert_equal(0, byteidxcomp(str, 0, v:true)) ! call assert_equal(1, byteidxcomp(str, 1, v:true)) ! call assert_equal(1, byteidxcomp(str, 2, v:true)) ! call assert_equal(5, byteidxcomp(str, 3, v:true)) ! call assert_equal(5, byteidxcomp(str, 4, v:true)) ! call assert_equal(9, byteidxcomp(str, 5, v:true)) ! call assert_equal(10, byteidxcomp(str, 6, v:true)) ! call assert_equal(-1, byteidxcomp(str, 7, v:true)) ! ! " string with composing characters ! let str = '-a虂-b虂' ! call assert_equal(0, byteidxcomp(str, 0, v:true)) ! call assert_equal(1, byteidxcomp(str, 1, v:true)) ! call assert_equal(2, byteidxcomp(str, 2, v:true)) ! call assert_equal(4, byteidxcomp(str, 3, v:true)) ! call assert_equal(5, byteidxcomp(str, 4, v:true)) ! call assert_equal(6, byteidxcomp(str, 5, v:true)) ! call assert_equal(8, byteidxcomp(str, 6, v:true)) ! call assert_equal(-1, byteidxcomp(str, 7, v:true)) ! call assert_fails('call byteidxcomp(str, 0, [])', 'E745:') ! ! " string with multiple composing characters ! let str = '-a台虂-a台虂' ! call assert_equal(0, byteidxcomp(str, 0, v:true)) ! call assert_equal(1, byteidxcomp(str, 1, v:true)) ! call assert_equal(2, byteidxcomp(str, 2, v:true)) ! call assert_equal(4, byteidxcomp(str, 3, v:true)) ! call assert_equal(6, byteidxcomp(str, 4, v:true)) ! call assert_equal(7, byteidxcomp(str, 5, v:true)) ! call assert_equal(8, byteidxcomp(str, 6, v:true)) ! call assert_equal(10, byteidxcomp(str, 7, v:true)) ! call assert_equal(12, byteidxcomp(str, 8, v:true)) ! call assert_equal(-1, byteidxcomp(str, 9, v:true)) ! ! " empty string ! call assert_equal(0, byteidxcomp('', 0, v:true)) ! call assert_equal(-1, byteidxcomp('', 1, v:true)) ! ! " error cases ! call assert_fails('call byteidxcomp(str, 0, [])', 'E745:') ! endfunc ! ! " Test for charidx() using a byte index func Test_charidx() let a = 'xa虂b虂y' call assert_equal(0, charidx(a, 0)) *************** *** 1232,1248 **** call assert_equal(3, charidx(a, 7)) call assert_equal(-1, charidx(a, 8)) call assert_equal(-1, charidx(a, -1)) - call assert_equal(-1, charidx('', 0)) - call assert_equal(-1, charidx(test_null_string(), 0)) " count composing characters ! call assert_equal(0, charidx(a, 0, 1)) ! call assert_equal(2, charidx(a, 2, 1)) ! call assert_equal(3, charidx(a, 4, 1)) ! call assert_equal(5, charidx(a, 7, 1)) ! call assert_equal(-1, charidx(a, 8, 1)) call assert_equal(-1, charidx('', 0, 1)) call assert_fails('let x = charidx([], 1)', 'E1174:') call assert_fails('let x = charidx("abc", [])', 'E1210:') call assert_fails('let x = charidx("abc", 1, [])', 'E1212:') --- 1393,1412 ---- call assert_equal(3, charidx(a, 7)) call assert_equal(-1, charidx(a, 8)) call assert_equal(-1, charidx(a, -1)) " count composing characters ! call assert_equal(0, a->charidx(0, 1)) ! call assert_equal(2, a->charidx(2, 1)) ! call assert_equal(3, a->charidx(4, 1)) ! call assert_equal(5, a->charidx(7, 1)) ! call assert_equal(-1, a->charidx(8, 1)) ! ! " empty string ! call assert_equal(-1, charidx('', 0)) call assert_equal(-1, charidx('', 0, 1)) + " error cases + call assert_equal(-1, charidx(test_null_string(), 0)) call assert_fails('let x = charidx([], 1)', 'E1174:') call assert_fails('let x = charidx("abc", [])', 'E1210:') call assert_fails('let x = charidx("abc", 1, [])', 'E1212:') *************** *** 1250,1255 **** --- 1414,1650 ---- call assert_fails('let x = charidx("abc", 1, 2)', 'E1212:') endfunc + " Test for charidx() using a UTF-16 index + func Test_charidx_from_utf16_index() + " string with single byte characters + let str = "abc" + for i in range(3) + call assert_equal(i, charidx(str, i, v:false, v:true)) + endfor + call assert_equal(-1, charidx(str, 3, v:false, v:true)) + + " string with two byte characters + let str = "a漏漏b" + call assert_equal(0, charidx(str, 0, v:false, v:true)) + call assert_equal(1, charidx(str, 1, v:false, v:true)) + call assert_equal(2, charidx(str, 2, v:false, v:true)) + call assert_equal(3, charidx(str, 3, v:false, v:true)) + call assert_equal(-1, charidx(str, 4, v:false, v:true)) + + " string with four byte characters + let str = "a馃槉馃槉b" + call assert_equal(0, charidx(str, 0, v:false, v:true)) + call assert_equal(1, charidx(str, 1, v:false, v:true)) + call assert_equal(1, charidx(str, 2, v:false, v:true)) + call assert_equal(2, charidx(str, 3, v:false, v:true)) + call assert_equal(2, charidx(str, 4, v:false, v:true)) + call assert_equal(3, charidx(str, 5, v:false, v:true)) + call assert_equal(-1, charidx(str, 6, v:false, v:true)) + + " string with composing characters + let str = '-a虂-b虂' + for i in str->strcharlen()->range() + call assert_equal(i, charidx(str, i, v:false, v:true)) + endfor + call assert_equal(-1, charidx(str, 4, v:false, v:true)) + for i in str->strchars()->range() + call assert_equal(i, charidx(str, i, v:true, v:true)) + endfor + call assert_equal(-1, charidx(str, 6, v:true, v:true)) + + " string with multiple composing characters + let str = '-a台虂-a台虂' + for i in str->strcharlen()->range() + call assert_equal(i, charidx(str, i, v:false, v:true)) + endfor + call assert_equal(-1, charidx(str, 4, v:false, v:true)) + for i in str->strchars()->range() + call assert_equal(i, charidx(str, i, v:true, v:true)) + endfor + call assert_equal(-1, charidx(str, 8, v:true, v:true)) + + " empty string + call assert_equal(-1, charidx('', 0, v:false, v:true)) + call assert_equal(-1, charidx('', 0, v:true, v:true)) + + " error cases + call assert_equal(-1, charidx('', 0, v:false, v:true)) + call assert_equal(-1, charidx('', 0, v:true, v:true)) + call assert_equal(-1, charidx(test_null_string(), 0, v:false, v:true)) + call assert_fails('let x = charidx("abc", 1, v:false, [])', 'E1212:') + call assert_fails('let x = charidx("abc", 1, v:true, [])', 'E1212:') + endfunc + + " Test for utf16idx() using a byte index + func Test_utf16idx_from_byteidx() + " UTF-16 index of a string with single byte characters + let str = "abc" + for i in range(3) + call assert_equal(i, utf16idx(str, i)) + endfor + call assert_equal(-1, utf16idx(str, 3)) + + " UTF-16 index of a string with two byte characters + let str = 'a漏漏b' + call assert_equal(0, str->utf16idx(0)) + call assert_equal(1, str->utf16idx(1)) + call assert_equal(1, str->utf16idx(2)) + call assert_equal(2, str->utf16idx(3)) + call assert_equal(2, str->utf16idx(4)) + call assert_equal(3, str->utf16idx(5)) + call assert_equal(-1, str->utf16idx(6)) + + " UTF-16 index of a string with four byte characters + let str = 'a馃槉馃槉b' + call assert_equal(0, utf16idx(str, 0)) + call assert_equal(2, utf16idx(str, 1)) + call assert_equal(2, utf16idx(str, 2)) + call assert_equal(2, utf16idx(str, 3)) + call assert_equal(2, utf16idx(str, 4)) + call assert_equal(4, utf16idx(str, 5)) + call assert_equal(4, utf16idx(str, 6)) + call assert_equal(4, utf16idx(str, 7)) + call assert_equal(4, utf16idx(str, 8)) + call assert_equal(5, utf16idx(str, 9)) + call assert_equal(-1, utf16idx(str, 10)) + + " UTF-16 index of a string with composing characters + let str = '-a虂-b虂' + call assert_equal(0, utf16idx(str, 0)) + call assert_equal(1, utf16idx(str, 1)) + call assert_equal(1, utf16idx(str, 2)) + call assert_equal(1, utf16idx(str, 3)) + call assert_equal(2, utf16idx(str, 4)) + call assert_equal(3, utf16idx(str, 5)) + call assert_equal(3, utf16idx(str, 6)) + call assert_equal(3, utf16idx(str, 7)) + call assert_equal(-1, utf16idx(str, 8)) + call assert_equal(0, utf16idx(str, 0, v:true)) + call assert_equal(1, utf16idx(str, 1, v:true)) + call assert_equal(2, utf16idx(str, 2, v:true)) + call assert_equal(2, utf16idx(str, 3, v:true)) + call assert_equal(3, utf16idx(str, 4, v:true)) + call assert_equal(4, utf16idx(str, 5, v:true)) + call assert_equal(5, utf16idx(str, 6, v:true)) + call assert_equal(5, utf16idx(str, 7, v:true)) + call assert_equal(-1, utf16idx(str, 8, v:true)) + + " string with multiple composing characters + let str = '-a台虂-a台虂' + call assert_equal(0, utf16idx(str, 0)) + call assert_equal(1, utf16idx(str, 1)) + call assert_equal(1, utf16idx(str, 2)) + call assert_equal(1, utf16idx(str, 3)) + call assert_equal(1, utf16idx(str, 4)) + call assert_equal(1, utf16idx(str, 5)) + call assert_equal(2, utf16idx(str, 6)) + call assert_equal(3, utf16idx(str, 7)) + call assert_equal(3, utf16idx(str, 8)) + call assert_equal(3, utf16idx(str, 9)) + call assert_equal(3, utf16idx(str, 10)) + call assert_equal(3, utf16idx(str, 11)) + call assert_equal(-1, utf16idx(str, 12)) + call assert_equal(0, utf16idx(str, 0, v:true)) + call assert_equal(1, utf16idx(str, 1, v:true)) + call assert_equal(2, utf16idx(str, 2, v:true)) + call assert_equal(2, utf16idx(str, 3, v:true)) + call assert_equal(3, utf16idx(str, 4, v:true)) + call assert_equal(3, utf16idx(str, 5, v:true)) + call assert_equal(4, utf16idx(str, 6, v:true)) + call assert_equal(5, utf16idx(str, 7, v:true)) + call assert_equal(6, utf16idx(str, 8, v:true)) + call assert_equal(6, utf16idx(str, 9, v:true)) + call assert_equal(7, utf16idx(str, 10, v:true)) + call assert_equal(7, utf16idx(str, 11, v:true)) + call assert_equal(-1, utf16idx(str, 12, v:true)) + + " empty string + call assert_equal(-1, utf16idx('', 0)) + call assert_equal(-1, utf16idx('', 0, v:true)) + + " error cases + call assert_equal(-1, utf16idx("", 0)) + call assert_equal(-1, utf16idx("abc", -1)) + call assert_equal(-1, utf16idx(test_null_string(), 0)) + call assert_fails('let l = utf16idx([], 0)', 'E1174:') + call assert_fails('let l = utf16idx("ab", [])', 'E1210:') + call assert_fails('let l = utf16idx("ab", 0, [])', 'E1212:') + endfunc + + " Test for utf16idx() using a character index + func Test_utf16idx_from_charidx() + let str = "abc" + for i in str->strcharlen()->range() + call assert_equal(i, utf16idx(str, i, v:false, v:true)) + endfor + call assert_equal(-1, utf16idx(str, 3, v:false, v:true)) + + " UTF-16 index of a string with two byte characters + let str = "a漏漏b" + for i in str->strcharlen()->range() + call assert_equal(i, utf16idx(str, i, v:false, v:true)) + endfor + call assert_equal(-1, utf16idx(str, 4, v:false, v:true)) + + " UTF-16 index of a string with four byte characters + let str = "a馃槉馃槉b" + call assert_equal(0, utf16idx(str, 0, v:false, v:true)) + call assert_equal(2, utf16idx(str, 1, v:false, v:true)) + call assert_equal(4, utf16idx(str, 2, v:false, v:true)) + call assert_equal(5, utf16idx(str, 3, v:false, v:true)) + call assert_equal(-1, utf16idx(str, 4, v:false, v:true)) + + " UTF-16 index of a string with composing characters + let str = '-a虂-b虂' + for i in str->strcharlen()->range() + call assert_equal(i, utf16idx(str, i, v:false, v:true)) + endfor + call assert_equal(-1, utf16idx(str, 4, v:false, v:true)) + for i in str->strchars()->range() + call assert_equal(i, utf16idx(str, i, v:true, v:true)) + endfor + call assert_equal(-1, utf16idx(str, 6, v:true, v:true)) + + " string with multiple composing characters + let str = '-a台虂-a台虂' + for i in str->strcharlen()->range() + call assert_equal(i, utf16idx(str, i, v:false, v:true)) + endfor + call assert_equal(-1, utf16idx(str, 4, v:false, v:true)) + for i in str->strchars()->range() + call assert_equal(i, utf16idx(str, i, v:true, v:true)) + endfor + call assert_equal(-1, utf16idx(str, 8, v:true, v:true)) + + " empty string + call assert_equal(-1, utf16idx('', 0, v:false, v:true)) + call assert_equal(-1, utf16idx('', 0, v:true, v:true)) + + " error cases + call assert_equal(-1, utf16idx(test_null_string(), 0, v:true, v:true)) + call assert_fails('let l = utf16idx("ab", 0, v:false, [])', 'E1212:') + endfunc + + " Test for strutf16len() + func Test_strutf16len() + call assert_equal(3, strutf16len('abc')) + call assert_equal(3, 'abc'->strutf16len(v:true)) + call assert_equal(4, strutf16len('a漏漏b')) + call assert_equal(4, strutf16len('a漏漏b', v:true)) + call assert_equal(6, strutf16len('a馃槉馃槉b')) + call assert_equal(6, strutf16len('a馃槉馃槉b', v:true)) + call assert_equal(4, strutf16len('-a虂-b虂')) + call assert_equal(6, strutf16len('-a虂-b虂', v:true)) + call assert_equal(4, strutf16len('-a台虂-a台虂')) + call assert_equal(8, strutf16len('-a台虂-a台虂', v:true)) + call assert_equal(0, strutf16len('')) + + " error cases + call assert_fails('let l = strutf16len([])', 'E1174:') + call assert_fails('let l = strutf16len("a", [])', 'E1212:') + call assert_equal(0, strutf16len(test_null_string())) + endfunc + func Test_count() let l = ['a', 'a', 'A', 'b'] call assert_equal(2, count(l, 'a')) *************** *** 3074,3078 **** call StopVimInTerminal(buf) endfunc - " vim: shiftwidth=2 sts=2 expandtab --- 3469,3472 ---- *** ../vim-9.0.1484/src/version.c 2023-04-24 18:11:32.156258651 +0100 --- src/version.c 2023-04-24 20:24:11.132470661 +0100 *************** *** 697,698 **** --- 697,700 ---- { /* Add new patch number below this line */ + /**/ + 1485, /**/ -- It might look like I'm doing nothing, but at the cellular level I'm really quite busy. /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// \\\ \\\ sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ /// \\\ help me help AIDS victims -- http://ICCF-Holland.org ///