readline/readline/mbutil.c

   1 /* mbutil.c -- readline multibyte character utility functions */
   2
   3 /* Copyright (C) 2001-2020 Free Software Foundation, Inc.
   4
   5    This file is part of the GNU Readline Library (Readline), a library
   6    for reading lines of text with interactive input and history editing.
   7
   8    Readline is free software: you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation, either version 3 of the License, or
  11    (at your option) any later version.
  12
  13    Readline is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with Readline.  If not, see <http://www.gnu.org/licenses/>.
  20 */
  21
  22 #define READLINE_LIBRARY
  23
  24 #if defined (HAVE_CONFIG_H)
  25 #  include <config.h>
  26 #endif
  27
  28 #include <sys/types.h>
  29 #include <fcntl.h>
  30 #include "posixjmp.h"
  31
  32 #if defined (HAVE_UNISTD_H)
  33 #  include <unistd.h>      /* for _POSIX_VERSION */
  34 #endif /* HAVE_UNISTD_H */
  35
  36 #if defined (HAVE_STDLIB_H)
  37 #  include <stdlib.h>
  38 #else
  39 #  include "ansi_stdlib.h"
  40 #endif /* HAVE_STDLIB_H */
  41
  42 #include <stdio.h>
  43 #include <ctype.h>
  44
  45 /* System-specific feature definitions and include files. */
  46 #include "rldefs.h"
  47 #include "rlmbutil.h"
  48
  49 #if defined (TIOCSTAT_IN_SYS_IOCTL)
  50 #  include <sys/ioctl.h>
  51 #endif /* TIOCSTAT_IN_SYS_IOCTL */
  52
  53 /* Some standard library routines. */
  54 #include "readline.h"
  55
  56 #include "rlprivate.h"
  57 #include "xmalloc.h"
  58
  59 /* Declared here so it can be shared between the readline and history
  60    libraries. */
  61 #if defined (HANDLE_MULTIBYTE)
  62 int rl_byte_oriented = 0;
  63 #else
  64 int rl_byte_oriented = 1;
  65 #endif
  66
  67 /* Ditto */
  68 int _rl_utf8locale = 0;
  69
  70 /* **************************************************************** */
  71 /*                                                                  */
  72 /*              Multibyte Character Utility Functions               */
  73 /*                                                                  */
  74 /* **************************************************************** */
  75
  76 #if defined(HANDLE_MULTIBYTE)
  77
  78 /* **************************************************************** */
  79 /*                                                                  */
  80 /*              UTF-8 specific Character Utility Functions          */
  81 /*                                                                  */
  82 /* **************************************************************** */
  83
  84 /* Return the length in bytes of the possibly-multibyte character beginning
  85    at S. Encoding is UTF-8. */
  86 static int
  87 _rl_utf8_mblen (const char *s, size_t n)
  88 {
  89   unsigned char c, c1, c2, c3;
  90
  91   if (s == 0)
  92     return (0); /* no shift states */
  93   if (n <= 0)
  94     return (-1);
  95
  96   c = (unsigned char)*s;
  97   if (c < 0x80)
  98     return (c != 0);
  99   if (c >= 0xc2)
 100     {
 101       c1 = (unsigned char)s[1];
 102       if (c < 0xe0)
 103         {
 104           if (n == 1)
 105             return -2;
 106           if (n >= 2 && (c1 ^ 0x80) < 0x40)
 107             return 2;
 108         }
 109       else if (c < 0xf0)
 110         {
 111           if (n == 1)
 112             return -2;
 113           if ((c1 ^ 0x80) < 0x40
 114                 && (c >= 0xe1 || c1 >= 0xa0)
 115                 && (c != 0xed || c1 < 0xa0))
 116             {
 117               if (n == 2)
 118                 return -2;
 119               c2 = (unsigned char)s[2];
 120               if ((c2 ^ 0x80) < 0x40)
 121                 return 3;
 122             }
 123         }
 124       else if (c < 0xf4)
 125         {
 126           if (n == 1)
 127             return -2;
 128           if (((c1 ^ 0x80) < 0x40)
 129                 && (c >= 0xf1 || c1 >= 0x90)
 130                 && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
 131             {
 132               if (n == 2)
 133                 return -2;
 134               c2 = (unsigned char)s[2];
 135               if ((c2 ^ 0x80) < 0x40)
 136                 {
 137                   if (n == 3)
 138                     return -2;
 139                   c3 = (unsigned char)s[3];
 140                   if ((c3 ^ 0x80) < 0x40)
 141                     return 4;
 142                 }
 143             }
 144         }
 145     }
 146   /* invalid or incomplete multibyte character */
 147   return -1;
 148 }
 149
 150 static int
 151 _rl_find_next_mbchar_internal (char *string, int seed, int count, int find_non_zero)
 152 {
 153   size_t tmp, len;
 154   mbstate_t ps;
 155   int point;
 156   wchar_t wc;
 157
 158   tmp = 0;
 159
 160   memset(&ps, 0, sizeof (mbstate_t));
 161   if (seed < 0)
 162     seed = 0;
 163   if (count <= 0)
 164     return seed;
 165
 166   point = seed + _rl_adjust_point (string, seed, &ps);
 167   /* if _rl_adjust_point returns -1, the character or string is invalid.
 168      treat as a byte. */
 169   if (point == seed - 1)        /* invalid */
 170     return seed + 1;
 171
 172   /* if this is true, means that seed was not pointing to a byte indicating
 173      the beginning of a multibyte character.  Correct the point and consume
 174      one char. */
 175   if (seed < point)
 176     count--;
 177
 178   while (count > 0)
 179     {
 180       len = strlen (string + point);
 181       if (len == 0)
 182         break;
 183       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
 184         {
 185           tmp = 1;
 186           wc = (wchar_t) string[point];
 187           memset(&ps, 0, sizeof(mbstate_t));
 188         }
 189       else
 190         tmp = mbrtowc (&wc, string+point, len, &ps);
 191       if (MB_INVALIDCH ((size_t)tmp))
 192         {
 193           /* invalid bytes. assume a byte represents a character */
 194           point++;
 195           count--;
 196           /* reset states. */
 197           memset(&ps, 0, sizeof(mbstate_t));
 198         }
 199       else if (MB_NULLWCH (tmp))
 200         break;                  /* found wide '\0' */
 201       else
 202         {
 203           /* valid bytes */
 204           point += tmp;
 205           if (find_non_zero)
 206             {
 207               if (WCWIDTH (wc) == 0)
 208                 continue;
 209               else
 210                 count--;
 211             }
 212           else
 213             count--;
 214         }
 215     }
 216
 217   if (find_non_zero)
 218     {
 219       tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
 220       while (MB_NULLWCH (tmp) == 0 && MB_INVALIDCH (tmp) == 0 && WCWIDTH (wc) == 0)
 221         {
 222           point += tmp;
 223           tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
 224         }
 225     }
 226
 227   return point;
 228 }
 229
 230 static inline int
 231 _rl_test_nonzero (char *string, int ind, int len)
 232 {
 233   size_t tmp;
 234   wchar_t wc;
 235   mbstate_t ps;
 236
 237   memset (&ps, 0, sizeof (mbstate_t));
 238   tmp = mbrtowc (&wc, string + ind, len - ind, &ps);
 239   /* treat invalid multibyte sequences as non-zero-width */
 240   return (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp) || WCWIDTH (wc) > 0);
 241 }
 242
 243 /* experimental -- needs to handle zero-width characters better */
 244 static int
 245 _rl_find_prev_utf8char (char *string, int seed, int find_non_zero)
 246 {
 247   char *s;
 248   unsigned char b;
 249   int save, prev;
 250   size_t len;
 251
 252   if (find_non_zero)
 253     len = RL_STRLEN (string);
 254
 255   prev = seed - 1;
 256   while (prev >= 0)
 257    {
 258       b = (unsigned char)string[prev];
 259       if (UTF8_SINGLEBYTE (b))
 260         return (prev);
 261
 262       save = prev;
 263
 264       /* Move back until we're not in the middle of a multibyte char */
 265       if (UTF8_MBCHAR (b))
 266         {
 267           while (prev > 0 && (b = (unsigned char)string[--prev]) && UTF8_MBCHAR (b))
 268             ;
 269         }
 270
 271       if (UTF8_MBFIRSTCHAR (b))
 272         {
 273           if (find_non_zero)
 274             {
 275               if (_rl_test_nonzero (string, prev, len))
 276                 return (prev);
 277               else              /* valid but WCWIDTH (wc) == 0 */
 278                 prev = prev - 1;
 279             }
 280           else
 281             return (prev);
 282         }
 283       else
 284         return (save);                  /* invalid utf-8 multibyte sequence */
 285     }
 286
 287   return ((prev < 0) ? 0 : prev);
 288 }
 289
 290 /*static*/ int
 291 _rl_find_prev_mbchar_internal (char *string, int seed, int find_non_zero)
 292 {
 293   mbstate_t ps;
 294   int prev, non_zero_prev, point, length;
 295   size_t tmp;
 296   wchar_t wc;
 297
 298   if (_rl_utf8locale)
 299     return (_rl_find_prev_utf8char (string, seed, find_non_zero));
 300
 301   memset(&ps, 0, sizeof(mbstate_t));
 302   length = strlen(string);
 303
 304   if (seed < 0)
 305     return 0;
 306   else if (length < seed)
 307     return length;
 308
 309   prev = non_zero_prev = point = 0;
 310   while (point < seed)
 311     {
 312       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
 313         {
 314           tmp = 1;
 315           wc = (wchar_t) string[point];
 316           memset(&ps, 0, sizeof(mbstate_t));
 317         }
 318       else
 319         tmp = mbrtowc (&wc, string + point, length - point, &ps);
 320       if (MB_INVALIDCH ((size_t)tmp))
 321         {
 322           /* in this case, bytes are invalid or too short to compose
 323              multibyte char, so assume that the first byte represents
 324              a single character anyway. */
 325           tmp = 1;
 326           /* clear the state of the byte sequence, because
 327              in this case effect of mbstate is undefined  */
 328           memset(&ps, 0, sizeof (mbstate_t));
 329
 330           /* Since we're assuming that this byte represents a single
 331              non-zero-width character, don't forget about it. */
 332           prev = point;
 333         }
 334       else if (MB_NULLWCH (tmp))
 335         break;                  /* Found '\0' char.  Can this happen? */
 336       else
 337         {
 338           if (find_non_zero)
 339             {
 340               if (WCWIDTH (wc) != 0)
 341                 prev = point;
 342             }
 343           else
 344             prev = point;
 345         }
 346
 347       point += tmp;
 348     }
 349
 350   return prev;
 351 }
 352
 353 /* return the number of bytes parsed from the multibyte sequence starting
 354    at src, if a non-L'\0' wide character was recognized. It returns 0,
 355    if a L'\0' wide character was recognized. It  returns (size_t)(-1),
 356    if an invalid multibyte sequence was encountered. It returns (size_t)(-2)
 357    if it couldn't parse a complete  multibyte character.  */
 358 int
 359 _rl_get_char_len (char *src, mbstate_t *ps)
 360 {
 361   size_t tmp, l;
 362   int mb_cur_max;
 363
 364   /* Look at no more than MB_CUR_MAX characters */
 365   l = (size_t)strlen (src);
 366   if (_rl_utf8locale && l >= 0 && UTF8_SINGLEBYTE(*src))
 367     tmp = (*src != 0) ? 1 : 0;
 368   else
 369     {
 370       mb_cur_max = MB_CUR_MAX;
 371       tmp = mbrlen((const char *)src, (l < mb_cur_max) ? l : mb_cur_max, ps);
 372     }
 373   if (tmp == (size_t)(-2))
 374     {
 375       /* too short to compose multibyte char */
 376       if (ps)
 377         memset (ps, 0, sizeof(mbstate_t));
 378       return -2;
 379     }
 380   else if (tmp == (size_t)(-1))
 381     {
 382       /* invalid to compose multibyte char */
 383       /* initialize the conversion state */
 384       if (ps)
 385         memset (ps, 0, sizeof(mbstate_t));
 386       return -1;
 387     }
 388   else if (tmp == (size_t)0)
 389     return 0;
 390   else
 391     return (int)tmp;
 392 }
 393
 394 /* compare the specified two characters. If the characters matched,
 395    return 1. Otherwise return 0. */
 396 int
 397 _rl_compare_chars (char *buf1, int pos1, mbstate_t *ps1, char *buf2, int pos2, mbstate_t *ps2)
 398 {
 399   int i, w1, w2;
 400
 401   if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 ||
 402         (w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
 403         (w1 != w2) ||
 404         (buf1[pos1] != buf2[pos2]))
 405     return 0;
 406
 407   for (i = 1; i < w1; i++)
 408     if (buf1[pos1+i] != buf2[pos2+i])
 409       return 0;
 410
 411   return 1;
 412 }
 413
 414 /* adjust pointed byte and find mbstate of the point of string.
 415    adjusted point will be point <= adjusted_point, and returns
 416    differences of the byte(adjusted_point - point).
 417    if point is invalid (point < 0 || more than string length),
 418    it returns -1 */
 419 int
 420 _rl_adjust_point (char *string, int point, mbstate_t *ps)
 421 {
 422   size_t tmp;
 423   int length, pos;
 424
 425   tmp = 0;
 426   pos = 0;
 427   length = strlen(string);
 428   if (point < 0)
 429     return -1;
 430   if (length < point)
 431     return -1;
 432
 433   while (pos < point)
 434     {
 435       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[pos]))
 436         tmp = 1;
 437       else
 438         tmp = mbrlen (string + pos, length - pos, ps);
 439       if (MB_INVALIDCH ((size_t)tmp))
 440         {
 441           /* in this case, bytes are invalid or too short to compose
 442              multibyte char, so assume that the first byte represents
 443              a single character anyway. */
 444           pos++;
 445           /* clear the state of the byte sequence, because
 446              in this case effect of mbstate is undefined  */
 447           if (ps)
 448             memset (ps, 0, sizeof (mbstate_t));
 449         }
 450       else if (MB_NULLWCH (tmp))
 451         pos++;
 452       else
 453         pos += tmp;
 454     }
 455
 456   return (pos - point);
 457 }
 458
 459 int
 460 _rl_is_mbchar_matched (char *string, int seed, int end, char *mbchar, int length)
 461 {
 462   int i;
 463
 464   if ((end - seed) < length)
 465     return 0;
 466
 467   for (i = 0; i < length; i++)
 468     if (string[seed + i] != mbchar[i])
 469       return 0;
 470   return 1;
 471 }
 472
 473 wchar_t
 474 _rl_char_value (char *buf, int ind)
 475 {
 476   size_t tmp;
 477   wchar_t wc;
 478   mbstate_t ps;
 479   int l;
 480
 481   if (MB_LEN_MAX == 1 || rl_byte_oriented)
 482     return ((wchar_t) buf[ind]);
 483   if (_rl_utf8locale && UTF8_SINGLEBYTE(buf[ind]))
 484     return ((wchar_t) buf[ind]);
 485   l = strlen (buf);
 486   if (ind >= l - 1)
 487     return ((wchar_t) buf[ind]);
 488   if (l < ind)                  /* Sanity check */
 489     l = strlen (buf+ind);
 490   memset (&ps, 0, sizeof (mbstate_t));
 491   tmp = mbrtowc (&wc, buf + ind, l - ind, &ps);
 492   if (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp))
 493     return ((wchar_t) buf[ind]);
 494   return wc;
 495 }
 496 #endif /* HANDLE_MULTIBYTE */
 497
 498 /* Find next `count' characters started byte point of the specified seed.
 499    If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
 500    characters. */
 501 #undef _rl_find_next_mbchar
 502 int
 503 _rl_find_next_mbchar (char *string, int seed, int count, int flags)
 504 {
 505 #if defined (HANDLE_MULTIBYTE)
 506   return _rl_find_next_mbchar_internal (string, seed, count, flags);
 507 #else
 508   return (seed + count);
 509 #endif
 510 }
 511
 512 /* Find previous character started byte point of the specified seed.
 513    Returned point will be point <= seed.  If flags is MB_FIND_NONZERO,
 514    we look for non-zero-width multibyte characters. */
 515 #undef _rl_find_prev_mbchar
 516 int
 517 _rl_find_prev_mbchar (char *string, int seed, int flags)
 518 {
 519 #if defined (HANDLE_MULTIBYTE)
 520   return _rl_find_prev_mbchar_internal (string, seed, flags);
 521 #else
 522   return ((seed == 0) ? seed : seed - 1);
 523 #endif
 524 }