gcc/cppcharset.c

   1 /* CPP Library - charsets
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
   3    Free Software Foundation, Inc.
   4
   5    Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
   6
   7 This program is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 This program is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with this program; if not, write to the Free Software
  19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "cpplib.h"
  26 #include "cpphash.h"
  27 #include "cppucnid.h"
  28
  29 /* Character set handling for C-family languages.
  30
  31    Terminological note: In what follows, "charset" or "character set"
  32    will be taken to mean both an abstract set of characters and an
  33    encoding for that set.
  34
  35    The C99 standard discusses two character sets: source and execution.
  36    The source character set is used for internal processing in translation
  37    phases 1 through 4; the execution character set is used thereafter.
  38    Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
  39    character encodings (see 3.7.2, 3.7.3 for the standardese meanings
  40    of these terms).  Furthermore, the "basic character set" (listed in
  41    5.2.1p3) is to be encoded in each with values one byte wide, and is
  42    to appear in the initial shift state.
  43
  44    It is not explicitly mentioned, but there is also a "wide execution
  45    character set" used to encode wide character constants and wide
  46    string literals; this is supposed to be the result of applying the
  47    standard library function mbstowcs() to an equivalent narrow string
  48    (6.4.5p5).  However, the behavior of hexadecimal and octal
  49    \-escapes is at odds with this; they are supposed to be translated
  50    directly to wchar_t values (6.4.4.4p5,6).
  51
  52    The source character set is not necessarily the character set used
  53    to encode physical source files on disk; translation phase 1 converts
  54    from whatever that encoding is to the source character set.
  55
  56    The presence of universal character names in C99 (6.4.3 et seq.)
  57    forces the source character set to be isomorphic to ISO 10646,
  58    that is, Unicode.  There is no such constraint on the execution
  59    character set; note also that the conversion from source to
  60    execution character set does not occur for identifiers (5.1.1.2p1#5).
  61
  62    For convenience of implementation, the source character set's
  63    encoding of the basic character set should be identical to the
  64    execution character set OF THE HOST SYSTEM's encoding of the basic
  65    character set, and it should not be a state-dependent encoding.
  66
  67    cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
  68    depending on whether the host is based on ASCII or EBCDIC (see
  69    respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
  70    Technical Report #16).  It relies on the system library's iconv()
  71    primitive to do charset conversion (specified in SUSv2).  If this
  72    primitive is not present, the source and execution character sets
  73    must be identical and are limited to the basic ASCII or EBCDIC
  74    range, and wide characters are implemented by padding narrow
  75    characters to the size of wchar_t.  */
  76
  77 #if !HAVE_ICONV
  78 /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
  79    below, which are guarded only by if statements with compile-time
  80    constant conditions, do not cause link errors.  */
  81 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
  82 #define iconv(a,b,c,d,e) (errno = EINVAL, (iconv_t)-1)
  83 #define iconv_close(x)   0
  84 #endif
  85
  86 #if HOST_CHARSET == HOST_CHARSET_ASCII
  87 #define SOURCE_CHARSET "UTF-8"
  88 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
  89 #define SOURCE_CHARSET "UTF-EBCDIC"
  90 #else
  91 #error "Unrecognized basic host character set"
  92 #endif
  93
  94 /* This structure is used for a resizable string buffer, mostly by
  95    convert_cset and cpp_interpret_string.  */
  96 struct strbuf
  97 {
  98   uchar *text;
  99   size_t asize;
 100   size_t len;
 101 };
 102
 103 /* This is enough to hold any string that fits on a single 80-column
 104    line, even if iconv quadruples its size (e.g. conversion from
 105    ASCII to UCS-4) rounded up to a power of two.  */
 106 #define OUTBUF_BLOCK_SIZE 256
 107
 108 /* Subroutine of cpp_init_iconv: initialize and return an iconv
 109    descriptor for conversion from FROM to TO.  If iconv_open() fails,
 110    issue an error and return (iconv_t) -1.  Silently return
 111    (iconv_t) -1 if FROM and TO are identical.  */
 112 static iconv_t
 113 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
 114 {
 115   iconv_t dsc;
 116
 117   if (!strcmp (to, from))
 118     return (iconv_t) -1;
 119
 120   dsc = iconv_open (to, from);
 121   if (dsc == (iconv_t) -1)
 122     {
 123       if (errno == EINVAL)
 124         cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
 125                    "conversion from %s to %s not supported by iconv",
 126                    from, to);
 127       else
 128         cpp_errno (pfile, DL_ERROR, "iconv_open");
 129     }
 130   return dsc;
 131 }
 132
 133 /* If charset conversion is requested, initialize iconv(3) descriptors
 134    for conversion from the source character set to the execution
 135    character sets.  If iconv is not present in the C library, and
 136    conversion is requested, issue an error.  */
 137
 138 void
 139 cpp_init_iconv (cpp_reader *pfile)
 140 {
 141   const char *ncset = CPP_OPTION (pfile, narrow_charset);
 142   const char *wcset = CPP_OPTION (pfile, wide_charset);
 143   const char *default_wcset;
 144
 145   bool be = CPP_OPTION (pfile, bytes_big_endian);
 146
 147   if (CPP_OPTION (pfile, wchar_precision) >= 32)
 148     default_wcset = be ? "UCS-4BE" : "UCS-4LE";
 149   else if (CPP_OPTION (pfile, wchar_precision) >= 16)
 150     default_wcset = be ? "UCS-2BE" : "UCS-2LE";
 151   else
 152     /* This effectively means that wide strings are not supported,
 153        so don't do any conversion at all.  */
 154    default_wcset = SOURCE_CHARSET;
 155
 156   if (!HAVE_ICONV)
 157     {
 158       if (ncset && strcmp (ncset, SOURCE_CHARSET))
 159         cpp_error (pfile, DL_ERROR,  /* XXX should be DL_SORRY */
 160                    "no iconv implementation, cannot convert to %s", ncset);
 161
 162       if (wcset && strcmp (wcset, default_wcset))
 163         cpp_error (pfile, DL_ERROR,  /* XXX should be DL_SORRY */
 164                    "no iconv implementation, cannot convert to %s", wcset);
 165     }
 166   else
 167     {
 168       if (!ncset)
 169         ncset = SOURCE_CHARSET;
 170       if (!wcset)
 171         wcset = default_wcset;
 172
 173       pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
 174       pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
 175     }
 176 }
 177
 178 void
 179 _cpp_destroy_iconv (cpp_reader *pfile)
 180 {
 181   if (HAVE_ICONV)
 182     {
 183       if (pfile->narrow_cset_desc != (iconv_t) -1)
 184         iconv_close (pfile->narrow_cset_desc);
 185       if (pfile->wide_cset_desc != (iconv_t) -1)
 186         iconv_close (pfile->wide_cset_desc);
 187     }
 188 }
 189
 190 /* iconv(3) utility wrapper.  Convert the string FROM, of length FLEN,
 191    according to the iconv descriptor CD.  The result is appended to
 192    the string buffer TO.  If DESC is (iconv_t)-1 or iconv is not
 193    available, the string is simply copied into TO.
 194
 195    Returns true on success, false on error.  */
 196
 197 static bool
 198 convert_cset (iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
 199 {
 200   if (!HAVE_ICONV || cd == (iconv_t)-1)
 201     {
 202       if (to->len + flen > to->asize)
 203         {
 204           to->asize = to->len + flen;
 205           to->text = xrealloc (to->text, to->asize);
 206         }
 207       memcpy (to->text + to->len, from, flen);
 208       to->len += flen;
 209       return true;
 210     }
 211   else
 212     {
 213       char *inbuf, *outbuf;
 214       size_t inbytesleft, outbytesleft;
 215
 216       /* Reset conversion descriptor and check that it is valid.  */
 217       if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
 218         return false;
 219
 220       inbuf = (char *)from;
 221       inbytesleft = flen;
 222       outbuf = (char *)to->text + to->len;
 223       outbytesleft = to->asize - to->len;
 224
 225       for (;;)
 226         {
 227           iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
 228           if (__builtin_expect (inbytesleft == 0, 1))
 229             {
 230               to->len = to->asize - outbytesleft;
 231               return true;
 232             }
 233           if (errno != E2BIG)
 234             return false;
 235
 236           outbytesleft += OUTBUF_BLOCK_SIZE;
 237           to->asize += OUTBUF_BLOCK_SIZE;
 238           to->text = xrealloc (to->text, to->asize);
 239           outbuf = (char *)to->text + to->asize - outbytesleft;
 240         }
 241     }
 242 }
 243
 244 /* Utility routine that computes a mask of the form 0000...111... with
 245    WIDTH 1-bits.  */
 246 static inline size_t
 247 width_to_mask (size_t width)
 248 {
 249   width = MIN (width, BITS_PER_CPPCHAR_T);
 250   if (width >= CHAR_BIT * sizeof (size_t))
 251     return ~(size_t) 0;
 252   else
 253     return ((size_t) 1 << width) - 1;
 254 }
 255
 256 \f
 257
 258 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
 259    the start of an identifier, and 0 if C is not valid in an
 260    identifier.  We assume C has already gone through the checks of
 261    _cpp_valid_ucn.  The algorithm is a simple binary search on the
 262    table defined in cppucnid.h.  */
 263
 264 static int
 265 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
 266 {
 267   int mn, mx, md;
 268
 269   mn = -1;
 270   mx = ARRAY_SIZE (ucnranges);
 271   while (mx - mn > 1)
 272     {
 273       md = (mn + mx) / 2;
 274       if (c < ucnranges[md].lo)
 275         mx = md;
 276       else if (c > ucnranges[md].hi)
 277         mn = md;
 278       else
 279         goto found;
 280     }
 281   return 0;
 282
 283  found:
 284   /* When -pedantic, we require the character to have been listed by
 285      the standard for the current language.  Otherwise, we accept the
 286      union of the acceptable sets for C++98 and C99.  */
 287   if (CPP_PEDANTIC (pfile)
 288       && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
 289           || (CPP_OPTION (pfile, cplusplus)
 290               && !(ucnranges[md].flags & CXX))))
 291     return 0;
 292
 293   /* In C99, UCN digits may not begin identifiers.  */
 294   if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
 295     return 2;
 296
 297   return 1;
 298 }
 299
 300 /* [lex.charset]: The character designated by the universal character
 301    name \UNNNNNNNN is that character whose character short name in
 302    ISO/IEC 10646 is NNNNNNNN; the character designated by the
 303    universal character name \uNNNN is that character whose character
 304    short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
 305    for a universal character name is less than 0x20 or in the range
 306    0x7F-0x9F (inclusive), or if the universal character name
 307    designates a character in the basic source character set, then the
 308    program is ill-formed.
 309
 310    *PSTR must be preceded by "\u" or "\U"; it is assumed that the
 311    buffer end is delimited by a non-hex digit.  Returns zero if UCNs
 312    are not part of the relevant standard, or if the string beginning
 313    at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
 314
 315    Otherwise the nonzero value of the UCN, whether valid or invalid,
 316    is returned.  Diagnostics are emitted for invalid values.  PSTR
 317    is updated to point one beyond the UCN, or to the syntactically
 318    invalid character.
 319
 320    IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
 321    an identifier, or 2 otherwise.
 322 */
 323
 324 cppchar_t
 325 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
 326                 const uchar *limit, int identifier_pos)
 327 {
 328   cppchar_t result, c;
 329   unsigned int length;
 330   const uchar *str = *pstr;
 331   const uchar *base = str - 2;
 332
 333   if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
 334     cpp_error (pfile, DL_WARNING,
 335                "universal character names are only valid in C++ and C99");
 336   else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
 337     cpp_error (pfile, DL_WARNING,
 338                "the meaning of '\\%c' is different in traditional C",
 339                (int) str[-1]);
 340
 341   if (str[-1] == 'u')
 342     length = 4;
 343   else if (str[-1] == 'U')
 344     length = 8;
 345   else
 346     abort();
 347
 348   result = 0;
 349   do
 350     {
 351       c = *str;
 352       if (!ISXDIGIT (c))
 353         break;
 354       str++;
 355       result = (result << 4) + hex_value (c);
 356     }
 357   while (--length && str < limit);
 358
 359   *pstr = str;
 360   if (length)
 361     {
 362       /* We'll error when we try it out as the start of an identifier.  */
 363       cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s",
 364                  (int) (str - base), base);
 365       result = 1;
 366     }
 367   /* The standard permits $, @ and ` to be specified as UCNs.  We use
 368      hex escapes so that this also works with EBCDIC hosts.  */
 369   else if ((result < 0xa0
 370             && (result != 0x24 && result != 0x40 && result != 0x60))
 371            || (result & 0x80000000)
 372            || (result >= 0xD800 && result <= 0xDFFF))
 373     {
 374       cpp_error (pfile, DL_ERROR, "%.*s is not a valid universal character",
 375                  (int) (str - base), base);
 376       result = 1;
 377     }
 378   else if (identifier_pos)
 379     {
 380       int validity = ucn_valid_in_identifier (pfile, result);
 381
 382       if (validity == 0)
 383         cpp_error (pfile, DL_ERROR,
 384                    "universal character %.*s is not valid in an identifier",
 385                    (int) (str - base), base);
 386       else if (validity == 2 && identifier_pos == 1)
 387         cpp_error (pfile, DL_ERROR,
 388    "universal character %.*s is not valid at the start of an identifier",
 389                    (int) (str - base), base);
 390     }
 391   /* We don't accept UCNs if iconv is not available or will not
 392      convert to the target wide character set.  */
 393   else if (!HAVE_ICONV || pfile->wide_cset_desc == (iconv_t) -1)
 394     {
 395       /* XXX should be DL_SORRY */
 396       cpp_error (pfile, DL_ERROR,
 397         "universal character names are not supported in this configuration");
 398     }
 399
 400
 401   if (result == 0)
 402     result = 1;
 403
 404   return result;
 405 }
 406
 407 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
 408    it to the execution character set and write the result into TBUF.
 409    An advanced pointer is returned.  Issues all relevant diagnostics.
 410
 411    UTF-8 encoding looks like this:
 412
 413    value range         encoded as
 414    00000000-0000007F   0xxxxxxx
 415    00000080-000007FF   110xxxxx 10xxxxxx
 416    00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
 417    00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 418    00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 419    04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 420
 421    Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
 422    which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
 423    never occur.  Note also that any value that can be encoded by a
 424    given row of the table can also be encoded by all successive rows,
 425    but this is not done; only the shortest possible encoding for any
 426    given value is valid.  For instance, the character 07C0 could be
 427    encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
 428    FC 80 80 80 9F 80.  Only the first is valid.  */
 429
 430 static const uchar *
 431 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
 432              struct strbuf *tbuf, bool wide)
 433 {
 434   int nbytes;
 435   uchar buf[6], *p = &buf[6];
 436   static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 437   cppchar_t ucn;
 438
 439   from++; /* skip u/U */
 440   ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
 441   if (!ucn)
 442     return from;
 443
 444   nbytes = 1;
 445   if (ucn < 0x80)
 446     *--p = ucn;
 447   else
 448     {
 449       do
 450         {
 451           *--p = ((ucn & 0x3F) | 0x80);
 452           ucn >>= 6;
 453           nbytes++;
 454         }
 455       while (ucn >= 0x3F || (ucn & masks[nbytes-1]));
 456       *--p = (ucn | masks[nbytes-1]);
 457     }
 458
 459   if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
 460                      p, nbytes, tbuf))
 461     cpp_errno (pfile, DL_ERROR, "converting UCN to execution character set");
 462
 463   return from;
 464 }
 465
 466 static void
 467 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
 468                      struct strbuf *tbuf, bool wide)
 469 {
 470   if (wide)
 471     {
 472       /* We have to render this into the target byte order, which may not
 473          be our byte order.  */
 474       bool bigend = CPP_OPTION (pfile, bytes_big_endian);
 475       size_t width = CPP_OPTION (pfile, wchar_precision);
 476       size_t cwidth = CPP_OPTION (pfile, char_precision);
 477       size_t cmask = width_to_mask (cwidth);
 478       size_t nbwc = width / cwidth;
 479       size_t i;
 480       size_t off = tbuf->len;
 481       cppchar_t c;
 482
 483       if (tbuf->len + nbwc > tbuf->asize)
 484         {
 485           tbuf->asize += OUTBUF_BLOCK_SIZE;
 486           tbuf->text = xrealloc (tbuf->text, tbuf->asize);
 487         }
 488
 489       for (i = 0; i < nbwc; i++)
 490         {
 491           c = n & cmask;
 492           n >>= cwidth;
 493           tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
 494         }
 495       tbuf->len += nbwc;
 496     }
 497   else
 498     {
 499       if (tbuf->len + 1 > tbuf->asize)
 500         {
 501           tbuf->asize += OUTBUF_BLOCK_SIZE;
 502           tbuf->text = xrealloc (tbuf->text, tbuf->asize);
 503         }
 504       tbuf->text[tbuf->len++] = n;
 505     }
 506 }
 507
 508 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
 509    character set and write it into the string buffer TBUF.  Returns an
 510    advanced pointer, and issues diagnostics as necessary.
 511    No character set translation occurs; this routine always produces the
 512    execution-set character with numeric value equal to the given hex
 513    number.  You can, e.g. generate surrogate pairs this way.  */
 514 static const uchar *
 515 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
 516              struct strbuf *tbuf, bool wide)
 517 {
 518   cppchar_t c, n = 0, overflow = 0;
 519   int digits_found = 0;
 520   size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
 521                   : CPP_OPTION (pfile, char_precision));
 522   size_t mask = width_to_mask (width);
 523
 524   if (CPP_WTRADITIONAL (pfile))
 525     cpp_error (pfile, DL_WARNING,
 526                "the meaning of '\\x' is different in traditional C");
 527
 528   from++;  /* skip 'x' */
 529   while (from < limit)
 530     {
 531       c = *from;
 532       if (! hex_p (c))
 533         break;
 534       from++;
 535       overflow |= n ^ (n << 4 >> 4);
 536       n = (n << 4) + hex_value (c);
 537       digits_found = 1;
 538     }
 539
 540   if (!digits_found)
 541     {
 542       cpp_error (pfile, DL_ERROR,
 543                  "\\x used with no following hex digits");
 544       return from;
 545     }
 546
 547   if (overflow | (n != (n & mask)))
 548     {
 549       cpp_error (pfile, DL_PEDWARN,
 550                  "hex escape sequence out of range");
 551       n &= mask;
 552     }
 553
 554   emit_numeric_escape (pfile, n, tbuf, wide);
 555
 556   return from;
 557 }
 558
 559 /* Convert an octal escape, pointed to by FROM, to the execution
 560    character set and write it into the string buffer TBUF.  Returns an
 561    advanced pointer, and issues diagnostics as necessary.
 562    No character set translation occurs; this routine always produces the
 563    execution-set character with numeric value equal to the given octal
 564    number.  */
 565 static const uchar *
 566 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
 567              struct strbuf *tbuf, bool wide)
 568 {
 569   size_t count = 0;
 570   cppchar_t c, n = 0;
 571   size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
 572                   : CPP_OPTION (pfile, char_precision));
 573   size_t mask = width_to_mask (width);
 574   bool overflow = false;
 575
 576   while (from < limit && count++ < 3)
 577     {
 578       c = *from;
 579       if (c < '0' || c > '7')
 580         break;
 581       from++;
 582       overflow |= n ^ (n << 3 >> 3);
 583       n = (n << 3) + c - '0';
 584     }
 585
 586   if (n != (n & mask))
 587     {
 588       cpp_error (pfile, DL_PEDWARN,
 589                  "octal escape sequence out of range");
 590       n &= mask;
 591     }
 592
 593   emit_numeric_escape (pfile, n, tbuf, wide);
 594
 595   return from;
 596 }
 597
 598 /* Convert an escape sequence (pointed to by FROM) to its value on
 599    the target, and to the execution character set.  Do not scan past
 600    LIMIT.  Write the converted value into TBUF.  Returns an advanced
 601    pointer.  Handles all relevant diagnostics.  */
 602 static const uchar *
 603 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
 604                 struct strbuf *tbuf, bool wide)
 605 {
 606   /* Values of \a \b \e \f \n \r \t \v respectively.  */
 607 #if HOST_CHARSET == HOST_CHARSET_ASCII
 608   static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
 609 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
 610   static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
 611 #else
 612 #error "unknown host character set"
 613 #endif
 614
 615   uchar c;
 616
 617   c = *from;
 618   switch (c)
 619     {
 620       /* UCNs, hex escapes, and octal escapes are processed separately.  */
 621     case 'u': case 'U':
 622       return convert_ucn (pfile, from, limit, tbuf, wide);
 623
 624     case 'x':
 625       return convert_hex (pfile, from, limit, tbuf, wide);
 626       break;
 627
 628     case '0':  case '1':  case '2':  case '3':
 629     case '4':  case '5':  case '6':  case '7':
 630       return convert_oct (pfile, from, limit, tbuf, wide);
 631
 632       /* Various letter escapes.  Get the appropriate host-charset
 633          value into C.  */
 634     case '\\': case '\'': case '"': case '?': break;
 635
 636     case '(': case '{': case '[': case '%':
 637       /* '\(', etc, can be used at the beginning of a line in a long
 638          string split onto multiple lines with \-newline, to prevent
 639          Emacs or other text editors from getting confused.  '\%' can
 640          be used to prevent SCCS from mangling printf format strings.  */
 641       if (CPP_PEDANTIC (pfile))
 642         goto unknown;
 643       break;
 644
 645     case 'b': c = charconsts[1];  break;
 646     case 'f': c = charconsts[3];  break;
 647     case 'n': c = charconsts[4];  break;
 648     case 'r': c = charconsts[5];  break;
 649     case 't': c = charconsts[6];  break;
 650     case 'v': c = charconsts[7];  break;
 651
 652     case 'a':
 653       if (CPP_WTRADITIONAL (pfile))
 654         cpp_error (pfile, DL_WARNING,
 655                    "the meaning of '\\a' is different in traditional C");
 656       c = charconsts[0];
 657       break;
 658
 659     case 'e': case 'E':
 660       if (CPP_PEDANTIC (pfile))
 661         cpp_error (pfile, DL_PEDWARN,
 662                    "non-ISO-standard escape sequence, '\\%c'", (int) c);
 663       c = charconsts[2];
 664       break;
 665
 666     default:
 667     unknown:
 668       if (ISGRAPH (c))
 669         cpp_error (pfile, DL_PEDWARN,
 670                    "unknown escape sequence '\\%c'", (int) c);
 671       else
 672         cpp_error (pfile, DL_PEDWARN,
 673                    "unknown escape sequence: '\\%03o'", (int) c);
 674     }
 675
 676   /* Now convert what we have to the execution character set.  */
 677   if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
 678                      &c, 1, tbuf))
 679     cpp_errno (pfile, DL_ERROR,
 680                "converting escape sequence to execution character set");
 681
 682   return from + 1;
 683 }
 684 \f
 685 /* FROM is an array of cpp_string structures of length COUNT.  These
 686    are to be converted from the source to the execution character set,
 687    escape sequences translated, and finally all are to be
 688    concatenated.  WIDE indicates whether or not to produce a wide
 689    string.  The result is written into TO.  Returns true for success,
 690    false for failure.  */
 691 bool
 692 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
 693                       cpp_string *to, bool wide)
 694 {
 695   struct strbuf tbuf;
 696   const uchar *p, *base, *limit;
 697   size_t i;
 698   iconv_t cd = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
 699
 700   tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
 701   tbuf.text = xmalloc (tbuf.asize);
 702   tbuf.len = 0;
 703
 704   for (i = 0; i < count; i++)
 705     {
 706       p = from[i].text;
 707       if (*p == 'L') p++;
 708       p++; /* skip leading quote */
 709       limit = from[i].text + from[i].len - 1; /* skip trailing quote */
 710
 711       for (;;)
 712         {
 713           base = p;
 714           while (p < limit && *p != '\\')
 715             p++;
 716           if (p > base)
 717             {
 718               /* We have a run of normal characters; these can be fed
 719                  directly to convert_cset.  */
 720               if (!convert_cset (cd, base, p - base, &tbuf))
 721                 goto fail;
 722             }
 723           if (p == limit)
 724             break;
 725
 726           p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
 727         }
 728     }
 729   /* NUL-terminate the 'to' buffer and translate it to a cpp_string
 730      structure.  */
 731   emit_numeric_escape (pfile, 0, &tbuf, wide);
 732   tbuf.text = xrealloc (tbuf.text, tbuf.len);
 733   to->text = tbuf.text;
 734   to->len = tbuf.len;
 735   return true;
 736
 737  fail:
 738   cpp_errno (pfile, DL_ERROR, "converting to execution character set");
 739   free (tbuf.text);
 740   return false;
 741 }
 742 \f
 743 /* Subroutine of cpp_interpret_charconst which performs the conversion
 744    to a number, for narrow strings.  STR is the string structure returned
 745    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
 746    cpp_interpret_charconst.  */
 747 static cppchar_t
 748 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
 749                          unsigned int *pchars_seen, int *unsignedp)
 750 {
 751   size_t width = CPP_OPTION (pfile, char_precision);
 752   size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
 753   size_t mask = width_to_mask (width);
 754   size_t i;
 755   cppchar_t result, c;
 756   bool unsigned_p;
 757
 758   /* The value of a multi-character character constant, or a
 759      single-character character constant whose representation in the
 760      execution character set is more than one byte long, is
 761      implementation defined.  This implementation defines it to be the
 762      number formed by interpreting the byte sequence in memory as a
 763      big-endian binary number.  If overflow occurs, the high bytes are
 764      lost, and a warning is issued.
 765
 766      We don't want to process the NUL terminator handed back by
 767      cpp_interpret_string.  */
 768   result = 0;
 769   for (i = 0; i < str.len - 1; i++)
 770     {
 771       c = str.text[i] & mask;
 772       if (width < BITS_PER_CPPCHAR_T)
 773         result = (result << width) | c;
 774       else
 775         result = c;
 776     }
 777
 778   if (i > max_chars)
 779     {
 780       i = max_chars;
 781       cpp_error (pfile, DL_WARNING, "character constant too long for its type");
 782     }
 783   else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
 784     cpp_error (pfile, DL_WARNING, "multi-character character constant");
 785
 786   /* Multichar constants are of type int and therefore signed.  */
 787   if (i > 1)
 788     unsigned_p = 0;
 789   else
 790     unsigned_p = CPP_OPTION (pfile, unsigned_char);
 791
 792   /* Truncate the constant to its natural width, and simultaneously
 793      sign- or zero-extend to the full width of cppchar_t.
 794      For single-character constants, the value is WIDTH bits wide.
 795      For multi-character constants, the value is INT_PRECISION bits wide.  */
 796   if (i > 1)
 797     width = CPP_OPTION (pfile, int_precision);
 798   if (width < BITS_PER_CPPCHAR_T)
 799     {
 800       mask = ((cppchar_t) 1 << width) - 1;
 801       if (unsigned_p || !(result & (1 << (width - 1))))
 802         result &= mask;
 803       else
 804         result |= ~mask;
 805     }
 806   *pchars_seen = i;
 807   *unsignedp = unsigned_p;
 808   return result;
 809 }
 810
 811 /* Subroutine of cpp_interpret_charconst which performs the conversion
 812    to a number, for wide strings.  STR is the string structure returned
 813    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
 814    cpp_interpret_charconst.  */
 815 static cppchar_t
 816 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
 817                        unsigned int *pchars_seen, int *unsignedp)
 818 {
 819   bool bigend = CPP_OPTION (pfile, bytes_big_endian);
 820   size_t width = CPP_OPTION (pfile, wchar_precision);
 821   size_t cwidth = CPP_OPTION (pfile, char_precision);
 822   size_t mask = width_to_mask (width);
 823   size_t cmask = width_to_mask (cwidth);
 824   size_t nbwc = width / cwidth;
 825   size_t off, i;
 826   cppchar_t result = 0, c;
 827
 828   /* This is finicky because the string is in the target's byte order,
 829      which may not be our byte order.  Only the last character, ignoring
 830      the NUL terminator, is relevant.  */
 831   off = str.len - (nbwc * 2);
 832   result = 0;
 833   for (i = 0; i < nbwc; i++)
 834     {
 835       c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
 836       result = (result << cwidth) | (c & cmask);
 837     }
 838
 839   /* Wide character constants have type wchar_t, and a single
 840      character exactly fills a wchar_t, so a multi-character wide
 841      character constant is guaranteed to overflow.  */
 842   if (off > 0)
 843     cpp_error (pfile, DL_WARNING, "character constant too long for its type");
 844
 845   /* Truncate the constant to its natural width, and simultaneously
 846      sign- or zero-extend to the full width of cppchar_t.  */
 847   if (width < BITS_PER_CPPCHAR_T)
 848     {
 849       if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
 850         result &= mask;
 851       else
 852         result |= ~mask;
 853     }
 854
 855   *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
 856   *pchars_seen = 1;
 857   return result;
 858 }
 859
 860 /* Interpret a (possibly wide) character constant in TOKEN.
 861    PCHARS_SEEN points to a variable that is filled in with the number
 862    of characters seen, and UNSIGNEDP to a variable that indicates
 863    whether the result has signed type.  */
 864 cppchar_t
 865 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
 866                          unsigned int *pchars_seen, int *unsignedp)
 867 {
 868   cpp_string str = { 0, 0 };
 869   bool wide = (token->type == CPP_WCHAR);
 870   cppchar_t result;
 871
 872   /* an empty constant will appear as L'' or '' */
 873   if (token->val.str.len == (size_t) (2 + wide))
 874     {
 875       cpp_error (pfile, DL_ERROR, "empty character constant");
 876       return 0;
 877     }
 878   else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
 879     return 0;
 880
 881   if (wide)
 882     result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
 883   else
 884     result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
 885
 886   if (str.text != token->val.str.text)
 887     free ((void *)str.text);
 888
 889   return result;
 890 }