0ba7e930ab0ab8f584dfd7abb01f9c7296d8db51
[gcc.git] / gcc / cppcharset.c
1 /* CPP Library - charsets
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
4
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
6
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
10 later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "cpplib.h"
26 #include "cpphash.h"
27 #include "cppucnid.h"
28
29 /* Character set handling for C-family languages.
30
31 Terminological note: In what follows, "charset" or "character set"
32 will be taken to mean both an abstract set of characters and an
33 encoding for that set.
34
35 The C99 standard discusses two character sets: source and execution.
36 The source character set is used for internal processing in translation
37 phases 1 through 4; the execution character set is used thereafter.
38 Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
39 character encodings (see 3.7.2, 3.7.3 for the standardese meanings
40 of these terms). Furthermore, the "basic character set" (listed in
41 5.2.1p3) is to be encoded in each with values one byte wide, and is
42 to appear in the initial shift state.
43
44 It is not explicitly mentioned, but there is also a "wide execution
45 character set" used to encode wide character constants and wide
46 string literals; this is supposed to be the result of applying the
47 standard library function mbstowcs() to an equivalent narrow string
48 (6.4.5p5). However, the behavior of hexadecimal and octal
49 \-escapes is at odds with this; they are supposed to be translated
50 directly to wchar_t values (6.4.4.4p5,6).
51
52 The source character set is not necessarily the character set used
53 to encode physical source files on disk; translation phase 1 converts
54 from whatever that encoding is to the source character set.
55
56 The presence of universal character names in C99 (6.4.3 et seq.)
57 forces the source character set to be isomorphic to ISO 10646,
58 that is, Unicode. There is no such constraint on the execution
59 character set; note also that the conversion from source to
60 execution character set does not occur for identifiers (5.1.1.2p1#5).
61
62 For convenience of implementation, the source character set's
63 encoding of the basic character set should be identical to the
64 execution character set OF THE HOST SYSTEM's encoding of the basic
65 character set, and it should not be a state-dependent encoding.
66
67 cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
68 depending on whether the host is based on ASCII or EBCDIC (see
69 respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
70 Technical Report #16). It relies on the system library's iconv()
71 primitive to do charset conversion (specified in SUSv2). If this
72 primitive is not present, the source and execution character sets
73 must be identical and are limited to the basic ASCII or EBCDIC
74 range, and wide characters are implemented by padding narrow
75 characters to the size of wchar_t. */
76
77 #if !HAVE_ICONV
78 /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
79 below, which are guarded only by if statements with compile-time
80 constant conditions, do not cause link errors. */
81 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
82 #define iconv(a,b,c,d,e) (errno = EINVAL, (iconv_t)-1)
83 #define iconv_close(x) 0
84 #endif
85
86 #if HOST_CHARSET == HOST_CHARSET_ASCII
87 #define SOURCE_CHARSET "UTF-8"
88 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
89 #define SOURCE_CHARSET "UTF-EBCDIC"
90 #else
91 #error "Unrecognized basic host character set"
92 #endif
93
94 /* This structure is used for a resizable string buffer, mostly by
95 convert_cset and cpp_interpret_string. */
96 struct strbuf
97 {
98 uchar *text;
99 size_t asize;
100 size_t len;
101 };
102
103 /* This is enough to hold any string that fits on a single 80-column
104 line, even if iconv quadruples its size (e.g. conversion from
105 ASCII to UCS-4) rounded up to a power of two. */
106 #define OUTBUF_BLOCK_SIZE 256
107
108 /* Subroutine of cpp_init_iconv: initialize and return an iconv
109 descriptor for conversion from FROM to TO. If iconv_open() fails,
110 issue an error and return (iconv_t) -1. Silently return
111 (iconv_t) -1 if FROM and TO are identical. */
112 static iconv_t
113 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
114 {
115 iconv_t dsc;
116
117 if (!strcmp (to, from))
118 return (iconv_t) -1;
119
120 dsc = iconv_open (to, from);
121 if (dsc == (iconv_t) -1)
122 {
123 if (errno == EINVAL)
124 cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
125 "conversion from %s to %s not supported by iconv",
126 from, to);
127 else
128 cpp_errno (pfile, DL_ERROR, "iconv_open");
129 }
130 return dsc;
131 }
132
133 /* If charset conversion is requested, initialize iconv(3) descriptors
134 for conversion from the source character set to the execution
135 character sets. If iconv is not present in the C library, and
136 conversion is requested, issue an error. */
137
138 void
139 cpp_init_iconv (cpp_reader *pfile)
140 {
141 const char *ncset = CPP_OPTION (pfile, narrow_charset);
142 const char *wcset = CPP_OPTION (pfile, wide_charset);
143 const char *default_wcset;
144
145 bool be = CPP_OPTION (pfile, bytes_big_endian);
146
147 if (CPP_OPTION (pfile, wchar_precision) >= 32)
148 default_wcset = be ? "UCS-4BE" : "UCS-4LE";
149 else if (CPP_OPTION (pfile, wchar_precision) >= 16)
150 default_wcset = be ? "UCS-2BE" : "UCS-2LE";
151 else
152 /* This effectively means that wide strings are not supported,
153 so don't do any conversion at all. */
154 default_wcset = SOURCE_CHARSET;
155
156 if (!HAVE_ICONV)
157 {
158 if (ncset && strcmp (ncset, SOURCE_CHARSET))
159 cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
160 "no iconv implementation, cannot convert to %s", ncset);
161
162 if (wcset && strcmp (wcset, default_wcset))
163 cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
164 "no iconv implementation, cannot convert to %s", wcset);
165 }
166 else
167 {
168 if (!ncset)
169 ncset = SOURCE_CHARSET;
170 if (!wcset)
171 wcset = default_wcset;
172
173 pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
174 pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
175 }
176 }
177
178 void
179 _cpp_destroy_iconv (cpp_reader *pfile)
180 {
181 if (HAVE_ICONV)
182 {
183 if (pfile->narrow_cset_desc != (iconv_t) -1)
184 iconv_close (pfile->narrow_cset_desc);
185 if (pfile->wide_cset_desc != (iconv_t) -1)
186 iconv_close (pfile->wide_cset_desc);
187 }
188 }
189
190 /* iconv(3) utility wrapper. Convert the string FROM, of length FLEN,
191 according to the iconv descriptor CD. The result is appended to
192 the string buffer TO. If DESC is (iconv_t)-1 or iconv is not
193 available, the string is simply copied into TO.
194
195 Returns true on success, false on error. */
196
197 static bool
198 convert_cset (iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
199 {
200 if (!HAVE_ICONV || cd == (iconv_t)-1)
201 {
202 if (to->len + flen > to->asize)
203 {
204 to->asize = to->len + flen;
205 to->text = xrealloc (to->text, to->asize);
206 }
207 memcpy (to->text + to->len, from, flen);
208 to->len += flen;
209 return true;
210 }
211 else
212 {
213 char *inbuf, *outbuf;
214 size_t inbytesleft, outbytesleft;
215
216 /* Reset conversion descriptor and check that it is valid. */
217 if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
218 return false;
219
220 inbuf = (char *)from;
221 inbytesleft = flen;
222 outbuf = (char *)to->text + to->len;
223 outbytesleft = to->asize - to->len;
224
225 for (;;)
226 {
227 iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
228 if (__builtin_expect (inbytesleft == 0, 1))
229 {
230 to->len = to->asize - outbytesleft;
231 return true;
232 }
233 if (errno != E2BIG)
234 return false;
235
236 outbytesleft += OUTBUF_BLOCK_SIZE;
237 to->asize += OUTBUF_BLOCK_SIZE;
238 to->text = xrealloc (to->text, to->asize);
239 outbuf = (char *)to->text + to->asize - outbytesleft;
240 }
241 }
242 }
243
244 /* Utility routine that computes a mask of the form 0000...111... with
245 WIDTH 1-bits. */
246 static inline size_t
247 width_to_mask (size_t width)
248 {
249 width = MIN (width, BITS_PER_CPPCHAR_T);
250 if (width >= CHAR_BIT * sizeof (size_t))
251 return ~(size_t) 0;
252 else
253 return ((size_t) 1 << width) - 1;
254 }
255
256 \f
257
258 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
259 the start of an identifier, and 0 if C is not valid in an
260 identifier. We assume C has already gone through the checks of
261 _cpp_valid_ucn. The algorithm is a simple binary search on the
262 table defined in cppucnid.h. */
263
264 static int
265 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
266 {
267 int mn, mx, md;
268
269 mn = -1;
270 mx = ARRAY_SIZE (ucnranges);
271 while (mx - mn > 1)
272 {
273 md = (mn + mx) / 2;
274 if (c < ucnranges[md].lo)
275 mx = md;
276 else if (c > ucnranges[md].hi)
277 mn = md;
278 else
279 goto found;
280 }
281 return 0;
282
283 found:
284 /* When -pedantic, we require the character to have been listed by
285 the standard for the current language. Otherwise, we accept the
286 union of the acceptable sets for C++98 and C99. */
287 if (CPP_PEDANTIC (pfile)
288 && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
289 || (CPP_OPTION (pfile, cplusplus)
290 && !(ucnranges[md].flags & CXX))))
291 return 0;
292
293 /* In C99, UCN digits may not begin identifiers. */
294 if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
295 return 2;
296
297 return 1;
298 }
299
300 /* [lex.charset]: The character designated by the universal character
301 name \UNNNNNNNN is that character whose character short name in
302 ISO/IEC 10646 is NNNNNNNN; the character designated by the
303 universal character name \uNNNN is that character whose character
304 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
305 for a universal character name is less than 0x20 or in the range
306 0x7F-0x9F (inclusive), or if the universal character name
307 designates a character in the basic source character set, then the
308 program is ill-formed.
309
310 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
311 buffer end is delimited by a non-hex digit. Returns zero if UCNs
312 are not part of the relevant standard, or if the string beginning
313 at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
314
315 Otherwise the nonzero value of the UCN, whether valid or invalid,
316 is returned. Diagnostics are emitted for invalid values. PSTR
317 is updated to point one beyond the UCN, or to the syntactically
318 invalid character.
319
320 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
321 an identifier, or 2 otherwise.
322 */
323
324 cppchar_t
325 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
326 const uchar *limit, int identifier_pos)
327 {
328 cppchar_t result, c;
329 unsigned int length;
330 const uchar *str = *pstr;
331 const uchar *base = str - 2;
332
333 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
334 cpp_error (pfile, DL_WARNING,
335 "universal character names are only valid in C++ and C99");
336 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
337 cpp_error (pfile, DL_WARNING,
338 "the meaning of '\\%c' is different in traditional C",
339 (int) str[-1]);
340
341 if (str[-1] == 'u')
342 length = 4;
343 else if (str[-1] == 'U')
344 length = 8;
345 else
346 abort();
347
348 result = 0;
349 do
350 {
351 c = *str;
352 if (!ISXDIGIT (c))
353 break;
354 str++;
355 result = (result << 4) + hex_value (c);
356 }
357 while (--length && str < limit);
358
359 *pstr = str;
360 if (length)
361 {
362 /* We'll error when we try it out as the start of an identifier. */
363 cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s",
364 (int) (str - base), base);
365 result = 1;
366 }
367 /* The standard permits $, @ and ` to be specified as UCNs. We use
368 hex escapes so that this also works with EBCDIC hosts. */
369 else if ((result < 0xa0
370 && (result != 0x24 && result != 0x40 && result != 0x60))
371 || (result & 0x80000000)
372 || (result >= 0xD800 && result <= 0xDFFF))
373 {
374 cpp_error (pfile, DL_ERROR, "%.*s is not a valid universal character",
375 (int) (str - base), base);
376 result = 1;
377 }
378 else if (identifier_pos)
379 {
380 int validity = ucn_valid_in_identifier (pfile, result);
381
382 if (validity == 0)
383 cpp_error (pfile, DL_ERROR,
384 "universal character %.*s is not valid in an identifier",
385 (int) (str - base), base);
386 else if (validity == 2 && identifier_pos == 1)
387 cpp_error (pfile, DL_ERROR,
388 "universal character %.*s is not valid at the start of an identifier",
389 (int) (str - base), base);
390 }
391 /* We don't accept UCNs if iconv is not available or will not
392 convert to the target wide character set. */
393 else if (!HAVE_ICONV || pfile->wide_cset_desc == (iconv_t) -1)
394 {
395 /* XXX should be DL_SORRY */
396 cpp_error (pfile, DL_ERROR,
397 "universal character names are not supported in this configuration");
398 }
399
400
401 if (result == 0)
402 result = 1;
403
404 return result;
405 }
406
407 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
408 it to the execution character set and write the result into TBUF.
409 An advanced pointer is returned. Issues all relevant diagnostics.
410
411 UTF-8 encoding looks like this:
412
413 value range encoded as
414 00000000-0000007F 0xxxxxxx
415 00000080-000007FF 110xxxxx 10xxxxxx
416 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
417 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
418 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
419 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
420
421 Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
422 which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
423 never occur. Note also that any value that can be encoded by a
424 given row of the table can also be encoded by all successive rows,
425 but this is not done; only the shortest possible encoding for any
426 given value is valid. For instance, the character 07C0 could be
427 encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
428 FC 80 80 80 9F 80. Only the first is valid. */
429
430 static const uchar *
431 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
432 struct strbuf *tbuf, bool wide)
433 {
434 int nbytes;
435 uchar buf[6], *p = &buf[6];
436 static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
437 cppchar_t ucn;
438
439 from++; /* skip u/U */
440 ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
441 if (!ucn)
442 return from;
443
444 nbytes = 1;
445 if (ucn < 0x80)
446 *--p = ucn;
447 else
448 {
449 do
450 {
451 *--p = ((ucn & 0x3F) | 0x80);
452 ucn >>= 6;
453 nbytes++;
454 }
455 while (ucn >= 0x3F || (ucn & masks[nbytes-1]));
456 *--p = (ucn | masks[nbytes-1]);
457 }
458
459 if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
460 p, nbytes, tbuf))
461 cpp_errno (pfile, DL_ERROR, "converting UCN to execution character set");
462
463 return from;
464 }
465
466 static void
467 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
468 struct strbuf *tbuf, bool wide)
469 {
470 if (wide)
471 {
472 /* We have to render this into the target byte order, which may not
473 be our byte order. */
474 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
475 size_t width = CPP_OPTION (pfile, wchar_precision);
476 size_t cwidth = CPP_OPTION (pfile, char_precision);
477 size_t cmask = width_to_mask (cwidth);
478 size_t nbwc = width / cwidth;
479 size_t i;
480 size_t off = tbuf->len;
481 cppchar_t c;
482
483 if (tbuf->len + nbwc > tbuf->asize)
484 {
485 tbuf->asize += OUTBUF_BLOCK_SIZE;
486 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
487 }
488
489 for (i = 0; i < nbwc; i++)
490 {
491 c = n & cmask;
492 n >>= cwidth;
493 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
494 }
495 tbuf->len += nbwc;
496 }
497 else
498 {
499 if (tbuf->len + 1 > tbuf->asize)
500 {
501 tbuf->asize += OUTBUF_BLOCK_SIZE;
502 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
503 }
504 tbuf->text[tbuf->len++] = n;
505 }
506 }
507
508 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
509 character set and write it into the string buffer TBUF. Returns an
510 advanced pointer, and issues diagnostics as necessary.
511 No character set translation occurs; this routine always produces the
512 execution-set character with numeric value equal to the given hex
513 number. You can, e.g. generate surrogate pairs this way. */
514 static const uchar *
515 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
516 struct strbuf *tbuf, bool wide)
517 {
518 cppchar_t c, n = 0, overflow = 0;
519 int digits_found = 0;
520 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
521 : CPP_OPTION (pfile, char_precision));
522 size_t mask = width_to_mask (width);
523
524 if (CPP_WTRADITIONAL (pfile))
525 cpp_error (pfile, DL_WARNING,
526 "the meaning of '\\x' is different in traditional C");
527
528 from++; /* skip 'x' */
529 while (from < limit)
530 {
531 c = *from;
532 if (! hex_p (c))
533 break;
534 from++;
535 overflow |= n ^ (n << 4 >> 4);
536 n = (n << 4) + hex_value (c);
537 digits_found = 1;
538 }
539
540 if (!digits_found)
541 {
542 cpp_error (pfile, DL_ERROR,
543 "\\x used with no following hex digits");
544 return from;
545 }
546
547 if (overflow | (n != (n & mask)))
548 {
549 cpp_error (pfile, DL_PEDWARN,
550 "hex escape sequence out of range");
551 n &= mask;
552 }
553
554 emit_numeric_escape (pfile, n, tbuf, wide);
555
556 return from;
557 }
558
559 /* Convert an octal escape, pointed to by FROM, to the execution
560 character set and write it into the string buffer TBUF. Returns an
561 advanced pointer, and issues diagnostics as necessary.
562 No character set translation occurs; this routine always produces the
563 execution-set character with numeric value equal to the given octal
564 number. */
565 static const uchar *
566 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
567 struct strbuf *tbuf, bool wide)
568 {
569 size_t count = 0;
570 cppchar_t c, n = 0;
571 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
572 : CPP_OPTION (pfile, char_precision));
573 size_t mask = width_to_mask (width);
574 bool overflow = false;
575
576 while (from < limit && count++ < 3)
577 {
578 c = *from;
579 if (c < '0' || c > '7')
580 break;
581 from++;
582 overflow |= n ^ (n << 3 >> 3);
583 n = (n << 3) + c - '0';
584 }
585
586 if (n != (n & mask))
587 {
588 cpp_error (pfile, DL_PEDWARN,
589 "octal escape sequence out of range");
590 n &= mask;
591 }
592
593 emit_numeric_escape (pfile, n, tbuf, wide);
594
595 return from;
596 }
597
598 /* Convert an escape sequence (pointed to by FROM) to its value on
599 the target, and to the execution character set. Do not scan past
600 LIMIT. Write the converted value into TBUF. Returns an advanced
601 pointer. Handles all relevant diagnostics. */
602 static const uchar *
603 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
604 struct strbuf *tbuf, bool wide)
605 {
606 /* Values of \a \b \e \f \n \r \t \v respectively. */
607 #if HOST_CHARSET == HOST_CHARSET_ASCII
608 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
609 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
610 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
611 #else
612 #error "unknown host character set"
613 #endif
614
615 uchar c;
616
617 c = *from;
618 switch (c)
619 {
620 /* UCNs, hex escapes, and octal escapes are processed separately. */
621 case 'u': case 'U':
622 return convert_ucn (pfile, from, limit, tbuf, wide);
623
624 case 'x':
625 return convert_hex (pfile, from, limit, tbuf, wide);
626 break;
627
628 case '0': case '1': case '2': case '3':
629 case '4': case '5': case '6': case '7':
630 return convert_oct (pfile, from, limit, tbuf, wide);
631
632 /* Various letter escapes. Get the appropriate host-charset
633 value into C. */
634 case '\\': case '\'': case '"': case '?': break;
635
636 case '(': case '{': case '[': case '%':
637 /* '\(', etc, can be used at the beginning of a line in a long
638 string split onto multiple lines with \-newline, to prevent
639 Emacs or other text editors from getting confused. '\%' can
640 be used to prevent SCCS from mangling printf format strings. */
641 if (CPP_PEDANTIC (pfile))
642 goto unknown;
643 break;
644
645 case 'b': c = charconsts[1]; break;
646 case 'f': c = charconsts[3]; break;
647 case 'n': c = charconsts[4]; break;
648 case 'r': c = charconsts[5]; break;
649 case 't': c = charconsts[6]; break;
650 case 'v': c = charconsts[7]; break;
651
652 case 'a':
653 if (CPP_WTRADITIONAL (pfile))
654 cpp_error (pfile, DL_WARNING,
655 "the meaning of '\\a' is different in traditional C");
656 c = charconsts[0];
657 break;
658
659 case 'e': case 'E':
660 if (CPP_PEDANTIC (pfile))
661 cpp_error (pfile, DL_PEDWARN,
662 "non-ISO-standard escape sequence, '\\%c'", (int) c);
663 c = charconsts[2];
664 break;
665
666 default:
667 unknown:
668 if (ISGRAPH (c))
669 cpp_error (pfile, DL_PEDWARN,
670 "unknown escape sequence '\\%c'", (int) c);
671 else
672 cpp_error (pfile, DL_PEDWARN,
673 "unknown escape sequence: '\\%03o'", (int) c);
674 }
675
676 /* Now convert what we have to the execution character set. */
677 if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
678 &c, 1, tbuf))
679 cpp_errno (pfile, DL_ERROR,
680 "converting escape sequence to execution character set");
681
682 return from + 1;
683 }
684 \f
685 /* FROM is an array of cpp_string structures of length COUNT. These
686 are to be converted from the source to the execution character set,
687 escape sequences translated, and finally all are to be
688 concatenated. WIDE indicates whether or not to produce a wide
689 string. The result is written into TO. Returns true for success,
690 false for failure. */
691 bool
692 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
693 cpp_string *to, bool wide)
694 {
695 struct strbuf tbuf;
696 const uchar *p, *base, *limit;
697 size_t i;
698 iconv_t cd = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
699
700 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
701 tbuf.text = xmalloc (tbuf.asize);
702 tbuf.len = 0;
703
704 for (i = 0; i < count; i++)
705 {
706 p = from[i].text;
707 if (*p == 'L') p++;
708 p++; /* skip leading quote */
709 limit = from[i].text + from[i].len - 1; /* skip trailing quote */
710
711 for (;;)
712 {
713 base = p;
714 while (p < limit && *p != '\\')
715 p++;
716 if (p > base)
717 {
718 /* We have a run of normal characters; these can be fed
719 directly to convert_cset. */
720 if (!convert_cset (cd, base, p - base, &tbuf))
721 goto fail;
722 }
723 if (p == limit)
724 break;
725
726 p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
727 }
728 }
729 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
730 structure. */
731 emit_numeric_escape (pfile, 0, &tbuf, wide);
732 tbuf.text = xrealloc (tbuf.text, tbuf.len);
733 to->text = tbuf.text;
734 to->len = tbuf.len;
735 return true;
736
737 fail:
738 cpp_errno (pfile, DL_ERROR, "converting to execution character set");
739 free (tbuf.text);
740 return false;
741 }
742 \f
743 /* Subroutine of cpp_interpret_charconst which performs the conversion
744 to a number, for narrow strings. STR is the string structure returned
745 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
746 cpp_interpret_charconst. */
747 static cppchar_t
748 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
749 unsigned int *pchars_seen, int *unsignedp)
750 {
751 size_t width = CPP_OPTION (pfile, char_precision);
752 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
753 size_t mask = width_to_mask (width);
754 size_t i;
755 cppchar_t result, c;
756 bool unsigned_p;
757
758 /* The value of a multi-character character constant, or a
759 single-character character constant whose representation in the
760 execution character set is more than one byte long, is
761 implementation defined. This implementation defines it to be the
762 number formed by interpreting the byte sequence in memory as a
763 big-endian binary number. If overflow occurs, the high bytes are
764 lost, and a warning is issued.
765
766 We don't want to process the NUL terminator handed back by
767 cpp_interpret_string. */
768 result = 0;
769 for (i = 0; i < str.len - 1; i++)
770 {
771 c = str.text[i] & mask;
772 if (width < BITS_PER_CPPCHAR_T)
773 result = (result << width) | c;
774 else
775 result = c;
776 }
777
778 if (i > max_chars)
779 {
780 i = max_chars;
781 cpp_error (pfile, DL_WARNING, "character constant too long for its type");
782 }
783 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
784 cpp_error (pfile, DL_WARNING, "multi-character character constant");
785
786 /* Multichar constants are of type int and therefore signed. */
787 if (i > 1)
788 unsigned_p = 0;
789 else
790 unsigned_p = CPP_OPTION (pfile, unsigned_char);
791
792 /* Truncate the constant to its natural width, and simultaneously
793 sign- or zero-extend to the full width of cppchar_t.
794 For single-character constants, the value is WIDTH bits wide.
795 For multi-character constants, the value is INT_PRECISION bits wide. */
796 if (i > 1)
797 width = CPP_OPTION (pfile, int_precision);
798 if (width < BITS_PER_CPPCHAR_T)
799 {
800 mask = ((cppchar_t) 1 << width) - 1;
801 if (unsigned_p || !(result & (1 << (width - 1))))
802 result &= mask;
803 else
804 result |= ~mask;
805 }
806 *pchars_seen = i;
807 *unsignedp = unsigned_p;
808 return result;
809 }
810
811 /* Subroutine of cpp_interpret_charconst which performs the conversion
812 to a number, for wide strings. STR is the string structure returned
813 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
814 cpp_interpret_charconst. */
815 static cppchar_t
816 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
817 unsigned int *pchars_seen, int *unsignedp)
818 {
819 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
820 size_t width = CPP_OPTION (pfile, wchar_precision);
821 size_t cwidth = CPP_OPTION (pfile, char_precision);
822 size_t mask = width_to_mask (width);
823 size_t cmask = width_to_mask (cwidth);
824 size_t nbwc = width / cwidth;
825 size_t off, i;
826 cppchar_t result = 0, c;
827
828 /* This is finicky because the string is in the target's byte order,
829 which may not be our byte order. Only the last character, ignoring
830 the NUL terminator, is relevant. */
831 off = str.len - (nbwc * 2);
832 result = 0;
833 for (i = 0; i < nbwc; i++)
834 {
835 c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
836 result = (result << cwidth) | (c & cmask);
837 }
838
839 /* Wide character constants have type wchar_t, and a single
840 character exactly fills a wchar_t, so a multi-character wide
841 character constant is guaranteed to overflow. */
842 if (off > 0)
843 cpp_error (pfile, DL_WARNING, "character constant too long for its type");
844
845 /* Truncate the constant to its natural width, and simultaneously
846 sign- or zero-extend to the full width of cppchar_t. */
847 if (width < BITS_PER_CPPCHAR_T)
848 {
849 if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
850 result &= mask;
851 else
852 result |= ~mask;
853 }
854
855 *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
856 *pchars_seen = 1;
857 return result;
858 }
859
860 /* Interpret a (possibly wide) character constant in TOKEN.
861 PCHARS_SEEN points to a variable that is filled in with the number
862 of characters seen, and UNSIGNEDP to a variable that indicates
863 whether the result has signed type. */
864 cppchar_t
865 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
866 unsigned int *pchars_seen, int *unsignedp)
867 {
868 cpp_string str = { 0, 0 };
869 bool wide = (token->type == CPP_WCHAR);
870 cppchar_t result;
871
872 /* an empty constant will appear as L'' or '' */
873 if (token->val.str.len == (size_t) (2 + wide))
874 {
875 cpp_error (pfile, DL_ERROR, "empty character constant");
876 return 0;
877 }
878 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
879 return 0;
880
881 if (wide)
882 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
883 else
884 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
885
886 if (str.text != token->val.str.text)
887 free ((void *)str.text);
888
889 return result;
890 }