0ba7e930ab0ab8f584dfd7abb01f9c7296d8db51
1 /* CPP Library - charsets
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "coretypes.h"
29 /* Character set handling for C-family languages.
31 Terminological note: In what follows, "charset" or "character set"
32 will be taken to mean both an abstract set of characters and an
33 encoding for that set.
35 The C99 standard discusses two character sets: source and execution.
36 The source character set is used for internal processing in translation
37 phases 1 through 4; the execution character set is used thereafter.
38 Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
39 character encodings (see 3.7.2, 3.7.3 for the standardese meanings
40 of these terms). Furthermore, the "basic character set" (listed in
41 5.2.1p3) is to be encoded in each with values one byte wide, and is
42 to appear in the initial shift state.
44 It is not explicitly mentioned, but there is also a "wide execution
45 character set" used to encode wide character constants and wide
46 string literals; this is supposed to be the result of applying the
47 standard library function mbstowcs() to an equivalent narrow string
48 (6.4.5p5). However, the behavior of hexadecimal and octal
49 \-escapes is at odds with this; they are supposed to be translated
50 directly to wchar_t values (6.4.4.4p5,6).
52 The source character set is not necessarily the character set used
53 to encode physical source files on disk; translation phase 1 converts
54 from whatever that encoding is to the source character set.
56 The presence of universal character names in C99 (6.4.3 et seq.)
57 forces the source character set to be isomorphic to ISO 10646,
58 that is, Unicode. There is no such constraint on the execution
59 character set; note also that the conversion from source to
60 execution character set does not occur for identifiers (5.1.1.2p1#5).
62 For convenience of implementation, the source character set's
63 encoding of the basic character set should be identical to the
64 execution character set OF THE HOST SYSTEM's encoding of the basic
65 character set, and it should not be a state-dependent encoding.
67 cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
68 depending on whether the host is based on ASCII or EBCDIC (see
69 respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
70 Technical Report #16). It relies on the system library's iconv()
71 primitive to do charset conversion (specified in SUSv2). If this
72 primitive is not present, the source and execution character sets
73 must be identical and are limited to the basic ASCII or EBCDIC
74 range, and wide characters are implemented by padding narrow
75 characters to the size of wchar_t. */
78 /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
79 below, which are guarded only by if statements with compile-time
80 constant conditions, do not cause link errors. */
81 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
82 #define iconv(a,b,c,d,e) (errno = EINVAL, (iconv_t)-1)
83 #define iconv_close(x) 0
86 #if HOST_CHARSET == HOST_CHARSET_ASCII
87 #define SOURCE_CHARSET "UTF-8"
88 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
89 #define SOURCE_CHARSET "UTF-EBCDIC"
91 #error "Unrecognized basic host character set"
94 /* This structure is used for a resizable string buffer, mostly by
95 convert_cset and cpp_interpret_string. */
103 /* This is enough to hold any string that fits on a single 80-column
104 line, even if iconv quadruples its size (e.g. conversion from
105 ASCII to UCS-4) rounded up to a power of two. */
106 #define OUTBUF_BLOCK_SIZE 256
108 /* Subroutine of cpp_init_iconv: initialize and return an iconv
109 descriptor for conversion from FROM to TO. If iconv_open() fails,
110 issue an error and return (iconv_t) -1. Silently return
111 (iconv_t) -1 if FROM and TO are identical. */
113 init_iconv_desc (cpp_reader
*pfile
, const char *to
, const char *from
)
117 if (!strcmp (to
, from
))
120 dsc
= iconv_open (to
, from
);
121 if (dsc
== (iconv_t
) -1)
124 cpp_error (pfile
, DL_ERROR
, /* XXX should be DL_SORRY */
125 "conversion from %s to %s not supported by iconv",
128 cpp_errno (pfile
, DL_ERROR
, "iconv_open");
133 /* If charset conversion is requested, initialize iconv(3) descriptors
134 for conversion from the source character set to the execution
135 character sets. If iconv is not present in the C library, and
136 conversion is requested, issue an error. */
139 cpp_init_iconv (cpp_reader
*pfile
)
141 const char *ncset
= CPP_OPTION (pfile
, narrow_charset
);
142 const char *wcset
= CPP_OPTION (pfile
, wide_charset
);
143 const char *default_wcset
;
145 bool be
= CPP_OPTION (pfile
, bytes_big_endian
);
147 if (CPP_OPTION (pfile
, wchar_precision
) >= 32)
148 default_wcset
= be
? "UCS-4BE" : "UCS-4LE";
149 else if (CPP_OPTION (pfile
, wchar_precision
) >= 16)
150 default_wcset
= be
? "UCS-2BE" : "UCS-2LE";
152 /* This effectively means that wide strings are not supported,
153 so don't do any conversion at all. */
154 default_wcset
= SOURCE_CHARSET
;
158 if (ncset
&& strcmp (ncset
, SOURCE_CHARSET
))
159 cpp_error (pfile
, DL_ERROR
, /* XXX should be DL_SORRY */
160 "no iconv implementation, cannot convert to %s", ncset
);
162 if (wcset
&& strcmp (wcset
, default_wcset
))
163 cpp_error (pfile
, DL_ERROR
, /* XXX should be DL_SORRY */
164 "no iconv implementation, cannot convert to %s", wcset
);
169 ncset
= SOURCE_CHARSET
;
171 wcset
= default_wcset
;
173 pfile
->narrow_cset_desc
= init_iconv_desc (pfile
, ncset
, SOURCE_CHARSET
);
174 pfile
->wide_cset_desc
= init_iconv_desc (pfile
, wcset
, SOURCE_CHARSET
);
179 _cpp_destroy_iconv (cpp_reader
*pfile
)
183 if (pfile
->narrow_cset_desc
!= (iconv_t
) -1)
184 iconv_close (pfile
->narrow_cset_desc
);
185 if (pfile
->wide_cset_desc
!= (iconv_t
) -1)
186 iconv_close (pfile
->wide_cset_desc
);
190 /* iconv(3) utility wrapper. Convert the string FROM, of length FLEN,
191 according to the iconv descriptor CD. The result is appended to
192 the string buffer TO. If DESC is (iconv_t)-1 or iconv is not
193 available, the string is simply copied into TO.
195 Returns true on success, false on error. */
198 convert_cset (iconv_t cd
, const uchar
*from
, size_t flen
, struct strbuf
*to
)
200 if (!HAVE_ICONV
|| cd
== (iconv_t
)-1)
202 if (to
->len
+ flen
> to
->asize
)
204 to
->asize
= to
->len
+ flen
;
205 to
->text
= xrealloc (to
->text
, to
->asize
);
207 memcpy (to
->text
+ to
->len
, from
, flen
);
213 char *inbuf
, *outbuf
;
214 size_t inbytesleft
, outbytesleft
;
216 /* Reset conversion descriptor and check that it is valid. */
217 if (iconv (cd
, 0, 0, 0, 0) == (size_t)-1)
220 inbuf
= (char *)from
;
222 outbuf
= (char *)to
->text
+ to
->len
;
223 outbytesleft
= to
->asize
- to
->len
;
227 iconv (cd
, &inbuf
, &inbytesleft
, &outbuf
, &outbytesleft
);
228 if (__builtin_expect (inbytesleft
== 0, 1))
230 to
->len
= to
->asize
- outbytesleft
;
236 outbytesleft
+= OUTBUF_BLOCK_SIZE
;
237 to
->asize
+= OUTBUF_BLOCK_SIZE
;
238 to
->text
= xrealloc (to
->text
, to
->asize
);
239 outbuf
= (char *)to
->text
+ to
->asize
- outbytesleft
;
244 /* Utility routine that computes a mask of the form 0000...111... with
247 width_to_mask (size_t width
)
249 width
= MIN (width
, BITS_PER_CPPCHAR_T
);
250 if (width
>= CHAR_BIT
* sizeof (size_t))
253 return ((size_t) 1 << width
) - 1;
258 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
259 the start of an identifier, and 0 if C is not valid in an
260 identifier. We assume C has already gone through the checks of
261 _cpp_valid_ucn. The algorithm is a simple binary search on the
262 table defined in cppucnid.h. */
265 ucn_valid_in_identifier (cpp_reader
*pfile
, cppchar_t c
)
270 mx
= ARRAY_SIZE (ucnranges
);
274 if (c
< ucnranges
[md
].lo
)
276 else if (c
> ucnranges
[md
].hi
)
284 /* When -pedantic, we require the character to have been listed by
285 the standard for the current language. Otherwise, we accept the
286 union of the acceptable sets for C++98 and C99. */
287 if (CPP_PEDANTIC (pfile
)
288 && ((CPP_OPTION (pfile
, c99
) && !(ucnranges
[md
].flags
& C99
))
289 || (CPP_OPTION (pfile
, cplusplus
)
290 && !(ucnranges
[md
].flags
& CXX
))))
293 /* In C99, UCN digits may not begin identifiers. */
294 if (CPP_OPTION (pfile
, c99
) && (ucnranges
[md
].flags
& DIG
))
300 /* [lex.charset]: The character designated by the universal character
301 name \UNNNNNNNN is that character whose character short name in
302 ISO/IEC 10646 is NNNNNNNN; the character designated by the
303 universal character name \uNNNN is that character whose character
304 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
305 for a universal character name is less than 0x20 or in the range
306 0x7F-0x9F (inclusive), or if the universal character name
307 designates a character in the basic source character set, then the
308 program is ill-formed.
310 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
311 buffer end is delimited by a non-hex digit. Returns zero if UCNs
312 are not part of the relevant standard, or if the string beginning
313 at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'.
315 Otherwise the nonzero value of the UCN, whether valid or invalid,
316 is returned. Diagnostics are emitted for invalid values. PSTR
317 is updated to point one beyond the UCN, or to the syntactically
320 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
321 an identifier, or 2 otherwise.
325 _cpp_valid_ucn (cpp_reader
*pfile
, const uchar
**pstr
,
326 const uchar
*limit
, int identifier_pos
)
330 const uchar
*str
= *pstr
;
331 const uchar
*base
= str
- 2;
333 if (!CPP_OPTION (pfile
, cplusplus
) && !CPP_OPTION (pfile
, c99
))
334 cpp_error (pfile
, DL_WARNING
,
335 "universal character names are only valid in C++ and C99");
336 else if (CPP_WTRADITIONAL (pfile
) && identifier_pos
== 0)
337 cpp_error (pfile
, DL_WARNING
,
338 "the meaning of '\\%c' is different in traditional C",
343 else if (str
[-1] == 'U')
355 result
= (result
<< 4) + hex_value (c
);
357 while (--length
&& str
< limit
);
362 /* We'll error when we try it out as the start of an identifier. */
363 cpp_error (pfile
, DL_ERROR
, "incomplete universal character name %.*s",
364 (int) (str
- base
), base
);
367 /* The standard permits $, @ and ` to be specified as UCNs. We use
368 hex escapes so that this also works with EBCDIC hosts. */
369 else if ((result
< 0xa0
370 && (result
!= 0x24 && result
!= 0x40 && result
!= 0x60))
371 || (result
& 0x80000000)
372 || (result
>= 0xD800 && result
<= 0xDFFF))
374 cpp_error (pfile
, DL_ERROR
, "%.*s is not a valid universal character",
375 (int) (str
- base
), base
);
378 else if (identifier_pos
)
380 int validity
= ucn_valid_in_identifier (pfile
, result
);
383 cpp_error (pfile
, DL_ERROR
,
384 "universal character %.*s is not valid in an identifier",
385 (int) (str
- base
), base
);
386 else if (validity
== 2 && identifier_pos
== 1)
387 cpp_error (pfile
, DL_ERROR
,
388 "universal character %.*s is not valid at the start of an identifier",
389 (int) (str
- base
), base
);
391 /* We don't accept UCNs if iconv is not available or will not
392 convert to the target wide character set. */
393 else if (!HAVE_ICONV
|| pfile
->wide_cset_desc
== (iconv_t
) -1)
395 /* XXX should be DL_SORRY */
396 cpp_error (pfile
, DL_ERROR
,
397 "universal character names are not supported in this configuration");
407 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
408 it to the execution character set and write the result into TBUF.
409 An advanced pointer is returned. Issues all relevant diagnostics.
411 UTF-8 encoding looks like this:
413 value range encoded as
414 00000000-0000007F 0xxxxxxx
415 00000080-000007FF 110xxxxx 10xxxxxx
416 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
417 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
418 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
419 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
421 Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
422 which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
423 never occur. Note also that any value that can be encoded by a
424 given row of the table can also be encoded by all successive rows,
425 but this is not done; only the shortest possible encoding for any
426 given value is valid. For instance, the character 07C0 could be
427 encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
428 FC 80 80 80 9F 80. Only the first is valid. */
431 convert_ucn (cpp_reader
*pfile
, const uchar
*from
, const uchar
*limit
,
432 struct strbuf
*tbuf
, bool wide
)
435 uchar buf
[6], *p
= &buf
[6];
436 static const uchar masks
[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
439 from
++; /* skip u/U */
440 ucn
= _cpp_valid_ucn (pfile
, &from
, limit
, 0);
451 *--p
= ((ucn
& 0x3F) | 0x80);
455 while (ucn
>= 0x3F || (ucn
& masks
[nbytes
-1]));
456 *--p
= (ucn
| masks
[nbytes
-1]);
459 if (!convert_cset (wide
? pfile
->wide_cset_desc
: pfile
->narrow_cset_desc
,
461 cpp_errno (pfile
, DL_ERROR
, "converting UCN to execution character set");
467 emit_numeric_escape (cpp_reader
*pfile
, cppchar_t n
,
468 struct strbuf
*tbuf
, bool wide
)
472 /* We have to render this into the target byte order, which may not
473 be our byte order. */
474 bool bigend
= CPP_OPTION (pfile
, bytes_big_endian
);
475 size_t width
= CPP_OPTION (pfile
, wchar_precision
);
476 size_t cwidth
= CPP_OPTION (pfile
, char_precision
);
477 size_t cmask
= width_to_mask (cwidth
);
478 size_t nbwc
= width
/ cwidth
;
480 size_t off
= tbuf
->len
;
483 if (tbuf
->len
+ nbwc
> tbuf
->asize
)
485 tbuf
->asize
+= OUTBUF_BLOCK_SIZE
;
486 tbuf
->text
= xrealloc (tbuf
->text
, tbuf
->asize
);
489 for (i
= 0; i
< nbwc
; i
++)
493 tbuf
->text
[off
+ (bigend
? nbwc
- i
- 1 : i
)] = c
;
499 if (tbuf
->len
+ 1 > tbuf
->asize
)
501 tbuf
->asize
+= OUTBUF_BLOCK_SIZE
;
502 tbuf
->text
= xrealloc (tbuf
->text
, tbuf
->asize
);
504 tbuf
->text
[tbuf
->len
++] = n
;
508 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
509 character set and write it into the string buffer TBUF. Returns an
510 advanced pointer, and issues diagnostics as necessary.
511 No character set translation occurs; this routine always produces the
512 execution-set character with numeric value equal to the given hex
513 number. You can, e.g. generate surrogate pairs this way. */
515 convert_hex (cpp_reader
*pfile
, const uchar
*from
, const uchar
*limit
,
516 struct strbuf
*tbuf
, bool wide
)
518 cppchar_t c
, n
= 0, overflow
= 0;
519 int digits_found
= 0;
520 size_t width
= (wide
? CPP_OPTION (pfile
, wchar_precision
)
521 : CPP_OPTION (pfile
, char_precision
));
522 size_t mask
= width_to_mask (width
);
524 if (CPP_WTRADITIONAL (pfile
))
525 cpp_error (pfile
, DL_WARNING
,
526 "the meaning of '\\x' is different in traditional C");
528 from
++; /* skip 'x' */
535 overflow
|= n
^ (n
<< 4 >> 4);
536 n
= (n
<< 4) + hex_value (c
);
542 cpp_error (pfile
, DL_ERROR
,
543 "\\x used with no following hex digits");
547 if (overflow
| (n
!= (n
& mask
)))
549 cpp_error (pfile
, DL_PEDWARN
,
550 "hex escape sequence out of range");
554 emit_numeric_escape (pfile
, n
, tbuf
, wide
);
559 /* Convert an octal escape, pointed to by FROM, to the execution
560 character set and write it into the string buffer TBUF. Returns an
561 advanced pointer, and issues diagnostics as necessary.
562 No character set translation occurs; this routine always produces the
563 execution-set character with numeric value equal to the given octal
566 convert_oct (cpp_reader
*pfile
, const uchar
*from
, const uchar
*limit
,
567 struct strbuf
*tbuf
, bool wide
)
571 size_t width
= (wide
? CPP_OPTION (pfile
, wchar_precision
)
572 : CPP_OPTION (pfile
, char_precision
));
573 size_t mask
= width_to_mask (width
);
574 bool overflow
= false;
576 while (from
< limit
&& count
++ < 3)
579 if (c
< '0' || c
> '7')
582 overflow
|= n
^ (n
<< 3 >> 3);
583 n
= (n
<< 3) + c
- '0';
588 cpp_error (pfile
, DL_PEDWARN
,
589 "octal escape sequence out of range");
593 emit_numeric_escape (pfile
, n
, tbuf
, wide
);
598 /* Convert an escape sequence (pointed to by FROM) to its value on
599 the target, and to the execution character set. Do not scan past
600 LIMIT. Write the converted value into TBUF. Returns an advanced
601 pointer. Handles all relevant diagnostics. */
603 convert_escape (cpp_reader
*pfile
, const uchar
*from
, const uchar
*limit
,
604 struct strbuf
*tbuf
, bool wide
)
606 /* Values of \a \b \e \f \n \r \t \v respectively. */
607 #if HOST_CHARSET == HOST_CHARSET_ASCII
608 static const uchar charconsts
[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
609 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
610 static const uchar charconsts
[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
612 #error "unknown host character set"
620 /* UCNs, hex escapes, and octal escapes are processed separately. */
622 return convert_ucn (pfile
, from
, limit
, tbuf
, wide
);
625 return convert_hex (pfile
, from
, limit
, tbuf
, wide
);
628 case '0': case '1': case '2': case '3':
629 case '4': case '5': case '6': case '7':
630 return convert_oct (pfile
, from
, limit
, tbuf
, wide
);
632 /* Various letter escapes. Get the appropriate host-charset
634 case '\\': case '\'': case '"': case '?': break;
636 case '(': case '{': case '[': case '%':
637 /* '\(', etc, can be used at the beginning of a line in a long
638 string split onto multiple lines with \-newline, to prevent
639 Emacs or other text editors from getting confused. '\%' can
640 be used to prevent SCCS from mangling printf format strings. */
641 if (CPP_PEDANTIC (pfile
))
645 case 'b': c
= charconsts
[1]; break;
646 case 'f': c
= charconsts
[3]; break;
647 case 'n': c
= charconsts
[4]; break;
648 case 'r': c
= charconsts
[5]; break;
649 case 't': c
= charconsts
[6]; break;
650 case 'v': c
= charconsts
[7]; break;
653 if (CPP_WTRADITIONAL (pfile
))
654 cpp_error (pfile
, DL_WARNING
,
655 "the meaning of '\\a' is different in traditional C");
660 if (CPP_PEDANTIC (pfile
))
661 cpp_error (pfile
, DL_PEDWARN
,
662 "non-ISO-standard escape sequence, '\\%c'", (int) c
);
669 cpp_error (pfile
, DL_PEDWARN
,
670 "unknown escape sequence '\\%c'", (int) c
);
672 cpp_error (pfile
, DL_PEDWARN
,
673 "unknown escape sequence: '\\%03o'", (int) c
);
676 /* Now convert what we have to the execution character set. */
677 if (!convert_cset (wide
? pfile
->wide_cset_desc
: pfile
->narrow_cset_desc
,
679 cpp_errno (pfile
, DL_ERROR
,
680 "converting escape sequence to execution character set");
685 /* FROM is an array of cpp_string structures of length COUNT. These
686 are to be converted from the source to the execution character set,
687 escape sequences translated, and finally all are to be
688 concatenated. WIDE indicates whether or not to produce a wide
689 string. The result is written into TO. Returns true for success,
690 false for failure. */
692 cpp_interpret_string (cpp_reader
*pfile
, const cpp_string
*from
, size_t count
,
693 cpp_string
*to
, bool wide
)
696 const uchar
*p
, *base
, *limit
;
698 iconv_t cd
= wide
? pfile
->wide_cset_desc
: pfile
->narrow_cset_desc
;
700 tbuf
.asize
= MAX (OUTBUF_BLOCK_SIZE
, from
->len
);
701 tbuf
.text
= xmalloc (tbuf
.asize
);
704 for (i
= 0; i
< count
; i
++)
708 p
++; /* skip leading quote */
709 limit
= from
[i
].text
+ from
[i
].len
- 1; /* skip trailing quote */
714 while (p
< limit
&& *p
!= '\\')
718 /* We have a run of normal characters; these can be fed
719 directly to convert_cset. */
720 if (!convert_cset (cd
, base
, p
- base
, &tbuf
))
726 p
= convert_escape (pfile
, p
+ 1, limit
, &tbuf
, wide
);
729 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
731 emit_numeric_escape (pfile
, 0, &tbuf
, wide
);
732 tbuf
.text
= xrealloc (tbuf
.text
, tbuf
.len
);
733 to
->text
= tbuf
.text
;
738 cpp_errno (pfile
, DL_ERROR
, "converting to execution character set");
743 /* Subroutine of cpp_interpret_charconst which performs the conversion
744 to a number, for narrow strings. STR is the string structure returned
745 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
746 cpp_interpret_charconst. */
748 narrow_str_to_charconst (cpp_reader
*pfile
, cpp_string str
,
749 unsigned int *pchars_seen
, int *unsignedp
)
751 size_t width
= CPP_OPTION (pfile
, char_precision
);
752 size_t max_chars
= CPP_OPTION (pfile
, int_precision
) / width
;
753 size_t mask
= width_to_mask (width
);
758 /* The value of a multi-character character constant, or a
759 single-character character constant whose representation in the
760 execution character set is more than one byte long, is
761 implementation defined. This implementation defines it to be the
762 number formed by interpreting the byte sequence in memory as a
763 big-endian binary number. If overflow occurs, the high bytes are
764 lost, and a warning is issued.
766 We don't want to process the NUL terminator handed back by
767 cpp_interpret_string. */
769 for (i
= 0; i
< str
.len
- 1; i
++)
771 c
= str
.text
[i
] & mask
;
772 if (width
< BITS_PER_CPPCHAR_T
)
773 result
= (result
<< width
) | c
;
781 cpp_error (pfile
, DL_WARNING
, "character constant too long for its type");
783 else if (i
> 1 && CPP_OPTION (pfile
, warn_multichar
))
784 cpp_error (pfile
, DL_WARNING
, "multi-character character constant");
786 /* Multichar constants are of type int and therefore signed. */
790 unsigned_p
= CPP_OPTION (pfile
, unsigned_char
);
792 /* Truncate the constant to its natural width, and simultaneously
793 sign- or zero-extend to the full width of cppchar_t.
794 For single-character constants, the value is WIDTH bits wide.
795 For multi-character constants, the value is INT_PRECISION bits wide. */
797 width
= CPP_OPTION (pfile
, int_precision
);
798 if (width
< BITS_PER_CPPCHAR_T
)
800 mask
= ((cppchar_t
) 1 << width
) - 1;
801 if (unsigned_p
|| !(result
& (1 << (width
- 1))))
807 *unsignedp
= unsigned_p
;
811 /* Subroutine of cpp_interpret_charconst which performs the conversion
812 to a number, for wide strings. STR is the string structure returned
813 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
814 cpp_interpret_charconst. */
816 wide_str_to_charconst (cpp_reader
*pfile
, cpp_string str
,
817 unsigned int *pchars_seen
, int *unsignedp
)
819 bool bigend
= CPP_OPTION (pfile
, bytes_big_endian
);
820 size_t width
= CPP_OPTION (pfile
, wchar_precision
);
821 size_t cwidth
= CPP_OPTION (pfile
, char_precision
);
822 size_t mask
= width_to_mask (width
);
823 size_t cmask
= width_to_mask (cwidth
);
824 size_t nbwc
= width
/ cwidth
;
826 cppchar_t result
= 0, c
;
828 /* This is finicky because the string is in the target's byte order,
829 which may not be our byte order. Only the last character, ignoring
830 the NUL terminator, is relevant. */
831 off
= str
.len
- (nbwc
* 2);
833 for (i
= 0; i
< nbwc
; i
++)
835 c
= bigend
? str
.text
[off
+ i
] : str
.text
[off
+ nbwc
- i
- 1];
836 result
= (result
<< cwidth
) | (c
& cmask
);
839 /* Wide character constants have type wchar_t, and a single
840 character exactly fills a wchar_t, so a multi-character wide
841 character constant is guaranteed to overflow. */
843 cpp_error (pfile
, DL_WARNING
, "character constant too long for its type");
845 /* Truncate the constant to its natural width, and simultaneously
846 sign- or zero-extend to the full width of cppchar_t. */
847 if (width
< BITS_PER_CPPCHAR_T
)
849 if (CPP_OPTION (pfile
, unsigned_wchar
) || !(result
& (1 << (width
- 1))))
855 *unsignedp
= CPP_OPTION (pfile
, unsigned_wchar
);
860 /* Interpret a (possibly wide) character constant in TOKEN.
861 PCHARS_SEEN points to a variable that is filled in with the number
862 of characters seen, and UNSIGNEDP to a variable that indicates
863 whether the result has signed type. */
865 cpp_interpret_charconst (cpp_reader
*pfile
, const cpp_token
*token
,
866 unsigned int *pchars_seen
, int *unsignedp
)
868 cpp_string str
= { 0, 0 };
869 bool wide
= (token
->type
== CPP_WCHAR
);
872 /* an empty constant will appear as L'' or '' */
873 if (token
->val
.str
.len
== (size_t) (2 + wide
))
875 cpp_error (pfile
, DL_ERROR
, "empty character constant");
878 else if (!cpp_interpret_string (pfile
, &token
->val
.str
, 1, &str
, wide
))
882 result
= wide_str_to_charconst (pfile
, str
, pchars_seen
, unsignedp
);
884 result
= narrow_str_to_charconst (pfile
, str
, pchars_seen
, unsignedp
);
886 if (str
.text
!= token
->val
.str
.text
)
887 free ((void *)str
.text
);