cpphash.h (U): New define, to correct type of string constants.
[gcc.git] / gcc / mbchar.c
1 /* Multibyte Character Functions.
2 Copyright (C) 1998 Free Software Foundation, Inc.
3
4 This file is part of GNU CC.
5
6 GNU CC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 GNU CC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GNU CC; see the file COPYING. If not, write to
18 the Free Software Foundation, 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
20
21 /* Note regarding cross compilation:
22
23 In general, translation of multibyte characters to wide characters can
24 only work in a native compiler since the translation function (mbtowc)
25 needs to know about both the source and target character encoding. However,
26 this particular implementation for JIS, SJIS and EUCJP source characters
27 will work for any compiler with a newlib target. Other targets may also
28 work provided that their wchar_t implementation is 2 bytes and the encoding
29 leaves the source character values unchanged (except for removing the
30 state shifting markers). */
31
32 #ifdef MULTIBYTE_CHARS
33 #include "config.h"
34 #include "system.h"
35 #include "mbchar.h"
36 #include <locale.h>
37
38 typedef enum {ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER,
39 JIS_C_NUM} JIS_CHAR_TYPE;
40
41 typedef enum {ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR,
42 J2_ESC, J2_ESC_BR, INV, JIS_S_NUM} JIS_STATE;
43
44 typedef enum {COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP,
45 EMPTY, ERROR} JIS_ACTION;
46
47 /* State/action tables for processing JIS encoding:
48
49 Where possible, switches to JIS are grouped with proceding JIS characters
50 and switches to ASCII are grouped with preceding JIS characters.
51 Thus, maximum returned length is:
52 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. */
53
54 static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
55 /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH*/
56 /*ASCII*/ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
57 /*A_ESC*/ { ASCII, A_ESC_DL,ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
58 /*A_ESC_DL*/{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII,ASCII,ASCII},
59 /*JIS*/ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1,INV },
60 /*JIS_1*/ { INV, JIS_2, JIS_2, JIS_2, JIS_2, JIS_2, INV, JIS_2,INV },
61 /*JIS_2*/ { J2_ESC,JIS, JIS, JIS, JIS, JIS, INV, JIS, JIS },
62 /*J_ESC*/ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV },
63 /*J_ESC_BR*/{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
64 /*J2_ESC*/ { INV, INV, J2_ESC_BR,INV, INV, INV, INV, INV, INV },
65 /*J2_ESC_BR*/{INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
66 };
67
68 static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
69 /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH */
70 /*ASCII */ {NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, EMPTY, COPYA, COPYA},
71 /*A_ESC */ {COPYA, NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA},
72 /*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA},
73 /*JIS */ {NOOP, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR},
74 /*JIS_1 */ {ERROR, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR},
75 /*JIS_2 */ {NOOP, COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2},
76 /*J_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR},
77 /*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR, NOOP, NOOP, ERROR, ERROR, ERROR},
78 /*J2_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR},
79 /*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR, COPYJ, COPYJ, ERROR, ERROR, ERROR},
80 };
81
82
83 const char *literal_codeset = NULL;
84
85 /* Store into *PWC (if PWC is not null) the wide character
86 corresponding to the multibyte character at the start of the
87 buffer S of size N. Return the number of bytes in the multibyte
88 character. Return -1 if the bytes do not form a valid character,
89 or 0 if S is null or points to a null byte.
90
91 This function behaves like the Standard C function mbtowc, except
92 it treats locale names of the form "C-..." specially. */
93
94 int
95 local_mbtowc (pwc, s, n)
96 wchar_t *pwc;
97 const char *s;
98 size_t n;
99 {
100 static JIS_STATE save_state = ASCII;
101 JIS_STATE curr_state = save_state;
102 const unsigned char *t = (const unsigned char *) s;
103
104 if (s != NULL && n == 0)
105 return -1;
106
107 if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
108 /* This must be the "C" locale or unknown locale -- fall thru */
109 ;
110 else if (! strcmp (literal_codeset, "C-SJIS"))
111 {
112 int char1;
113 if (s == NULL)
114 /* Not state-dependent. */
115 return 0;
116
117 char1 = *t;
118 if (ISSJIS1 (char1))
119 {
120 int char2 = t[1];
121
122 if (n <= 1)
123 return -1;
124
125 if (ISSJIS2 (char2))
126 {
127 if (pwc != NULL)
128 *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
129 return 2;
130 }
131
132 return -1;
133 }
134
135 if (pwc != NULL)
136 *pwc = (wchar_t) *t;
137
138 if (*t == '\0')
139 return 0;
140
141 return 1;
142 }
143 else if (! strcmp (literal_codeset, "C-EUCJP"))
144 {
145 int char1;
146
147 if (s == NULL)
148 /* Not state-dependent. */
149 return 0;
150
151 char1 = *t;
152 if (ISEUCJP (char1))
153 {
154 int char2 = t[1];
155
156 if (n <= 1)
157 return -1;
158
159 if (ISEUCJP (char2))
160 {
161 if (pwc != NULL)
162 *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
163 return 2;
164 }
165
166 return -1;
167 }
168
169 if (pwc != NULL)
170 *pwc = (wchar_t) *t;
171
172 if (*t == '\0')
173 return 0;
174
175 return 1;
176 }
177 else if (! strcmp (literal_codeset, "C-JIS"))
178 {
179 JIS_ACTION action;
180 JIS_CHAR_TYPE ch;
181 const unsigned char *ptr;
182 size_t i, curr_ch;
183
184 if (s == NULL)
185 {
186 save_state = ASCII;
187 /* State-dependent. */
188 return 1;
189 }
190
191 ptr = t;
192
193 for (i = 0; i < n; i++)
194 {
195 curr_ch = t[i];
196 switch (curr_ch)
197 {
198 case JIS_ESC_CHAR:
199 ch = ESCAPE;
200 break;
201 case '$':
202 ch = DOLLAR;
203 break;
204 case '@':
205 ch = AT;
206 break;
207 case '(':
208 ch = BRACKET;
209 break;
210 case 'B':
211 ch = B;
212 break;
213 case 'J':
214 ch = J;
215 break;
216 case '\0':
217 ch = NUL;
218 break;
219 default:
220 if (ISJIS (curr_ch))
221 ch = JIS_CHAR;
222 else
223 ch = OTHER;
224 }
225
226 action = JIS_action_table[curr_state][ch];
227 curr_state = JIS_state_table[curr_state][ch];
228
229 switch (action)
230 {
231 case NOOP:
232 break;
233
234 case EMPTY:
235 if (pwc != NULL)
236 *pwc = (wchar_t) 0;
237
238 save_state = curr_state;
239 return i;
240
241 case COPYA:
242 if (pwc != NULL)
243 *pwc = (wchar_t) *ptr;
244 save_state = curr_state;
245 return i + 1;
246
247 case COPYJ:
248 if (pwc != NULL)
249 *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
250
251 save_state = curr_state;
252 return i + 1;
253
254 case COPYJ2:
255 if (pwc != NULL)
256 *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
257
258 save_state = curr_state;
259 return ptr - t + 2;
260
261 case MAKE_A:
262 case MAKE_J:
263 ptr = (const unsigned char *) (t + i + 1);
264 break;
265
266 case ERROR:
267 default:
268 return -1;
269 }
270 }
271
272 /* More than n bytes needed. */
273 return -1;
274 }
275
276 #ifdef CROSS_COMPILE
277 if (s == NULL)
278 /* Not state-dependent. */
279 return 0;
280
281 if (pwc != NULL)
282 *pwc = *s;
283 return 1;
284 #else
285
286 /* This must be the "C" locale or unknown locale. */
287 return mbtowc (pwc, s, n);
288 #endif
289 }
290
291 /* Return the number of bytes in the multibyte character at the start
292 of the buffer S of size N. Return -1 if the bytes do not form a
293 valid character, or 0 if S is null or points to a null byte.
294
295 This function behaves like the Standard C function mblen, except
296 it treats locale names of the form "C-..." specially. */
297
298 int
299 local_mblen (s, n)
300 const char *s;
301 size_t n;
302 {
303 return local_mbtowc (NULL, s, n);
304 }
305
306 /* Return the maximum mumber of bytes in a multibyte character.
307
308 This function returns the same value as the Standard C macro MB_CUR_MAX,
309 except it treats locale names of the form "C-..." specially. */
310
311 int
312 local_mb_cur_max ()
313 {
314 if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
315 ;
316 else if (! strcmp (literal_codeset, "C-SJIS"))
317 return 2;
318 else if (! strcmp (literal_codeset, "C-EUCJP"))
319 return 2;
320 else if (! strcmp (literal_codeset, "C-JIS"))
321 return 8; /* 3 + 2 + 3 */
322
323 #ifdef CROSS_COMPILE
324 return 1;
325 #else
326 if (MB_CUR_MAX > 0)
327 return MB_CUR_MAX;
328
329 return 1; /* default */
330 #endif
331 }
332 #else /* MULTIBYTE_CHARS */
333 extern int dummy; /* silence 'ANSI C forbids an empty source file' warning */
334 #endif /* MULTIBYTE_CHARS */