cppcharset.c (one_iso88591_to_utf8): New function.

author Eric Christopher <echristo@gcc.gnu.org>

Fri, 16 Jan 2004 22:37:49 +0000 (22:37 +0000)

committer Eric Christopher <echristo@gcc.gnu.org>

Fri, 16 Jan 2004 22:37:49 +0000 (22:37 +0000)
author Eric Christopher <echristo@gcc.gnu.org>
Fri, 16 Jan 2004 22:37:49 +0000 (22:37 +0000)
committer Eric Christopher <echristo@gcc.gnu.org>
Fri, 16 Jan 2004 22:37:49 +0000 (22:37 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index d7afc1937c2db4ace8100f2120409c5097799629..8a3d1fdfaca3220156d809f77cc71c3f7aff0a33 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,19 @@
+2004-01-16  Eric Christopher  <echristo@redhat.com>
+           Chandrakala Chavva <cchavva@redhat.com>
+
+       * cppcharset.c (one_iso88591_to_utf8): New function.
+       (convert_iso88591_utf8): Ditto. Use.
+       (conversion_tab): Use.
+       (_cpp_input_to_utf8): New function.
+       (_cpp_init_iconv_buffer): Ditto.
+       (_cpp_close_iconv_buffer): Ditto.
+       * cpphash.h: Prototype new functions.
+       (cpp_buffer): Add input_cset_desc.
+       * cppinit.c: Add input_charset default.
+       * cpplib.c (cpp_push_buffer): Support init and
+       close of iconv.
+       * cpplib.h (cpp_options): Add input_charset.
+
  2004-01-16  Kazu Hirata  <kazu@cs.umass.edu>
  
         * system.h (ASM_OUTPUT_SECTION_NAME): Poison.
@@ -14,23 +30,23 @@
         * fixinc/tests/base/sys/stat.h: Adapt for new hackname.
  
         * fixinc/inclhack.def (alpha___extern_prefix,
-       alpha___extern_prefix_standards): New hacks to obey 
+       alpha___extern_prefix_standards): New hacks to obey
         __PRAGMA_EXTERN_PREFIX.
         * fixinc/tests/base/testing.h [ALPHA___EXTERN_PREFIX_CHECK]: New
         test.
         * fixinc/tests/base/standards.h: Likewise.
-       
+
         * fixincl/inclhack.def (alpha_pthread): Tweak to match more
         variations.
         New testcase.
         * fixinc/tests/base/pthread.h: Handle it.
-       
+
         * fixincl/inclhack.def (bad_lval): Sort file list.
         Add many missing files up to Tru64 UNIX V5.1B.
         * gcc/fixinc/tests/base/libgen.h: Renamed to ...
         * gcc/fixinc/tests/base/dirent.h: ... this to match new file list
         order.
-       
+
         * fixinc/fixincl.x: Regenerate.
  
  2004-01-16  Mark Mitchell  <mark@codesourcery.com>
diff --git a/gcc/cppcharset.c b/gcc/cppcharset.c

index 1b2d0b2a091ec7ecbfb5f2ec22db018e48fb4ad4..5070366e3a8cfdfe0d4f508068b64cf9d16c2ba9 100644 (file)
--- a/gcc/cppcharset.c
+++ b/gcc/cppcharset.c
@@ -170,7 +170,7 @@ one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
  {
    static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
    static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
-  
+
    cppchar_t c;
    const uchar *inbuf = *inbufp;
    size_t nbytes, i;
@@ -274,7 +274,7 @@ one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
     The return value is either 0 for success, or an errno value for
     failure, which may be E2BIG (need more space), EILSEQ (ill-formed
     input sequence), ir EINVAL (incomplete input sequence).  */
-   
+
  static inline int
  one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
                    uchar **outbufp, size_t *outbytesleftp)
@@ -446,6 +446,31 @@ one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
    return 0;
  }
  
+/* The first 256 code points of ISO 8859.1 have the same numeric
+   values as the first 256 code points of Unicode, therefore the
+   incoming ISO 8859.1 character can be passed directly to
+   one_cppchar_to_utf8 (which expects a Unicode value).  */
+
+static int
+one_iso88591_to_utf8 (iconv_t bigend ATTRIBUTE_UNUSED, const uchar **inbufp,
+                     size_t *inbytesleftp, uchar **outbufp, size_t *outbytesleftp)
+{
+  const uchar *inbuf = *inbufp;
+  int rval;
+
+  if (*inbytesleftp > 1)
+    return EINVAL;
+
+  rval = one_cppchar_to_utf8 ((cppchar_t)*inbuf, outbufp, outbytesleftp);
+  if (rval)
+    return rval;
+
+  *inbufp += 1;
+  *inbytesleftp -= 1;
+
+  return 0;
+}
+
  /* Helper routine for the next few functions.  The 'const' on
     one_conversion means that we promise not to modify what function is
     pointed to, which lets the inliner see through it.  */
@@ -489,7 +514,7 @@ conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
        outbuf = to->text + to->asize - outbytesleft;
      }
  }
-                
+
  
  /* These functions convert entire strings between character sets.
     They all have the signature
@@ -529,6 +554,14 @@ convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
    return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
  }
  
+static bool
+convert_iso88591_utf8 (iconv_t cd, const uchar *from, size_t flen,
+                       struct _cpp_strbuf *to)
+{
+  return conversion_loop (one_iso88591_to_utf8, cd, from, flen, to);
+}
+
+
  /* Identity conversion, used when we have no alternative.  */
  static bool
  convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
@@ -606,6 +639,7 @@ static const struct conversion conversion_tab[] = {
    { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
    { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
    { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
+  { "ISO-8859-1/UTF-8", convert_iso88591_utf8, (iconv_t)0 },
  };
  
  /* Subroutine of cpp_init_iconv: initialize and return a
@@ -619,7 +653,7 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
    struct cset_converter ret;
    char *pair;
    size_t i;
-  
+
    if (!strcasecmp (to, from))
      {
        ret.func = convert_no_conversion;
@@ -649,7 +683,7 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
        if (ret.cd == (iconv_t) -1)
         {
           if (errno == EINVAL)
-           cpp_error (pfile, CPP_DL_ERROR, /* XXX should be DL_SORRY */
+           cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
                        "conversion from %s to %s not supported by iconv",
                        from, to);
           else
@@ -660,7 +694,7 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
      }
    else
      {
-      cpp_error (pfile, CPP_DL_ERROR, /* XXX should be DL_SORRY */
+      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
                  "no iconv implementation, cannot convert from %s to %s",
                  from, to);
        ret.func = convert_no_conversion;
@@ -1270,7 +1304,7 @@ narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
    *unsignedp = unsigned_p;
    return result;
  }
-                        
+
  /* Subroutine of cpp_interpret_charconst which performs the conversion
     to a number, for wide strings.  STR is the string structure returned
     by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
@@ -1352,3 +1386,46 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
  
    return result;
  }
+
+uchar *
+_cpp_input_to_utf8 (cpp_reader *pfile, const uchar *input, cppchar_t length)
+{
+  struct _cpp_strbuf tbuf;
+  struct cset_converter cvt = pfile->buffer->input_cset_desc;
+
+  tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, length);
+  tbuf.text = xmalloc (tbuf.asize);
+  tbuf.len = 0;
+
+  if (!APPLY_CONVERSION (cvt, input, length, &tbuf))
+   {
+      cpp_error (pfile, CPP_DL_ERROR, "converting input to source character set.");
+      return NULL;
+   }
+
+  if (length)
+    tbuf.text[tbuf.len] = '\n';
+  else
+    tbuf.text[0] = '\n';
+
+  return tbuf.text;
+}
+
+  /* Check the input file format. At present assuming the input file
+     is in iso-8859-1 format. Convert this input character set to
+     source character set format (UTF-8). */
+
+void
+_cpp_init_iconv_buffer (cpp_reader *pfile, const char *from)
+{
+  pfile->buffer->input_cset_desc = init_iconv_desc (pfile, SOURCE_CHARSET,
+                                                   from);
+}
+
+void
+_cpp_close_iconv_buffer (cpp_reader *pfile)
+{
+  if (HAVE_ICONV
+      && pfile->buffer->input_cset_desc.func == convert_using_iconv)
+    iconv_close (pfile->buffer->input_cset_desc.cd);
+}
diff --git a/gcc/cpphash.h b/gcc/cpphash.h

index 80cb04c5f529499a2dc1857b48f2958b3f7fc54a..6c13ea1c0b17254dce06070ce0c5eaf43d5c699b 100644 (file)
--- a/gcc/cpphash.h
+++ b/gcc/cpphash.h
@@ -270,7 +270,7 @@ struct cpp_buffer
    const uchar *cur;            /* Current location.  */
    const uchar *line_base;      /* Start of current physical line.  */
    const uchar *next_line;      /* Start of to-be-cleaned logical line.  */
-  
+
    const uchar *buf;            /* Entire character buffer.  */
    const uchar *rlimit;         /* Writable byte at end of file.  */
  
@@ -313,6 +313,10 @@ struct cpp_buffer
  
    /* Used for buffer overlays by cpptrad.c.  */
    const uchar *saved_cur, *saved_rlimit;
+
+  /* Descriptor for converting from the input character set to the
+     source character set.  */
+  struct cset_converter input_cset_desc;
  };
  
  /* A cpp_reader encapsulates the "state" of a pre-processor run.
@@ -557,6 +561,9 @@ extern void _cpp_init_internal_pragmas (cpp_reader *);
  extern void _cpp_do_file_change (cpp_reader *, enum lc_reason, const char *,
                                  unsigned int, unsigned int);
  extern void _cpp_pop_buffer (cpp_reader *);
+extern uchar *_cpp_input_to_utf8 (cpp_reader *, const unsigned char *, cppchar_t);
+extern void _cpp_init_iconv_buffer (cpp_reader *, const char *);
+extern void _cpp_close_iconv_buffer (cpp_reader *);
  
  /* In cpptrad.c.  */
  extern bool _cpp_scan_out_logical_line (cpp_reader *, cpp_macro *);
diff --git a/gcc/cppinit.c b/gcc/cppinit.c

index 13326886778a1f378cb8fd2f95bec80934750b75..629da2734a7d13ae5b9c7c7082996e570f0ed7b3 100644 (file)
--- a/gcc/cppinit.c
+++ b/gcc/cppinit.c
@@ -161,6 +161,9 @@ cpp_create_reader (enum c_lang lang, hash_table *table)
    CPP_OPTION (pfile, narrow_charset) = 0;
    CPP_OPTION (pfile, wide_charset) = 0;
  
+  /* Default the input character set to iso-8859-1 for now. */
+  CPP_OPTION (pfile, input_charset) = "ISO-8859-1";
+
    /* A fake empty "directory" used as the starting point for files
       looked up without a search path.  Name cannot be '/' because we
       don't want to prepend anything at all to filenames using it.  All
diff --git a/gcc/cpplib.c b/gcc/cpplib.c

index 2b213cb461a80bcd2a6fd9ac61c361a6c261f15e..feb8717745b8d40803b9e61a5dbafcf953451797 100644 (file)
--- a/gcc/cpplib.c
+++ b/gcc/cpplib.c
@@ -549,14 +549,14 @@ do_undef (cpp_reader *pfile)
  /* Undefine a single macro/assertion/whatever.  */
  
  static int
-undefine_macros (cpp_reader *pfile, cpp_hashnode *h, 
+undefine_macros (cpp_reader *pfile, cpp_hashnode *h,
                  void *data_p ATTRIBUTE_UNUSED)
  {
    switch (h->type)
      {
      case NT_VOID:
        break;
-      
+
      case NT_MACRO:
        if (pfile->cb.undef)
          (*pfile->cb.undef) (pfile, pfile->directive_line, h);
@@ -855,7 +855,7 @@ do_linemarker (cpp_reader *pfile)
        cpp_string s = { 0, 0 };
        if (_cpp_interpret_string_notranslate (pfile, &token->val.str, &s))
         new_file = (const char *)s.text;
-      
+
        new_sysp = 0;
        flag = read_flag (pfile, 0);
        if (flag == 1)
@@ -1159,7 +1159,7 @@ do_pragma (cpp_reader *pfile)
        (*p->u.handler) (pfile);
        if (pfile->cb.line_change)
         (*pfile->cb.line_change) (pfile, pfile->cur_token, false);
-      
+
      }
    else if (pfile->cb.def_pragma)
      {
@@ -1925,6 +1925,7 @@ cpp_push_buffer (cpp_reader *pfile, const uchar *buffer, size_t len,
                  int from_stage3)
  {
    cpp_buffer *new = xobnew (&pfile->buffer_ob, cpp_buffer);
+  const char *input = CPP_OPTION (pfile, input_charset);
  
    /* Clears, amongst other things, if_stack and mi_cmacro.  */
    memset (new, 0, sizeof (cpp_buffer));
@@ -1936,6 +1937,8 @@ cpp_push_buffer (cpp_reader *pfile, const uchar *buffer, size_t len,
    new->need_line = true;
  
    pfile->buffer = new;
+  _cpp_init_iconv_buffer (pfile, input);
+
    return new;
  }
  
@@ -1957,6 +1960,8 @@ _cpp_pop_buffer (cpp_reader *pfile)
    /* In case of a missing #endif.  */
    pfile->state.skipping = 0;
  
+  _cpp_close_iconv_buffer (pfile);
+
    /* _cpp_do_file_change expects pfile->buffer to be the new one.  */
    pfile->buffer = buffer->prev;
  
diff --git a/gcc/cpplib.h b/gcc/cpplib.h

index 5f189245eb58e7799c5731d5d15d68c7e17b6f06..f7e12d200b7c65a7c04925111e24615da308d12f 100644 (file)
--- a/gcc/cpplib.h
+++ b/gcc/cpplib.h
@@ -332,6 +332,9 @@ struct cpp_options
    /* Holds the name of the target wide character set.  */
    const char *wide_charset;
  
+  /* Holds the name of the input character set.  */
+  const char *input_charset;
+
    /* True to warn about precompiled header files we couldn't use.  */
    bool warn_invalid_pch;
  
@@ -417,7 +420,7 @@ struct cpp_dir
    /* Mapping of file names for this directory for MS-DOS and related
       platforms.  A NULL-terminated array of (from, to) pairs.  */
    const char **name_map;
-    
+
    /* The C front end uses these to recognize duplicated
       directories in the search path.  */
    ino_t ino;
@@ -481,7 +484,7 @@ struct cpp_hashnode GTY(())
  {
    struct ht_identifier ident;
    unsigned int is_directive : 1;
-  unsigned int directive_index : 7;    /* If is_directive, 
+  unsigned int directive_index : 7;    /* If is_directive,
                                            then index into directive table.
                                            Otherwise, a NODE_OPERATOR.  */
    unsigned char rid_code;              /* Rid code - for front ends.  */
author	Eric Christopher <echristo@gcc.gnu.org>
	Fri, 16 Jan 2004 22:37:49 +0000 (22:37 +0000)
committer	Eric Christopher <echristo@gcc.gnu.org>
	Fri, 16 Jan 2004 22:37:49 +0000 (22:37 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/cppcharset.c		patch \| blob \| history
gcc/cpphash.h		patch \| blob \| history
gcc/cppinit.c		patch \| blob \| history
gcc/cpplib.c		patch \| blob \| history
gcc/cpplib.h		patch \| blob \| history