re PR preprocessor/33415 (Can't compile .cpp file with UTF-8 BOM.)

author Tom Tromey <tromey@redhat.com>

Mon, 21 Apr 2008 14:02:00 +0000 (14:02 +0000)

committer Tom Tromey <tromey@gcc.gnu.org>

Mon, 21 Apr 2008 14:02:00 +0000 (14:02 +0000)
author Tom Tromey <tromey@redhat.com>
Mon, 21 Apr 2008 14:02:00 +0000 (14:02 +0000)
committer Tom Tromey <tromey@gcc.gnu.org>
Mon, 21 Apr 2008 14:02:00 +0000 (14:02 +0000)
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 23c6f7e99e30d6c9b13267c720446fff7fe50791..02e190f23b078509122864c75424c7c810aec046 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2008-04-21  Tom Tromey  <tromey@redhat.com>
+
+       PR libcpp/33415:
+       * gcc.dg/cpp/pr33415.c: New file.
+
  2008-04-21  Olivier Hainque  <hainque@adacore.com>
  
         * gnat.dg/bltins.adb: New testcase.
diff --git a/gcc/testsuite/gcc.dg/cpp/pr33415.c b/gcc/testsuite/gcc.dg/cpp/pr33415.c

new file mode 100644 (file)

index 0000000..28ffe23
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/pr33415.c
@@ -0,0 +1,6 @@
+ /* Test case for PR 33415.  Note that the first bytes of this file
+     are a UTF-8 BOM.  */
+
+/* { dg-do compile } */
+
+int f(void) { return 5; }
diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog

index 9eef6efb3e9e0b04b860dd33a7e5cc97cd33a4fd..b80afd2550bb00fb9411268aef107df3a48bbfd6 100644 (file)
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@@ -1,3 +1,14 @@
+2008-04-21  Tom Tromey  <tromey@redhat.com>
+
+       PR libcpp/33415:
+       * charset.c (_cpp_convert_input): Add buffer_start argument.
+       Ignore UTF-8 BOM if seen.
+       * internal.h (_cpp_convert_input): Add argument.
+       * files.c (struct _cpp_file) <buffer_start>: New field.
+       (destroy_cpp_file): Free buffer_start, not buffer.
+       (_cpp_pop_file_buffer): Likewise.
+       (read_file_guts): Update.
+
  2008-04-18  Kris Van Hees <kris.van.hees@oracle.com>
  
         * include/cpp-id-data.h (UC): Was U, conflicts with U"..." literal.
diff --git a/libcpp/charset.c b/libcpp/charset.c

index 225cdb4915e5bf8850cc3ae982aa8aacf1e575df..d70d05cc0205435247e25ca66dd7bf46c79e163f 100644 (file)
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -1,5 +1,5 @@
  /* CPP Library - charsets
-   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006
+   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008
     Free Software Foundation, Inc.
  
     Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
@@ -1637,18 +1637,24 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
     source file) from INPUT_CHARSET to the source character set.  INPUT
     points to the input buffer, SIZE is its allocated size, and LEN is
     the length of the meaningful data within the buffer.  The
-   translated buffer is returned, and *ST_SIZE is set to the length of
-   the meaningful data within the translated buffer.
-
-   INPUT is expected to have been allocated with xmalloc.  This function
-   will either return INPUT, or free it and return a pointer to another
-   xmalloc-allocated block of memory.  */
+   translated buffer is returned, *ST_SIZE is set to the length of
+   the meaningful data within the translated buffer, and *BUFFER_START
+   is set to the start of the returned buffer.  *BUFFER_START may
+   differ from the return value in the case of a BOM or other ignored
+   marker information.
+
+   INPUT is expected to have been allocated with xmalloc.  This
+   function will either set *BUFFER_START to INPUT, or free it and set
+   *BUFFER_START to a pointer to another xmalloc-allocated block of
+   memory.  */
  uchar * 
  _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
-                   uchar *input, size_t size, size_t len, off_t *st_size)
+                   uchar *input, size_t size, size_t len,
+                   const unsigned char **buffer_start, off_t *st_size)
  {
    struct cset_converter input_cset;
    struct _cpp_strbuf to;
+  unsigned char *buffer;
  
    input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
    if (input_cset.func == convert_no_conversion)
@@ -1689,8 +1695,24 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
    else
      to.text[to.len] = '\n';
  
+  buffer = to.text;
    *st_size = to.len;
-  return to.text;
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+  /* The HOST_CHARSET test just above ensures that the source charset
+     is UTF-8.  So, ignore a UTF-8 BOM if we see one.  Note that
+     glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
+     BOM -- however, even if it did, we would still need this code due
+     to the 'convert_no_conversion' case.  */
+  if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
+      && to.text[2] == 0xbf)
+    {
+      *st_size -= 3;
+      buffer += 3;
+    }
+#endif
+
+  *buffer_start = to.text;
+  return buffer;
  }
  
  /* Decide on the default encoding to assume for input files.  */
diff --git a/libcpp/files.c b/libcpp/files.c

index 2bc3a801e3562e0d391ac7de6e69770073abd574..1adc58d88a8594290df88657b101bce491089a0c 100644 (file)
--- a/libcpp/files.c
+++ b/libcpp/files.c
@@ -74,6 +74,10 @@ struct _cpp_file
    /* The contents of NAME after calling read_file().  */
    const uchar *buffer;
  
+  /* Pointer to the real start of BUFFER.  read_file() might increment
+     BUFFER; when freeing, this this pointer must be used instead.  */
+  const uchar *buffer_start;
+
    /* The macro, if any, preventing re-inclusion.  */
    const cpp_hashnode *cmacro;
  
@@ -635,8 +639,11 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file)
      cpp_error (pfile, CPP_DL_WARNING,
                "%s is shorter than expected", file->path);
  
-  file->buffer = _cpp_convert_input (pfile, CPP_OPTION (pfile, input_charset),
-                                    buf, size, total, &file->st.st_size);
+  file->buffer = _cpp_convert_input (pfile,
+                                    CPP_OPTION (pfile, input_charset),
+                                    buf, size, total,
+                                    &file->buffer_start,
+                                    &file->st.st_size);
    file->buffer_valid = true;
  
    return true;
@@ -969,8 +976,8 @@ make_cpp_file (cpp_reader *pfile, cpp_dir *dir, const char *fname)
  static void
  destroy_cpp_file (_cpp_file *file)
  {
-  if (file->buffer)
-    free ((void *) file->buffer);
+  if (file->buffer_start)
+    free ((void *) file->buffer_start);
    free ((void *) file->name);
    free (file);
  }
@@ -1302,9 +1309,10 @@ _cpp_pop_file_buffer (cpp_reader *pfile, _cpp_file *file)
    /* Invalidate control macros in the #including file.  */
    pfile->mi_valid = false;
  
-  if (file->buffer)
+  if (file->buffer_start)
      {
-      free ((void *) file->buffer);
+      free ((void *) file->buffer_start);
+      file->buffer_start = NULL;
        file->buffer = NULL;
        file->buffer_valid = false;
      }
diff --git a/libcpp/internal.h b/libcpp/internal.h

index bf6c5f8c8d2e9ac8e4792fd8334fbcfe06c5826f..860fe2e53a278be987c444a45bd66cb83418c857 100644 (file)
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -1,5 +1,5 @@
  /* Part of CPP library.
-   Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2007
+   Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008
     Free Software Foundation, Inc.
  
  This program is free software; you can redistribute it and/or modify it
@@ -644,7 +644,7 @@ extern cppchar_t _cpp_valid_ucn (cpp_reader *, const unsigned char **,
  extern void _cpp_destroy_iconv (cpp_reader *);
  extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
                                           unsigned char *, size_t, size_t,
-                                         off_t *);
+                                         const unsigned char **, off_t *);
  extern const char *_cpp_default_encoding (void);
  extern cpp_hashnode * _cpp_interpret_identifier (cpp_reader *pfile,
                                                  const unsigned char *id,
author	Tom Tromey <tromey@redhat.com>
	Mon, 21 Apr 2008 14:02:00 +0000 (14:02 +0000)
committer	Tom Tromey <tromey@gcc.gnu.org>
	Mon, 21 Apr 2008 14:02:00 +0000 (14:02 +0000)
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.dg/cpp/pr33415.c	[new file with mode: 0644]	patch \| blob
libcpp/ChangeLog		patch \| blob \| history
libcpp/charset.c		patch \| blob \| history
libcpp/files.c		patch \| blob \| history
libcpp/internal.h		patch \| blob \| history