From 688e7a53446776e4d7b49472b06fec29ea69ff17 Mon Sep 17 00:00:00 2001 From: Tom Tromey Date: Mon, 21 Apr 2008 14:02:00 +0000 Subject: [PATCH] re PR preprocessor/33415 (Can't compile .cpp file with UTF-8 BOM.) libcpp PR libcpp/33415: * charset.c (_cpp_convert_input): Add buffer_start argument. Ignore UTF-8 BOM if seen. * internal.h (_cpp_convert_input): Add argument. * files.c (struct _cpp_file) : New field. (destroy_cpp_file): Free buffer_start, not buffer. (_cpp_pop_file_buffer): Likewise. (read_file_guts): Update. gcc/testsuite PR libcpp/33415: * gcc.dg/cpp/pr33415.c: New file. From-SVN: r134507 --- gcc/testsuite/ChangeLog | 5 ++++ gcc/testsuite/gcc.dg/cpp/pr33415.c | 6 +++++ libcpp/ChangeLog | 11 ++++++++ libcpp/charset.c | 40 +++++++++++++++++++++++------- libcpp/files.c | 20 ++++++++++----- libcpp/internal.h | 4 +-- 6 files changed, 69 insertions(+), 17 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/cpp/pr33415.c diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 23c6f7e99e3..02e190f23b0 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2008-04-21 Tom Tromey + + PR libcpp/33415: + * gcc.dg/cpp/pr33415.c: New file. + 2008-04-21 Olivier Hainque * gnat.dg/bltins.adb: New testcase. diff --git a/gcc/testsuite/gcc.dg/cpp/pr33415.c b/gcc/testsuite/gcc.dg/cpp/pr33415.c new file mode 100644 index 00000000000..28ffe2372e4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/cpp/pr33415.c @@ -0,0 +1,6 @@ + /* Test case for PR 33415. Note that the first bytes of this file + are a UTF-8 BOM. */ + +/* { dg-do compile } */ + +int f(void) { return 5; } diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog index 9eef6efb3e9..b80afd2550b 100644 --- a/libcpp/ChangeLog +++ b/libcpp/ChangeLog @@ -1,3 +1,14 @@ +2008-04-21 Tom Tromey + + PR libcpp/33415: + * charset.c (_cpp_convert_input): Add buffer_start argument. + Ignore UTF-8 BOM if seen. + * internal.h (_cpp_convert_input): Add argument. + * files.c (struct _cpp_file) : New field. + (destroy_cpp_file): Free buffer_start, not buffer. + (_cpp_pop_file_buffer): Likewise. + (read_file_guts): Update. + 2008-04-18 Kris Van Hees * include/cpp-id-data.h (UC): Was U, conflicts with U"..." literal. diff --git a/libcpp/charset.c b/libcpp/charset.c index 225cdb4915e..d70d05cc020 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -1,5 +1,5 @@ /* CPP Library - charsets - Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006 + Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008 Free Software Foundation, Inc. Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. @@ -1637,18 +1637,24 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) source file) from INPUT_CHARSET to the source character set. INPUT points to the input buffer, SIZE is its allocated size, and LEN is the length of the meaningful data within the buffer. The - translated buffer is returned, and *ST_SIZE is set to the length of - the meaningful data within the translated buffer. - - INPUT is expected to have been allocated with xmalloc. This function - will either return INPUT, or free it and return a pointer to another - xmalloc-allocated block of memory. */ + translated buffer is returned, *ST_SIZE is set to the length of + the meaningful data within the translated buffer, and *BUFFER_START + is set to the start of the returned buffer. *BUFFER_START may + differ from the return value in the case of a BOM or other ignored + marker information. + + INPUT is expected to have been allocated with xmalloc. This + function will either set *BUFFER_START to INPUT, or free it and set + *BUFFER_START to a pointer to another xmalloc-allocated block of + memory. */ uchar * _cpp_convert_input (cpp_reader *pfile, const char *input_charset, - uchar *input, size_t size, size_t len, off_t *st_size) + uchar *input, size_t size, size_t len, + const unsigned char **buffer_start, off_t *st_size) { struct cset_converter input_cset; struct _cpp_strbuf to; + unsigned char *buffer; input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset); if (input_cset.func == convert_no_conversion) @@ -1689,8 +1695,24 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, else to.text[to.len] = '\n'; + buffer = to.text; *st_size = to.len; - return to.text; +#if HOST_CHARSET == HOST_CHARSET_ASCII + /* The HOST_CHARSET test just above ensures that the source charset + is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that + glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a + BOM -- however, even if it did, we would still need this code due + to the 'convert_no_conversion' case. */ + if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb + && to.text[2] == 0xbf) + { + *st_size -= 3; + buffer += 3; + } +#endif + + *buffer_start = to.text; + return buffer; } /* Decide on the default encoding to assume for input files. */ diff --git a/libcpp/files.c b/libcpp/files.c index 2bc3a801e35..1adc58d88a8 100644 --- a/libcpp/files.c +++ b/libcpp/files.c @@ -74,6 +74,10 @@ struct _cpp_file /* The contents of NAME after calling read_file(). */ const uchar *buffer; + /* Pointer to the real start of BUFFER. read_file() might increment + BUFFER; when freeing, this this pointer must be used instead. */ + const uchar *buffer_start; + /* The macro, if any, preventing re-inclusion. */ const cpp_hashnode *cmacro; @@ -635,8 +639,11 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file) cpp_error (pfile, CPP_DL_WARNING, "%s is shorter than expected", file->path); - file->buffer = _cpp_convert_input (pfile, CPP_OPTION (pfile, input_charset), - buf, size, total, &file->st.st_size); + file->buffer = _cpp_convert_input (pfile, + CPP_OPTION (pfile, input_charset), + buf, size, total, + &file->buffer_start, + &file->st.st_size); file->buffer_valid = true; return true; @@ -969,8 +976,8 @@ make_cpp_file (cpp_reader *pfile, cpp_dir *dir, const char *fname) static void destroy_cpp_file (_cpp_file *file) { - if (file->buffer) - free ((void *) file->buffer); + if (file->buffer_start) + free ((void *) file->buffer_start); free ((void *) file->name); free (file); } @@ -1302,9 +1309,10 @@ _cpp_pop_file_buffer (cpp_reader *pfile, _cpp_file *file) /* Invalidate control macros in the #including file. */ pfile->mi_valid = false; - if (file->buffer) + if (file->buffer_start) { - free ((void *) file->buffer); + free ((void *) file->buffer_start); + file->buffer_start = NULL; file->buffer = NULL; file->buffer_valid = false; } diff --git a/libcpp/internal.h b/libcpp/internal.h index bf6c5f8c8d2..860fe2e53a2 100644 --- a/libcpp/internal.h +++ b/libcpp/internal.h @@ -1,5 +1,5 @@ /* Part of CPP library. - Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2007 + Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it @@ -644,7 +644,7 @@ extern cppchar_t _cpp_valid_ucn (cpp_reader *, const unsigned char **, extern void _cpp_destroy_iconv (cpp_reader *); extern unsigned char *_cpp_convert_input (cpp_reader *, const char *, unsigned char *, size_t, size_t, - off_t *); + const unsigned char **, off_t *); extern const char *_cpp_default_encoding (void); extern cpp_hashnode * _cpp_interpret_identifier (cpp_reader *pfile, const unsigned char *id, -- 2.30.2