On-demand locations within string-literals

author David Malcolm <dmalcolm@redhat.com>

Fri, 5 Aug 2016 18:08:33 +0000 (18:08 +0000)

committer David Malcolm <dmalcolm@gcc.gnu.org>

Fri, 5 Aug 2016 18:08:33 +0000 (18:08 +0000)
author David Malcolm <dmalcolm@redhat.com>
Fri, 5 Aug 2016 18:08:33 +0000 (18:08 +0000)
committer David Malcolm <dmalcolm@gcc.gnu.org>
Fri, 5 Aug 2016 18:08:33 +0000 (18:08 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 10d7224b36a786e423028eb241ef798c916a868e..40ca18ca3efb635614abe26cfc4f0739150a6ef4 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,61 @@
+2016-08-05  David Malcolm  <dmalcolm@redhat.com>
+
+       * input.c (string_concat::string_concat): New constructor.
+       (string_concat_db::string_concat_db): New constructor.
+       (string_concat_db::record_string_concatenation): New method.
+       (string_concat_db::get_string_concatenation): New method.
+       (string_concat_db::get_key_loc): New method.
+       (class auto_cpp_string_vec): New class.
+       (get_substring_ranges_for_loc): New function.
+       (get_source_range_for_substring): New function.
+       (get_num_source_ranges_for_substring): New function.
+       (class selftest::lexer_test_options): New class.
+       (struct selftest::lexer_test): New struct.
+       (class selftest::ebcdic_execution_charset): New class.
+       (selftest::ebcdic_execution_charset::s_singleton): New variable.
+       (selftest::lexer_test::lexer_test): New constructor.
+       (selftest::lexer_test::~lexer_test): New destructor.
+       (selftest::lexer_test::get_token): New method.
+       (selftest::assert_char_at_range): New function.
+       (ASSERT_CHAR_AT_RANGE): New macro.
+       (selftest::assert_num_substring_ranges): New function.
+       (ASSERT_NUM_SUBSTRING_RANGES): New macro.
+       (selftest::assert_has_no_substring_ranges): New function.
+       (ASSERT_HAS_NO_SUBSTRING_RANGES): New macro.
+       (selftest::test_lexer_string_locations_simple): New function.
+       (selftest::test_lexer_string_locations_ebcdic): New function.
+       (selftest::test_lexer_string_locations_hex): New function.
+       (selftest::test_lexer_string_locations_oct): New function.
+       (selftest::test_lexer_string_locations_letter_escape_1): New function.
+       (selftest::test_lexer_string_locations_letter_escape_2): New function.
+       (selftest::test_lexer_string_locations_ucn4): New function.
+       (selftest::test_lexer_string_locations_ucn8): New function.
+       (selftest::uint32_from_big_endian): New function.
+       (selftest::test_lexer_string_locations_wide_string): New function.
+       (selftest::uint16_from_big_endian): New function.
+       (selftest::test_lexer_string_locations_string16): New function.
+       (selftest::test_lexer_string_locations_string32): New function.
+       (selftest::test_lexer_string_locations_u8): New function.
+       (selftest::test_lexer_string_locations_utf8_source): New function.
+       (selftest::test_lexer_string_locations_concatenation_1): New
+       function.
+       (selftest::test_lexer_string_locations_concatenation_2): New
+       function.
+       (selftest::test_lexer_string_locations_concatenation_3): New
+       function.
+       (selftest::test_lexer_string_locations_macro): New function.
+       (selftest::test_lexer_string_locations_stringified_macro_argument):
+       New function.
+       (selftest::test_lexer_string_locations_non_string): New function.
+       (selftest::test_lexer_string_locations_long_line): New function.
+       (selftest::test_lexer_char_constants): New function.
+       (selftest::input_c_tests): Call the new test functions once per
+       case within the line_table test matrix.
+       * input.h (struct string_concat): New struct.
+       (struct location_hash): New struct.
+       (class string_concat_db): New class.
+       * substring-locations.h: New header.
+
  2016-08-05  Patrick Palka  <ppalka@gcc.gnu.org>
  
         PR tree-optimization/72810
diff --git a/gcc/c-family/ChangeLog b/gcc/c-family/ChangeLog

index aed494a5af1fc2c58bcd2616733b60858986fb67..d5cdfed537a45175323d2c6c1d130d370ca07140 100644 (file)
--- a/gcc/c-family/ChangeLog
+++ b/gcc/c-family/ChangeLog
@@ -1,3 +1,17 @@
+2016-08-05  David Malcolm  <dmalcolm@redhat.com>
+
+       * c-common.c: Include "substring-locations.h".
+       (get_cpp_ttype_from_string_type): New function.
+       (g_string_concat_db): New global.
+       (substring_loc::get_range): New method.
+       * c-common.h (g_string_concat_db): New declaration.
+       (class substring_loc): New class.
+       * c-lex.c (lex_string): When concatenating strings, capture the
+       locations of all tokens using a new obstack, and record the
+       concatenation locations within g_string_concat_db.
+       * c-opts.c (c_common_init_options): Construct g_string_concat_db
+       on the ggc-heap.
+
  2016-07-29  Marek Polacek  <polacek@redhat.com>
  
         PR c/71926
diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c

index 27031b5e25fb95f59c2b7f1e33e06c4cc91c511a..569f000150f277b2356318ac199ad7a2769578a2 100644 (file)
--- a/gcc/c-family/c-common.c
+++ b/gcc/c-family/c-common.c
@@ -45,6 +45,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "tree-iterator.h"
  #include "opts.h"
  #include "gimplify.h"
+#include "substring-locations.h"
  
  cpp_reader *parse_in;          /* Declared in c-pragma.h.  */
  
@@ -1098,6 +1099,67 @@ fix_string_type (tree value)
    TREE_STATIC (value) = 1;
    return value;
  }
+
+/* Given a string of type STRING_TYPE, determine what kind of string
+   token would give an equivalent execution encoding: CPP_STRING,
+   CPP_STRING16, or CPP_STRING32.  Return CPP_OTHER in case of error.
+   This may not be exactly the string token type that initially created
+   the string, since CPP_WSTRING is indistinguishable from the 16/32 bit
+   string type at this point.
+
+   This effectively reverses part of the logic in lex_string and
+   fix_string_type.  */
+
+static enum cpp_ttype
+get_cpp_ttype_from_string_type (tree string_type)
+{
+  gcc_assert (string_type);
+  if (TREE_CODE (string_type) != ARRAY_TYPE)
+    return CPP_OTHER;
+
+  tree element_type = TREE_TYPE (string_type);
+  if (TREE_CODE (element_type) != INTEGER_TYPE)
+    return CPP_OTHER;
+
+  int bits_per_character = TYPE_PRECISION (element_type);
+  switch (bits_per_character)
+    {
+    case 8:
+      return CPP_STRING;  /* It could have also been CPP_UTF8STRING.  */
+    case 16:
+      return CPP_STRING16;
+    case 32:
+      return CPP_STRING32;
+    }
+
+  return CPP_OTHER;
+}
+
+/* The global record of string concatentations, for use in
+   extracting locations within string literals.  */
+
+GTY(()) string_concat_db *g_string_concat_db;
+
+/* Attempt to determine the source range of the substring.
+   If successful, return NULL and write the source range to *OUT_RANGE.
+   Otherwise return an error message.  Error messages are intended
+   for GCC developers (to help debugging) rather than for end-users.  */
+
+const char *
+substring_loc::get_range (source_range *out_range) const
+{
+  gcc_assert (out_range);
+
+  enum cpp_ttype tok_type = get_cpp_ttype_from_string_type (m_string_type);
+  if (tok_type == CPP_OTHER)
+    return "unrecognized string type";
+
+  return get_source_range_for_substring (parse_in, g_string_concat_db,
+                                        m_fmt_string_loc, tok_type,
+                                        m_start_idx, m_end_idx,
+                                        out_range);
+}
+
  \f
  /* Fold X for consideration by one of the warning functions when checking
     whether an expression has a constant value.  */
diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h

index 8c80574a1469ab769cbbde9c1458b406ae9b6437..7b5da5736e9e78d14e0811c1b92fe25da67a7ae9 100644 (file)
--- a/gcc/c-family/c-common.h
+++ b/gcc/c-family/c-common.h
@@ -1110,6 +1110,35 @@ extern time_t cb_get_source_date_epoch (cpp_reader *pfile);
     __TIME__ can store.  */
  #define MAX_SOURCE_DATE_EPOCH HOST_WIDE_INT_C (253402300799)
  
+extern GTY(()) string_concat_db *g_string_concat_db;
+
+/* libcpp can calculate location information about a range of characters
+   within a string literal, but doing so is non-trivial.
+
+   This class encapsulates such a source location, so that it can be
+   passed around (e.g. within c-format.c).  It is effectively a deferred
+   call into libcpp.  If needed by a diagnostic, the actual source_range
+   can be calculated by calling the get_range method.  */
+
+class substring_loc
+{
+ public:
+  substring_loc (location_t fmt_string_loc, tree string_type,
+                int start_idx, int end_idx)
+  : m_fmt_string_loc (fmt_string_loc), m_string_type (string_type),
+    m_start_idx (start_idx), m_end_idx (end_idx) {}
+
+  const char *get_range (source_range *out_range) const;
+
+  location_t get_fmt_string_loc () const { return m_fmt_string_loc; }
+
+ private:
+  location_t m_fmt_string_loc;
+  tree m_string_type;
+  int m_start_idx;
+  int m_end_idx;
+};
+
  /* In c-gimplify.c  */
  extern void c_genericize (tree);
  extern int c_gimplify_expr (tree *, gimple_seq *, gimple_seq *);
diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c

index 8f33d8616e38fd3a41bd4b5b403e76019cb89f1a..4c7e3852142ffe96cd385050add856840a2b17a9 100644 (file)
--- a/gcc/c-family/c-lex.c
+++ b/gcc/c-family/c-lex.c
@@ -1097,13 +1097,16 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
    tree value;
    size_t concats = 0;
    struct obstack str_ob;
+  struct obstack loc_ob;
    cpp_string istr;
    enum cpp_ttype type = tok->type;
  
    /* Try to avoid the overhead of creating and destroying an obstack
       for the common case of just one string.  */
    cpp_string str = tok->val.str;
+  location_t init_loc = tok->src_loc;
    cpp_string *strs = &str;
+  location_t *locs = NULL;
  
    /* objc_at_sign_was_seen is only used when doing Objective-C string
       concatenation.  It is 'true' if we have seen an '@' before the
@@ -1142,16 +1145,21 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
           else
             error ("unsupported non-standard concatenation of string literals");
         }
+      /* FALLTHROUGH */
  
      case CPP_STRING:
        if (!concats)
         {
           gcc_obstack_init (&str_ob);
+         gcc_obstack_init (&loc_ob);
           obstack_grow (&str_ob, &str, sizeof (cpp_string));
+         obstack_grow (&loc_ob, &init_loc, sizeof (location_t));
         }
  
        concats++;
        obstack_grow (&str_ob, &tok->val.str, sizeof (cpp_string));
+      obstack_grow (&loc_ob, &tok->src_loc, sizeof (location_t));
+
        if (objc_string)
         objc_at_sign_was_seen = false;
        goto retry;
@@ -1164,7 +1172,10 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
    /* We have read one more token than we want.  */
    _cpp_backup_tokens (parse_in, 1);
    if (concats)
-    strs = XOBFINISH (&str_ob, cpp_string *);
+    {
+      strs = XOBFINISH (&str_ob, cpp_string *);
+      locs = XOBFINISH (&loc_ob, location_t *);
+    }
  
    if (concats && !objc_string && !in_system_header_at (input_location))
      warning (OPT_Wtraditional,
@@ -1176,6 +1187,12 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
      {
        value = build_string (istr.len, (const char *) istr.text);
        free (CONST_CAST (unsigned char *, istr.text));
+      if (concats)
+       {
+         gcc_assert (locs);
+         gcc_assert (g_string_concat_db);
+         g_string_concat_db->record_string_concatenation (concats + 1, locs);
+       }
      }
    else
      {
@@ -1227,7 +1244,10 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
    *valp = fix_string_type (value);
  
    if (concats)
-    obstack_free (&str_ob, 0);
+    {
+      obstack_free (&str_ob, 0);
+      obstack_free (&loc_ob, 0);
+    }
  
    return objc_string ? CPP_OBJC_STRING : type;
  }
diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c

index c11e7e7f53ccfdff2c5d6a5e3d5257f7271a0e9d..0715b2ef20062bb944850a19bb13630b4f422fba 100644 (file)
--- a/gcc/c-family/c-opts.c
+++ b/gcc/c-family/c-opts.c
@@ -216,6 +216,9 @@ c_common_init_options (unsigned int decoded_options_count,
    unsigned int i;
    struct cpp_callbacks *cb;
  
+  g_string_concat_db
+    = new (ggc_alloc <string_concat_db> ()) string_concat_db ();
+
    parse_in = cpp_create_reader (c_dialect_cxx () ? CLK_GNUCXX: CLK_GNUC89,
                                 ident_hash, line_table);
    cb = cpp_get_callbacks (parse_in);
diff --git a/gcc/input.c b/gcc/input.c

index f91a702d47b533f004e82caad34ebf47d8ec116e..d058b8aa81f1639156210f86e9722ed1cdcd940b 100644 (file)
--- a/gcc/input.c
+++ b/gcc/input.c
@@ -1189,6 +1189,279 @@ dump_location_info (FILE *stream)
                                 MAX_SOURCE_LOCATION + 1, UINT_MAX);
  }
  
+/* string_concat's constructor.  */
+
+string_concat::string_concat (int num, location_t *locs)
+  : m_num (num)
+{
+  m_locs = ggc_vec_alloc <location_t> (num);
+  for (int i = 0; i < num; i++)
+    m_locs[i] = locs[i];
+}
+
+/* string_concat_db's constructor.  */
+
+string_concat_db::string_concat_db ()
+{
+  m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
+}
+
+/* Record that a string concatenation occurred, covering NUM
+   string literal tokens.  LOCS is an array of size NUM, containing the
+   locations of the tokens.  A copy of LOCS is taken.  */
+
+void
+string_concat_db::record_string_concatenation (int num, location_t *locs)
+{
+  gcc_assert (num > 1);
+  gcc_assert (locs);
+
+  location_t key_loc = get_key_loc (locs[0]);
+
+  string_concat *concat
+    = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
+  m_table->put (key_loc, concat);
+}
+
+/* Determine if LOC was the location of the the initial token of a
+   concatenation of string literal tokens.
+   If so, *OUT_NUM is written to with the number of tokens, and
+   *OUT_LOCS with the location of an array of locations of the
+   tokens, and return true.  *OUT_LOCS is a borrowed pointer to
+   storage owned by the string_concat_db.
+   Otherwise, return false.  */
+
+bool
+string_concat_db::get_string_concatenation (location_t loc,
+                                           int *out_num,
+                                           location_t **out_locs)
+{
+  gcc_assert (out_num);
+  gcc_assert (out_locs);
+
+  location_t key_loc = get_key_loc (loc);
+
+  string_concat **concat = m_table->get (key_loc);
+  if (!concat)
+    return false;
+
+  *out_num = (*concat)->m_num;
+  *out_locs =(*concat)->m_locs;
+  return true;
+}
+
+/* Internal function.  Canonicalize LOC into a form suitable for
+   use as a key within the database, stripping away macro expansion,
+   ad-hoc information, and range information, using the location of
+   the start of LOC within an ordinary linemap.  */
+
+location_t
+string_concat_db::get_key_loc (location_t loc)
+{
+  loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
+                                 NULL);
+
+  loc = get_range_from_loc (line_table, loc).m_start;
+
+  return loc;
+}
+
+/* Helper class for use within get_substring_ranges_for_loc.
+   An vec of cpp_string with responsibility for releasing all of the
+   str->text for each str in the vector.  */
+
+class auto_cpp_string_vec :  public auto_vec <cpp_string>
+{
+ public:
+  auto_cpp_string_vec (int alloc)
+    : auto_vec <cpp_string> (alloc) {}
+
+  ~auto_cpp_string_vec ()
+  {
+    /* Clean up the copies within this vec.  */
+    int i;
+    cpp_string *str;
+    FOR_EACH_VEC_ELT (*this, i, str)
+      free (const_cast <unsigned char *> (str->text));
+  }
+};
+
+/* Attempt to populate RANGES with source location information on the
+   individual characters within the string literal found at STRLOC.
+   If CONCATS is non-NULL, then any string literals that the token at
+   STRLOC  was concatenated with are also added to RANGES.
+
+   Return NULL if successful, or an error message if any errors occurred (in
+   which case RANGES may be only partially populated and should not
+   be used).
+
+   This is implemented by re-parsing the relevant source line(s).  */
+
+static const char *
+get_substring_ranges_for_loc (cpp_reader *pfile,
+                             string_concat_db *concats,
+                             location_t strloc,
+                             enum cpp_ttype type,
+                             cpp_substring_ranges &ranges)
+{
+  gcc_assert (pfile);
+
+  if (strloc == UNKNOWN_LOCATION)
+    return "unknown location";
+
+  /* If string concatenation has occurred at STRLOC, get the locations
+     of all of the literal tokens making up the compound string.
+     Otherwise, just use STRLOC.  */
+  int num_locs = 1;
+  location_t *strlocs = &strloc;
+  if (concats)
+    concats->get_string_concatenation (strloc, &num_locs, &strlocs);
+
+  auto_cpp_string_vec strs (num_locs);
+  auto_vec <cpp_string_location_reader> loc_readers (num_locs);
+  for (int i = 0; i < num_locs; i++)
+    {
+      /* Get range of strloc.  We will use it to locate the start and finish
+        of the literal token within the line.  */
+      source_range src_range = get_range_from_loc (line_table, strlocs[i]);
+
+      if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
+       /* If the string is within a macro expansion, we can't get at the
+          end location.  */
+       return "macro expansion";
+
+      if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
+       /* If so, we can't reliably determine where the token started within
+          its line.  */
+       return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
+
+      if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
+       /* If so, we can't reliably determine where the token finished within
+          its line.  */
+       return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
+
+      expanded_location start
+       = expand_location_to_spelling_point (src_range.m_start);
+      expanded_location finish
+       = expand_location_to_spelling_point (src_range.m_finish);
+      if (start.file != finish.file)
+       return "range endpoints are in different files";
+      if (start.line != finish.line)
+       return "range endpoints are on different lines";
+      if (start.column > finish.column)
+       return "range endpoints are reversed";
+
+      int line_width;
+      const char *line = location_get_source_line (start.file, start.line,
+                                                  &line_width);
+      if (line == NULL)
+       return "unable to read source line";
+
+      /* Determine the location of the literal (including quotes
+        and leading prefix chars, such as the 'u' in a u""
+        token).  */
+      const char *literal = line + start.column - 1;
+      int literal_length = finish.column - start.column + 1;
+
+      gcc_assert (line_width >= (start.column - 1 + literal_length));
+      cpp_string from;
+      from.len = literal_length;
+      /* Make a copy of the literal, to avoid having to rely on
+        the lifetime of the copy of the line within the cache.
+        This will be released by the auto_cpp_string_vec dtor.  */
+      from.text = XDUPVEC (unsigned char, literal, literal_length);
+      strs.safe_push (from);
+
+      /* For very long lines, a new linemap could have started
+        halfway through the token.
+        Ensure that the loc_reader uses the linemap of the
+        *end* of the token for its start location.  */
+      const line_map_ordinary *final_ord_map;
+      linemap_resolve_location (line_table, src_range.m_finish,
+                               LRK_MACRO_EXPANSION_POINT, &final_ord_map);
+      location_t start_loc
+       = linemap_position_for_line_and_column (line_table, final_ord_map,
+                                               start.line, start.column);
+
+      cpp_string_location_reader loc_reader (start_loc, line_table);
+      loc_readers.safe_push (loc_reader);
+    }
+
+  /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
+  const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
+                                                loc_readers.address (),
+                                                num_locs, &ranges, type);
+  if (err)
+    return err;
+
+  /* Success: "ranges" should now contain information on the string.  */
+  return NULL;
+}
+
+/* Attempt to populate *OUT_RANGE with source location information on the
+   range of given characters within the string literal found at STRLOC.
+   START_IDX and END_IDX refer to offsets within the execution character
+   set.
+   If CONCATS is non-NULL, then any string literals that the token at
+   STRLOC was concatenated with are also considered.
+
+   This is implemented by re-parsing the relevant source line(s).
+
+   Return NULL if successful, or an error message if any errors occurred.
+   Error messages are intended for GCC developers (to help debugging) rather
+   than for end-users.  */
+
+const char *
+get_source_range_for_substring (cpp_reader *pfile,
+                               string_concat_db *concats,
+                               location_t strloc,
+                               enum cpp_ttype type,
+                               int start_idx, int end_idx,
+                               source_range *out_range)
+{
+  gcc_checking_assert (start_idx >= 0);
+  gcc_checking_assert (end_idx >= 0);
+  gcc_assert (out_range);
+
+  cpp_substring_ranges ranges;
+  const char *err
+    = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
+  if (err)
+    return err;
+
+  if (start_idx >= ranges.get_num_ranges ())
+    return "start_idx out of range";
+  if (end_idx >= ranges.get_num_ranges ())
+    return "end_idx out of range";
+
+  out_range->m_start = ranges.get_range (start_idx).m_start;
+  out_range->m_finish = ranges.get_range (end_idx).m_finish;
+  return NULL;
+}
+
+/* As get_source_range_for_substring, but write to *OUT the number
+   of ranges that are available.  */
+
+const char *
+get_num_source_ranges_for_substring (cpp_reader *pfile,
+                                    string_concat_db *concats,
+                                    location_t strloc,
+                                    enum cpp_ttype type,
+                                    int *out)
+{
+  gcc_assert (out);
+
+  cpp_substring_ranges ranges;
+  const char *err
+    = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
+
+  if (err)
+    return err;
+
+  *out = ranges.get_num_ranges ();
+  return NULL;
+}
+
  #if CHECKING_P
  
  namespace selftest {
@@ -1541,6 +1814,1259 @@ test_lexer (const line_table_case &case_)
    cpp_destroy (parser);
  }
  
+/* Forward decls.  */
+
+struct lexer_test;
+class lexer_test_options;
+
+/* A class for specifying options of a lexer_test.
+   The "apply" vfunc is called during the lexer_test constructor.  */
+
+class lexer_test_options
+{
+ public:
+  virtual void apply (lexer_test &) = 0;
+};
+
+/* A struct for writing lexer tests.  */
+
+struct lexer_test
+{
+  lexer_test (const line_table_case &case_, const char *content,
+             lexer_test_options *options);
+  ~lexer_test ();
+
+  const cpp_token *get_token ();
+
+  temp_source_file m_tempfile;
+  temp_line_table m_tmp_lt;
+  cpp_reader *m_parser;
+  string_concat_db m_concats;
+};
+
+/* Use an EBCDIC encoding for the execution charset, specifically
+   IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
+
+   This exercises iconv integration within libcpp.
+   Not every build of iconv supports the given charset,
+   so we need to flag this error and handle it gracefully.  */
+
+class ebcdic_execution_charset : public lexer_test_options
+{
+ public:
+  ebcdic_execution_charset () : m_num_iconv_errors (0)
+    {
+      gcc_assert (s_singleton == NULL);
+      s_singleton = this;
+    }
+  ~ebcdic_execution_charset ()
+    {
+      gcc_assert (s_singleton == this);
+      s_singleton = NULL;
+    }
+
+  void apply (lexer_test &test) FINAL OVERRIDE
+  {
+    cpp_options *cpp_opts = cpp_get_options (test.m_parser);
+    cpp_opts->narrow_charset = "IBM1047";
+
+    cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
+    callbacks->error = on_error;
+  }
+
+  static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
+                       int level ATTRIBUTE_UNUSED,
+                       int reason ATTRIBUTE_UNUSED,
+                       rich_location *richloc ATTRIBUTE_UNUSED,
+                       const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
+    ATTRIBUTE_FPTR_PRINTF(5,0)
+  {
+    gcc_assert (s_singleton);
+    /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
+       when the local iconv build doesn't support the conversion.  */
+    if (strstr (msgid, "not supported by iconv"))
+      {
+       s_singleton->m_num_iconv_errors++;
+       return true;
+      }
+
+    /* Otherwise, we have an unexpected error.  */
+    abort ();
+  }
+
+  bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
+
+ private:
+  static ebcdic_execution_charset *s_singleton;
+  int m_num_iconv_errors;
+};
+
+ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
+
+/* Constructor.  Override line_table with a new instance based on CASE_,
+   and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
+   start parsing the tempfile.  */
+
+lexer_test::lexer_test (const line_table_case &case_, const char *content,
+                       lexer_test_options *options) :
+  /* Create a tempfile and write the text to it.  */
+  m_tempfile (SELFTEST_LOCATION, ".c", content),
+  m_tmp_lt (case_),
+  m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
+  m_concats ()
+{
+  if (options)
+    options->apply (*this);
+
+  cpp_init_iconv (m_parser);
+
+  /* Parse the file.  */
+  const char *fname = cpp_read_main_file (m_parser,
+                                         m_tempfile.get_filename ());
+  ASSERT_NE (fname, NULL);
+}
+
+/* Destructor.  Verify that the next token in m_parser is EOF.  */
+
+lexer_test::~lexer_test ()
+{
+  location_t loc;
+  const cpp_token *tok;
+
+  tok = cpp_get_token_with_location (m_parser, &loc);
+  ASSERT_NE (tok, NULL);
+  ASSERT_EQ (tok->type, CPP_EOF);
+
+  cpp_finish (m_parser, NULL);
+  cpp_destroy (m_parser);
+}
+
+/* Get the next token from m_parser.  */
+
+const cpp_token *
+lexer_test::get_token ()
+{
+  location_t loc;
+  const cpp_token *tok;
+
+  tok = cpp_get_token_with_location (m_parser, &loc);
+  ASSERT_NE (tok, NULL);
+  return tok;
+}
+
+/* Verify that locations within string literals are correctly handled.  */
+
+/* Verify get_source_range_for_substring for token(s) at STRLOC,
+   using the string concatenation database for TEST.
+
+   Assert that the character at index IDX is on EXPECTED_LINE,
+   and that it begins at column EXPECTED_START_COL and ends at
+   EXPECTED_FINISH_COL (unless the locations are beyond
+   LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
+   columns).  */
+
+static void
+assert_char_at_range (const location &loc,
+                     lexer_test& test,
+                     location_t strloc, enum cpp_ttype type, int idx,
+                     int expected_line, int expected_start_col,
+                     int expected_finish_col)
+{
+  cpp_reader *pfile = test.m_parser;
+  string_concat_db *concats = &test.m_concats;
+
+  source_range actual_range;
+  const char *err
+    = get_source_range_for_substring (pfile, concats, strloc, type,
+                                     idx, idx, &actual_range);
+  if (should_have_column_data_p (strloc))
+    ASSERT_EQ_AT (loc, NULL, err);
+  else
+    {
+      ASSERT_STREQ_AT (loc,
+                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
+                      err);
+      return;
+    }
+
+  int actual_start_line = LOCATION_LINE (actual_range.m_start);
+  ASSERT_EQ_AT (loc, expected_line, actual_start_line);
+  int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
+  ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
+
+  if (should_have_column_data_p (actual_range.m_start))
+    {
+      int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
+      ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
+    }
+  if (should_have_column_data_p (actual_range.m_finish))
+    {
+      int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
+      ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
+    }
+}
+
+/* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
+   the effective location of any errors.  */
+
+#define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
+                            EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
+  assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
+                       (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
+                       (EXPECTED_FINISH_COL))
+
+/* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
+   using the string concatenation database for TEST.
+
+   Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
+
+static void
+assert_num_substring_ranges (const location &loc,
+                            lexer_test& test,
+                            location_t strloc,
+                            enum cpp_ttype type,
+                            int expected_num_ranges)
+{
+  cpp_reader *pfile = test.m_parser;
+  string_concat_db *concats = &test.m_concats;
+
+  int actual_num_ranges;
+  const char *err
+    = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
+                                          &actual_num_ranges);
+  if (should_have_column_data_p (strloc))
+    ASSERT_EQ_AT (loc, NULL, err);
+  else
+    {
+      ASSERT_STREQ_AT (loc,
+                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
+                      err);
+      return;
+    }
+  ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
+}
+
+/* Macro for calling assert_num_substring_ranges, supplying
+   SELFTEST_LOCATION for the effective location of any errors.  */
+
+#define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
+                                   EXPECTED_NUM_RANGES)                \
+  assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
+                              (TYPE), (EXPECTED_NUM_RANGES))
+
+
+/* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
+   returns an error (using the string concatenation database for TEST).  */
+
+static void
+assert_has_no_substring_ranges (const location &loc,
+                               lexer_test& test,
+                               location_t strloc,
+                               enum cpp_ttype type,
+                               const char *expected_err)
+{
+  cpp_reader *pfile = test.m_parser;
+  string_concat_db *concats = &test.m_concats;
+  cpp_substring_ranges ranges;
+  const char *actual_err
+    = get_substring_ranges_for_loc (pfile, concats, strloc,
+                                   type, ranges);
+  if (should_have_column_data_p (strloc))
+    ASSERT_STREQ_AT (loc, expected_err, actual_err);
+  else
+    ASSERT_STREQ_AT (loc,
+                    "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
+                    actual_err);
+}
+
+#define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
+    assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
+                                   (STRLOC), (TYPE), (ERR))
+
+/* Lex a simple string literal.  Verify the substring location data, before
+   and after running cpp_interpret_string on it.  */
+
+static void
+test_lexer_string_locations_simple (const line_table_case &case_)
+{
+  /* Digits 0-9 (with 0 at column 10), the simple way.
+     ....................000000000.11111111112.2222222223333333333
+     ....................123456789.01234567890.1234567890123456789
+     We add a trailing comment to ensure that we correctly locate
+     the end of the string literal token.  */
+  const char *content = "        \"0123456789\" /* not a string */\n";
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
+  ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
+
+  /* At this point in lexing, the quote characters are treated as part of
+     the string (they are stripped off by cpp_interpret_string).  */
+
+  ASSERT_EQ (tok->val.str.len, 12);
+
+  /* Verify that cpp_interpret_string works.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Verify ranges of individual characters.  This no longer includes the
+     quotes.  */
+  for (int i = 0; i <= 9; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
+                         10 + i, 10 + i);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 10);
+}
+
+/* As test_lexer_string_locations_simple, but use an EBCDIC execution
+   encoding.  */
+
+static void
+test_lexer_string_locations_ebcdic (const line_table_case &case_)
+{
+  /* EBCDIC support requires iconv.  */
+  if (!HAVE_ICONV)
+    return;
+
+  /* Digits 0-9 (with 0 at column 10), the simple way.
+     ....................000000000.11111111112.2222222223333333333
+     ....................123456789.01234567890.1234567890123456789
+     We add a trailing comment to ensure that we correctly locate
+     the end of the string literal token.  */
+  const char *content = "        \"0123456789\" /* not a string */\n";
+  ebcdic_execution_charset use_ebcdic;
+  lexer_test test (case_, content, &use_ebcdic);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
+  ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
+
+  /* At this point in lexing, the quote characters are treated as part of
+     the string (they are stripped off by cpp_interpret_string).  */
+
+  ASSERT_EQ (tok->val.str.len, 12);
+
+  /* The remainder of the test requires an iconv implementation that
+     can convert from UTF-8 to the EBCDIC encoding requested above.  */
+  if (use_ebcdic.iconv_errors_occurred_p ())
+    return;
+
+  /* Verify that cpp_interpret_string works.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  /* We should now have EBCDIC-encoded text, specifically
+     IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
+     The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
+  ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
+               (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Verify that we don't attempt to record substring location information
+     for such cases.  */
+  ASSERT_HAS_NO_SUBSTRING_RANGES
+    (test, tok->src_loc, type,
+     "execution character set != source character set");
+}
+
+/* Lex a string literal containing a hex-escaped character.
+   Verify the substring location data, before and after running
+   cpp_interpret_string on it.  */
+
+static void
+test_lexer_string_locations_hex (const line_table_case &case_)
+{
+  /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
+     and with a space in place of digit 6, to terminate the escaped
+     hex code.
+     ....................000000000.111111.11112222.
+     ....................123456789.012345.67890123.  */
+  const char *content = "        \"01234\\x35 789\"\n";
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
+  ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
+
+  /* At this point in lexing, the quote characters are treated as part of
+     the string (they are stripped off by cpp_interpret_string).  */
+  ASSERT_EQ (tok->val.str.len, 15);
+
+  /* Verify that cpp_interpret_string works.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Verify ranges of individual characters.  This no longer includes the
+     quotes.  */
+  for (int i = 0; i <= 4; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+  ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
+  for (int i = 6; i <= 9; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 10);
+}
+
+/* Lex a string literal containing an octal-escaped character.
+   Verify the substring location data after running cpp_interpret_string
+   on it.  */
+
+static void
+test_lexer_string_locations_oct (const line_table_case &case_)
+{
+  /* Digits 0-9, expressing digit 5 in ASCII as "\065"
+     and with a space in place of digit 6, to terminate the escaped
+     octal code.
+     ....................000000000.111111.11112222.2222223333333333444
+     ....................123456789.012345.67890123.4567890123456789012  */
+  const char *content = "        \"01234\\065 789\" /* not a string */\n";
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
+
+  /* Verify that cpp_interpret_string works.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Verify ranges of individual characters.  This no longer includes the
+     quotes.  */
+  for (int i = 0; i < 5; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+  ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
+  for (int i = 6; i <= 9; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 10);
+}
+
+/* Test of string literal containing letter escapes.  */
+
+static void
+test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
+{
+  /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
+     .....................000000000.1.11111.1.1.11222.22222223333333
+     .....................123456789.0.12345.6.7.89012.34567890123456.  */
+  const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected tokens back.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
+
+  /* Verify ranges of individual characters. */
+  /* "\t".  */
+  ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+                       0, 1, 10, 11);
+  /* "foo". */
+  for (int i = 1; i <= 3; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+                         i, 1, 11 + i, 11 + i);
+  /* "\\" and "\n".  */
+  ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+                       4, 1, 15, 16);
+  ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+                       5, 1, 17, 18);
+
+  /* "bar".  */
+  for (int i = 6; i <= 8; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+                         i, 1, 13 + i, 13 + i);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 9);
+}
+
+/* Another test of a string literal containing a letter escape.
+   Based on string seen in
+     printf ("%-%\n");
+   in gcc.dg/format/c90-printf-1.c.  */
+
+static void
+test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
+{
+  /* .....................000000000.1111.11.1111.22222222223.
+     .....................123456789.0123.45.6789.01234567890.  */
+  const char *content = ("        \"%-%\\n\" /* non-str */\n");
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected tokens back.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
+
+  /* Verify ranges of individual characters. */
+  /* "%-%".  */
+  for (int i = 0; i < 3; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+                         i, 1, 10 + i, 10 + i);
+  /* "\n".  */
+  ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+                       3, 1, 13, 14);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 4);
+}
+
+/* Lex a string literal containing UCN 4 characters.
+   Verify the substring location data after running cpp_interpret_string
+   on it.  */
+
+static void
+test_lexer_string_locations_ucn4 (const line_table_case &case_)
+{
+  /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
+     as UCN 4.
+     ....................000000000.111111.111122.222222223.33333333344444
+     ....................123456789.012345.678901.234567890.12345678901234  */
+  const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
+
+  /* Verify that cpp_interpret_string works.
+     The string should be encoded in the execution character
+     set.  Assuming that that is UTF-8, we should have the following:
+     -----------  ----  -----  -------  ----------------
+     Byte offset  Byte  Octal  Unicode  Source Column(s)
+     -----------  ----  -----  -------  ----------------
+     0            0x30         '0'      10
+     1            0x31         '1'      11
+     2            0x32         '2'      12
+     3            0x33         '3'      13
+     4            0x34         '4'      14
+     5            0xE2  \342   U+2174   15-20
+     6            0x85  \205    (cont)  15-20
+     7            0xB4  \264    (cont)  15-20
+     8            0xE2  \342   U+2175   21-26
+     9            0x85  \205    (cont)  21-26
+     10           0xB5  \265    (cont)  21-26
+     11           0x37         '7'      27
+     12           0x38         '8'      28
+     13           0x39         '9'      29
+     -----------  ----  -----  -------  ---------------.  */
+
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  ASSERT_STREQ ("01234\342\205\264\342\205\265789",
+               (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Verify ranges of individual characters.  This no longer includes the
+     quotes.
+     '01234'.  */
+  for (int i = 0; i <= 4; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+  /* U+2174.  */
+  for (int i = 5; i <= 7; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
+  /* U+2175.  */
+  for (int i = 8; i <= 10; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
+  /* '789'.  */
+  for (int i = 11; i <= 13; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 14);
+}
+
+/* Lex a string literal containing UCN 8 characters.
+   Verify the substring location data after running cpp_interpret_string
+   on it.  */
+
+static void
+test_lexer_string_locations_ucn8 (const line_table_case &case_)
+{
+  /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
+     ....................000000000.111111.1111222222.2222333333333.344444
+     ....................123456789.012345.6789012345.6789012345678.901234  */
+  const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
+                          "\"01234\\U00002174\\U00002175789\"");
+
+  /* Verify that cpp_interpret_string works.
+     The UTF-8 encoding of the string is identical to that from
+     the ucn4 testcase above; the only difference is the column
+     locations.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  ASSERT_STREQ ("01234\342\205\264\342\205\265789",
+               (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Verify ranges of individual characters.  This no longer includes the
+     quotes.
+     '01234'.  */
+  for (int i = 0; i <= 4; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+  /* U+2174.  */
+  for (int i = 5; i <= 7; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
+  /* U+2175.  */
+  for (int i = 8; i <= 10; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
+  /* '789' at columns 35-37  */
+  for (int i = 11; i <= 13; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 14);
+}
+
+/* Fetch a big-endian 32-bit value and convert to host endianness.  */
+
+static uint32_t
+uint32_from_big_endian (const uint32_t *ptr_be_value)
+{
+  const unsigned char *buf = (const unsigned char *)ptr_be_value;
+  return (((uint32_t) buf[0] << 24)
+         | ((uint32_t) buf[1] << 16)
+         | ((uint32_t) buf[2] << 8)
+         | (uint32_t) buf[3]);
+}
+
+/* Lex a wide string literal and verify that attempts to read substring
+   location data from it fail gracefully.  */
+
+static void
+test_lexer_string_locations_wide_string (const line_table_case &case_)
+{
+  /* Digits 0-9.
+     ....................000000000.11111111112.22222222233333
+     ....................123456789.01234567890.12345678901234  */
+  const char *content = "       L\"0123456789\" /* non-str */\n";
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_WSTRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
+
+  /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_WSTRING;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  /* The cpp_reader defaults to big-endian with
+     CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
+     now be encoded as UTF-32BE.  */
+  const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
+  ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
+  ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
+  ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
+  ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* We don't yet support generating substring location information
+     for L"" strings.  */
+  ASSERT_HAS_NO_SUBSTRING_RANGES
+    (test, tok->src_loc, type,
+     "execution character set != source character set");
+}
+
+/* Fetch a big-endian 16-bit value and convert to host endianness.  */
+
+static uint16_t
+uint16_from_big_endian (const uint16_t *ptr_be_value)
+{
+  const unsigned char *buf = (const unsigned char *)ptr_be_value;
+  return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
+}
+
+/* Lex a u"" string literal and verify that attempts to read substring
+   location data from it fail gracefully.  */
+
+static void
+test_lexer_string_locations_string16 (const line_table_case &case_)
+{
+  /* Digits 0-9.
+     ....................000000000.11111111112.22222222233333
+     ....................123456789.01234567890.12345678901234  */
+  const char *content = "       u\"0123456789\" /* non-str */\n";
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING16);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
+
+  /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING16;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+
+  /* The cpp_reader defaults to big-endian, so dst_string should
+     now be encoded as UTF-16BE.  */
+  const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
+  ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
+  ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
+  ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
+  ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* We don't yet support generating substring location information
+     for L"" strings.  */
+  ASSERT_HAS_NO_SUBSTRING_RANGES
+    (test, tok->src_loc, type,
+     "execution character set != source character set");
+}
+
+/* Lex a U"" string literal and verify that attempts to read substring
+   location data from it fail gracefully.  */
+
+static void
+test_lexer_string_locations_string32 (const line_table_case &case_)
+{
+  /* Digits 0-9.
+     ....................000000000.11111111112.22222222233333
+     ....................123456789.01234567890.12345678901234  */
+  const char *content = "       U\"0123456789\" /* non-str */\n";
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING32);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
+
+  /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING32;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+
+  /* The cpp_reader defaults to big-endian, so dst_string should
+     now be encoded as UTF-32BE.  */
+  const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
+  ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
+  ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
+  ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
+  ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* We don't yet support generating substring location information
+     for L"" strings.  */
+  ASSERT_HAS_NO_SUBSTRING_RANGES
+    (test, tok->src_loc, type,
+     "execution character set != source character set");
+}
+
+/* Lex a u8-string literal.
+   Verify the substring location data after running cpp_interpret_string
+   on it.  */
+
+static void
+test_lexer_string_locations_u8 (const line_table_case &case_)
+{
+  /* Digits 0-9.
+     ....................000000000.11111111112.22222222233333
+     ....................123456789.01234567890.12345678901234  */
+  const char *content = "      u8\"0123456789\" /* non-str */\n";
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_UTF8STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
+
+  /* Verify that cpp_interpret_string works.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Verify ranges of individual characters.  This no longer includes the
+     quotes.  */
+  for (int i = 0; i <= 9; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+}
+
+/* Lex a string literal containing UTF-8 source characters.
+   Verify the substring location data after running cpp_interpret_string
+   on it.  */
+
+static void
+test_lexer_string_locations_utf8_source (const line_table_case &case_)
+{
+ /* This string literal is written out to the source file as UTF-8,
+    and is of the form "before mojibake after", where "mojibake"
+    is written as the following four unicode code points:
+       U+6587 CJK UNIFIED IDEOGRAPH-6587
+       U+5B57 CJK UNIFIED IDEOGRAPH-5B57
+       U+5316 CJK UNIFIED IDEOGRAPH-5316
+       U+3051 HIRAGANA LETTER KE.
+     Each of these is 3 bytes wide when encoded in UTF-8, whereas the
+     "before" and "after" are 1 byte per unicode character.
+
+     The numbering shown are "columns", which are *byte* numbers within
+     the line, rather than unicode character numbers.
+
+     .................... 000000000.1111111.
+     .................... 123456789.0123456.  */
+  const char *content = ("        \"before "
+                        /* U+6587 CJK UNIFIED IDEOGRAPH-6587
+                             UTF-8: 0xE6 0x96 0x87
+                             C octal escaped UTF-8: \346\226\207
+                           "column" numbers: 17-19.  */
+                        "\346\226\207"
+
+                        /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
+                             UTF-8: 0xE5 0xAD 0x97
+                             C octal escaped UTF-8: \345\255\227
+                           "column" numbers: 20-22.  */
+                        "\345\255\227"
+
+                        /* U+5316 CJK UNIFIED IDEOGRAPH-5316
+                             UTF-8: 0xE5 0x8C 0x96
+                             C octal escaped UTF-8: \345\214\226
+                           "column" numbers: 23-25.  */
+                        "\345\214\226"
+
+                        /* U+3051 HIRAGANA LETTER KE
+                             UTF-8: 0xE3 0x81 0x91
+                             C octal escaped UTF-8: \343\201\221
+                           "column" numbers: 26-28.  */
+                        "\343\201\221"
+
+                        /* column numbers 29 onwards
+                         2333333.33334444444444
+                         9012345.67890123456789. */
+                        " after\" /* non-str */\n");
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back, with the correct
+     location information.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ
+    (test.m_parser, tok,
+     "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
+
+  /* Verify that cpp_interpret_string works.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  ASSERT_STREQ
+    ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
+     (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Verify ranges of individual characters.  This no longer includes the
+     quotes.
+     Assuming that both source and execution encodings are UTF-8, we have
+     a run of 25 octets in each.  */
+  for (int i = 0; i < 25; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 25);
+}
+
+/* Test of string literal concatenation.  */
+
+static void
+test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
+{
+  /* Digits 0-9.
+     .....................000000000.111111.11112222222222
+     .....................123456789.012345.67890123456789.  */
+  const char *content = ("        \"01234\" /* non-str */\n"
+                        "        \"56789\" /* non-str */\n");
+  lexer_test test (case_, content, NULL);
+
+  location_t input_locs[2];
+
+  /* Verify that we get the expected tokens back.  */
+  auto_vec <cpp_string> input_strings;
+  const cpp_token *tok_a = test.get_token ();
+  ASSERT_EQ (tok_a->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
+  input_strings.safe_push (tok_a->val.str);
+  input_locs[0] = tok_a->src_loc;
+
+  const cpp_token *tok_b = test.get_token ();
+  ASSERT_EQ (tok_b->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
+  input_strings.safe_push (tok_b->val.str);
+  input_locs[1] = tok_b->src_loc;
+
+  /* Verify that cpp_interpret_string works.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser,
+                                     input_strings.address (), 2,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Simulate c-lex.c's lex_string in order to record concatenation.  */
+  test.m_concats.record_string_concatenation (2, input_locs);
+
+  location_t initial_loc = input_locs[0];
+
+  for (int i = 0; i <= 4; i++)
+    ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
+  for (int i = 5; i <= 9; i++)
+    ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 10);
+}
+
+/* Another test of string literal concatenation.  */
+
+static void
+test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
+{
+  /* Digits 0-9.
+     .....................000000000.111.11111112222222
+     .....................123456789.012.34567890123456.  */
+  const char *content = ("        \"01\" /* non-str */\n"
+                        "        \"23\" /* non-str */\n"
+                        "        \"45\" /* non-str */\n"
+                        "        \"67\" /* non-str */\n"
+                        "        \"89\" /* non-str */\n");
+  lexer_test test (case_, content, NULL);
+
+  auto_vec <cpp_string> input_strings;
+  location_t input_locs[5];
+
+  /* Verify that we get the expected tokens back.  */
+  for (int i = 0; i < 5; i++)
+    {
+      const cpp_token *tok = test.get_token ();
+      ASSERT_EQ (tok->type, CPP_STRING);
+      input_strings.safe_push (tok->val.str);
+      input_locs[i] = tok->src_loc;
+    }
+
+  /* Verify that cpp_interpret_string works.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser,
+                                     input_strings.address (), 5,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Simulate c-lex.c's lex_string in order to record concatenation.  */
+  test.m_concats.record_string_concatenation (5, input_locs);
+
+  location_t initial_loc = input_locs[0];
+
+  /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
+     detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
+     and expect get_source_range_for_substring to fail.
+     However, for a string concatenation test, we can have a case
+     where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
+     but subsequent strings can be after it.
+     Attempting to detect this within assert_char_at_range
+     would overcomplicate the logic for the common test cases, so
+     we detect it here.  */
+  if (should_have_column_data_p (input_locs[0])
+      && !should_have_column_data_p (input_locs[4]))
+    {
+      /* Verify that get_source_range_for_substring gracefully rejects
+        this case.  */
+      source_range actual_range;
+      const char *err
+       = get_source_range_for_substring (test.m_parser, &test.m_concats,
+                                         initial_loc, type, 0, 0,
+                                         &actual_range);
+      ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
+      return;
+    }
+
+  for (int i = 0; i < 5; i++)
+    for (int j = 0; j < 2; j++)
+      ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
+                           i + 1, 10 + j, 10 + j);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 10);
+}
+
+/* Another test of string literal concatenation, this time combined with
+   various kinds of escaped characters.  */
+
+static void
+test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
+{
+  /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
+     digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
+  const char *content
+    /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
+       .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
+    = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
+  lexer_test test (case_, content, NULL);
+
+  auto_vec <cpp_string> input_strings;
+  location_t input_locs[4];
+
+  /* Verify that we get the expected tokens back.  */
+  for (int i = 0; i < 4; i++)
+    {
+      const cpp_token *tok = test.get_token ();
+      ASSERT_EQ (tok->type, CPP_STRING);
+      input_strings.safe_push (tok->val.str);
+      input_locs[i] = tok->src_loc;
+    }
+
+  /* Verify that cpp_interpret_string works.  */
+  cpp_string dst_string;
+  const enum cpp_ttype type = CPP_STRING;
+  bool result = cpp_interpret_string (test.m_parser,
+                                     input_strings.address (), 4,
+                                     &dst_string, type);
+  ASSERT_TRUE (result);
+  ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
+  free (const_cast <unsigned char *> (dst_string.text));
+
+  /* Simulate c-lex.c's lex_string in order to record concatenation.  */
+  test.m_concats.record_string_concatenation (4, input_locs);
+
+  location_t initial_loc = input_locs[0];
+
+  for (int i = 0; i <= 4; i++)
+    ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
+  ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
+  ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
+  for (int i = 7; i <= 9; i++)
+    ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 10);
+}
+
+/* Test of string literal in a macro.  */
+
+static void
+test_lexer_string_locations_macro (const line_table_case &case_)
+{
+  /* Digits 0-9.
+     .....................0000000001111111111.22222222223.
+     .....................1234567890123456789.01234567890.  */
+  const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
+                        "  MACRO");
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected tokens back.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_PADDING);
+
+  tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
+
+  /* Verify ranges of individual characters.  We ought to
+     see columns within the macro definition.  */
+  for (int i = 0; i <= 9; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+                         i, 1, 20 + i, 20 + i);
+
+  ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
+
+  tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_PADDING);
+}
+
+/* Test of stringification of a macro argument.  */
+
+static void
+test_lexer_string_locations_stringified_macro_argument
+  (const line_table_case &case_)
+{
+  /* .....................000000000111111111122222222223.
+     .....................123456789012345678901234567890.  */
+  const char *content = ("#define MACRO(X) #X /* non-str */\n"
+                        "MACRO(foo)\n");
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_PADDING);
+
+  tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
+
+  /* We don't support getting the location of a stringified macro
+     argument.  Verify that it fails gracefully.  */
+  ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
+                                 "cpp_interpret_string_1 failed");
+
+  tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_PADDING);
+
+  tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_PADDING);
+}
+
+/* Ensure that we are fail gracefully if something attempts to pass
+   in a location that isn't a string literal token.  Seen on this code:
+
+     const char a[] = " %d ";
+     __builtin_printf (a, 0.5);
+                       ^
+
+   when c-format.c erroneously used the indicated one-character
+   location as the format string location, leading to a read past the
+   end of a string buffer in cpp_interpret_string_1.  */
+
+static void
+test_lexer_string_locations_non_string (const line_table_case &case_)
+{
+  /* .....................000000000111111111122222222223.
+     .....................123456789012345678901234567890.  */
+  const char *content = ("         a\n");
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_NAME);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
+
+  /* At this point, libcpp is attempting to interpret the name as a
+     string literal, despite it not starting with a quote.  We don't detect
+     that, but we should at least fail gracefully.  */
+  ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
+                                 "cpp_interpret_string_1 failed");
+}
+
+/* Ensure that we can read substring information for a token which
+   starts in one linemap and ends in another .  Adapted from
+   gcc.dg/cpp/pr69985.c.  */
+
+static void
+test_lexer_string_locations_long_line (const line_table_case &case_)
+{
+  /* .....................000000.000111111111
+     .....................123456.789012346789.  */
+  const char *content = ("/* A very long line, so that we start a new line map.  */\n"
+                        "     \"0123456789012345678901234567890123456789"
+                        "0123456789012345678901234567890123456789"
+                        "0123456789012345678901234567890123456789"
+                        "0123456789\"\n");
+
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected token back.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_STRING);
+
+  if (!should_have_column_data_p (line_table->highest_location))
+    return;
+
+  /* Verify ranges of individual characters.  */
+  ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 130);
+  for (int i = 0; i < 130; i++)
+    ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+                         i, 2, 7 + i, 7 + i);
+}
+
+/* Test of lexing char constants.  */
+
+static void
+test_lexer_char_constants (const line_table_case &case_)
+{
+  /* Various char constants.
+     .....................0000000001111111111.22222222223.
+     .....................1234567890123456789.01234567890.  */
+  const char *content = ("         'a'\n"
+                        "        u'a'\n"
+                        "        U'a'\n"
+                        "        L'a'\n"
+                        "         'abc'\n");
+  lexer_test test (case_, content, NULL);
+
+  /* Verify that we get the expected tokens back.  */
+  /* 'a'.  */
+  const cpp_token *tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_CHAR);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
+
+  unsigned int chars_seen;
+  int unsignedp;
+  cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
+                                         &chars_seen, &unsignedp);
+  ASSERT_EQ (cc, 'a');
+  ASSERT_EQ (chars_seen, 1);
+
+  /* u'a'.  */
+  tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_CHAR16);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
+
+  /* U'a'.  */
+  tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_CHAR32);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
+
+  /* L'a'.  */
+  tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_WCHAR);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
+
+  /* 'abc' (c-char-sequence).  */
+  tok = test.get_token ();
+  ASSERT_EQ (tok->type, CPP_CHAR);
+  ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
+}
  /* A table of interesting location_t values, giving one axis of our test
     matrix.  */
  
@@ -1599,6 +3125,27 @@ input_c_tests ()
           /* Run all tests for the given case within the test matrix.  */
           test_accessing_ordinary_linemaps (c);
           test_lexer (c);
+         test_lexer_string_locations_simple (c);
+         test_lexer_string_locations_ebcdic (c);
+         test_lexer_string_locations_hex (c);
+         test_lexer_string_locations_oct (c);
+         test_lexer_string_locations_letter_escape_1 (c);
+         test_lexer_string_locations_letter_escape_2 (c);
+         test_lexer_string_locations_ucn4 (c);
+         test_lexer_string_locations_ucn8 (c);
+         test_lexer_string_locations_wide_string (c);
+         test_lexer_string_locations_string16 (c);
+         test_lexer_string_locations_string32 (c);
+         test_lexer_string_locations_u8 (c);
+         test_lexer_string_locations_utf8_source (c);
+         test_lexer_string_locations_concatenation_1 (c);
+         test_lexer_string_locations_concatenation_2 (c);
+         test_lexer_string_locations_concatenation_3 (c);
+         test_lexer_string_locations_macro (c);
+         test_lexer_string_locations_stringified_macro_argument (c);
+         test_lexer_string_locations_non_string (c);
+         test_lexer_string_locations_long_line (c);
+         test_lexer_char_constants (c);
  
           num_cases_tested++;
         }
diff --git a/gcc/input.h b/gcc/input.h

index d51f950bcc96da7aeade1b8497e527b98df023de..c17e440e576fc6ab90900b492cdf5823ed223763 100644 (file)
--- a/gcc/input.h
+++ b/gcc/input.h
@@ -95,4 +95,39 @@ void dump_location_info (FILE *stream);
  
  void diagnostics_file_cache_fini (void);
  
+struct GTY(()) string_concat
+{
+  string_concat (int num, location_t *locs);
+
+  int m_num;
+  location_t * GTY ((atomic)) m_locs;
+};
+
+struct location_hash : int_hash <location_t, UNKNOWN_LOCATION> { };
+
+class GTY(()) string_concat_db
+{
+ public:
+  string_concat_db ();
+  void record_string_concatenation (int num, location_t *locs);
+
+  bool get_string_concatenation (location_t loc,
+                                int *out_num,
+                                location_t **out_locs);
+
+ private:
+  static location_t get_key_loc (location_t loc);
+
+  /* For the fields to be private, we must grant access to the
+     generated code in gtype-desc.c.  */
+
+  friend void ::gt_ggc_mx_string_concat_db (void *x_p);
+  friend void ::gt_pch_nx_string_concat_db (void *x_p);
+  friend void ::gt_pch_p_16string_concat_db (void *this_obj, void *x_p,
+                                            gt_pointer_operator op,
+                                            void *cookie);
+
+  hash_map <location_hash, string_concat *> *m_table;
+};
+
  #endif
diff --git a/gcc/substring-locations.h b/gcc/substring-locations.h

new file mode 100644 (file)

index 0000000..274ebbe
--- /dev/null
+++ b/gcc/substring-locations.h
@@ -0,0 +1,30 @@
+/* Source locations within string literals.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_SUBSTRING_LOCATIONS_H
+#define GCC_SUBSTRING_LOCATIONS_H
+
+extern const char *get_source_range_for_substring (cpp_reader *pfile,
+                                                  string_concat_db *concats,
+                                                  location_t strloc,
+                                                  enum cpp_ttype type,
+                                                  int start_idx, int end_idx,
+                                                  source_range *out_range);
+
+#endif /* ! GCC_SUBSTRING_LOCATIONS_H */
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 25af7832a593d9a6745fc88e1c4fe9433c7b8433..997efac8af0911dd1d7de50b48532473e8655ec0 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2016-08-05  David Malcolm  <dmalcolm@redhat.com>
+
+       * gcc.dg/plugin/diagnostic-test-string-literals-1.c: New file.
+       * gcc.dg/plugin/diagnostic-test-string-literals-2.c: New file.
+       * gcc.dg/plugin/diagnostic_plugin_test_string_literals.c: New file.
+       * gcc.dg/plugin/plugin.exp (plugin_test_list): Add the above new files.
+
  2016-08-05  Patrick Palka  <ppalka@gcc.gnu.org>
  
         PR tree-optimization/72810
diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c

new file mode 100644 (file)

index 0000000..82689b4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c
@@ -0,0 +1,211 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdiagnostics-show-caret" } */
+
+/* This is a collection of unittests for ranges within string literals,
+   using diagnostic_plugin_test_string_literals, which handles
+   "__emit_string_literal_range" by generating a warning at the given
+   subset of a string literal.
+
+   The indices are 0-based.  It's easiest to verify things using string
+   literals that are runs of 0-based digits (to avoid having to count
+   characters).
+
+   LITERAL is a const void * to allow testing the various kinds of wide
+   string literal, rather than just const char *.  */
+
+extern void __emit_string_literal_range (const void *literal,
+                                        int start_idx, int end_idx);
+
+void
+test_simple_string_literal (void)
+{
+  __emit_string_literal_range ("0123456789", /* { dg-warning "range" } */
+                              6, 7);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range ("0123456789",
+                                       ^~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_concatenated_string_literal (void)
+{
+  __emit_string_literal_range ("01234" "56789", /* { dg-warning "range" } */
+                              3, 6);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range ("01234" "56789",
+                                    ^~~~~~~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_multiline_string_literal (void)
+{
+  __emit_string_literal_range ("01234" /* { dg-warning "range" } */
+                               "56789",
+                               3, 6);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range ("01234"
+                                    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                "56789",
+                                ~~~  
+   { dg-end-multiline-output "" } */
+  /* FIXME: why does the above need two trailing spaces?  */
+}
+
+/* Tests of various unicode encodings.
+
+   Digits 0 through 9 are unicode code points:
+      U+0030 DIGIT ZERO
+      ...
+      U+0039 DIGIT NINE
+   However, these are not always valid as UCN (see the comment in
+   libcpp/charset.c:_cpp_valid_ucn).
+
+   Hence we need to test UCN using an alternative unicode
+   representation of numbers; let's use Roman numerals,
+   (though these start at one, not zero):
+      U+2170 SMALL ROMAN NUMERAL ONE
+      ...
+      U+2174 SMALL ROMAN NUMERAL FIVE  ("v")
+      U+2175 SMALL ROMAN NUMERAL SIX   ("vi")
+      ...
+      U+2178 SMALL ROMAN NUMERAL NINE.  */
+
+void
+test_hex (void)
+{
+  /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
+     and with a space in place of digit 6, to terminate the escaped
+     hex code.  */
+  __emit_string_literal_range ("01234\x35 789", /* { dg-warning "range" } */
+                              3, 7);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range ("01234\x35 789"
+                                    ^~~~~~~~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_oct (void)
+{
+  /* Digits 0-9, expressing digit 5 in ASCII as "\065"
+     and with a space in place of digit 6, to terminate the escaped
+     octal code.  */
+  __emit_string_literal_range ("01234\065 789", /* { dg-warning "range" } */
+                              3, 7);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range ("01234\065 789"
+                                    ^~~~~~~~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_multiple (void)
+{
+  /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
+     digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
+  __emit_string_literal_range ("01234"  "\x35"  "\066"  "789", /* { dg-warning "range" } */
+                              3, 8);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range ("01234"  "\x35"  "\066"  "789",
+                                    ^~~~~~~~~~~~~~~~~~~~~~~~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_ucn4 (void)
+{
+  /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
+     as UCN 4.
+     The resulting string is encoded as UTF-8.  Most of the digits are 1 byte
+     each, but digits 5 and 6 are encoded with 3 bytes each.
+     Hence to underline digits 4-7 we need to underling using bytes 4-11 in
+     the UTF-8 encoding.  */
+  __emit_string_literal_range ("01234\u2174\u2175789", /* { dg-warning "range" } */
+                              4, 11);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range ("01234\u2174\u2175789",
+                                     ^~~~~~~~~~~~~~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_ucn8 (void)
+{
+  /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
+     The resulting string is the same as as in test_ucn4 above, and hence
+     has the same UTF-8 encoding, and so we again need to underline bytes
+     4-11 in the UTF-8 encoding in order to underline digits 4-7.  */
+  __emit_string_literal_range ("01234\U00002174\U00002175789", /* { dg-warning "range" } */
+                              4, 11);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range ("01234\U00002174\U00002175789",
+                                     ^~~~~~~~~~~~~~~~~~~~~~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_u8 (void)
+{
+  /* Digits 0-9.  */
+  __emit_string_literal_range (u8"0123456789", /* { dg-warning "range" } */
+                              4, 7);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range (u8"0123456789",
+                                       ^~~~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_u (void)
+{
+  /* Digits 0-9.  */
+  __emit_string_literal_range (u"0123456789", /* { dg-error "unable to read substring range: execution character set != source character set" } */
+                              4, 7);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range (u"0123456789",
+                                ^~~~~~~~~~~~~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_U (void)
+{
+  /* Digits 0-9.  */
+  __emit_string_literal_range (U"0123456789", /* { dg-error "unable to read substring range: execution character set != source character set" } */
+                              4, 7);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range (U"0123456789",
+                                ^~~~~~~~~~~~~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_L (void)
+{
+  /* Digits 0-9.  */
+  __emit_string_literal_range (L"0123456789", /* { dg-error "unable to read substring range: execution character set != source character set" } */
+                              4, 7);
+/* { dg-begin-multiline-output "" }
+   __emit_string_literal_range (L"0123456789",
+                                ^~~~~~~~~~~~~
+   { dg-end-multiline-output "" } */
+}
+
+void
+test_macro (void)
+{
+#define START "01234"  /* { dg-warning "range" } */
+  __emit_string_literal_range (START
+                               "56789",
+                               3, 6);
+/* { dg-begin-multiline-output "" }
+ #define START "01234"
+                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+   __emit_string_literal_range (START
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                                "56789",
+                                ~~~
+   { dg-end-multiline-output "" } */
+}
diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-2.c b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-2.c

new file mode 100644 (file)

index 0000000..7851c02
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-2.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+
+/* See the notes in diagnostic-test-string-literals-1.c.
+   This test case has caret-printing disabled.  */
+
+extern void __emit_string_literal_range (const void *literal,
+                                        int start_idx, int end_idx);
+/* Test of a stringified macro argument, by itself.  */
+
+void
+test_stringified_token_1 (int x)
+{
+#define STRINGIFY(EXPR) #EXPR
+
+  __emit_string_literal_range (STRINGIFY(x > 0), /* { dg-error "unable to read substring range: macro expansion" } */
+                               0, 4);
+
+#undef STRINGIFY
+}
+
+/* Test of a stringified token within a concatenation.  */
+
+void
+test_stringized_token_2 (int x)
+{
+#define EXAMPLE(EXPR, START_IDX, END_IDX)                      \
+  do {                                                         \
+    __emit_string_literal_range ("  before " #EXPR " after \n",        \
+                                START_IDX, END_IDX);           \
+  } while (0)
+
+  EXAMPLE(x > 0, 1, 6);
+  /* { dg-error "unable to read substring range: cpp_interpret_string_1 failed" "" { target *-*-* } 28 } */
+
+#undef EXAMPLE
+}
+
+/* Test of a doubly-stringified macro argument (by itself).  */
+
+void
+test_stringified_token_3 (int x)
+{
+#define XSTR(s) STR(s)
+#define STR(s) #s
+#define FOO 123456789
+  __emit_string_literal_range (XSTR (FOO), /* { dg-error "unable to read substring range: macro expansion" } */
+                               2, 3);
+
+#undef XSTR
+#undef STR
+#undef FOO
+}
+
diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic_plugin_test_string_literals.c b/gcc/testsuite/gcc.dg/plugin/diagnostic_plugin_test_string_literals.c

new file mode 100644 (file)

index 0000000..d44612a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/plugin/diagnostic_plugin_test_string_literals.c
@@ -0,0 +1,212 @@
+/* This plugin uses the diagnostics code to verify tracking of source code
+   locations within string literals.  */
+/* { dg-options "-O" } */
+
+#include "gcc-plugin.h"
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "stringpool.h"
+#include "toplev.h"
+#include "basic-block.h"
+#include "hash-table.h"
+#include "vec.h"
+#include "ggc.h"
+#include "basic-block.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-fold.h"
+#include "tree-eh.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "tree.h"
+#include "tree-pass.h"
+#include "intl.h"
+#include "plugin-version.h"
+#include "c-family/c-common.h"
+#include "diagnostic.h"
+#include "context.h"
+#include "print-tree.h"
+#include "cpplib.h"
+#include "c-family/c-pragma.h"
+
+int plugin_is_GPL_compatible;
+
+/* A custom pass for printing string literal location information.  */
+
+const pass_data pass_data_test_string_literals =
+{
+  GIMPLE_PASS, /* type */
+  "test_string_literals", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  PROP_ssa, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_test_string_literals : public gimple_opt_pass
+{
+public:
+  pass_test_string_literals(gcc::context *ctxt)
+    : gimple_opt_pass(pass_data_test_string_literals, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return true; }
+  virtual unsigned int execute (function *);
+
+}; // class pass_test_string_literals
+
+/* Determine if STMT is a call with NUM_ARGS arguments to a function
+   named FUNCNAME.
+   If so, return STMT as a gcall *.  Otherwise return NULL.  */
+
+static gcall *
+check_for_named_call (gimple *stmt,
+                     const char *funcname, unsigned int num_args)
+{
+  gcc_assert (funcname);
+
+  gcall *call = dyn_cast <gcall *> (stmt);
+  if (!call)
+    return NULL;
+
+  tree fndecl = gimple_call_fndecl (call);
+  if (!fndecl)
+    return NULL;
+
+  if (strcmp (IDENTIFIER_POINTER (DECL_NAME (fndecl)), funcname))
+    return NULL;
+
+  if (gimple_call_num_args (call) != num_args)
+    {
+      error_at (stmt->location, "expected number of args: %i (got %i)",
+               num_args, gimple_call_num_args (call));
+      return NULL;
+    }
+
+  return call;
+}
+
+/* Emit a warning covering SRC_RANGE, with the caret at the start of
+   SRC_RANGE.  */
+
+static void
+emit_warning (source_range src_range)
+{
+  location_t loc
+    = make_location (src_range.m_start, src_range.m_start, src_range.m_finish);
+  warning_at (loc, 0, "range %i:%i-%i:%i",
+             LOCATION_LINE (src_range.m_start),
+             LOCATION_COLUMN (src_range.m_start),
+             LOCATION_LINE (src_range.m_finish),
+             LOCATION_COLUMN (src_range.m_finish));
+}
+
+/* Support code for verifying that we are correctly tracking ranges
+   within string literals, for use by diagnostic-test-string-literals-*.c.
+   Emit a warning showing the range of a string literal, for each call to
+   a function named "__emit_string_literal_range".
+   The initial argument should be a string literal; arguments 2 and 3
+   should be integer constants, giving the range within the string
+   to be printed.  */
+
+static void
+test_string_literals (gimple *stmt)
+{
+  gcall *call = check_for_named_call (stmt, "__emit_string_literal_range", 3);
+  if (!call)
+    return;
+
+  /* We expect an ADDR_EXPR with a STRING_CST inside it for the
+     initial arg.  */
+  tree t_addr_string = gimple_call_arg (call, 0);
+  if (TREE_CODE (t_addr_string) != ADDR_EXPR)
+    {
+      error_at (call->location, "string literal required for arg 1");
+      return;
+    }
+
+  tree t_string = TREE_OPERAND (t_addr_string, 0);
+  if (TREE_CODE (t_string) != STRING_CST)
+    {
+      error_at (call->location, "string literal required for arg 1");
+      return;
+    }
+
+  tree t_start_idx = gimple_call_arg (call, 1);
+  if (TREE_CODE (t_start_idx) != INTEGER_CST)
+    {
+      error_at (call->location, "integer constant required for arg 2");
+      return;
+    }
+  int start_idx = TREE_INT_CST_LOW (t_start_idx);
+
+  tree t_end_idx = gimple_call_arg (call, 2);
+  if (TREE_CODE (t_end_idx) != INTEGER_CST)
+    {
+      error_at (call->location, "integer constant required for arg 3");
+      return;
+    }
+  int end_idx = TREE_INT_CST_LOW (t_end_idx);
+
+  /* A STRING_CST doesn't have a location, but the ADDR_EXPR does.  */
+  location_t strloc = EXPR_LOCATION (t_addr_string);
+  source_range src_range;
+  substring_loc substr_loc (strloc, TREE_TYPE (t_string),
+                           start_idx, end_idx);
+  const char *err = substr_loc.get_range (&src_range);
+  if (err)
+    error_at (strloc, "unable to read substring range: %s", err);
+  else
+    emit_warning (src_range);
+}
+
+/* Call test_string_literals on every statement within FUN.  */
+
+unsigned int
+pass_test_string_literals::execute (function *fun)
+{
+  gimple_stmt_iterator gsi;
+  basic_block bb;
+
+  FOR_EACH_BB_FN (bb, fun)
+    for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+       gimple *stmt = gsi_stmt (gsi);
+       test_string_literals (stmt);
+      }
+
+  return 0;
+}
+
+/* Entrypoint for the plugin.  Create and register the custom pass.  */
+
+int
+plugin_init (struct plugin_name_args *plugin_info,
+            struct plugin_gcc_version *version)
+{
+  struct register_pass_info pass_info;
+  const char *plugin_name = plugin_info->base_name;
+  int argc = plugin_info->argc;
+  struct plugin_argument *argv = plugin_info->argv;
+
+  if (!plugin_default_version_check (version, &gcc_version))
+    return 1;
+
+  pass_info.pass = new pass_test_string_literals (g);
+  pass_info.reference_pass_name = "ssa";
+  pass_info.ref_pass_instance_number = 1;
+  pass_info.pos_op = PASS_POS_INSERT_AFTER;
+  register_callback (plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL,
+                    &pass_info);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/plugin/plugin.exp b/gcc/testsuite/gcc.dg/plugin/plugin.exp

index faebb75d114474b4044eddce30a4c0cc44aa1d7b..715038a02794200420925b9046a95add35eca9e1 100644 (file)
--- a/gcc/testsuite/gcc.dg/plugin/plugin.exp
+++ b/gcc/testsuite/gcc.dg/plugin/plugin.exp
@@ -70,6 +70,9 @@ set plugin_test_list [list \
           diagnostic-test-expressions-1.c } \
      { diagnostic_plugin_show_trees.c \
           diagnostic-test-show-trees-1.c } \
+    { diagnostic_plugin_test_string_literals.c \
+         diagnostic-test-string-literals-1.c \
+         diagnostic-test-string-literals-2.c } \
      { location_overflow_plugin.c \
           location-overflow-test-1.c \
           location-overflow-test-2.c } \
diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog

index da3df67b8a872db4858e8703da0b83779c1899c9..3a530b7133af7c1fd5744eb3e6d1502d18fcebed 100644 (file)
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@@ -1,3 +1,49 @@
+2016-08-05  David Malcolm  <dmalcolm@redhat.com>
+
+       * charset.c (cpp_substring_ranges::cpp_substring_ranges): New
+       constructor.
+       (cpp_substring_ranges::~cpp_substring_ranges): New destructor.
+       (cpp_substring_ranges::add_range): New method.
+       (cpp_substring_ranges::add_n_ranges): New method.
+       (_cpp_valid_ucn): Add "char_range" and "loc_reader" params; if
+       they are non-NULL, read position information from *loc_reader
+       and update char_range->m_finish accordingly.
+       (convert_ucn): Add "char_range", "loc_reader", and "ranges"
+       params.  If loc_reader is non-NULL, read location information from
+       it, and update *ranges accordingly, using char_range.
+       Conditionalize the conversion into tbuf on tbuf being non-NULL.
+       (convert_hex): Likewise, conditionalizing the call to
+       emit_numeric_escape on tbuf.
+       (convert_oct): Likewise.
+       (convert_escape): Add params "loc_reader" and "ranges".  If
+       loc_reader is non-NULL, read location information from it, and
+       update *ranges accordingly.  Conditionalize the conversion into
+       tbuf on tbuf being non-NULL.
+       (cpp_interpret_string): Rename to...
+       (cpp_interpret_string_1): ...this, adding params "loc_readers" and
+       "out".  Use "to" to conditionalize the initialization and usage of
+       "tbuf", such as running the converter.  If "loc_readers" is
+       non-NULL, use the instances within it, reading location
+       information from them, and passing them to convert_escape; likewise
+       write to "out" if loc_readers is non-NULL.  Check for leading
+       quote and issue an error if it is not present.  Update boundary
+       check from "== limit" to ">= limit" to protect against erroneous
+       location values to calls that are not parsing string literals.
+       (cpp_interpret_string): Reimplement in terms to
+       cpp_interpret_string_1.
+       (noop_error_cb): New function.
+       (cpp_interpret_string_ranges): New function.
+       (cpp_string_location_reader::cpp_string_location_reader): New
+       constructor.
+       (cpp_string_location_reader::get_next): New method.
+       * include/cpplib.h (class cpp_string_location_reader): New class.
+       (class cpp_substring_ranges): New class.
+       (cpp_interpret_string_ranges): New prototype.
+       * internal.h (_cpp_valid_ucn): Add params "char_range" and
+       "loc_reader".
+       * lex.c (forms_identifier_p): Pass NULL for new params to
+       _cpp_valid_ucn.
+
  2016-08-01  Andreas Schwab  <schwab@suse.de>
  
         * include/cpplib.h: Fix comment typo.
diff --git a/libcpp/charset.c b/libcpp/charset.c

index 2d07942de734c931e513490fa2921ea0b8408746..3739d6ca034b2ff51e0d3e4fe1da446578a517b0 100644 (file)
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -812,6 +812,51 @@ cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
  
  \f
  
+/* cpp_substring_ranges's constructor. */
+
+cpp_substring_ranges::cpp_substring_ranges () :
+  m_ranges (NULL),
+  m_num_ranges (0),
+  m_alloc_ranges (8)
+{
+  m_ranges = XNEWVEC (source_range, m_alloc_ranges);
+}
+
+/* cpp_substring_ranges's destructor. */
+
+cpp_substring_ranges::~cpp_substring_ranges ()
+{
+  free (m_ranges);
+}
+
+/* Add RANGE to the vector of source_range information.  */
+
+void
+cpp_substring_ranges::add_range (source_range range)
+{
+  if (m_num_ranges >= m_alloc_ranges)
+    {
+      m_alloc_ranges *= 2;
+      m_ranges
+       = (source_range *)xrealloc (m_ranges,
+                                   sizeof (source_range) * m_alloc_ranges);
+    }
+  m_ranges[m_num_ranges++] = range;
+}
+
+/* Read NUM ranges from LOC_READER, adding them to the vector of source_range
+   information.  */
+
+void
+cpp_substring_ranges::add_n_ranges (int num,
+                                   cpp_string_location_reader &loc_reader)
+{
+  for (int i = 0; i < num; i++)
+    add_range (loc_reader.get_next ());
+}
+
+\f
+
  /* Utility routine that computes a mask of the form 0000...111... with
     WIDTH 1-bits.  */
  static inline size_t
@@ -980,18 +1025,27 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
     one beyond the UCN, or to the syntactically invalid character.
  
     IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
-   an identifier, or 2 otherwise.  */
+   an identifier, or 2 otherwise.
+
+   If CHAR_RANGE and LOC_READER are non-NULL, then position information is
+   read from *LOC_READER and CHAR_RANGE->m_finish is updated accordingly.  */
  
  bool
  _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
                 const uchar *limit, int identifier_pos,
-               struct normalize_state *nst, cppchar_t *cp)
+               struct normalize_state *nst, cppchar_t *cp,
+               source_range *char_range,
+               cpp_string_location_reader *loc_reader)
  {
    cppchar_t result, c;
    unsigned int length;
    const uchar *str = *pstr;
    const uchar *base = str - 2;
  
+  /* char_range and loc_reader must either be both NULL, or both be
+     non-NULL.  */
+  gcc_assert ((char_range != NULL) == (loc_reader != NULL));
+
    if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
      cpp_error (pfile, CPP_DL_WARNING,
                "universal character names are only valid in C++ and C99");
@@ -1021,6 +1075,8 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
        if (!ISXDIGIT (c))
         break;
        str++;
+      if (loc_reader)
+       char_range->m_finish = loc_reader->get_next ().m_finish;
        result = (result << 4) + hex_value (c);
      }
    while (--length && str < limit);
@@ -1086,11 +1142,18 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
  }
  
  /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
-   it to the execution character set and write the result into TBUF.
-   An advanced pointer is returned.  Issues all relevant diagnostics.  */
+   it to the execution character set and write the result into TBUF,
+   if TBUF is non-NULL.
+   An advanced pointer is returned.  Issues all relevant diagnostics.
+   If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
+   contains the location of the character so far: location information
+   is read from *LOC_READER, and *RANGES is updated accordingly.  */
  static const uchar *
  convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
-            struct _cpp_strbuf *tbuf, struct cset_converter cvt)
+            struct _cpp_strbuf *tbuf, struct cset_converter cvt,
+            source_range char_range,
+            cpp_string_location_reader *loc_reader,
+            cpp_substring_ranges *ranges)
  {
    cppchar_t ucn;
    uchar buf[6];
@@ -1099,8 +1162,17 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
    int rval;
    struct normalize_state nst = INITIAL_NORMALIZE_STATE;
  
+  /* loc_reader and ranges must either be both NULL, or both be non-NULL.  */
+  gcc_assert ((loc_reader != NULL) == (ranges != NULL));
+
    from++;  /* Skip u/U.  */
-  _cpp_valid_ucn (pfile, &from, limit, 0, &nst, &ucn);
+
+  if (loc_reader)
+    /* The u/U is part of the spelling of this character.  */
+    char_range.m_finish = loc_reader->get_next ().m_finish;
+
+  _cpp_valid_ucn (pfile, &from, limit, 0, &nst,
+                 &ucn, &char_range, loc_reader);
  
    rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
    if (rval)
@@ -1109,9 +1181,20 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
        cpp_errno (pfile, CPP_DL_ERROR,
                  "converting UCN to source character set");
      }
-  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
-    cpp_errno (pfile, CPP_DL_ERROR,
-              "converting UCN to execution character set");
+  else
+    {
+      if (tbuf)
+       if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
+         cpp_errno (pfile, CPP_DL_ERROR,
+                    "converting UCN to execution character set");
+
+      if (loc_reader)
+       {
+         int num_encoded_bytes = 6 - bytesleft;
+         for (int i = 0; i < num_encoded_bytes; i++)
+           ranges->add_range (char_range);
+       }
+    }
  
    return from;
  }
@@ -1167,31 +1250,48 @@ emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
  }
  
  /* Convert a hexadecimal escape, pointed to by FROM, to the execution
-   character set and write it into the string buffer TBUF.  Returns an
-   advanced pointer, and issues diagnostics as necessary.
+   character set and write it into the string buffer TBUF (if non-NULL).
+   Returns an advanced pointer, and issues diagnostics as necessary.
     No character set translation occurs; this routine always produces the
     execution-set character with numeric value equal to the given hex
-   number.  You can, e.g. generate surrogate pairs this way.  */
+   number.  You can, e.g. generate surrogate pairs this way.
+   If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
+   contains the location of the character so far: location information
+   is read from *LOC_READER, and *RANGES is updated accordingly.  */
  static const uchar *
  convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
-            struct _cpp_strbuf *tbuf, struct cset_converter cvt)
+            struct _cpp_strbuf *tbuf, struct cset_converter cvt,
+            source_range char_range,
+            cpp_string_location_reader *loc_reader,
+            cpp_substring_ranges *ranges)
  {
    cppchar_t c, n = 0, overflow = 0;
    int digits_found = 0;
    size_t width = cvt.width;
    size_t mask = width_to_mask (width);
  
+  /* loc_reader and ranges must either be both NULL, or both be non-NULL.  */
+  gcc_assert ((loc_reader != NULL) == (ranges != NULL));
+
    if (CPP_WTRADITIONAL (pfile))
      cpp_warning (pfile, CPP_W_TRADITIONAL,
                  "the meaning of '\\x' is different in traditional C");
  
-  from++;  /* Skip 'x'.  */
+  /* Skip 'x'.  */
+  from++;
+
+  /* The 'x' is part of the spelling of this character.  */
+  if (loc_reader)
+    char_range.m_finish = loc_reader->get_next ().m_finish;
+
    while (from < limit)
      {
        c = *from;
        if (! hex_p (c))
         break;
        from++;
+      if (loc_reader)
+       char_range.m_finish = loc_reader->get_next ().m_finish;
        overflow |= n ^ (n << 4 >> 4);
        n = (n << 4) + hex_value (c);
        digits_found = 1;
@@ -1211,7 +1311,10 @@ convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
        n &= mask;
      }
  
-  emit_numeric_escape (pfile, n, tbuf, cvt);
+  if (tbuf)
+    emit_numeric_escape (pfile, n, tbuf, cvt);
+  if (ranges)
+    ranges->add_range (char_range);
  
    return from;
  }
@@ -1221,10 +1324,16 @@ convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
     advanced pointer, and issues diagnostics as necessary.
     No character set translation occurs; this routine always produces the
     execution-set character with numeric value equal to the given octal
-   number.  */
+   number.
+   If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
+   contains the location of the character so far: location information
+   is read from *LOC_READER, and *RANGES is updated accordingly.  */
  static const uchar *
  convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
-            struct _cpp_strbuf *tbuf, struct cset_converter cvt)
+            struct _cpp_strbuf *tbuf, struct cset_converter cvt,
+            source_range char_range,
+            cpp_string_location_reader *loc_reader,
+            cpp_substring_ranges *ranges)
  {
    size_t count = 0;
    cppchar_t c, n = 0;
@@ -1232,12 +1341,17 @@ convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
    size_t mask = width_to_mask (width);
    bool overflow = false;
  
+  /* loc_reader and ranges must either be both NULL, or both be non-NULL.  */
+  gcc_assert ((loc_reader != NULL) == (ranges != NULL));
+
    while (from < limit && count++ < 3)
      {
        c = *from;
        if (c < '0' || c > '7')
         break;
        from++;
+      if (loc_reader)
+       char_range.m_finish = loc_reader->get_next ().m_finish;
        overflow |= n ^ (n << 3 >> 3);
        n = (n << 3) + c - '0';
      }
@@ -1249,18 +1363,26 @@ convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
        n &= mask;
      }
  
-  emit_numeric_escape (pfile, n, tbuf, cvt);
+  if (tbuf)
+    emit_numeric_escape (pfile, n, tbuf, cvt);
+  if (ranges)
+    ranges->add_range (char_range);
  
    return from;
  }
  
  /* Convert an escape sequence (pointed to by FROM) to its value on
     the target, and to the execution character set.  Do not scan past
-   LIMIT.  Write the converted value into TBUF.  Returns an advanced
-   pointer.  Handles all relevant diagnostics.  */
+   LIMIT.  Write the converted value into TBUF, if TBUF is non-NULL.
+   Returns an advanced pointer.  Handles all relevant diagnostics.
+   If LOC_READER is non-NULL, then RANGES must be non-NULL: location
+   information is read from *LOC_READER, and *RANGES is updated
+   accordingly.  */
  static const uchar *
  convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
-               struct _cpp_strbuf *tbuf, struct cset_converter cvt)
+               struct _cpp_strbuf *tbuf, struct cset_converter cvt,
+               cpp_string_location_reader *loc_reader,
+               cpp_substring_ranges *ranges)
  {
    /* Values of \a \b \e \f \n \r \t \v respectively.  */
  #if HOST_CHARSET == HOST_CHARSET_ASCII
@@ -1273,20 +1395,28 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
  
    uchar c;
  
+  /* Record the location of the backslash.  */
+  source_range char_range;
+  if (loc_reader)
+    char_range = loc_reader->get_next ();
+
    c = *from;
    switch (c)
      {
        /* UCNs, hex escapes, and octal escapes are processed separately.  */
      case 'u': case 'U':
-      return convert_ucn (pfile, from, limit, tbuf, cvt);
+      return convert_ucn (pfile, from, limit, tbuf, cvt,
+                         char_range, loc_reader, ranges);
  
      case 'x':
-      return convert_hex (pfile, from, limit, tbuf, cvt);
+      return convert_hex (pfile, from, limit, tbuf, cvt,
+                         char_range, loc_reader, ranges);
        break;
  
      case '0':  case '1':  case '2':  case '3':
      case '4':  case '5':  case '6':  case '7':
-      return convert_oct (pfile, from, limit, tbuf, cvt);
+      return convert_oct (pfile, from, limit, tbuf, cvt,
+                         char_range, loc_reader, ranges);
  
        /* Various letter escapes.  Get the appropriate host-charset
          value into C.  */
@@ -1338,10 +1468,17 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
         }
      }
  
-  /* Now convert what we have to the execution character set.  */
-  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
-    cpp_errno (pfile, CPP_DL_ERROR,
-              "converting escape sequence to execution character set");
+  if (tbuf)
+    /* Now convert what we have to the execution character set.  */
+    if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
+      cpp_errno (pfile, CPP_DL_ERROR,
+                "converting escape sequence to execution character set");
+
+  if (loc_reader)
+    {
+      char_range.m_finish = loc_reader->get_next ().m_finish;
+      ranges->add_range (char_range);
+    }
  
    return from + 1;
  }
@@ -1374,28 +1511,52 @@ converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
     are to be converted from the source to the execution character set,
     escape sequences translated, and finally all are to be
     concatenated.  WIDE indicates whether or not to produce a wide
-   string.  The result is written into TO.  Returns true for success,
-   false for failure.  */
-bool
-cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
-                     cpp_string *to,  enum cpp_ttype type)
+   string.  If TO is non-NULL, the result is written into TO.
+   If LOC_READERS and OUT are non-NULL, then location information
+   is read from LOC_READERS (which must be an array of length COUNT),
+   and location information is written to *RANGES.
+
+   Returns true for success, false for failure.  */
+
+static bool
+cpp_interpret_string_1 (cpp_reader *pfile, const cpp_string *from, size_t count,
+                       cpp_string *to,  enum cpp_ttype type,
+                       cpp_string_location_reader *loc_readers,
+                       cpp_substring_ranges *out)
  {
    struct _cpp_strbuf tbuf;
    const uchar *p, *base, *limit;
    size_t i;
    struct cset_converter cvt = converter_for_type (pfile, type);
  
-  tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
-  tbuf.text = XNEWVEC (uchar, tbuf.asize);
-  tbuf.len = 0;
+  /* loc_readers and out must either be both NULL, or both be non-NULL.  */
+  gcc_assert ((loc_readers != NULL) == (out != NULL));
+
+  if (to)
+    {
+      tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
+      tbuf.text = XNEWVEC (uchar, tbuf.asize);
+      tbuf.len = 0;
+    }
  
    for (i = 0; i < count; i++)
      {
+      cpp_string_location_reader *loc_reader = NULL;
+      if (loc_readers)
+       loc_reader = &loc_readers[i];
+
        p = from[i].text;
        if (*p == 'u')
         {
-         if (*++p == '8')
-           p++;
+         p++;
+         if (loc_reader)
+           loc_reader->get_next ();
+         if (*p == '8')
+           {
+             p++;
+             if (loc_reader)
+               loc_reader->get_next ();
+           }
         }
        else if (*p == 'L' || *p == 'U') p++;
        if (*p == 'R')
@@ -1414,13 +1575,43 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
  
           /* Raw strings are all normal characters; these can be fed
              directly to convert_cset.  */
-         if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
-           goto fail;
+         if (to)
+           if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
+             goto fail;
+
+         if (loc_reader)
+           {
+             /* If generating source ranges, assume we have a 1:1
+                correspondence between bytes in the source encoding and bytes
+                in the execution encoding (e.g. if we have a UTF-8 to UTF-8
+                conversion), so that this run of bytes in the source file
+                corresponds to a run of bytes in the execution string.
+                This requirement is guaranteed by an early-reject in
+                cpp_interpret_string_ranges.  */
+             gcc_assert (cvt.func == convert_no_conversion);
+             out->add_n_ranges (limit - p, *loc_reader);
+           }
  
           continue;
         }
  
-      p++; /* Skip leading quote.  */
+      /* If we don't now have a leading quote, something has gone wrong.
+        This can occur if cpp_interpret_string_ranges is handling a
+        stringified macro argument, but should not be possible otherwise.  */
+      if (*p != '"' && *p != '\'')
+       {
+         gcc_assert (out != NULL);
+         cpp_error (pfile, CPP_DL_ERROR, "missing open quote");
+         if (to)
+           free (tbuf.text);
+         return false;
+       }
+
+      /* Skip leading quote.  */
+      p++;
+      if (loc_reader)
+       loc_reader->get_next ();
+
        limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
  
        for (;;)
@@ -1432,29 +1623,130 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
             {
               /* We have a run of normal characters; these can be fed
                  directly to convert_cset.  */
-             if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
-               goto fail;
+             if (to)
+               if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
+                 goto fail;
+           /* Similar to above: assumes we have a 1:1 correspondence
+              between bytes in the source encoding and bytes in the
+              execution encoding.  */
+             if (loc_reader)
+               {
+                 gcc_assert (cvt.func == convert_no_conversion);
+                 out->add_n_ranges (p - base, *loc_reader);
+               }
             }
-         if (p == limit)
+         if (p >= limit)
             break;
  
-         p = convert_escape (pfile, p + 1, limit, &tbuf, cvt);
+         struct _cpp_strbuf *tbuf_ptr = to ? &tbuf : NULL;
+         p = convert_escape (pfile, p + 1, limit, tbuf_ptr, cvt,
+                             loc_reader, out);
         }
      }
-  /* NUL-terminate the 'to' buffer and translate it to a cpp_string
-     structure.  */
-  emit_numeric_escape (pfile, 0, &tbuf, cvt);
-  tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
-  to->text = tbuf.text;
-  to->len = tbuf.len;
+
+  if (to)
+    {
+      /* NUL-terminate the 'to' buffer and translate it to a cpp_string
+        structure.  */
+      emit_numeric_escape (pfile, 0, &tbuf, cvt);
+      tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
+      to->text = tbuf.text;
+      to->len = tbuf.len;
+    }
+
    return true;
  
   fail:
    cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
-  free (tbuf.text);
+  if (to)
+    free (tbuf.text);
    return false;
  }
  
+/* FROM is an array of cpp_string structures of length COUNT.  These
+   are to be converted from the source to the execution character set,
+   escape sequences translated, and finally all are to be
+   concatenated.  WIDE indicates whether or not to produce a wide
+   string.  The result is written into TO.  Returns true for success,
+   false for failure.  */
+bool
+cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
+                     cpp_string *to,  enum cpp_ttype type)
+{
+  return cpp_interpret_string_1 (pfile, from, count, to, type, NULL, NULL);
+}
+
+/* A "do nothing" error-handling callback for use by
+   cpp_interpret_string_ranges, so that it can temporarily suppress
+   error-handling.  */
+
+static bool
+noop_error_cb (cpp_reader *, int, int, rich_location *,
+              const char *, va_list *)
+{
+  /* no-op.  */
+  return true;
+}
+
+/* This function mimics the behavior of cpp_interpret_string, but
+   rather than generating a string in the execution character set,
+   *OUT is written to with the source code ranges of the characters
+   in such a string.
+   FROM and LOC_READERS should both be arrays of length COUNT.
+   Returns NULL for success, or an error message for failure.  */
+
+const char *
+cpp_interpret_string_ranges (cpp_reader *pfile, const cpp_string *from,
+                            cpp_string_location_reader *loc_readers,
+                            size_t count,
+                            cpp_substring_ranges *out,
+                            enum cpp_ttype type)
+{
+  /* There are a couple of cases in the range-handling in
+     cpp_interpret_string_1 that rely on there being a 1:1 correspondence
+     between bytes in the source encoding and bytes in the execution
+     encoding, so that each byte in the execution string can correspond
+     to the location of a byte in the source string.
+
+     This holds for the typical case of a UTF-8 to UTF-8 conversion.
+     Enforce this requirement by only attempting to track substring
+     locations if we have source encoding == execution encoding.
+
+     This is a stronger condition than we need, since we could e.g.
+     have ASCII to EBCDIC (with 1 byte per character before and after),
+     but it seems to be a reasonable restriction.  */
+  struct cset_converter cvt = converter_for_type (pfile, type);
+  if (cvt.func != convert_no_conversion)
+    return "execution character set != source character set";
+
+  /* For on-demand strings we have already lexed the strings, so there
+     should be no errors.  However, if we have bogus source location
+     data (or stringified macro arguments), the attempt to lex the
+     strings could fail with an error.  Temporarily install an
+     error-handler to catch the error, so that it can lead to this call
+     failing, rather than being emitted as a user-visible diagnostic.
+     If an error does occur, we should see it via the return value of
+     cpp_interpret_string_1.  */
+  bool (*saved_error_handler) (cpp_reader *, int, int, rich_location *,
+                              const char *, va_list *)
+    ATTRIBUTE_FPTR_PRINTF(5,0);
+
+  saved_error_handler = pfile->cb.error;
+  pfile->cb.error = noop_error_cb;
+
+  bool result = cpp_interpret_string_1 (pfile, from, count, NULL, type,
+                                       loc_readers, out);
+
+  /* Restore the saved error-handler.  */
+  pfile->cb.error = saved_error_handler;
+
+  if (!result)
+    return "cpp_interpret_string_1 failed";
+
+  /* Success.  */
+  return NULL;
+}
+
  /* Subroutine of do_line and do_linemarker.  Convert escape sequences
     in a string, but do not perform character set conversion.  */
  bool
@@ -1818,3 +2110,39 @@ _cpp_default_encoding (void)
  
    return current_encoding;
  }
+
+/* Implementation of class cpp_string_location_reader.  */
+
+/* Constructor for cpp_string_location_reader.  */
+
+cpp_string_location_reader::
+cpp_string_location_reader (source_location src_loc,
+                           line_maps *line_table)
+: m_line_table (line_table)
+{
+  src_loc = get_range_from_loc (line_table, src_loc).m_start;
+
+  /* SRC_LOC might be a macro location.  It only makes sense to do
+     column-by-column calculations on ordinary maps, so get the
+     corresponding location in an ordinary map.  */
+  m_loc
+    = linemap_resolve_location (line_table, src_loc,
+                               LRK_SPELLING_LOCATION, NULL);
+
+  const line_map_ordinary *map
+    = linemap_check_ordinary (linemap_lookup (line_table, m_loc));
+  m_offset_per_column = (1 << map->m_range_bits);
+}
+
+/* Get the range of the next source byte.  */
+
+source_range
+cpp_string_location_reader::get_next ()
+{
+  source_range result;
+  result.m_start = m_loc;
+  result.m_finish = m_loc;
+  if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS)
+    m_loc += m_offset_per_column;
+  return result;
+}
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h

index 4e0084c1c7dac1dae0ad45adbd759a0a7435b789..659686bf2616ee03a1db13842569f1c223c70d88 100644 (file)
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -743,6 +743,51 @@ struct GTY(()) cpp_hashnode {
    union _cpp_hashnode_value GTY ((desc ("CPP_HASHNODE_VALUE_IDX (%1)"))) value;
  };
  
+/* A class for iterating through the source locations within a
+   string token (before escapes are interpreted, and before
+   concatenation).  */
+
+class cpp_string_location_reader {
+ public:
+  cpp_string_location_reader (source_location src_loc,
+                             line_maps *line_table);
+
+  source_range get_next ();
+
+ private:
+  source_location m_loc;
+  int m_offset_per_column;
+  line_maps *m_line_table;
+};
+
+/* A class for storing the source ranges of all of the characters within
+   a string literal, after escapes are interpreted, and after
+   concatenation.
+
+   This is not GTY-marked, as instances are intended to be temporary.  */
+
+class cpp_substring_ranges
+{
+ public:
+  cpp_substring_ranges ();
+  ~cpp_substring_ranges ();
+
+  int get_num_ranges () const { return m_num_ranges; }
+  source_range get_range (int idx) const
+  {
+    linemap_assert (idx < m_num_ranges);
+    return m_ranges[idx];
+  }
+
+  void add_range (source_range range);
+  void add_n_ranges (int num, cpp_string_location_reader &loc_reader);
+
+ private:
+  source_range *m_ranges;
+  int m_num_ranges;
+  int m_alloc_ranges;
+};
+
  /* Call this first to get a handle to pass to other functions.
  
     If you want cpplib to manage its own hashtable, pass in a NULL
@@ -829,6 +874,12 @@ extern cppchar_t cpp_interpret_charconst (cpp_reader *, const cpp_token *,
  extern bool cpp_interpret_string (cpp_reader *,
                                   const cpp_string *, size_t,
                                   cpp_string *, enum cpp_ttype);
+extern const char *cpp_interpret_string_ranges (cpp_reader *pfile,
+                                               const cpp_string *from,
+                                               cpp_string_location_reader *,
+                                               size_t count,
+                                               cpp_substring_ranges *out,
+                                               enum cpp_ttype type);
  extern bool cpp_interpret_string_notranslate (cpp_reader *,
                                               const cpp_string *, size_t,
                                               cpp_string *, enum cpp_ttype);
diff --git a/libcpp/internal.h b/libcpp/internal.h

index ca2b4988cf6f71e64f2c6dca506a82bba20dc4b0..4a5cd3c027cdcada2d6d6823067a3f237af2e6b4 100644 (file)
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -754,7 +754,9 @@ struct normalize_state
  extern bool _cpp_valid_ucn (cpp_reader *, const unsigned char **,
                             const unsigned char *, int,
                             struct normalize_state *state,
-                           cppchar_t *);
+                           cppchar_t *,
+                           source_range *char_range,
+                           cpp_string_location_reader *loc_reader);
  extern void _cpp_destroy_iconv (cpp_reader *);
  extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
                                           unsigned char *, size_t, size_t,
diff --git a/libcpp/lex.c b/libcpp/lex.c

index 236418dd78136e63b7ff60753a85bce2fcbdcc86..4e71965ebe793be4f6920fad0b9926d23f443a76 100644 (file)
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1247,7 +1247,7 @@ forms_identifier_p (cpp_reader *pfile, int first,
        cppchar_t s;
        buffer->cur += 2;
        if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
-                         state, &s))
+                         state, &s, NULL, NULL))
         return true;
        buffer->cur -= 2;
      }
author	David Malcolm <dmalcolm@redhat.com>
	Fri, 5 Aug 2016 18:08:33 +0000 (18:08 +0000)
committer	David Malcolm <dmalcolm@gcc.gnu.org>
	Fri, 5 Aug 2016 18:08:33 +0000 (18:08 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/c-family/ChangeLog		patch \| blob \| history
gcc/c-family/c-common.c		patch \| blob \| history
gcc/c-family/c-common.h		patch \| blob \| history
gcc/c-family/c-lex.c		patch \| blob \| history
gcc/c-family/c-opts.c		patch \| blob \| history
gcc/input.c		patch \| blob \| history
gcc/input.h		patch \| blob \| history
gcc/substring-locations.h	[new file with mode: 0644]	patch \| blob
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/plugin/diagnostic_plugin_test_string_literals.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/plugin/plugin.exp		patch \| blob \| history
libcpp/ChangeLog		patch \| blob \| history
libcpp/charset.c		patch \| blob \| history
libcpp/include/cpplib.h		patch \| blob \| history
libcpp/internal.h		patch \| blob \| history
libcpp/lex.c		patch \| blob \| history