From: Nick Clifton Date: Thu, 18 Nov 2021 16:48:19 +0000 (+0000) Subject: Add multibyte character warning option to the assembler. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=578c64a45a0e47fd0af53c77339ec0c26ef4874a;p=binutils-gdb.git Add multibyte character warning option to the assembler. * as.c (parse_args): Add support for --multibyte-handling. * as.h (multibyte_handling): Declare. * app.c (scan_for_multibyte_characters): New function. (do_scrub_chars): Call the new function if multibyte warning is enabled. * input-scrub,c (input_scrub_next_buffer): Call the multibyte scanning function if multibyte warnings are enabled. * symbols.c (struct symbol_flags): Add multibyte_warned bit. (symbol_init): Call the multibyte scanning function if multibyte symbol warnings are enabled. (S_SET_SEGMENT): Likewise. * NEWS: Mention the new feature. * doc/as.texi: Document the new feature. * testsuite/gas/all/multibyte.s: New test source file. * testsuite/gas/all/multibyte1.d: New test driver file. * testsuite/gas/all/multibyte1.l: New test expected output. * testsuite/gas/all/multibyte2.d: New test driver file. * testsuite/gas/all/multibyte2.l: New test expected output. * testsuite/gas/all/gas.exp: Run the new tests. --- diff --git a/gas/ChangeLog b/gas/ChangeLog index e9761e9a901..8732088c91e 100644 --- a/gas/ChangeLog +++ b/gas/ChangeLog @@ -1,3 +1,25 @@ +2021-11-18 Nick Clifton + + * as.c (parse_args): Add support for --multibyte-handling. + * as.h (multibyte_handling): Declare. + * app.c (scan_for_multibyte_characters): New function. + (do_scrub_chars): Call the new function if multibyte warning is + enabled. + * input-scrub,c (input_scrub_next_buffer): Call the multibyte + scanning function if multibyte warnings are enabled. + * symbols.c (struct symbol_flags): Add multibyte_warned bit. + (symbol_init): Call the multibyte scanning function if multibyte + symbol warnings are enabled. + (S_SET_SEGMENT): Likewise. + * NEWS: Mention the new feature. + * doc/as.texi: Document the new feature. + * testsuite/gas/all/multibyte.s: New test source file. + * testsuite/gas/all/multibyte1.d: New test driver file. + * testsuite/gas/all/multibyte1.l: New test expected output. + * testsuite/gas/all/multibyte2.d: New test driver file. + * testsuite/gas/all/multibyte2.l: New test expected output. + * testsuite/gas/all/gas.exp: Run the new tests. + 2021-11-15 Eric Botcazou * doc/as.texi (File): Update description of .file 0 directive. diff --git a/gas/NEWS b/gas/NEWS index aac75220cfe..4288e6213dd 100644 --- a/gas/NEWS +++ b/gas/NEWS @@ -13,6 +13,14 @@ * Add support for Scalable Matrix Extension (SME) for AArch64. +* The --multibyte-handling=[allow|warn|warn-sym-only] option tells the + assembler what to when it encoutners multibyte characters in the input. The + default is to allow them. Setting the option to "warn" will generate a + warning message whenever any multibyte character is encountered. Using the + option to "warn-sym-only" will make the assembler generate a warning whenever a + symbol is defined containing multibyte characters. (References to undefined + symbols will not generate warnings). + * Outputs of .ds.x directive and .tfloat directive with hex input from x86 assembler have been reduced from 12 bytes to 10 bytes to match the output of .tfloat directive. diff --git a/gas/app.c b/gas/app.c index 712bffef851..0c15b969007 100644 --- a/gas/app.c +++ b/gas/app.c @@ -345,6 +345,55 @@ process_escape (int ch) } } +#define MULTIBYTE_WARN_COUNT_LIMIT 10 +static unsigned int multibyte_warn_count = 0; + +bool +scan_for_multibyte_characters (const unsigned char * start, + const unsigned char * end, + bool warn) +{ + if (end <= start) + return false; + + if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT) + return false; + + bool found = false; + + while (start < end) + { + unsigned char c; + + if ((c = * start++) <= 0x7f) + continue; + + if (!warn) + return true; + + found = true; + + const char * filename; + unsigned int lineno; + + filename = as_where (& lineno); + if (filename == NULL) + as_warn (_("multibyte character (%#x) encountered in input"), c); + else if (lineno == 0) + as_warn (_("multibyte character (%#x) encountered in %s"), c, filename); + else + as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno); + + if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT) + { + as_warn (_("further multibyte character warnings suppressed")); + break; + } + } + + return found; +} + /* This function is called to process input characters. The GET parameter is used to retrieve more input characters. GET should set its parameter to point to a buffer, and return the length of @@ -463,6 +512,11 @@ do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen) return 0; from = input_buffer; fromend = from + fromlen; + + if (multibyte_handling == multibyte_warn) + (void) scan_for_multibyte_characters ((const unsigned char *) from, + (const unsigned char* ) fromend, + true /* Generate warnings. */); } while (1) diff --git a/gas/as.c b/gas/as.c index 7de8af246e1..8af04aa85b8 100644 --- a/gas/as.c +++ b/gas/as.c @@ -474,7 +474,7 @@ parse_args (int * pargc, char *** pargv) OPTION_DEBUG_PREFIX_MAP, OPTION_DEFSYM, OPTION_LISTING_LHS_WIDTH, - OPTION_LISTING_LHS_WIDTH2, + OPTION_LISTING_LHS_WIDTH2, /* = STD_BASE + 10 */ OPTION_LISTING_RHS_WIDTH, OPTION_LISTING_CONT_LINES, OPTION_DEPFILE, @@ -484,7 +484,7 @@ parse_args (int * pargc, char *** pargv) OPTION_GDWARF_3, OPTION_GDWARF_4, OPTION_GDWARF_5, - OPTION_GDWARF_SECTIONS, + OPTION_GDWARF_SECTIONS, /* = STD_BASE + 20 */ OPTION_GDWARF_CIE_VERSION, OPTION_STRIP_LOCAL_ABSOLUTE, OPTION_TRADITIONAL_FORMAT, @@ -494,7 +494,7 @@ parse_args (int * pargc, char *** pargv) OPTION_NOEXECSTACK, OPTION_SIZE_CHECK, OPTION_ELF_STT_COMMON, - OPTION_ELF_BUILD_NOTES, + OPTION_ELF_BUILD_NOTES, /* = STD_BASE + 30 */ OPTION_SECTNAME_SUBST, OPTION_ALTERNATE, OPTION_AL, @@ -503,7 +503,8 @@ parse_args (int * pargc, char *** pargv) OPTION_WARN_FATAL, OPTION_COMPRESS_DEBUG, OPTION_NOCOMPRESS_DEBUG, - OPTION_NO_PAD_SECTIONS /* = STD_BASE + 40 */ + OPTION_NO_PAD_SECTIONS, + OPTION_MULTIBYTE_HANDLING /* = STD_BASE + 40 */ /* When you add options here, check that they do not collide with OPTION_MD_BASE. See as.h. */ }; @@ -581,6 +582,7 @@ parse_args (int * pargc, char *** pargv) ,{"target-help", no_argument, NULL, OPTION_TARGET_HELP} ,{"traditional-format", no_argument, NULL, OPTION_TRADITIONAL_FORMAT} ,{"warn", no_argument, NULL, OPTION_WARN} + ,{"multibyte-handling", required_argument, NULL, OPTION_MULTIBYTE_HANDLING} }; /* Construct the option lists from the standard list and the target @@ -683,6 +685,19 @@ parse_args (int * pargc, char *** pargv) flag_traditional_format = 1; break; + case OPTION_MULTIBYTE_HANDLING: + if (strcmp (optarg, "allow") == 0) + multibyte_handling = multibyte_allow; + else if (strcmp (optarg, "warn") == 0) + multibyte_handling = multibyte_warn; + else if (strcmp (optarg, "warn-sym-only") == 0) + multibyte_handling = multibyte_warn_syms; + else if (strcmp (optarg, "warn_sym_only") == 0) + multibyte_handling = multibyte_warn_syms; + else + as_fatal (_("unexpected argument to --multibyte-input-option: '%s'"), optarg); + break; + case OPTION_VERSION: /* This output is intended to follow the GNU standards document. */ printf (_("GNU assembler %s\n"), BFD_VERSION_STRING); diff --git a/gas/as.h b/gas/as.h index f3f12fbd2f8..89dae1b6833 100644 --- a/gas/as.h +++ b/gas/as.h @@ -344,6 +344,14 @@ COMMON int linkrelax; COMMON int do_not_pad_sections_to_alignment; +enum multibyte_input_handling +{ + multibyte_allow = 0, + multibyte_warn, + multibyte_warn_syms +}; +COMMON enum multibyte_input_handling multibyte_handling; + /* TRUE if we should produce a listing. */ extern int listing; @@ -450,6 +458,7 @@ void input_scrub_insert_file (char *); char * input_scrub_new_file (const char *); char * input_scrub_next_buffer (char **bufp); size_t do_scrub_chars (size_t (*get) (char *, size_t), char *, size_t); +bool scan_for_multibyte_characters (const unsigned char *, const unsigned char *, bool); int gen_to_words (LITTLENUM_TYPE *, int, long); int had_err (void); int ignore_input (void); diff --git a/gas/doc/as.texi b/gas/doc/as.texi index 9c1924d4bbd..b83f50b0bfc 100644 --- a/gas/doc/as.texi +++ b/gas/doc/as.texi @@ -245,6 +245,7 @@ gcc(1), ld(1), and the Info entries for @file{binutils} and @file{ld}. [@b{--sectname-subst}] [@b{--size-check=[error|warning]}] [@b{--elf-stt-common=[no|yes]}] [@b{--generate-missing-build-notes=[no|yes]}] + [@b{--multibyte-handling=[allow|warn|warn-sym-only]}] [@b{--target-help}] [@var{target-options}] [@b{--}|@var{files} @dots{}] @c @@ -871,6 +872,18 @@ Set the maximum width of an input source line, as displayed in a listing, to Set the maximum number of lines printed in a listing for a single line of input to @var{number} + 1. +@item --multibyte-handling=allow +@itemx --multibyte-handling=warn +@itemx --multibyte-handling=warn-sym-only +Controls how the assembler handles multibyte characters in the input. The +default (which can be restored by using the @option{allow} argument) is to +allow such characters without complaint. Using the @option{warn} argument will +make the assembler generate a warning message whenever any multibyte character +is encountered. Using the @option{warn-sym-only} argument will only cause a +warning to be generated when a symbol is defined with a name that contains +multibyte characters. (References to undefined symbols will not generate a +warning). + @item --no-pad-sections Stop the assembler for padding the ends of output sections to the alignment of that section. The default is to pad the sections, but this can waste space @@ -2966,9 +2979,11 @@ are noted in @ref{Machine Dependencies}. @end ifset No symbol may begin with a digit. Case is significant. There is no length limit; all characters are significant. Multibyte characters -are supported. Symbols are delimited by characters not in that set, or by the -beginning of a file (since the source program must end with a newline, the end -of a file is not a possible symbol delimiter). @xref{Symbols}. +are supported, but note that the setting of the +@option{--multibyte-handling} option might prevent their use. Symbols +are delimited by characters not in that set, or by the beginning of a file +(since the source program must end with a newline, the end of a file is not a +possible symbol delimiter). @xref{Symbols}. Symbol names may also be enclosed in double quote @code{"} characters. In such cases any characters are allowed, except for the NUL character. If a double @@ -3858,11 +3873,18 @@ than @code{Foo}. Symbol names do not start with a digit. An exception to this rule is made for Local Labels. See below. -Multibyte characters are supported. To generate a symbol name containing +Multibyte characters are supported, but note that the setting of the +@option{multibyte-handling} option might prevent their use. +To generate a symbol name containing multibyte characters enclose it within double quotes and use escape codes. cf @xref{Strings}. Generating a multibyte symbol name from a label is not currently supported. +Since multibyte symbol names are unusual, and could possibly be used +maliciously, @command{@value{AS}} provides a command line option +(@option{--multibyte-handling=warn-sym-only}) which can be used to generate a +warning message whenever a symbol name containing multibyte characters is defined. + Each symbol has exactly one name. Each name in an assembly language program refers to exactly one symbol. You may use that symbol name any number of times in a program. diff --git a/gas/input-scrub.c b/gas/input-scrub.c index b93afb26b43..c665402220e 100644 --- a/gas/input-scrub.c +++ b/gas/input-scrub.c @@ -377,6 +377,11 @@ input_scrub_next_buffer (char **bufp) ++p; } + if (multibyte_handling == multibyte_warn) + (void) scan_for_multibyte_characters ((const unsigned char *) p, + (const unsigned char *) limit, + true /* Generate warnings */); + /* We found a newline in the newly read chars. */ partial_where = p; partial_size = limit - p; diff --git a/gas/symbols.c b/gas/symbols.c index 3cb9425c4ce..889ec662149 100644 --- a/gas/symbols.c +++ b/gas/symbols.c @@ -82,6 +82,10 @@ struct symbol_flags /* Whether the symbol has been marked to be removed by a .symver directive. */ unsigned int removed : 1; + + /* Set when a warning about the symbol containing multibyte characters + is generated. */ + unsigned int multibyte_warned : 1; }; /* A pointer in the symbol may point to either a complete symbol @@ -198,7 +202,7 @@ static void * symbol_entry_find (htab_t table, const char *name) { hashval_t hash = htab_hash_string (name); - symbol_entry_t needle = { { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + symbol_entry_t needle = { { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, hash, name, 0, 0, 0 } }; return htab_find_with_hash (table, &needle, hash); } @@ -309,6 +313,18 @@ symbol_init (symbolS *symbolP, const char *name, asection *sec, symbolP->bsym->name = name; symbolP->bsym->section = sec; + if (multibyte_handling == multibyte_warn_syms + && ! symbolP->flags.local_symbol + && sec != undefined_section + && ! symbolP->flags.multibyte_warned + && scan_for_multibyte_characters ((const unsigned char *) name, + (const unsigned char *) name + strlen (name), + false /* Do not warn. */)) + { + as_warn (_("symbol '%s' contains multibyte characters"), name); + symbolP->flags.multibyte_warned = 1; + } + S_SET_VALUE (symbolP, valu); symbol_clear_list_pointers (symbolP); @@ -2427,7 +2443,21 @@ S_SET_SEGMENT (symbolS *s, segT seg) abort (); } else - s->bsym->section = seg; + { + if (multibyte_handling == multibyte_warn_syms + && ! s->flags.local_symbol + && seg != undefined_section + && ! s->flags.multibyte_warned + && scan_for_multibyte_characters ((const unsigned char *) s->name, + (const unsigned char *) s->name + strlen (s->name), + false)) + { + as_warn (_("symbol '%s' contains multibyte characters"), s->name); + s->flags.multibyte_warned = 1; + } + + s->bsym->section = seg; + } } void diff --git a/gas/testsuite/gas/all/gas.exp b/gas/testsuite/gas/all/gas.exp index 2c812b1fd79..5eee4f8abfa 100644 --- a/gas/testsuite/gas/all/gas.exp +++ b/gas/testsuite/gas/all/gas.exp @@ -502,3 +502,5 @@ run_dump_test "nop" run_dump_test "asciz" run_dump_test "pr27384" run_dump_test "pr27381" +run_dump_test "multibyte1" +run_dump_test "multibyte2" diff --git a/gas/testsuite/gas/all/multibyte.s b/gas/testsuite/gas/all/multibyte.s new file mode 100644 index 00000000000..f93ea4666be --- /dev/null +++ b/gas/testsuite/gas/all/multibyte.s @@ -0,0 +1,8 @@ + .text + .globl he‮oll‬ +he‮oll‬: + .nop + + .globl hello +hello: + .nop diff --git a/gas/testsuite/gas/all/multibyte1.d b/gas/testsuite/gas/all/multibyte1.d new file mode 100644 index 00000000000..dcbd54d0ff8 --- /dev/null +++ b/gas/testsuite/gas/all/multibyte1.d @@ -0,0 +1,3 @@ +#source: multibyte.s +#as: --multibyte-handling=warn +#warning_output: multibyte1.l diff --git a/gas/testsuite/gas/all/multibyte1.l b/gas/testsuite/gas/all/multibyte1.l new file mode 100644 index 00000000000..a592c17055b --- /dev/null +++ b/gas/testsuite/gas/all/multibyte1.l @@ -0,0 +1,12 @@ +[^:]*: Assembler messages: +[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xae\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xac\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xae\) encountered in .*multibyte.s +[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s +[^:]*: Warning: further multibyte character warnings suppressed diff --git a/gas/testsuite/gas/all/multibyte2.d b/gas/testsuite/gas/all/multibyte2.d new file mode 100644 index 00000000000..3a268dea4e9 --- /dev/null +++ b/gas/testsuite/gas/all/multibyte2.d @@ -0,0 +1,3 @@ +#source: multibyte.s +#as: --multibyte-handling=warn-sym-only +#warning_output: multibyte2.l diff --git a/gas/testsuite/gas/all/multibyte2.l b/gas/testsuite/gas/all/multibyte2.l new file mode 100644 index 00000000000..18d7ca5fd28 --- /dev/null +++ b/gas/testsuite/gas/all/multibyte2.l @@ -0,0 +1,2 @@ +[^:]*: Assembler messages: +[^:]*:3: Warning: symbol '.*' contains multibyte characters