binutils/strings.c

   1 /* strings -- print the strings of printable characters in files
   2    Copyright (C) 1993-2021 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 3, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
  17    02110-1301, USA.  */
  18 \f
  19 /* Usage: strings [options] file...
  20
  21    Options:
  22    --all
  23    -a
  24    -            Scan each file in its entirety.
  25
  26    --data
  27    -d           Scan only the initialized data section(s) of object files.
  28
  29    --print-file-name
  30    -f           Print the name of the file before each string.
  31
  32    --bytes=min-len
  33    -n min-len
  34    -min-len     Print graphic char sequences, MIN-LEN or more bytes long,
  35                 that are followed by a NUL or a newline.  Default is 4.
  36
  37    --radix={o,x,d}
  38    -t {o,x,d}   Print the offset within the file before each string,
  39                 in octal/hex/decimal.
  40
  41   --include-all-whitespace
  42   -w            By default tab and space are the only whitepace included in graphic
  43                 char sequences.  This option considers all of isspace() valid.
  44
  45    -o           Like -to.  (Some other implementations have -o like -to,
  46                 others like -td.  We chose one arbitrarily.)
  47
  48    --encoding={s,S,b,l,B,L}
  49    -e {s,S,b,l,B,L}
  50                 Select character encoding: 7-bit-character, 8-bit-character,
  51                 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
  52                 littleendian 32-bit.
  53
  54    --target=BFDNAME
  55    -T {bfdname}
  56                 Specify a non-default object file format.
  57
  58   --unicode={default|locale|invalid|hex|escape|highlight}
  59   -u {d|l|i|x|e|h}
  60                 Determine how to handle UTF-8 unicode characters.  The default
  61                 is no special treatment.  All other versions of this option
  62                 only apply if the encoding is valid and enabling the option
  63                 implies --encoding=S.
  64                 The 'locale' option displays the characters according to the
  65                 current locale.  The 'invalid' option treats them as
  66                 non-string characters.  The 'hex' option displays them as hex
  67                 byte sequences.  The 'escape' option displays them as escape
  68                 sequences and the 'highlight' option displays them as
  69                 coloured escape sequences.
  70
  71   --output-separator=sep_string
  72   -s sep_string String used to separate parsed strings in output.
  73                 Default is newline.
  74
  75    --help
  76    -h           Print the usage message on the standard output.
  77
  78    --version
  79    -V
  80    -v           Print the program version number.
  81
  82    Written by Richard Stallman <rms@gnu.ai.mit.edu>
  83    and David MacKenzie <djm@gnu.ai.mit.edu>.  */
  84
  85 #include "sysdep.h"
  86 #include "bfd.h"
  87 #include "getopt.h"
  88 #include "libiberty.h"
  89 #include "safe-ctype.h"
  90 #include "bucomm.h"
  91
  92 #ifndef streq
  93 #define streq(a,b) (strcmp ((a),(b)) == 0)
  94 #endif
  95
  96 typedef enum unicode_display_type
  97 {
  98   unicode_default = 0,
  99   unicode_locale,
 100   unicode_escape,
 101   unicode_hex,
 102   unicode_highlight,
 103   unicode_invalid
 104 } unicode_display_type;
 105
 106 static unicode_display_type unicode_display = unicode_default;
 107
 108 #define STRING_ISGRAPHIC(c) \
 109       (   (c) >= 0 \
 110        && (c) <= 255 \
 111        && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
 112            || (include_all_whitespace && ISSPACE (c))) \
 113       )
 114
 115 #ifndef errno
 116 extern int errno;
 117 #endif
 118
 119 /* The BFD section flags that identify an initialized data section.  */
 120 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
 121
 122 /* Radix for printing addresses (must be 8, 10 or 16).  */
 123 static int address_radix;
 124
 125 /* Minimum length of sequence of graphic chars to trigger output.  */
 126 static uint string_min;
 127
 128 /* Whether or not we include all whitespace as a graphic char.   */
 129 static bool include_all_whitespace;
 130
 131 /* TRUE means print address within file for each string.  */
 132 static bool print_addresses;
 133
 134 /* TRUE means print filename for each string.  */
 135 static bool print_filenames;
 136
 137 /* TRUE means for object files scan only the data section.  */
 138 static bool datasection_only;
 139
 140 /* The BFD object file format.  */
 141 static char *target;
 142
 143 /* The character encoding format.  */
 144 static char encoding;
 145 static int encoding_bytes;
 146
 147 /* Output string used to separate parsed strings  */
 148 static char *output_separator;
 149
 150 static struct option long_options[] =
 151 {
 152   {"all", no_argument, NULL, 'a'},
 153   {"bytes", required_argument, NULL, 'n'},
 154   {"data", no_argument, NULL, 'd'},
 155   {"encoding", required_argument, NULL, 'e'},
 156   {"help", no_argument, NULL, 'h'},
 157   {"include-all-whitespace", no_argument, NULL, 'w'},
 158   {"output-separator", required_argument, NULL, 's'},
 159   {"print-file-name", no_argument, NULL, 'f'},
 160   {"radix", required_argument, NULL, 't'},
 161   {"target", required_argument, NULL, 'T'},
 162   {"unicode", required_argument, NULL, 'U'},
 163   {"version", no_argument, NULL, 'v'},
 164   {NULL, 0, NULL, 0}
 165 };
 166
 167 static bool strings_file (char *);
 168 static void print_strings (const char *, FILE *, file_ptr, int, char *);
 169 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
 170 \f
 171 int main (int, char **);
 172
 173 int
 174 main (int argc, char **argv)
 175 {
 176   int optc;
 177   int exit_status = 0;
 178   bool files_given = false;
 179   char *s;
 180   int numeric_opt = 0;
 181
 182   setlocale (LC_ALL, "");
 183   bindtextdomain (PACKAGE, LOCALEDIR);
 184   textdomain (PACKAGE);
 185
 186   program_name = argv[0];
 187   xmalloc_set_program_name (program_name);
 188   bfd_set_error_program_name (program_name);
 189
 190   expandargv (&argc, &argv);
 191
 192   string_min = 4;
 193   include_all_whitespace = false;
 194   print_addresses = false;
 195   print_filenames = false;
 196   if (DEFAULT_STRINGS_ALL)
 197     datasection_only = false;
 198   else
 199     datasection_only = true;
 200   target = NULL;
 201   encoding = 's';
 202   output_separator = NULL;
 203
 204   while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
 205                               long_options, (int *) 0)) != EOF)
 206     {
 207       switch (optc)
 208         {
 209         case 'a':
 210           datasection_only = false;
 211           break;
 212
 213         case 'd':
 214           datasection_only = true;
 215           break;
 216
 217         case 'f':
 218           print_filenames = true;
 219           break;
 220
 221         case 'H':
 222         case 'h':
 223           usage (stdout, 0);
 224
 225         case 'n':
 226           string_min = (int) strtoul (optarg, &s, 0);
 227           if (s != NULL && *s != 0)
 228             fatal (_("invalid integer argument %s"), optarg);
 229           break;
 230
 231         case 'w':
 232           include_all_whitespace = true;
 233           break;
 234
 235         case 'o':
 236           print_addresses = true;
 237           address_radix = 8;
 238           break;
 239
 240         case 't':
 241           print_addresses = true;
 242           if (optarg[1] != '\0')
 243             usage (stderr, 1);
 244           switch (optarg[0])
 245             {
 246             case 'o':
 247               address_radix = 8;
 248               break;
 249
 250             case 'd':
 251               address_radix = 10;
 252               break;
 253
 254             case 'x':
 255               address_radix = 16;
 256               break;
 257
 258             default:
 259               usage (stderr, 1);
 260             }
 261           break;
 262
 263         case 'T':
 264           target = optarg;
 265           break;
 266
 267         case 'e':
 268           if (optarg[1] != '\0')
 269             usage (stderr, 1);
 270           encoding = optarg[0];
 271           break;
 272
 273         case 's':
 274           output_separator = optarg;
 275           break;
 276
 277         case 'U':
 278           if (streq (optarg, "default") || streq (optarg, "d"))
 279             unicode_display = unicode_default;
 280           else if (streq (optarg, "locale") || streq (optarg, "l"))
 281             unicode_display = unicode_locale;
 282           else if (streq (optarg, "escape") || streq (optarg, "e"))
 283             unicode_display = unicode_escape;
 284           else if (streq (optarg, "invalid") || streq (optarg, "i"))
 285             unicode_display = unicode_invalid;
 286           else if (streq (optarg, "hex") || streq (optarg, "x"))
 287             unicode_display = unicode_hex;
 288           else if (streq (optarg, "highlight") || streq (optarg, "h"))
 289             unicode_display = unicode_highlight;
 290           else
 291             fatal (_("invalid argument to -U/--unicode: %s"), optarg);
 292           break;
 293
 294         case 'V':
 295         case 'v':
 296           print_version ("strings");
 297           break;
 298
 299         case '?':
 300           usage (stderr, 1);
 301
 302         default:
 303           numeric_opt = optind;
 304           break;
 305         }
 306     }
 307
 308   if (unicode_display != unicode_default)
 309     encoding = 'S';
 310
 311   if (numeric_opt != 0)
 312     {
 313       string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
 314       if (s != NULL && *s != 0)
 315         fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
 316     }
 317   if (string_min < 1)
 318     fatal (_("invalid minimum string length %d"), string_min);
 319
 320   switch (encoding)
 321     {
 322     case 'S':
 323     case 's':
 324       encoding_bytes = 1;
 325       break;
 326     case 'b':
 327     case 'l':
 328       encoding_bytes = 2;
 329       break;
 330     case 'B':
 331     case 'L':
 332       encoding_bytes = 4;
 333       break;
 334     default:
 335       usage (stderr, 1);
 336     }
 337
 338   if (bfd_init () != BFD_INIT_MAGIC)
 339     fatal (_("fatal error: libbfd ABI mismatch"));
 340   set_default_bfd_target ();
 341
 342   if (optind >= argc)
 343     {
 344       datasection_only = false;
 345       SET_BINARY (fileno (stdin));
 346       print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
 347       files_given = true;
 348     }
 349   else
 350     {
 351       for (; optind < argc; ++optind)
 352         {
 353           if (streq (argv[optind], "-"))
 354             datasection_only = false;
 355           else
 356             {
 357               files_given = true;
 358               exit_status |= !strings_file (argv[optind]);
 359             }
 360         }
 361     }
 362
 363   if (!files_given)
 364     usage (stderr, 1);
 365
 366   return (exit_status);
 367 }
 368 \f
 369 /* Scan section SECT of the file ABFD, whose printable name is
 370    FILENAME.  If it contains initialized data set GOT_A_SECTION and
 371    print the strings in it.  */
 372
 373 static void
 374 strings_a_section (bfd *abfd, asection *sect, const char *filename,
 375                    bool *got_a_section)
 376 {
 377   bfd_size_type sectsize;
 378   bfd_byte *mem;
 379
 380   if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
 381     return;
 382
 383   sectsize = bfd_section_size (sect);
 384   if (sectsize == 0)
 385     return;
 386
 387   if (!bfd_malloc_and_get_section (abfd, sect, &mem))
 388     {
 389       non_fatal (_("%s: Reading section %s failed: %s"),
 390                  filename, sect->name, bfd_errmsg (bfd_get_error ()));
 391       return;
 392     }
 393
 394   *got_a_section = true;
 395   print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
 396   free (mem);
 397 }
 398
 399 /* Scan all of the sections in FILE, and print the strings
 400    in the initialized data section(s).
 401
 402    Return TRUE if successful,
 403    FALSE if not (such as if FILE is not an object file).  */
 404
 405 static bool
 406 strings_object_file (const char *file)
 407 {
 408   bfd *abfd;
 409   asection *s;
 410   bool got_a_section;
 411
 412   abfd = bfd_openr (file, target);
 413
 414   if (abfd == NULL)
 415     /* Treat the file as a non-object file.  */
 416     return false;
 417
 418   /* This call is mainly for its side effect of reading in the sections.
 419      We follow the traditional behavior of `strings' in that we don't
 420      complain if we don't recognize a file to be an object file.  */
 421   if (!bfd_check_format (abfd, bfd_object))
 422     {
 423       bfd_close (abfd);
 424       return false;
 425     }
 426
 427   got_a_section = false;
 428   for (s = abfd->sections; s != NULL; s = s->next)
 429     strings_a_section (abfd, s, file, &got_a_section);
 430
 431   if (!bfd_close (abfd))
 432     {
 433       bfd_nonfatal (file);
 434       return false;
 435     }
 436
 437   return got_a_section;
 438 }
 439
 440 /* Print the strings in FILE.  Return TRUE if ok, FALSE if an error occurs.  */
 441
 442 static bool
 443 strings_file (char *file)
 444 {
 445   struct stat st;
 446
 447   /* get_file_size does not support non-S_ISREG files.  */
 448
 449   if (stat (file, &st) < 0)
 450     {
 451       if (errno == ENOENT)
 452         non_fatal (_("'%s': No such file"), file);
 453       else
 454         non_fatal (_("Warning: could not locate '%s'.  reason: %s"),
 455                    file, strerror (errno));
 456       return false;
 457     }
 458   else if (S_ISDIR (st.st_mode))
 459     {
 460       non_fatal (_("Warning: '%s' is a directory"), file);
 461       return false;
 462     }
 463
 464   /* If we weren't told to scan the whole file,
 465      try to open it as an object file and only look at
 466      initialized data sections.  If that fails, fall back to the
 467      whole file.  */
 468   if (!datasection_only || !strings_object_file (file))
 469     {
 470       FILE *stream;
 471
 472       stream = fopen (file, FOPEN_RB);
 473       if (stream == NULL)
 474         {
 475           fprintf (stderr, "%s: ", program_name);
 476           perror (file);
 477           return false;
 478         }
 479
 480       print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
 481
 482       if (fclose (stream) == EOF)
 483         {
 484           fprintf (stderr, "%s: ", program_name);
 485           perror (file);
 486           return false;
 487         }
 488     }
 489
 490   return true;
 491 }
 492 \f
 493 /* Read the next character, return EOF if none available.
 494    Assume that STREAM is positioned so that the next byte read
 495    is at address ADDRESS in the file.
 496
 497    If STREAM is NULL, do not read from it.
 498    The caller can supply a buffer of characters
 499    to be processed before the data in STREAM.
 500    MAGIC is the address of the buffer and
 501    MAGICCOUNT is how many characters are in it.  */
 502
 503 static long
 504 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
 505 {
 506   int c, i;
 507   long r = 0;
 508
 509   for (i = 0; i < encoding_bytes; i++)
 510     {
 511       if (*magiccount)
 512         {
 513           (*magiccount)--;
 514           c = *(*magic)++;
 515         }
 516       else
 517         {
 518           if (stream == NULL)
 519             return EOF;
 520
 521           /* Only use getc_unlocked if we found a declaration for it.
 522              Otherwise, libc is not thread safe by default, and we
 523              should not use it.  */
 524
 525 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
 526           c = getc_unlocked (stream);
 527 #else
 528           c = getc (stream);
 529 #endif
 530           if (c == EOF)
 531             return EOF;
 532         }
 533
 534       (*address)++;
 535       r = (r << 8) | (c & 0xff);
 536     }
 537
 538   switch (encoding)
 539     {
 540     default:
 541       break;
 542     case 'l':
 543       r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
 544       break;
 545     case 'L':
 546       r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
 547            | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
 548       break;
 549     }
 550
 551   return r;
 552 }
 553
 554 /* Throw away one byte of a (possibly) multi-byte char C, updating
 555    address and buffer to suit.  */
 556
 557 static void
 558 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
 559 {
 560   static char tmp[4];
 561
 562   if (encoding_bytes > 1)
 563     {
 564       *address -= encoding_bytes - 1;
 565
 566       if (*magiccount == 0)
 567         {
 568           /* If no magic buffer exists, use temp buffer.  */
 569           switch (encoding)
 570             {
 571             default:
 572               break;
 573             case 'b':
 574               tmp[0] = c & 0xff;
 575               *magiccount = 1;
 576               break;
 577             case 'l':
 578               tmp[0] = (c >> 8) & 0xff;
 579               *magiccount = 1;
 580               break;
 581             case 'B':
 582               tmp[0] = (c >> 16) & 0xff;
 583               tmp[1] = (c >> 8) & 0xff;
 584               tmp[2] = c & 0xff;
 585               *magiccount = 3;
 586               break;
 587             case 'L':
 588               tmp[0] = (c >> 8) & 0xff;
 589               tmp[1] = (c >> 16) & 0xff;
 590               tmp[2] = (c >> 24) & 0xff;
 591               *magiccount = 3;
 592               break;
 593             }
 594           *magic = tmp;
 595         }
 596       else
 597         {
 598           /* If magic buffer exists, rewind.  */
 599           *magic -= encoding_bytes - 1;
 600           *magiccount += encoding_bytes - 1;
 601         }
 602     }
 603 }
 604
 605 static void
 606 print_filename_and_address (const char * filename, file_ptr address)
 607 {
 608   if (print_filenames)
 609     printf ("%s: ", filename);
 610
 611   if (! print_addresses)
 612     return;
 613
 614   switch (address_radix)
 615     {
 616     case 8:
 617       if (sizeof (address) > sizeof (long))
 618         {
 619 #ifndef __MSVCRT__
 620           printf ("%7llo ", (unsigned long long) address);
 621 #else
 622           printf ("%7I64o ", (unsigned long long) address);
 623 #endif
 624         }
 625       else
 626         printf ("%7lo ", (unsigned long) address);
 627       break;
 628
 629     case 10:
 630       if (sizeof (address) > sizeof (long))
 631         {
 632 #ifndef __MSVCRT__
 633           printf ("%7llu ", (unsigned long long) address);
 634 #else
 635           printf ("%7I64d ", (unsigned long long) address);
 636 #endif
 637         }
 638       else
 639         printf ("%7ld ", (long) address);
 640       break;
 641
 642     case 16:
 643       if (sizeof (address) > sizeof (long))
 644         {
 645 #ifndef __MSVCRT__
 646           printf ("%7llx ", (unsigned long long) address);
 647 #else
 648           printf ("%7I64x ", (unsigned long long) address);
 649 #endif
 650         }
 651       else
 652         printf ("%7lx ", (unsigned long) address);
 653       break;
 654     }
 655 }
 656
 657 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
 658    If the encoding is valid then returns the number of bytes it uses.  */
 659
 660 static unsigned int
 661 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
 662 {
 663   if (buffer[0] < 0xc0)
 664     return 0;
 665
 666   if (buflen < 2)
 667     return 0;
 668
 669   if ((buffer[1] & 0xc0) != 0x80)
 670     return 0;
 671
 672   if ((buffer[0] & 0x20) == 0)
 673     return 2;
 674
 675   if (buflen < 3)
 676     return 0;
 677
 678   if ((buffer[2] & 0xc0) != 0x80)
 679     return 0;
 680
 681   if ((buffer[0] & 0x10) == 0)
 682     return 3;
 683
 684   if (buflen < 4)
 685     return 0;
 686
 687   if ((buffer[3] & 0xc0) != 0x80)
 688     return 0;
 689
 690   return 4;
 691 }
 692
 693 /* Display a UTF-8 encoded character in BUFFER according to the setting
 694    of unicode_display.  The character is known to be valid.
 695    Returns the number of bytes consumed.  */
 696
 697 static uint
 698 display_utf8_char (const unsigned char * buffer)
 699 {
 700   uint j;
 701   uint utf8_len;
 702
 703   switch (buffer[0] & 0x30)
 704     {
 705     case 0x00:
 706     case 0x10:
 707       utf8_len = 2;
 708       break;
 709     case 0x20:
 710       utf8_len = 3;
 711       break;
 712     default:
 713       utf8_len = 4;
 714     }
 715
 716   switch (unicode_display)
 717     {
 718     default:
 719       fprintf (stderr, "ICE: unexpected unicode display type\n");
 720       break;
 721
 722     case unicode_escape:
 723     case unicode_highlight:
 724       if (unicode_display == unicode_highlight && isatty (1))
 725         printf ("\x1B[31;47m"); /* Red.  */
 726
 727       switch (utf8_len)
 728         {
 729         case 2:
 730           printf ("\\u%02x%02x",
 731                   ((buffer[0] & 0x1c) >> 2),
 732                   ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
 733           break;
 734
 735         case 3:
 736           printf ("\\u%02x%02x",
 737                   ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
 738                   ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
 739           break;
 740
 741         case 4:
 742           printf ("\\u%02x%02x%02x",
 743                   ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
 744                   ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
 745                   ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
 746           break;
 747         default:
 748           /* URG.  */
 749           break;
 750         }
 751
 752       if (unicode_display == unicode_highlight && isatty (1))
 753         printf ("\033[0m"); /* Default colour.  */
 754       break;
 755
 756     case unicode_hex:
 757       putchar ('<');
 758       printf ("0x");
 759       for (j = 0; j < utf8_len; j++)
 760         printf ("%02x", buffer [j]);
 761       putchar ('>');
 762       break;
 763
 764     case unicode_locale:
 765       printf ("%.1s", buffer);
 766       break;
 767     }
 768
 769   return utf8_len;
 770 }
 771
 772 /* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
 773    according to the setting of the unicode_display variable.  The buffer
 774    contains BUFLEN bytes.
 775
 776    Display the characters as if they started at ADDRESS and are contained in
 777    FILENAME.  */
 778
 779 static void
 780 print_unicode_buffer (const char *            filename,
 781                       file_ptr                address,
 782                       const unsigned char *   buffer,
 783                       unsigned long           buflen)
 784 {
 785   /* Paranoia checks...  */
 786   if (filename == NULL
 787       || buffer == NULL
 788       || unicode_display == unicode_default
 789       || encoding != 'S'
 790       || encoding_bytes != 1)
 791     {
 792       fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
 793       return;
 794     }
 795
 796   if (buflen == 0)
 797     return;
 798
 799   /* We must only display strings that are at least string_min *characters*
 800      long.  So we scan the buffer in two stages.  First we locate the start
 801      of a potential string.  Then we walk along it until we have found
 802      string_min characters.  Then we go back to the start point and start
 803      displaying characters according to the unicode_display setting.  */
 804
 805   unsigned long start_point = 0;
 806   unsigned long i = 0;
 807   unsigned int char_len = 1;
 808   unsigned int num_found = 0;
 809
 810   for (i = 0; i < buflen; i += char_len)
 811     {
 812       int c = buffer[i];
 813
 814       char_len = 1;
 815
 816       /* Find the first potential character of a string.  */
 817       if (! STRING_ISGRAPHIC (c))
 818         {
 819           num_found = 0;
 820           continue;
 821         }
 822
 823       if (c > 126)
 824         {
 825           if (c < 0xc0)
 826             {
 827               num_found = 0;
 828               continue;
 829             }
 830
 831           if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
 832             {
 833               char_len = 1;
 834               num_found = 0;
 835               continue;
 836             }
 837
 838           if (unicode_display == unicode_invalid)
 839             {
 840               /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
 841               num_found = 0;
 842               continue;
 843             }
 844         }
 845
 846       if (num_found == 0)
 847         /* We have found a potential starting point for a string.  */
 848         start_point = i;
 849
 850       ++ num_found;
 851
 852       if (num_found >= string_min)
 853         break;
 854     }
 855
 856   if (num_found < string_min)
 857     return;
 858
 859   print_filename_and_address (filename, address + start_point);
 860
 861   /* We have found string_min characters.  Display them and any
 862      more that follow.  */
 863   for (i = start_point; i < buflen; i += char_len)
 864     {
 865       int c = buffer[i];
 866
 867       char_len = 1;
 868
 869       if (! STRING_ISGRAPHIC (c))
 870         break;
 871       else if (c < 127)
 872         putchar (c);
 873       else if (! is_valid_utf8 (buffer + i, buflen - i))
 874         break;
 875       else if (unicode_display == unicode_invalid)
 876         break;
 877       else
 878         char_len = display_utf8_char (buffer + i);
 879     }
 880
 881   if (output_separator)
 882     fputs (output_separator, stdout);
 883   else
 884     putchar ('\n');
 885
 886   /* FIXME: Using tail recursion here is lazy programming...  */
 887   print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
 888 }
 889
 890 static int
 891 get_unicode_byte (FILE * stream, unsigned char * putback, uint * num_putback, uint * num_read)
 892 {
 893   if (* num_putback > 0)
 894     {
 895       * num_putback = * num_putback - 1;
 896       return putback [* num_putback];
 897     }
 898
 899   * num_read = * num_read + 1;
 900
 901 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
 902   return getc_unlocked (stream);
 903 #else
 904   return getc (stream);
 905 #endif
 906 }
 907
 908 /* Helper function for print_unicode_stream.  */
 909
 910 static void
 911 print_unicode_stream_body (const char *     filename,
 912                            file_ptr         address,
 913                            FILE *           stream,
 914                            unsigned char *  putback_buf,
 915                            uint             num_putback,
 916                            unsigned char *  print_buf)
 917 {
 918   /* It would be nice if we could just read the stream into a buffer
 919      and then process if with print_unicode_buffer.  But the input
 920      might be huge or it might time-locked (eg stdin).  So instead
 921      we go one byte at a time...  */
 922
 923   file_ptr start_point = 0;
 924   uint num_read = 0;
 925   uint num_chars = 0;
 926   uint num_print = 0;
 927   int c = 0;
 928
 929   /* Find a series of string_min characters.  Put them into print_buf.  */
 930   do
 931     {
 932       if (num_chars >= string_min)
 933         break;
 934
 935       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 936       if (c == EOF)
 937         break;
 938
 939       if (! STRING_ISGRAPHIC (c))
 940         {
 941           num_chars = num_print = 0;
 942           continue;
 943         }
 944
 945       if (num_chars == 0)
 946         start_point = num_read - 1;
 947
 948       if (c < 127)
 949         {
 950           print_buf[num_print] = c;
 951           num_chars ++;
 952           num_print ++;
 953           continue;
 954         }
 955
 956       if (c < 0xc0)
 957         {
 958           num_chars = num_print = 0;
 959           continue;
 960         }
 961
 962       /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
 963       char utf8[4];
 964
 965       utf8[0] = c;
 966       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 967       if (c == EOF)
 968         break;
 969       utf8[1] = c;
 970
 971       if ((utf8[1] & 0xc0) != 0x80)
 972         {
 973           /* Invalid UTF-8.  */
 974           putback_buf[num_putback++] = utf8[1];
 975           num_chars = num_print = 0;
 976           continue;
 977         }
 978       else if ((utf8[0] & 0x20) == 0)
 979         {
 980           /* A valid 2-byte UTF-8 encoding.  */
 981           if (unicode_display == unicode_invalid)
 982             {
 983               putback_buf[num_putback++] = utf8[1];
 984               num_chars = num_print = 0;
 985             }
 986           else
 987             {
 988               print_buf[num_print ++] = utf8[0];
 989               print_buf[num_print ++] = utf8[1];
 990               num_chars ++;
 991             }
 992           continue;
 993         }
 994
 995       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 996       if (c == EOF)
 997         break;
 998       utf8[2] = c;
 999
1000       if ((utf8[2] & 0xc0) != 0x80)
1001         {
1002           /* Invalid UTF-8.  */
1003           putback_buf[num_putback++] = utf8[2];
1004           putback_buf[num_putback++] = utf8[1];
1005           num_chars = num_print = 0;
1006           continue;
1007         }
1008       else if ((utf8[0] & 0x10) == 0)
1009         {
1010           /* A valid 3-byte UTF-8 encoding.  */
1011           if (unicode_display == unicode_invalid)
1012             {
1013               putback_buf[num_putback++] = utf8[2];
1014               putback_buf[num_putback++] = utf8[1];
1015               num_chars = num_print = 0;
1016             }
1017           else
1018             {
1019               print_buf[num_print ++] = utf8[0];
1020               print_buf[num_print ++] = utf8[1];
1021               print_buf[num_print ++] = utf8[2];
1022               num_chars ++;
1023             }
1024           continue;
1025         }
1026
1027       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1028       if (c == EOF)
1029         break;
1030       utf8[3] = c;
1031
1032       if ((utf8[3] & 0xc0) != 0x80)
1033         {
1034           /* Invalid UTF-8.  */
1035           putback_buf[num_putback++] = utf8[3];
1036           putback_buf[num_putback++] = utf8[2];
1037           putback_buf[num_putback++] = utf8[1];
1038           num_chars = num_print = 0;
1039         }
1040       /* We have a valid 4-byte UTF-8 encoding.  */
1041       else if (unicode_display == unicode_invalid)
1042         {
1043           putback_buf[num_putback++] = utf8[3];
1044           putback_buf[num_putback++] = utf8[1];
1045           putback_buf[num_putback++] = utf8[2];
1046           num_chars = num_print = 0;
1047         }
1048       else
1049         {
1050           print_buf[num_print ++] = utf8[0];
1051           print_buf[num_print ++] = utf8[1];
1052           print_buf[num_print ++] = utf8[2];
1053           print_buf[num_print ++] = utf8[3];
1054           num_chars ++;
1055         }
1056     }
1057   while (1);
1058
1059   if (num_chars >= string_min)
1060     {
1061       /* We know that we have string_min valid characters in print_buf,
1062          and there may be more to come in the stream.  Start displaying
1063          them.  */
1064
1065       print_filename_and_address (filename, address + start_point);
1066
1067       uint i;
1068       for (i = 0; i < num_print;)
1069         {
1070           if (print_buf[i] < 127)
1071             putchar (print_buf[i++]);
1072           else
1073             i += display_utf8_char (print_buf + i);
1074         }
1075
1076       /* OK so now we have to start read unchecked bytes.  */
1077
1078         /* Find a series of string_min characters.  Put them into print_buf.  */
1079       do
1080         {
1081           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1082           if (c == EOF)
1083             break;
1084
1085           if (! STRING_ISGRAPHIC (c))
1086             break;
1087
1088           if (c < 127)
1089             {
1090               putchar (c);
1091               continue;
1092             }
1093
1094           if (c < 0xc0)
1095             break;
1096
1097           /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
1098           unsigned char utf8[4];
1099
1100           utf8[0] = c;
1101           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1102           if (c == EOF)
1103             break;
1104           utf8[1] = c;
1105
1106           if ((utf8[1] & 0xc0) != 0x80)
1107             {
1108               /* Invalid UTF-8.  */
1109               putback_buf[num_putback++] = utf8[1];
1110               break;
1111             }
1112           else if ((utf8[0] & 0x20) == 0)
1113             {
1114               /* Valid 2-byte UTF-8.  */
1115               if (unicode_display == unicode_invalid)
1116                 {
1117                   putback_buf[num_putback++] = utf8[1];
1118                   break;
1119                 }
1120               else
1121                 {
1122                   (void) display_utf8_char (utf8);
1123                   continue;
1124                 }
1125             }
1126
1127           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1128           if (c == EOF)
1129             break;
1130           utf8[2] = c;
1131
1132           if ((utf8[2] & 0xc0) != 0x80)
1133             {
1134               /* Invalid UTF-8.  */
1135               putback_buf[num_putback++] = utf8[2];
1136               putback_buf[num_putback++] = utf8[1];
1137               break;
1138             }
1139           else if ((utf8[0] & 0x10) == 0)
1140             {
1141               /* Valid 3-byte UTF-8.  */
1142               if (unicode_display == unicode_invalid)
1143                 {
1144                   putback_buf[num_putback++] = utf8[2];
1145                   putback_buf[num_putback++] = utf8[1];
1146                   break;
1147                 }
1148               else
1149                 {
1150                   (void) display_utf8_char (utf8);
1151                   continue;
1152                 }
1153             }
1154
1155           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1156           if (c == EOF)
1157             break;
1158           utf8[3] = c;
1159
1160           if ((utf8[3] & 0xc0) != 0x80)
1161             {
1162               /* Invalid UTF-8.  */
1163               putback_buf[num_putback++] = utf8[3];
1164               putback_buf[num_putback++] = utf8[2];
1165               putback_buf[num_putback++] = utf8[1];
1166               break;
1167             }
1168           else if (unicode_display == unicode_invalid)
1169             {
1170               putback_buf[num_putback++] = utf8[3];
1171               putback_buf[num_putback++] = utf8[2];
1172               putback_buf[num_putback++] = utf8[1];
1173               break;
1174             }
1175           else
1176             /* A valid 4-byte UTF-8 encoding.  */
1177             (void) display_utf8_char (utf8);
1178         }
1179       while (1);
1180
1181       if (output_separator)
1182         fputs (output_separator, stdout);
1183       else
1184         putchar ('\n');
1185     }
1186
1187   if (c != EOF)
1188     /* FIXME: Using tail recursion here is lazy, but it works.  */
1189     print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1190 }
1191
1192 /* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
1193    encountered according to the setting of the unicode_display variable.
1194    The stream is positioned at ADDRESS and is attached to FILENAME.  */
1195
1196 static void
1197 print_unicode_stream (const char * filename,
1198                       file_ptr     address,
1199                       FILE *       stream)
1200 {
1201   /* Paranoia checks...  */
1202   if (filename == NULL
1203       || stream == NULL
1204       || unicode_display == unicode_default
1205       || encoding != 'S'
1206       || encoding_bytes != 1)
1207     {
1208       fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1209       return;
1210     }
1211
1212   /* Allocate space for string_min 4-byte utf-8 characters.  */
1213   unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1214   /* We should never have to put back more than 4 bytes.  */
1215   unsigned char putback_buf[5];
1216   uint num_putback = 0;
1217
1218   print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1219   free (print_buf);
1220 }
1221 \f
1222 /* Find the strings in file FILENAME, read from STREAM.
1223    Assume that STREAM is positioned so that the next byte read
1224    is at address ADDRESS in the file.
1225
1226    If STREAM is NULL, do not read from it.
1227    The caller can supply a buffer of characters
1228    to be processed before the data in STREAM.
1229    MAGIC is the address of the buffer and
1230    MAGICCOUNT is how many characters are in it.
1231    Those characters come at address ADDRESS and the data in STREAM follow.  */
1232
1233 static void
1234 print_strings (const char *filename, FILE *stream, file_ptr address,
1235                int magiccount, char *magic)
1236 {
1237   if (unicode_display != unicode_default)
1238     {
1239       if (magic != NULL)
1240         print_unicode_buffer (filename, address,
1241                               (const unsigned char *) magic, magiccount);
1242
1243       if (stream != NULL)
1244         print_unicode_stream (filename, address, stream);
1245       return;
1246     }
1247
1248   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1249
1250   while (1)
1251     {
1252       file_ptr start;
1253       uint i;
1254       long c;
1255
1256       /* See if the next `string_min' chars are all graphic chars.  */
1257     tryline:
1258       start = address;
1259       for (i = 0; i < string_min; i++)
1260         {
1261           c = get_char (stream, &address, &magiccount, &magic);
1262           if (c == EOF)
1263             {
1264               free (buf);
1265               return;
1266             }
1267
1268           if (! STRING_ISGRAPHIC (c))
1269             {
1270               /* Found a non-graphic.  Try again starting with next byte.  */
1271               unget_part_char (c, &address, &magiccount, &magic);
1272               goto tryline;
1273             }
1274           buf[i] = c;
1275         }
1276
1277       /* We found a run of `string_min' graphic characters.  Print up
1278          to the next non-graphic character.  */
1279       print_filename_and_address (filename, start);
1280
1281       buf[i] = '\0';
1282       fputs (buf, stdout);
1283
1284       while (1)
1285         {
1286           c = get_char (stream, &address, &magiccount, &magic);
1287           if (c == EOF)
1288             break;
1289           if (! STRING_ISGRAPHIC (c))
1290             {
1291               unget_part_char (c, &address, &magiccount, &magic);
1292               break;
1293             }
1294           putchar (c);
1295         }
1296
1297       if (output_separator)
1298         fputs (output_separator, stdout);
1299       else
1300         putchar ('\n');
1301     }
1302   free (buf);
1303 }
1304 \f
1305 static void
1306 usage (FILE *stream, int status)
1307 {
1308   fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1309   fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1310   fprintf (stream, _(" The options are:\n"));
1311
1312   if (DEFAULT_STRINGS_ALL)
1313     fprintf (stream, _("\
1314   -a - --all                Scan the entire file, not just the data section [default]\n\
1315   -d --data                 Only scan the data sections in the file\n"));
1316   else
1317     fprintf (stream, _("\
1318   -a - --all                Scan the entire file, not just the data section\n\
1319   -d --data                 Only scan the data sections in the file [default]\n"));
1320
1321   fprintf (stream, _("\
1322   -f --print-file-name      Print the name of the file before each string\n\
1323   -n --bytes=[number]       Locate & print any NUL-terminated sequence of at\n\
1324   -<number>                   least [number] characters (default 4).\n\
1325   -t --radix={o,d,x}        Print the location of the string in base 8, 10 or 16\n\
1326   -w --include-all-whitespace Include all whitespace as valid string characters\n\
1327   -o                        An alias for --radix=o\n\
1328   -T --target=<BFDNAME>     Specify the binary file format\n\
1329   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1330                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1331   --unicode={default|show|invalid|hex|escape|highlight}\n\
1332   -u {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
1333   -s --output-separator=<string> String used to separate strings in output.\n\
1334   @<file>                   Read options from <file>\n\
1335   -h --help                 Display this information\n\
1336   -v -V --version           Print the program's version number\n"));
1337   list_supported_targets (program_name, stream);
1338   if (REPORT_BUGS_TO[0] && status == 0)
1339     fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1340   exit (status);
1341 }