binutils/strings.c

   1 /* strings -- print the strings of printable characters in files
   2    Copyright (C) 1993-2021 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 3, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
  17    02110-1301, USA.  */
  18 \f
  19 /* Usage: strings [options] file...
  20
  21    Options:
  22    --all
  23    -a
  24    -            Scan each file in its entirety.
  25
  26    --data
  27    -d           Scan only the initialized data section(s) of object files.
  28
  29    --print-file-name
  30    -f           Print the name of the file before each string.
  31
  32    --bytes=min-len
  33    -n min-len
  34    -min-len     Print graphic char sequences, MIN-LEN or more bytes long,
  35                 that are followed by a NUL or a newline.  Default is 4.
  36
  37    --radix={o,x,d}
  38    -t {o,x,d}   Print the offset within the file before each string,
  39                 in octal/hex/decimal.
  40
  41   --include-all-whitespace
  42   -w            By default tab and space are the only whitepace included in graphic
  43                 char sequences.  This option considers all of isspace() valid.
  44
  45    -o           Like -to.  (Some other implementations have -o like -to,
  46                 others like -td.  We chose one arbitrarily.)
  47
  48    --encoding={s,S,b,l,B,L}
  49    -e {s,S,b,l,B,L}
  50                 Select character encoding: 7-bit-character, 8-bit-character,
  51                 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
  52                 littleendian 32-bit.
  53
  54    --target=BFDNAME
  55    -T {bfdname}
  56                 Specify a non-default object file format.
  57
  58   --unicode={default|locale|invalid|hex|escape|highlight}
  59   -u {d|l|i|x|e|h}
  60                 Determine how to handle UTF-8 unicode characters.  The default
  61                 is no special treatment.  All other versions of this option
  62                 only apply if the encoding is valid and enabling the option
  63                 implies --encoding=S.
  64                 The 'locale' option displays the characters according to the
  65                 current locale.  The 'invalid' option treats them as
  66                 non-string characters.  The 'hex' option displays them as hex
  67                 byte sequences.  The 'escape' option displays them as escape
  68                 sequences and the 'highlight' option displays them as
  69                 coloured escape sequences.
  70
  71   --output-separator=sep_string
  72   -s sep_string String used to separate parsed strings in output.
  73                 Default is newline.
  74
  75    --help
  76    -h           Print the usage message on the standard output.
  77
  78    --version
  79    -V
  80    -v           Print the program version number.
  81
  82    Written by Richard Stallman <rms@gnu.ai.mit.edu>
  83    and David MacKenzie <djm@gnu.ai.mit.edu>.  */
  84
  85 #include "sysdep.h"
  86 #include "bfd.h"
  87 #include "getopt.h"
  88 #include "libiberty.h"
  89 #include "safe-ctype.h"
  90 #include "bucomm.h"
  91
  92 #ifndef streq
  93 #define streq(a,b) (strcmp ((a),(b)) == 0)
  94 #endif
  95
  96 typedef enum unicode_display_type
  97 {
  98   unicode_default = 0,
  99   unicode_locale,
 100   unicode_escape,
 101   unicode_hex,
 102   unicode_highlight,
 103   unicode_invalid
 104 } unicode_display_type;
 105
 106 static unicode_display_type unicode_display = unicode_default;
 107
 108 #define STRING_ISGRAPHIC(c) \
 109       (   (c) >= 0 \
 110        && (c) <= 255 \
 111        && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
 112            || (include_all_whitespace && ISSPACE (c))) \
 113       )
 114
 115 #ifndef errno
 116 extern int errno;
 117 #endif
 118
 119 /* The BFD section flags that identify an initialized data section.  */
 120 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
 121
 122 /* Radix for printing addresses (must be 8, 10 or 16).  */
 123 static int address_radix;
 124
 125 /* Minimum length of sequence of graphic chars to trigger output.  */
 126 static unsigned int string_min;
 127
 128 /* Whether or not we include all whitespace as a graphic char.   */
 129 static bool include_all_whitespace;
 130
 131 /* TRUE means print address within file for each string.  */
 132 static bool print_addresses;
 133
 134 /* TRUE means print filename for each string.  */
 135 static bool print_filenames;
 136
 137 /* TRUE means for object files scan only the data section.  */
 138 static bool datasection_only;
 139
 140 /* The BFD object file format.  */
 141 static char *target;
 142
 143 /* The character encoding format.  */
 144 static char encoding;
 145 static int encoding_bytes;
 146
 147 /* Output string used to separate parsed strings  */
 148 static char *output_separator;
 149
 150 static struct option long_options[] =
 151 {
 152   {"all", no_argument, NULL, 'a'},
 153   {"bytes", required_argument, NULL, 'n'},
 154   {"data", no_argument, NULL, 'd'},
 155   {"encoding", required_argument, NULL, 'e'},
 156   {"help", no_argument, NULL, 'h'},
 157   {"include-all-whitespace", no_argument, NULL, 'w'},
 158   {"output-separator", required_argument, NULL, 's'},
 159   {"print-file-name", no_argument, NULL, 'f'},
 160   {"radix", required_argument, NULL, 't'},
 161   {"target", required_argument, NULL, 'T'},
 162   {"unicode", required_argument, NULL, 'U'},
 163   {"version", no_argument, NULL, 'v'},
 164   {NULL, 0, NULL, 0}
 165 };
 166
 167 static bool strings_file (char *);
 168 static void print_strings (const char *, FILE *, file_ptr, int, char *);
 169 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
 170 \f
 171 int main (int, char **);
 172
 173 int
 174 main (int argc, char **argv)
 175 {
 176   int optc;
 177   int exit_status = 0;
 178   bool files_given = false;
 179   char *s;
 180   int numeric_opt = 0;
 181
 182   setlocale (LC_ALL, "");
 183   bindtextdomain (PACKAGE, LOCALEDIR);
 184   textdomain (PACKAGE);
 185
 186   program_name = argv[0];
 187   xmalloc_set_program_name (program_name);
 188   bfd_set_error_program_name (program_name);
 189
 190   expandargv (&argc, &argv);
 191
 192   string_min = 4;
 193   include_all_whitespace = false;
 194   print_addresses = false;
 195   print_filenames = false;
 196   if (DEFAULT_STRINGS_ALL)
 197     datasection_only = false;
 198   else
 199     datasection_only = true;
 200   target = NULL;
 201   encoding = 's';
 202   output_separator = NULL;
 203
 204   while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
 205                               long_options, (int *) 0)) != EOF)
 206     {
 207       switch (optc)
 208         {
 209         case 'a':
 210           datasection_only = false;
 211           break;
 212
 213         case 'd':
 214           datasection_only = true;
 215           break;
 216
 217         case 'f':
 218           print_filenames = true;
 219           break;
 220
 221         case 'H':
 222         case 'h':
 223           usage (stdout, 0);
 224
 225         case 'n':
 226           string_min = (int) strtoul (optarg, &s, 0);
 227           if (s != NULL && *s != 0)
 228             fatal (_("invalid integer argument %s"), optarg);
 229           break;
 230
 231         case 'w':
 232           include_all_whitespace = true;
 233           break;
 234
 235         case 'o':
 236           print_addresses = true;
 237           address_radix = 8;
 238           break;
 239
 240         case 't':
 241           print_addresses = true;
 242           if (optarg[1] != '\0')
 243             usage (stderr, 1);
 244           switch (optarg[0])
 245             {
 246             case 'o':
 247               address_radix = 8;
 248               break;
 249
 250             case 'd':
 251               address_radix = 10;
 252               break;
 253
 254             case 'x':
 255               address_radix = 16;
 256               break;
 257
 258             default:
 259               usage (stderr, 1);
 260             }
 261           break;
 262
 263         case 'T':
 264           target = optarg;
 265           break;
 266
 267         case 'e':
 268           if (optarg[1] != '\0')
 269             usage (stderr, 1);
 270           encoding = optarg[0];
 271           break;
 272
 273         case 's':
 274           output_separator = optarg;
 275           break;
 276
 277         case 'U':
 278           if (streq (optarg, "default") || streq (optarg, "d"))
 279             unicode_display = unicode_default;
 280           else if (streq (optarg, "locale") || streq (optarg, "l"))
 281             unicode_display = unicode_locale;
 282           else if (streq (optarg, "escape") || streq (optarg, "e"))
 283             unicode_display = unicode_escape;
 284           else if (streq (optarg, "invalid") || streq (optarg, "i"))
 285             unicode_display = unicode_invalid;
 286           else if (streq (optarg, "hex") || streq (optarg, "x"))
 287             unicode_display = unicode_hex;
 288           else if (streq (optarg, "highlight") || streq (optarg, "h"))
 289             unicode_display = unicode_highlight;
 290           else
 291             fatal (_("invalid argument to -U/--unicode: %s"), optarg);
 292           break;
 293
 294         case 'V':
 295         case 'v':
 296           print_version ("strings");
 297           break;
 298
 299         case '?':
 300           usage (stderr, 1);
 301
 302         default:
 303           numeric_opt = optind;
 304           break;
 305         }
 306     }
 307
 308   if (unicode_display != unicode_default)
 309     encoding = 'S';
 310
 311   if (numeric_opt != 0)
 312     {
 313       string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
 314       if (s != NULL && *s != 0)
 315         fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
 316     }
 317   if (string_min < 1)
 318     fatal (_("invalid minimum string length %d"), string_min);
 319
 320   switch (encoding)
 321     {
 322     case 'S':
 323     case 's':
 324       encoding_bytes = 1;
 325       break;
 326     case 'b':
 327     case 'l':
 328       encoding_bytes = 2;
 329       break;
 330     case 'B':
 331     case 'L':
 332       encoding_bytes = 4;
 333       break;
 334     default:
 335       usage (stderr, 1);
 336     }
 337
 338   if (bfd_init () != BFD_INIT_MAGIC)
 339     fatal (_("fatal error: libbfd ABI mismatch"));
 340   set_default_bfd_target ();
 341
 342   if (optind >= argc)
 343     {
 344       datasection_only = false;
 345       SET_BINARY (fileno (stdin));
 346       print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
 347       files_given = true;
 348     }
 349   else
 350     {
 351       for (; optind < argc; ++optind)
 352         {
 353           if (streq (argv[optind], "-"))
 354             datasection_only = false;
 355           else
 356             {
 357               files_given = true;
 358               exit_status |= !strings_file (argv[optind]);
 359             }
 360         }
 361     }
 362
 363   if (!files_given)
 364     usage (stderr, 1);
 365
 366   return (exit_status);
 367 }
 368 \f
 369 /* Scan section SECT of the file ABFD, whose printable name is
 370    FILENAME.  If it contains initialized data set GOT_A_SECTION and
 371    print the strings in it.  */
 372
 373 static void
 374 strings_a_section (bfd *abfd, asection *sect, const char *filename,
 375                    bool *got_a_section)
 376 {
 377   bfd_size_type sectsize;
 378   bfd_byte *mem;
 379
 380   if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
 381     return;
 382
 383   sectsize = bfd_section_size (sect);
 384   if (sectsize == 0)
 385     return;
 386
 387   if (!bfd_malloc_and_get_section (abfd, sect, &mem))
 388     {
 389       non_fatal (_("%s: Reading section %s failed: %s"),
 390                  filename, sect->name, bfd_errmsg (bfd_get_error ()));
 391       return;
 392     }
 393
 394   *got_a_section = true;
 395   print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
 396   free (mem);
 397 }
 398
 399 /* Scan all of the sections in FILE, and print the strings
 400    in the initialized data section(s).
 401
 402    Return TRUE if successful,
 403    FALSE if not (such as if FILE is not an object file).  */
 404
 405 static bool
 406 strings_object_file (const char *file)
 407 {
 408   bfd *abfd;
 409   asection *s;
 410   bool got_a_section;
 411
 412   abfd = bfd_openr (file, target);
 413
 414   if (abfd == NULL)
 415     /* Treat the file as a non-object file.  */
 416     return false;
 417
 418   /* This call is mainly for its side effect of reading in the sections.
 419      We follow the traditional behavior of `strings' in that we don't
 420      complain if we don't recognize a file to be an object file.  */
 421   if (!bfd_check_format (abfd, bfd_object))
 422     {
 423       bfd_close (abfd);
 424       return false;
 425     }
 426
 427   got_a_section = false;
 428   for (s = abfd->sections; s != NULL; s = s->next)
 429     strings_a_section (abfd, s, file, &got_a_section);
 430
 431   if (!bfd_close (abfd))
 432     {
 433       bfd_nonfatal (file);
 434       return false;
 435     }
 436
 437   return got_a_section;
 438 }
 439
 440 /* Print the strings in FILE.  Return TRUE if ok, FALSE if an error occurs.  */
 441
 442 static bool
 443 strings_file (char *file)
 444 {
 445   struct stat st;
 446
 447   /* get_file_size does not support non-S_ISREG files.  */
 448
 449   if (stat (file, &st) < 0)
 450     {
 451       if (errno == ENOENT)
 452         non_fatal (_("'%s': No such file"), file);
 453       else
 454         non_fatal (_("Warning: could not locate '%s'.  reason: %s"),
 455                    file, strerror (errno));
 456       return false;
 457     }
 458   else if (S_ISDIR (st.st_mode))
 459     {
 460       non_fatal (_("Warning: '%s' is a directory"), file);
 461       return false;
 462     }
 463
 464   /* If we weren't told to scan the whole file,
 465      try to open it as an object file and only look at
 466      initialized data sections.  If that fails, fall back to the
 467      whole file.  */
 468   if (!datasection_only || !strings_object_file (file))
 469     {
 470       FILE *stream;
 471
 472       stream = fopen (file, FOPEN_RB);
 473       if (stream == NULL)
 474         {
 475           fprintf (stderr, "%s: ", program_name);
 476           perror (file);
 477           return false;
 478         }
 479
 480       print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
 481
 482       if (fclose (stream) == EOF)
 483         {
 484           fprintf (stderr, "%s: ", program_name);
 485           perror (file);
 486           return false;
 487         }
 488     }
 489
 490   return true;
 491 }
 492 \f
 493 /* Read the next character, return EOF if none available.
 494    Assume that STREAM is positioned so that the next byte read
 495    is at address ADDRESS in the file.
 496
 497    If STREAM is NULL, do not read from it.
 498    The caller can supply a buffer of characters
 499    to be processed before the data in STREAM.
 500    MAGIC is the address of the buffer and
 501    MAGICCOUNT is how many characters are in it.  */
 502
 503 static long
 504 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
 505 {
 506   int c, i;
 507   long r = 0;
 508
 509   for (i = 0; i < encoding_bytes; i++)
 510     {
 511       if (*magiccount)
 512         {
 513           (*magiccount)--;
 514           c = *(*magic)++;
 515         }
 516       else
 517         {
 518           if (stream == NULL)
 519             return EOF;
 520
 521           /* Only use getc_unlocked if we found a declaration for it.
 522              Otherwise, libc is not thread safe by default, and we
 523              should not use it.  */
 524
 525 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
 526           c = getc_unlocked (stream);
 527 #else
 528           c = getc (stream);
 529 #endif
 530           if (c == EOF)
 531             return EOF;
 532         }
 533
 534       (*address)++;
 535       r = (r << 8) | (c & 0xff);
 536     }
 537
 538   switch (encoding)
 539     {
 540     default:
 541       break;
 542     case 'l':
 543       r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
 544       break;
 545     case 'L':
 546       r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
 547            | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
 548       break;
 549     }
 550
 551   return r;
 552 }
 553
 554 /* Throw away one byte of a (possibly) multi-byte char C, updating
 555    address and buffer to suit.  */
 556
 557 static void
 558 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
 559 {
 560   static char tmp[4];
 561
 562   if (encoding_bytes > 1)
 563     {
 564       *address -= encoding_bytes - 1;
 565
 566       if (*magiccount == 0)
 567         {
 568           /* If no magic buffer exists, use temp buffer.  */
 569           switch (encoding)
 570             {
 571             default:
 572               break;
 573             case 'b':
 574               tmp[0] = c & 0xff;
 575               *magiccount = 1;
 576               break;
 577             case 'l':
 578               tmp[0] = (c >> 8) & 0xff;
 579               *magiccount = 1;
 580               break;
 581             case 'B':
 582               tmp[0] = (c >> 16) & 0xff;
 583               tmp[1] = (c >> 8) & 0xff;
 584               tmp[2] = c & 0xff;
 585               *magiccount = 3;
 586               break;
 587             case 'L':
 588               tmp[0] = (c >> 8) & 0xff;
 589               tmp[1] = (c >> 16) & 0xff;
 590               tmp[2] = (c >> 24) & 0xff;
 591               *magiccount = 3;
 592               break;
 593             }
 594           *magic = tmp;
 595         }
 596       else
 597         {
 598           /* If magic buffer exists, rewind.  */
 599           *magic -= encoding_bytes - 1;
 600           *magiccount += encoding_bytes - 1;
 601         }
 602     }
 603 }
 604
 605 static void
 606 print_filename_and_address (const char * filename, file_ptr address)
 607 {
 608   if (print_filenames)
 609     printf ("%s: ", filename);
 610
 611   if (! print_addresses)
 612     return;
 613
 614   switch (address_radix)
 615     {
 616     case 8:
 617       if (sizeof (address) > sizeof (long))
 618         {
 619 #ifndef __MSVCRT__
 620           printf ("%7llo ", (unsigned long long) address);
 621 #else
 622           printf ("%7I64o ", (unsigned long long) address);
 623 #endif
 624         }
 625       else
 626         printf ("%7lo ", (unsigned long) address);
 627       break;
 628
 629     case 10:
 630       if (sizeof (address) > sizeof (long))
 631         {
 632 #ifndef __MSVCRT__
 633           printf ("%7llu ", (unsigned long long) address);
 634 #else
 635           printf ("%7I64d ", (unsigned long long) address);
 636 #endif
 637         }
 638       else
 639         printf ("%7ld ", (long) address);
 640       break;
 641
 642     case 16:
 643       if (sizeof (address) > sizeof (long))
 644         {
 645 #ifndef __MSVCRT__
 646           printf ("%7llx ", (unsigned long long) address);
 647 #else
 648           printf ("%7I64x ", (unsigned long long) address);
 649 #endif
 650         }
 651       else
 652         printf ("%7lx ", (unsigned long) address);
 653       break;
 654     }
 655 }
 656
 657 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
 658    If the encoding is valid then returns the number of bytes it uses.  */
 659
 660 static unsigned int
 661 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
 662 {
 663   if (buffer[0] < 0xc0)
 664     return 0;
 665
 666   if (buflen < 2)
 667     return 0;
 668
 669   if ((buffer[1] & 0xc0) != 0x80)
 670     return 0;
 671
 672   if ((buffer[0] & 0x20) == 0)
 673     return 2;
 674
 675   if (buflen < 3)
 676     return 0;
 677
 678   if ((buffer[2] & 0xc0) != 0x80)
 679     return 0;
 680
 681   if ((buffer[0] & 0x10) == 0)
 682     return 3;
 683
 684   if (buflen < 4)
 685     return 0;
 686
 687   if ((buffer[3] & 0xc0) != 0x80)
 688     return 0;
 689
 690   return 4;
 691 }
 692
 693 /* Display a UTF-8 encoded character in BUFFER according to the setting
 694    of unicode_display.  The character is known to be valid.
 695    Returns the number of bytes consumed.  */
 696
 697 static unsigned int
 698 display_utf8_char (const unsigned char * buffer)
 699 {
 700   unsigned int j;
 701   unsigned int utf8_len;
 702
 703   switch (buffer[0] & 0x30)
 704     {
 705     case 0x00:
 706     case 0x10:
 707       utf8_len = 2;
 708       break;
 709     case 0x20:
 710       utf8_len = 3;
 711       break;
 712     default:
 713       utf8_len = 4;
 714     }
 715
 716   switch (unicode_display)
 717     {
 718     default:
 719       fprintf (stderr, "ICE: unexpected unicode display type\n");
 720       break;
 721
 722     case unicode_escape:
 723     case unicode_highlight:
 724       if (unicode_display == unicode_highlight && isatty (1))
 725         printf ("\x1B[31;47m"); /* Red.  */
 726
 727       switch (utf8_len)
 728         {
 729         case 2:
 730           printf ("\\u%02x%02x",
 731                   ((buffer[0] & 0x1c) >> 2),
 732                   ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
 733           break;
 734
 735         case 3:
 736           printf ("\\u%02x%02x",
 737                   ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
 738                   ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
 739           break;
 740
 741         case 4:
 742           printf ("\\u%02x%02x%02x",
 743                   ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
 744                   ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
 745                   ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
 746           break;
 747         default:
 748           /* URG.  */
 749           break;
 750         }
 751
 752       if (unicode_display == unicode_highlight && isatty (1))
 753         printf ("\033[0m"); /* Default colour.  */
 754       break;
 755
 756     case unicode_hex:
 757       putchar ('<');
 758       printf ("0x");
 759       for (j = 0; j < utf8_len; j++)
 760         printf ("%02x", buffer [j]);
 761       putchar ('>');
 762       break;
 763
 764     case unicode_locale:
 765       printf ("%.1s", buffer);
 766       break;
 767     }
 768
 769   return utf8_len;
 770 }
 771
 772 /* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
 773    according to the setting of the unicode_display variable.  The buffer
 774    contains BUFLEN bytes.
 775
 776    Display the characters as if they started at ADDRESS and are contained in
 777    FILENAME.  */
 778
 779 static void
 780 print_unicode_buffer (const char *            filename,
 781                       file_ptr                address,
 782                       const unsigned char *   buffer,
 783                       unsigned long           buflen)
 784 {
 785   /* Paranoia checks...  */
 786   if (filename == NULL
 787       || buffer == NULL
 788       || unicode_display == unicode_default
 789       || encoding != 'S'
 790       || encoding_bytes != 1)
 791     {
 792       fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
 793       return;
 794     }
 795
 796   if (buflen == 0)
 797     return;
 798
 799   /* We must only display strings that are at least string_min *characters*
 800      long.  So we scan the buffer in two stages.  First we locate the start
 801      of a potential string.  Then we walk along it until we have found
 802      string_min characters.  Then we go back to the start point and start
 803      displaying characters according to the unicode_display setting.  */
 804
 805   unsigned long start_point = 0;
 806   unsigned long i = 0;
 807   unsigned int char_len = 1;
 808   unsigned int num_found = 0;
 809
 810   for (i = 0; i < buflen; i += char_len)
 811     {
 812       int c = buffer[i];
 813
 814       char_len = 1;
 815
 816       /* Find the first potential character of a string.  */
 817       if (! STRING_ISGRAPHIC (c))
 818         {
 819           num_found = 0;
 820           continue;
 821         }
 822
 823       if (c > 126)
 824         {
 825           if (c < 0xc0)
 826             {
 827               num_found = 0;
 828               continue;
 829             }
 830
 831           if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
 832             {
 833               char_len = 1;
 834               num_found = 0;
 835               continue;
 836             }
 837
 838           if (unicode_display == unicode_invalid)
 839             {
 840               /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
 841               num_found = 0;
 842               continue;
 843             }
 844         }
 845
 846       if (num_found == 0)
 847         /* We have found a potential starting point for a string.  */
 848         start_point = i;
 849
 850       ++ num_found;
 851
 852       if (num_found >= string_min)
 853         break;
 854     }
 855
 856   if (num_found < string_min)
 857     return;
 858
 859   print_filename_and_address (filename, address + start_point);
 860
 861   /* We have found string_min characters.  Display them and any
 862      more that follow.  */
 863   for (i = start_point; i < buflen; i += char_len)
 864     {
 865       int c = buffer[i];
 866
 867       char_len = 1;
 868
 869       if (! STRING_ISGRAPHIC (c))
 870         break;
 871       else if (c < 127)
 872         putchar (c);
 873       else if (! is_valid_utf8 (buffer + i, buflen - i))
 874         break;
 875       else if (unicode_display == unicode_invalid)
 876         break;
 877       else
 878         char_len = display_utf8_char (buffer + i);
 879     }
 880
 881   if (output_separator)
 882     fputs (output_separator, stdout);
 883   else
 884     putchar ('\n');
 885
 886   /* FIXME: Using tail recursion here is lazy programming...  */
 887   print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
 888 }
 889
 890 static int
 891 get_unicode_byte (FILE *          stream,
 892                   unsigned char * putback,
 893                   unsigned int *  num_putback,
 894                   unsigned int *  num_read)
 895 {
 896   if (* num_putback > 0)
 897     {
 898       * num_putback = * num_putback - 1;
 899       return putback [* num_putback];
 900     }
 901
 902   * num_read = * num_read + 1;
 903
 904 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
 905   return getc_unlocked (stream);
 906 #else
 907   return getc (stream);
 908 #endif
 909 }
 910
 911 /* Helper function for print_unicode_stream.  */
 912
 913 static void
 914 print_unicode_stream_body (const char *     filename,
 915                            file_ptr         address,
 916                            FILE *           stream,
 917                            unsigned char *  putback_buf,
 918                            unsigned int     num_putback,
 919                            unsigned char *  print_buf)
 920 {
 921   /* It would be nice if we could just read the stream into a buffer
 922      and then process if with print_unicode_buffer.  But the input
 923      might be huge or it might time-locked (eg stdin).  So instead
 924      we go one byte at a time...  */
 925
 926   file_ptr start_point = 0;
 927   unsigned int num_read = 0;
 928   unsigned int num_chars = 0;
 929   unsigned int num_print = 0;
 930   int c = 0;
 931
 932   /* Find a series of string_min characters.  Put them into print_buf.  */
 933   do
 934     {
 935       if (num_chars >= string_min)
 936         break;
 937
 938       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 939       if (c == EOF)
 940         break;
 941
 942       if (! STRING_ISGRAPHIC (c))
 943         {
 944           num_chars = num_print = 0;
 945           continue;
 946         }
 947
 948       if (num_chars == 0)
 949         start_point = num_read - 1;
 950
 951       if (c < 127)
 952         {
 953           print_buf[num_print] = c;
 954           num_chars ++;
 955           num_print ++;
 956           continue;
 957         }
 958
 959       if (c < 0xc0)
 960         {
 961           num_chars = num_print = 0;
 962           continue;
 963         }
 964
 965       /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
 966       char utf8[4];
 967
 968       utf8[0] = c;
 969       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 970       if (c == EOF)
 971         break;
 972       utf8[1] = c;
 973
 974       if ((utf8[1] & 0xc0) != 0x80)
 975         {
 976           /* Invalid UTF-8.  */
 977           putback_buf[num_putback++] = utf8[1];
 978           num_chars = num_print = 0;
 979           continue;
 980         }
 981       else if ((utf8[0] & 0x20) == 0)
 982         {
 983           /* A valid 2-byte UTF-8 encoding.  */
 984           if (unicode_display == unicode_invalid)
 985             {
 986               putback_buf[num_putback++] = utf8[1];
 987               num_chars = num_print = 0;
 988             }
 989           else
 990             {
 991               print_buf[num_print ++] = utf8[0];
 992               print_buf[num_print ++] = utf8[1];
 993               num_chars ++;
 994             }
 995           continue;
 996         }
 997
 998       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 999       if (c == EOF)
1000         break;
1001       utf8[2] = c;
1002
1003       if ((utf8[2] & 0xc0) != 0x80)
1004         {
1005           /* Invalid UTF-8.  */
1006           putback_buf[num_putback++] = utf8[2];
1007           putback_buf[num_putback++] = utf8[1];
1008           num_chars = num_print = 0;
1009           continue;
1010         }
1011       else if ((utf8[0] & 0x10) == 0)
1012         {
1013           /* A valid 3-byte UTF-8 encoding.  */
1014           if (unicode_display == unicode_invalid)
1015             {
1016               putback_buf[num_putback++] = utf8[2];
1017               putback_buf[num_putback++] = utf8[1];
1018               num_chars = num_print = 0;
1019             }
1020           else
1021             {
1022               print_buf[num_print ++] = utf8[0];
1023               print_buf[num_print ++] = utf8[1];
1024               print_buf[num_print ++] = utf8[2];
1025               num_chars ++;
1026             }
1027           continue;
1028         }
1029
1030       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1031       if (c == EOF)
1032         break;
1033       utf8[3] = c;
1034
1035       if ((utf8[3] & 0xc0) != 0x80)
1036         {
1037           /* Invalid UTF-8.  */
1038           putback_buf[num_putback++] = utf8[3];
1039           putback_buf[num_putback++] = utf8[2];
1040           putback_buf[num_putback++] = utf8[1];
1041           num_chars = num_print = 0;
1042         }
1043       /* We have a valid 4-byte UTF-8 encoding.  */
1044       else if (unicode_display == unicode_invalid)
1045         {
1046           putback_buf[num_putback++] = utf8[3];
1047           putback_buf[num_putback++] = utf8[1];
1048           putback_buf[num_putback++] = utf8[2];
1049           num_chars = num_print = 0;
1050         }
1051       else
1052         {
1053           print_buf[num_print ++] = utf8[0];
1054           print_buf[num_print ++] = utf8[1];
1055           print_buf[num_print ++] = utf8[2];
1056           print_buf[num_print ++] = utf8[3];
1057           num_chars ++;
1058         }
1059     }
1060   while (1);
1061
1062   if (num_chars >= string_min)
1063     {
1064       /* We know that we have string_min valid characters in print_buf,
1065          and there may be more to come in the stream.  Start displaying
1066          them.  */
1067
1068       print_filename_and_address (filename, address + start_point);
1069
1070       unsigned int i;
1071       for (i = 0; i < num_print;)
1072         {
1073           if (print_buf[i] < 127)
1074             putchar (print_buf[i++]);
1075           else
1076             i += display_utf8_char (print_buf + i);
1077         }
1078
1079       /* OK so now we have to start read unchecked bytes.  */
1080
1081       /* Find a series of string_min characters.  Put them into print_buf.  */
1082       do
1083         {
1084           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1085           if (c == EOF)
1086             break;
1087
1088           if (! STRING_ISGRAPHIC (c))
1089             break;
1090
1091           if (c < 127)
1092             {
1093               putchar (c);
1094               continue;
1095             }
1096
1097           if (c < 0xc0)
1098             break;
1099
1100           /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
1101           unsigned char utf8[4];
1102
1103           utf8[0] = c;
1104           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1105           if (c == EOF)
1106             break;
1107           utf8[1] = c;
1108
1109           if ((utf8[1] & 0xc0) != 0x80)
1110             {
1111               /* Invalid UTF-8.  */
1112               putback_buf[num_putback++] = utf8[1];
1113               break;
1114             }
1115           else if ((utf8[0] & 0x20) == 0)
1116             {
1117               /* Valid 2-byte UTF-8.  */
1118               if (unicode_display == unicode_invalid)
1119                 {
1120                   putback_buf[num_putback++] = utf8[1];
1121                   break;
1122                 }
1123               else
1124                 {
1125                   (void) display_utf8_char (utf8);
1126                   continue;
1127                 }
1128             }
1129
1130           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1131           if (c == EOF)
1132             break;
1133           utf8[2] = c;
1134
1135           if ((utf8[2] & 0xc0) != 0x80)
1136             {
1137               /* Invalid UTF-8.  */
1138               putback_buf[num_putback++] = utf8[2];
1139               putback_buf[num_putback++] = utf8[1];
1140               break;
1141             }
1142           else if ((utf8[0] & 0x10) == 0)
1143             {
1144               /* Valid 3-byte UTF-8.  */
1145               if (unicode_display == unicode_invalid)
1146                 {
1147                   putback_buf[num_putback++] = utf8[2];
1148                   putback_buf[num_putback++] = utf8[1];
1149                   break;
1150                 }
1151               else
1152                 {
1153                   (void) display_utf8_char (utf8);
1154                   continue;
1155                 }
1156             }
1157
1158           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1159           if (c == EOF)
1160             break;
1161           utf8[3] = c;
1162
1163           if ((utf8[3] & 0xc0) != 0x80)
1164             {
1165               /* Invalid UTF-8.  */
1166               putback_buf[num_putback++] = utf8[3];
1167               putback_buf[num_putback++] = utf8[2];
1168               putback_buf[num_putback++] = utf8[1];
1169               break;
1170             }
1171           else if (unicode_display == unicode_invalid)
1172             {
1173               putback_buf[num_putback++] = utf8[3];
1174               putback_buf[num_putback++] = utf8[2];
1175               putback_buf[num_putback++] = utf8[1];
1176               break;
1177             }
1178           else
1179             /* A valid 4-byte UTF-8 encoding.  */
1180             (void) display_utf8_char (utf8);
1181         }
1182       while (1);
1183
1184       if (output_separator)
1185         fputs (output_separator, stdout);
1186       else
1187         putchar ('\n');
1188     }
1189
1190   if (c != EOF)
1191     /* FIXME: Using tail recursion here is lazy, but it works.  */
1192     print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1193 }
1194
1195 /* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
1196    encountered according to the setting of the unicode_display variable.
1197    The stream is positioned at ADDRESS and is attached to FILENAME.  */
1198
1199 static void
1200 print_unicode_stream (const char * filename,
1201                       file_ptr     address,
1202                       FILE *       stream)
1203 {
1204   /* Paranoia checks...  */
1205   if (filename == NULL
1206       || stream == NULL
1207       || unicode_display == unicode_default
1208       || encoding != 'S'
1209       || encoding_bytes != 1)
1210     {
1211       fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1212       return;
1213     }
1214
1215   /* Allocate space for string_min 4-byte utf-8 characters.  */
1216   unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1217   /* We should never have to put back more than 4 bytes.  */
1218   unsigned char putback_buf[5];
1219   unsigned int num_putback = 0;
1220
1221   print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1222   free (print_buf);
1223 }
1224 \f
1225 /* Find the strings in file FILENAME, read from STREAM.
1226    Assume that STREAM is positioned so that the next byte read
1227    is at address ADDRESS in the file.
1228
1229    If STREAM is NULL, do not read from it.
1230    The caller can supply a buffer of characters
1231    to be processed before the data in STREAM.
1232    MAGIC is the address of the buffer and
1233    MAGICCOUNT is how many characters are in it.
1234    Those characters come at address ADDRESS and the data in STREAM follow.  */
1235
1236 static void
1237 print_strings (const char *filename, FILE *stream, file_ptr address,
1238                int magiccount, char *magic)
1239 {
1240   if (unicode_display != unicode_default)
1241     {
1242       if (magic != NULL)
1243         print_unicode_buffer (filename, address,
1244                               (const unsigned char *) magic, magiccount);
1245
1246       if (stream != NULL)
1247         print_unicode_stream (filename, address, stream);
1248       return;
1249     }
1250
1251   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1252
1253   while (1)
1254     {
1255       file_ptr start;
1256       unsigned int i;
1257       long c;
1258
1259       /* See if the next `string_min' chars are all graphic chars.  */
1260     tryline:
1261       start = address;
1262       for (i = 0; i < string_min; i++)
1263         {
1264           c = get_char (stream, &address, &magiccount, &magic);
1265           if (c == EOF)
1266             {
1267               free (buf);
1268               return;
1269             }
1270
1271           if (! STRING_ISGRAPHIC (c))
1272             {
1273               /* Found a non-graphic.  Try again starting with next byte.  */
1274               unget_part_char (c, &address, &magiccount, &magic);
1275               goto tryline;
1276             }
1277           buf[i] = c;
1278         }
1279
1280       /* We found a run of `string_min' graphic characters.  Print up
1281          to the next non-graphic character.  */
1282       print_filename_and_address (filename, start);
1283
1284       buf[i] = '\0';
1285       fputs (buf, stdout);
1286
1287       while (1)
1288         {
1289           c = get_char (stream, &address, &magiccount, &magic);
1290           if (c == EOF)
1291             break;
1292           if (! STRING_ISGRAPHIC (c))
1293             {
1294               unget_part_char (c, &address, &magiccount, &magic);
1295               break;
1296             }
1297           putchar (c);
1298         }
1299
1300       if (output_separator)
1301         fputs (output_separator, stdout);
1302       else
1303         putchar ('\n');
1304     }
1305   free (buf);
1306 }
1307 \f
1308 static void
1309 usage (FILE *stream, int status)
1310 {
1311   fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1312   fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1313   fprintf (stream, _(" The options are:\n"));
1314
1315   if (DEFAULT_STRINGS_ALL)
1316     fprintf (stream, _("\
1317   -a - --all                Scan the entire file, not just the data section [default]\n\
1318   -d --data                 Only scan the data sections in the file\n"));
1319   else
1320     fprintf (stream, _("\
1321   -a - --all                Scan the entire file, not just the data section\n\
1322   -d --data                 Only scan the data sections in the file [default]\n"));
1323
1324   fprintf (stream, _("\
1325   -f --print-file-name      Print the name of the file before each string\n\
1326   -n --bytes=[number]       Locate & print any NUL-terminated sequence of at\n\
1327   -<number>                   least [number] characters (default 4).\n\
1328   -t --radix={o,d,x}        Print the location of the string in base 8, 10 or 16\n\
1329   -w --include-all-whitespace Include all whitespace as valid string characters\n\
1330   -o                        An alias for --radix=o\n\
1331   -T --target=<BFDNAME>     Specify the binary file format\n\
1332   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1333                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1334   --unicode={default|show|invalid|hex|escape|highlight}\n\
1335   -u {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
1336   -s --output-separator=<string> String used to separate strings in output.\n\
1337   @<file>                   Read options from <file>\n\
1338   -h --help                 Display this information\n\
1339   -v -V --version           Print the program's version number\n"));
1340   list_supported_targets (program_name, stream);
1341   if (REPORT_BUGS_TO[0] && status == 0)
1342     fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1343   exit (status);
1344 }