gas: Update commit 4780e5e4933
[binutils-gdb.git] / binutils / strings.c
1 /* strings -- print the strings of printable characters in files
2 Copyright (C) 1993-2021 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18 \f
19 /* Usage: strings [options] file...
20
21 Options:
22 --all
23 -a
24 - Scan each file in its entirety.
25
26 --data
27 -d Scan only the initialized data section(s) of object files.
28
29 --print-file-name
30 -f Print the name of the file before each string.
31
32 --bytes=min-len
33 -n min-len
34 -min-len Print graphic char sequences, MIN-LEN or more bytes long,
35 that are followed by a NUL or a newline. Default is 4.
36
37 --radix={o,x,d}
38 -t {o,x,d} Print the offset within the file before each string,
39 in octal/hex/decimal.
40
41 --include-all-whitespace
42 -w By default tab and space are the only whitepace included in graphic
43 char sequences. This option considers all of isspace() valid.
44
45 -o Like -to. (Some other implementations have -o like -to,
46 others like -td. We chose one arbitrarily.)
47
48 --encoding={s,S,b,l,B,L}
49 -e {s,S,b,l,B,L}
50 Select character encoding: 7-bit-character, 8-bit-character,
51 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
52 littleendian 32-bit.
53
54 --target=BFDNAME
55 -T {bfdname}
56 Specify a non-default object file format.
57
58 --unicode={default|locale|invalid|hex|escape|highlight}
59 -u {d|l|i|x|e|h}
60 Determine how to handle UTF-8 unicode characters. The default
61 is no special treatment. All other versions of this option
62 only apply if the encoding is valid and enabling the option
63 implies --encoding=S.
64 The 'locale' option displays the characters according to the
65 current locale. The 'invalid' option treats them as
66 non-string characters. The 'hex' option displays them as hex
67 byte sequences. The 'escape' option displays them as escape
68 sequences and the 'highlight' option displays them as
69 coloured escape sequences.
70
71 --output-separator=sep_string
72 -s sep_string String used to separate parsed strings in output.
73 Default is newline.
74
75 --help
76 -h Print the usage message on the standard output.
77
78 --version
79 -V
80 -v Print the program version number.
81
82 Written by Richard Stallman <rms@gnu.ai.mit.edu>
83 and David MacKenzie <djm@gnu.ai.mit.edu>. */
84
85 #include "sysdep.h"
86 #include "bfd.h"
87 #include "getopt.h"
88 #include "libiberty.h"
89 #include "safe-ctype.h"
90 #include "bucomm.h"
91
92 #ifndef streq
93 #define streq(a,b) (strcmp ((a),(b)) == 0)
94 #endif
95
96 typedef enum unicode_display_type
97 {
98 unicode_default = 0,
99 unicode_locale,
100 unicode_escape,
101 unicode_hex,
102 unicode_highlight,
103 unicode_invalid
104 } unicode_display_type;
105
106 static unicode_display_type unicode_display = unicode_default;
107
108 #define STRING_ISGRAPHIC(c) \
109 ( (c) >= 0 \
110 && (c) <= 255 \
111 && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
112 || (include_all_whitespace && ISSPACE (c))) \
113 )
114
115 #ifndef errno
116 extern int errno;
117 #endif
118
119 /* The BFD section flags that identify an initialized data section. */
120 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
121
122 /* Radix for printing addresses (must be 8, 10 or 16). */
123 static int address_radix;
124
125 /* Minimum length of sequence of graphic chars to trigger output. */
126 static unsigned int string_min;
127
128 /* Whether or not we include all whitespace as a graphic char. */
129 static bool include_all_whitespace;
130
131 /* TRUE means print address within file for each string. */
132 static bool print_addresses;
133
134 /* TRUE means print filename for each string. */
135 static bool print_filenames;
136
137 /* TRUE means for object files scan only the data section. */
138 static bool datasection_only;
139
140 /* The BFD object file format. */
141 static char *target;
142
143 /* The character encoding format. */
144 static char encoding;
145 static int encoding_bytes;
146
147 /* Output string used to separate parsed strings */
148 static char *output_separator;
149
150 static struct option long_options[] =
151 {
152 {"all", no_argument, NULL, 'a'},
153 {"bytes", required_argument, NULL, 'n'},
154 {"data", no_argument, NULL, 'd'},
155 {"encoding", required_argument, NULL, 'e'},
156 {"help", no_argument, NULL, 'h'},
157 {"include-all-whitespace", no_argument, NULL, 'w'},
158 {"output-separator", required_argument, NULL, 's'},
159 {"print-file-name", no_argument, NULL, 'f'},
160 {"radix", required_argument, NULL, 't'},
161 {"target", required_argument, NULL, 'T'},
162 {"unicode", required_argument, NULL, 'U'},
163 {"version", no_argument, NULL, 'v'},
164 {NULL, 0, NULL, 0}
165 };
166
167 static bool strings_file (char *);
168 static void print_strings (const char *, FILE *, file_ptr, int, char *);
169 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
170 \f
171 int main (int, char **);
172
173 int
174 main (int argc, char **argv)
175 {
176 int optc;
177 int exit_status = 0;
178 bool files_given = false;
179 char *s;
180 int numeric_opt = 0;
181
182 setlocale (LC_ALL, "");
183 bindtextdomain (PACKAGE, LOCALEDIR);
184 textdomain (PACKAGE);
185
186 program_name = argv[0];
187 xmalloc_set_program_name (program_name);
188 bfd_set_error_program_name (program_name);
189
190 expandargv (&argc, &argv);
191
192 string_min = 4;
193 include_all_whitespace = false;
194 print_addresses = false;
195 print_filenames = false;
196 if (DEFAULT_STRINGS_ALL)
197 datasection_only = false;
198 else
199 datasection_only = true;
200 target = NULL;
201 encoding = 's';
202 output_separator = NULL;
203
204 while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
205 long_options, (int *) 0)) != EOF)
206 {
207 switch (optc)
208 {
209 case 'a':
210 datasection_only = false;
211 break;
212
213 case 'd':
214 datasection_only = true;
215 break;
216
217 case 'f':
218 print_filenames = true;
219 break;
220
221 case 'H':
222 case 'h':
223 usage (stdout, 0);
224
225 case 'n':
226 string_min = (int) strtoul (optarg, &s, 0);
227 if (s != NULL && *s != 0)
228 fatal (_("invalid integer argument %s"), optarg);
229 break;
230
231 case 'w':
232 include_all_whitespace = true;
233 break;
234
235 case 'o':
236 print_addresses = true;
237 address_radix = 8;
238 break;
239
240 case 't':
241 print_addresses = true;
242 if (optarg[1] != '\0')
243 usage (stderr, 1);
244 switch (optarg[0])
245 {
246 case 'o':
247 address_radix = 8;
248 break;
249
250 case 'd':
251 address_radix = 10;
252 break;
253
254 case 'x':
255 address_radix = 16;
256 break;
257
258 default:
259 usage (stderr, 1);
260 }
261 break;
262
263 case 'T':
264 target = optarg;
265 break;
266
267 case 'e':
268 if (optarg[1] != '\0')
269 usage (stderr, 1);
270 encoding = optarg[0];
271 break;
272
273 case 's':
274 output_separator = optarg;
275 break;
276
277 case 'U':
278 if (streq (optarg, "default") || streq (optarg, "d"))
279 unicode_display = unicode_default;
280 else if (streq (optarg, "locale") || streq (optarg, "l"))
281 unicode_display = unicode_locale;
282 else if (streq (optarg, "escape") || streq (optarg, "e"))
283 unicode_display = unicode_escape;
284 else if (streq (optarg, "invalid") || streq (optarg, "i"))
285 unicode_display = unicode_invalid;
286 else if (streq (optarg, "hex") || streq (optarg, "x"))
287 unicode_display = unicode_hex;
288 else if (streq (optarg, "highlight") || streq (optarg, "h"))
289 unicode_display = unicode_highlight;
290 else
291 fatal (_("invalid argument to -U/--unicode: %s"), optarg);
292 break;
293
294 case 'V':
295 case 'v':
296 print_version ("strings");
297 break;
298
299 case '?':
300 usage (stderr, 1);
301
302 default:
303 numeric_opt = optind;
304 break;
305 }
306 }
307
308 if (unicode_display != unicode_default)
309 encoding = 'S';
310
311 if (numeric_opt != 0)
312 {
313 string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
314 if (s != NULL && *s != 0)
315 fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
316 }
317 if (string_min < 1)
318 fatal (_("invalid minimum string length %d"), string_min);
319
320 switch (encoding)
321 {
322 case 'S':
323 case 's':
324 encoding_bytes = 1;
325 break;
326 case 'b':
327 case 'l':
328 encoding_bytes = 2;
329 break;
330 case 'B':
331 case 'L':
332 encoding_bytes = 4;
333 break;
334 default:
335 usage (stderr, 1);
336 }
337
338 if (bfd_init () != BFD_INIT_MAGIC)
339 fatal (_("fatal error: libbfd ABI mismatch"));
340 set_default_bfd_target ();
341
342 if (optind >= argc)
343 {
344 datasection_only = false;
345 SET_BINARY (fileno (stdin));
346 print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
347 files_given = true;
348 }
349 else
350 {
351 for (; optind < argc; ++optind)
352 {
353 if (streq (argv[optind], "-"))
354 datasection_only = false;
355 else
356 {
357 files_given = true;
358 exit_status |= !strings_file (argv[optind]);
359 }
360 }
361 }
362
363 if (!files_given)
364 usage (stderr, 1);
365
366 return (exit_status);
367 }
368 \f
369 /* Scan section SECT of the file ABFD, whose printable name is
370 FILENAME. If it contains initialized data set GOT_A_SECTION and
371 print the strings in it. */
372
373 static void
374 strings_a_section (bfd *abfd, asection *sect, const char *filename,
375 bool *got_a_section)
376 {
377 bfd_size_type sectsize;
378 bfd_byte *mem;
379
380 if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
381 return;
382
383 sectsize = bfd_section_size (sect);
384 if (sectsize == 0)
385 return;
386
387 if (!bfd_malloc_and_get_section (abfd, sect, &mem))
388 {
389 non_fatal (_("%s: Reading section %s failed: %s"),
390 filename, sect->name, bfd_errmsg (bfd_get_error ()));
391 return;
392 }
393
394 *got_a_section = true;
395 print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
396 free (mem);
397 }
398
399 /* Scan all of the sections in FILE, and print the strings
400 in the initialized data section(s).
401
402 Return TRUE if successful,
403 FALSE if not (such as if FILE is not an object file). */
404
405 static bool
406 strings_object_file (const char *file)
407 {
408 bfd *abfd;
409 asection *s;
410 bool got_a_section;
411
412 abfd = bfd_openr (file, target);
413
414 if (abfd == NULL)
415 /* Treat the file as a non-object file. */
416 return false;
417
418 /* This call is mainly for its side effect of reading in the sections.
419 We follow the traditional behavior of `strings' in that we don't
420 complain if we don't recognize a file to be an object file. */
421 if (!bfd_check_format (abfd, bfd_object))
422 {
423 bfd_close (abfd);
424 return false;
425 }
426
427 got_a_section = false;
428 for (s = abfd->sections; s != NULL; s = s->next)
429 strings_a_section (abfd, s, file, &got_a_section);
430
431 if (!bfd_close (abfd))
432 {
433 bfd_nonfatal (file);
434 return false;
435 }
436
437 return got_a_section;
438 }
439
440 /* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */
441
442 static bool
443 strings_file (char *file)
444 {
445 struct stat st;
446
447 /* get_file_size does not support non-S_ISREG files. */
448
449 if (stat (file, &st) < 0)
450 {
451 if (errno == ENOENT)
452 non_fatal (_("'%s': No such file"), file);
453 else
454 non_fatal (_("Warning: could not locate '%s'. reason: %s"),
455 file, strerror (errno));
456 return false;
457 }
458 else if (S_ISDIR (st.st_mode))
459 {
460 non_fatal (_("Warning: '%s' is a directory"), file);
461 return false;
462 }
463
464 /* If we weren't told to scan the whole file,
465 try to open it as an object file and only look at
466 initialized data sections. If that fails, fall back to the
467 whole file. */
468 if (!datasection_only || !strings_object_file (file))
469 {
470 FILE *stream;
471
472 stream = fopen (file, FOPEN_RB);
473 if (stream == NULL)
474 {
475 fprintf (stderr, "%s: ", program_name);
476 perror (file);
477 return false;
478 }
479
480 print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
481
482 if (fclose (stream) == EOF)
483 {
484 fprintf (stderr, "%s: ", program_name);
485 perror (file);
486 return false;
487 }
488 }
489
490 return true;
491 }
492 \f
493 /* Read the next character, return EOF if none available.
494 Assume that STREAM is positioned so that the next byte read
495 is at address ADDRESS in the file.
496
497 If STREAM is NULL, do not read from it.
498 The caller can supply a buffer of characters
499 to be processed before the data in STREAM.
500 MAGIC is the address of the buffer and
501 MAGICCOUNT is how many characters are in it. */
502
503 static long
504 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
505 {
506 int c, i;
507 long r = 0;
508
509 for (i = 0; i < encoding_bytes; i++)
510 {
511 if (*magiccount)
512 {
513 (*magiccount)--;
514 c = *(*magic)++;
515 }
516 else
517 {
518 if (stream == NULL)
519 return EOF;
520
521 /* Only use getc_unlocked if we found a declaration for it.
522 Otherwise, libc is not thread safe by default, and we
523 should not use it. */
524
525 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
526 c = getc_unlocked (stream);
527 #else
528 c = getc (stream);
529 #endif
530 if (c == EOF)
531 return EOF;
532 }
533
534 (*address)++;
535 r = (r << 8) | (c & 0xff);
536 }
537
538 switch (encoding)
539 {
540 default:
541 break;
542 case 'l':
543 r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
544 break;
545 case 'L':
546 r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
547 | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
548 break;
549 }
550
551 return r;
552 }
553
554 /* Throw away one byte of a (possibly) multi-byte char C, updating
555 address and buffer to suit. */
556
557 static void
558 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
559 {
560 static char tmp[4];
561
562 if (encoding_bytes > 1)
563 {
564 *address -= encoding_bytes - 1;
565
566 if (*magiccount == 0)
567 {
568 /* If no magic buffer exists, use temp buffer. */
569 switch (encoding)
570 {
571 default:
572 break;
573 case 'b':
574 tmp[0] = c & 0xff;
575 *magiccount = 1;
576 break;
577 case 'l':
578 tmp[0] = (c >> 8) & 0xff;
579 *magiccount = 1;
580 break;
581 case 'B':
582 tmp[0] = (c >> 16) & 0xff;
583 tmp[1] = (c >> 8) & 0xff;
584 tmp[2] = c & 0xff;
585 *magiccount = 3;
586 break;
587 case 'L':
588 tmp[0] = (c >> 8) & 0xff;
589 tmp[1] = (c >> 16) & 0xff;
590 tmp[2] = (c >> 24) & 0xff;
591 *magiccount = 3;
592 break;
593 }
594 *magic = tmp;
595 }
596 else
597 {
598 /* If magic buffer exists, rewind. */
599 *magic -= encoding_bytes - 1;
600 *magiccount += encoding_bytes - 1;
601 }
602 }
603 }
604
605 static void
606 print_filename_and_address (const char * filename, file_ptr address)
607 {
608 if (print_filenames)
609 printf ("%s: ", filename);
610
611 if (! print_addresses)
612 return;
613
614 switch (address_radix)
615 {
616 case 8:
617 if (sizeof (address) > sizeof (long))
618 {
619 #ifndef __MSVCRT__
620 printf ("%7llo ", (unsigned long long) address);
621 #else
622 printf ("%7I64o ", (unsigned long long) address);
623 #endif
624 }
625 else
626 printf ("%7lo ", (unsigned long) address);
627 break;
628
629 case 10:
630 if (sizeof (address) > sizeof (long))
631 {
632 #ifndef __MSVCRT__
633 printf ("%7llu ", (unsigned long long) address);
634 #else
635 printf ("%7I64d ", (unsigned long long) address);
636 #endif
637 }
638 else
639 printf ("%7ld ", (long) address);
640 break;
641
642 case 16:
643 if (sizeof (address) > sizeof (long))
644 {
645 #ifndef __MSVCRT__
646 printf ("%7llx ", (unsigned long long) address);
647 #else
648 printf ("%7I64x ", (unsigned long long) address);
649 #endif
650 }
651 else
652 printf ("%7lx ", (unsigned long) address);
653 break;
654 }
655 }
656
657 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
658 If the encoding is valid then returns the number of bytes it uses. */
659
660 static unsigned int
661 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
662 {
663 if (buffer[0] < 0xc0)
664 return 0;
665
666 if (buflen < 2)
667 return 0;
668
669 if ((buffer[1] & 0xc0) != 0x80)
670 return 0;
671
672 if ((buffer[0] & 0x20) == 0)
673 return 2;
674
675 if (buflen < 3)
676 return 0;
677
678 if ((buffer[2] & 0xc0) != 0x80)
679 return 0;
680
681 if ((buffer[0] & 0x10) == 0)
682 return 3;
683
684 if (buflen < 4)
685 return 0;
686
687 if ((buffer[3] & 0xc0) != 0x80)
688 return 0;
689
690 return 4;
691 }
692
693 /* Display a UTF-8 encoded character in BUFFER according to the setting
694 of unicode_display. The character is known to be valid.
695 Returns the number of bytes consumed. */
696
697 static unsigned int
698 display_utf8_char (const unsigned char * buffer)
699 {
700 unsigned int j;
701 unsigned int utf8_len;
702
703 switch (buffer[0] & 0x30)
704 {
705 case 0x00:
706 case 0x10:
707 utf8_len = 2;
708 break;
709 case 0x20:
710 utf8_len = 3;
711 break;
712 default:
713 utf8_len = 4;
714 }
715
716 switch (unicode_display)
717 {
718 default:
719 fprintf (stderr, "ICE: unexpected unicode display type\n");
720 break;
721
722 case unicode_escape:
723 case unicode_highlight:
724 if (unicode_display == unicode_highlight && isatty (1))
725 printf ("\x1B[31;47m"); /* Red. */
726
727 switch (utf8_len)
728 {
729 case 2:
730 printf ("\\u%02x%02x",
731 ((buffer[0] & 0x1c) >> 2),
732 ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
733 break;
734
735 case 3:
736 printf ("\\u%02x%02x",
737 ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
738 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
739 break;
740
741 case 4:
742 printf ("\\u%02x%02x%02x",
743 ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
744 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
745 ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
746 break;
747 default:
748 /* URG. */
749 break;
750 }
751
752 if (unicode_display == unicode_highlight && isatty (1))
753 printf ("\033[0m"); /* Default colour. */
754 break;
755
756 case unicode_hex:
757 putchar ('<');
758 printf ("0x");
759 for (j = 0; j < utf8_len; j++)
760 printf ("%02x", buffer [j]);
761 putchar ('>');
762 break;
763
764 case unicode_locale:
765 printf ("%.1s", buffer);
766 break;
767 }
768
769 return utf8_len;
770 }
771
772 /* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
773 according to the setting of the unicode_display variable. The buffer
774 contains BUFLEN bytes.
775
776 Display the characters as if they started at ADDRESS and are contained in
777 FILENAME. */
778
779 static void
780 print_unicode_buffer (const char * filename,
781 file_ptr address,
782 const unsigned char * buffer,
783 unsigned long buflen)
784 {
785 /* Paranoia checks... */
786 if (filename == NULL
787 || buffer == NULL
788 || unicode_display == unicode_default
789 || encoding != 'S'
790 || encoding_bytes != 1)
791 {
792 fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
793 return;
794 }
795
796 if (buflen == 0)
797 return;
798
799 /* We must only display strings that are at least string_min *characters*
800 long. So we scan the buffer in two stages. First we locate the start
801 of a potential string. Then we walk along it until we have found
802 string_min characters. Then we go back to the start point and start
803 displaying characters according to the unicode_display setting. */
804
805 unsigned long start_point = 0;
806 unsigned long i = 0;
807 unsigned int char_len = 1;
808 unsigned int num_found = 0;
809
810 for (i = 0; i < buflen; i += char_len)
811 {
812 int c = buffer[i];
813
814 char_len = 1;
815
816 /* Find the first potential character of a string. */
817 if (! STRING_ISGRAPHIC (c))
818 {
819 num_found = 0;
820 continue;
821 }
822
823 if (c > 126)
824 {
825 if (c < 0xc0)
826 {
827 num_found = 0;
828 continue;
829 }
830
831 if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
832 {
833 char_len = 1;
834 num_found = 0;
835 continue;
836 }
837
838 if (unicode_display == unicode_invalid)
839 {
840 /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
841 num_found = 0;
842 continue;
843 }
844 }
845
846 if (num_found == 0)
847 /* We have found a potential starting point for a string. */
848 start_point = i;
849
850 ++ num_found;
851
852 if (num_found >= string_min)
853 break;
854 }
855
856 if (num_found < string_min)
857 return;
858
859 print_filename_and_address (filename, address + start_point);
860
861 /* We have found string_min characters. Display them and any
862 more that follow. */
863 for (i = start_point; i < buflen; i += char_len)
864 {
865 int c = buffer[i];
866
867 char_len = 1;
868
869 if (! STRING_ISGRAPHIC (c))
870 break;
871 else if (c < 127)
872 putchar (c);
873 else if (! is_valid_utf8 (buffer + i, buflen - i))
874 break;
875 else if (unicode_display == unicode_invalid)
876 break;
877 else
878 char_len = display_utf8_char (buffer + i);
879 }
880
881 if (output_separator)
882 fputs (output_separator, stdout);
883 else
884 putchar ('\n');
885
886 /* FIXME: Using tail recursion here is lazy programming... */
887 print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
888 }
889
890 static int
891 get_unicode_byte (FILE * stream,
892 unsigned char * putback,
893 unsigned int * num_putback,
894 unsigned int * num_read)
895 {
896 if (* num_putback > 0)
897 {
898 * num_putback = * num_putback - 1;
899 return putback [* num_putback];
900 }
901
902 * num_read = * num_read + 1;
903
904 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
905 return getc_unlocked (stream);
906 #else
907 return getc (stream);
908 #endif
909 }
910
911 /* Helper function for print_unicode_stream. */
912
913 static void
914 print_unicode_stream_body (const char * filename,
915 file_ptr address,
916 FILE * stream,
917 unsigned char * putback_buf,
918 unsigned int num_putback,
919 unsigned char * print_buf)
920 {
921 /* It would be nice if we could just read the stream into a buffer
922 and then process if with print_unicode_buffer. But the input
923 might be huge or it might time-locked (eg stdin). So instead
924 we go one byte at a time... */
925
926 file_ptr start_point = 0;
927 unsigned int num_read = 0;
928 unsigned int num_chars = 0;
929 unsigned int num_print = 0;
930 int c = 0;
931
932 /* Find a series of string_min characters. Put them into print_buf. */
933 do
934 {
935 if (num_chars >= string_min)
936 break;
937
938 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
939 if (c == EOF)
940 break;
941
942 if (! STRING_ISGRAPHIC (c))
943 {
944 num_chars = num_print = 0;
945 continue;
946 }
947
948 if (num_chars == 0)
949 start_point = num_read - 1;
950
951 if (c < 127)
952 {
953 print_buf[num_print] = c;
954 num_chars ++;
955 num_print ++;
956 continue;
957 }
958
959 if (c < 0xc0)
960 {
961 num_chars = num_print = 0;
962 continue;
963 }
964
965 /* We *might* have a UTF-8 sequence. Time to start peeking. */
966 char utf8[4];
967
968 utf8[0] = c;
969 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
970 if (c == EOF)
971 break;
972 utf8[1] = c;
973
974 if ((utf8[1] & 0xc0) != 0x80)
975 {
976 /* Invalid UTF-8. */
977 putback_buf[num_putback++] = utf8[1];
978 num_chars = num_print = 0;
979 continue;
980 }
981 else if ((utf8[0] & 0x20) == 0)
982 {
983 /* A valid 2-byte UTF-8 encoding. */
984 if (unicode_display == unicode_invalid)
985 {
986 putback_buf[num_putback++] = utf8[1];
987 num_chars = num_print = 0;
988 }
989 else
990 {
991 print_buf[num_print ++] = utf8[0];
992 print_buf[num_print ++] = utf8[1];
993 num_chars ++;
994 }
995 continue;
996 }
997
998 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
999 if (c == EOF)
1000 break;
1001 utf8[2] = c;
1002
1003 if ((utf8[2] & 0xc0) != 0x80)
1004 {
1005 /* Invalid UTF-8. */
1006 putback_buf[num_putback++] = utf8[2];
1007 putback_buf[num_putback++] = utf8[1];
1008 num_chars = num_print = 0;
1009 continue;
1010 }
1011 else if ((utf8[0] & 0x10) == 0)
1012 {
1013 /* A valid 3-byte UTF-8 encoding. */
1014 if (unicode_display == unicode_invalid)
1015 {
1016 putback_buf[num_putback++] = utf8[2];
1017 putback_buf[num_putback++] = utf8[1];
1018 num_chars = num_print = 0;
1019 }
1020 else
1021 {
1022 print_buf[num_print ++] = utf8[0];
1023 print_buf[num_print ++] = utf8[1];
1024 print_buf[num_print ++] = utf8[2];
1025 num_chars ++;
1026 }
1027 continue;
1028 }
1029
1030 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1031 if (c == EOF)
1032 break;
1033 utf8[3] = c;
1034
1035 if ((utf8[3] & 0xc0) != 0x80)
1036 {
1037 /* Invalid UTF-8. */
1038 putback_buf[num_putback++] = utf8[3];
1039 putback_buf[num_putback++] = utf8[2];
1040 putback_buf[num_putback++] = utf8[1];
1041 num_chars = num_print = 0;
1042 }
1043 /* We have a valid 4-byte UTF-8 encoding. */
1044 else if (unicode_display == unicode_invalid)
1045 {
1046 putback_buf[num_putback++] = utf8[3];
1047 putback_buf[num_putback++] = utf8[1];
1048 putback_buf[num_putback++] = utf8[2];
1049 num_chars = num_print = 0;
1050 }
1051 else
1052 {
1053 print_buf[num_print ++] = utf8[0];
1054 print_buf[num_print ++] = utf8[1];
1055 print_buf[num_print ++] = utf8[2];
1056 print_buf[num_print ++] = utf8[3];
1057 num_chars ++;
1058 }
1059 }
1060 while (1);
1061
1062 if (num_chars >= string_min)
1063 {
1064 /* We know that we have string_min valid characters in print_buf,
1065 and there may be more to come in the stream. Start displaying
1066 them. */
1067
1068 print_filename_and_address (filename, address + start_point);
1069
1070 unsigned int i;
1071 for (i = 0; i < num_print;)
1072 {
1073 if (print_buf[i] < 127)
1074 putchar (print_buf[i++]);
1075 else
1076 i += display_utf8_char (print_buf + i);
1077 }
1078
1079 /* OK so now we have to start read unchecked bytes. */
1080
1081 /* Find a series of string_min characters. Put them into print_buf. */
1082 do
1083 {
1084 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1085 if (c == EOF)
1086 break;
1087
1088 if (! STRING_ISGRAPHIC (c))
1089 break;
1090
1091 if (c < 127)
1092 {
1093 putchar (c);
1094 continue;
1095 }
1096
1097 if (c < 0xc0)
1098 break;
1099
1100 /* We *might* have a UTF-8 sequence. Time to start peeking. */
1101 unsigned char utf8[4];
1102
1103 utf8[0] = c;
1104 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1105 if (c == EOF)
1106 break;
1107 utf8[1] = c;
1108
1109 if ((utf8[1] & 0xc0) != 0x80)
1110 {
1111 /* Invalid UTF-8. */
1112 putback_buf[num_putback++] = utf8[1];
1113 break;
1114 }
1115 else if ((utf8[0] & 0x20) == 0)
1116 {
1117 /* Valid 2-byte UTF-8. */
1118 if (unicode_display == unicode_invalid)
1119 {
1120 putback_buf[num_putback++] = utf8[1];
1121 break;
1122 }
1123 else
1124 {
1125 (void) display_utf8_char (utf8);
1126 continue;
1127 }
1128 }
1129
1130 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1131 if (c == EOF)
1132 break;
1133 utf8[2] = c;
1134
1135 if ((utf8[2] & 0xc0) != 0x80)
1136 {
1137 /* Invalid UTF-8. */
1138 putback_buf[num_putback++] = utf8[2];
1139 putback_buf[num_putback++] = utf8[1];
1140 break;
1141 }
1142 else if ((utf8[0] & 0x10) == 0)
1143 {
1144 /* Valid 3-byte UTF-8. */
1145 if (unicode_display == unicode_invalid)
1146 {
1147 putback_buf[num_putback++] = utf8[2];
1148 putback_buf[num_putback++] = utf8[1];
1149 break;
1150 }
1151 else
1152 {
1153 (void) display_utf8_char (utf8);
1154 continue;
1155 }
1156 }
1157
1158 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1159 if (c == EOF)
1160 break;
1161 utf8[3] = c;
1162
1163 if ((utf8[3] & 0xc0) != 0x80)
1164 {
1165 /* Invalid UTF-8. */
1166 putback_buf[num_putback++] = utf8[3];
1167 putback_buf[num_putback++] = utf8[2];
1168 putback_buf[num_putback++] = utf8[1];
1169 break;
1170 }
1171 else if (unicode_display == unicode_invalid)
1172 {
1173 putback_buf[num_putback++] = utf8[3];
1174 putback_buf[num_putback++] = utf8[2];
1175 putback_buf[num_putback++] = utf8[1];
1176 break;
1177 }
1178 else
1179 /* A valid 4-byte UTF-8 encoding. */
1180 (void) display_utf8_char (utf8);
1181 }
1182 while (1);
1183
1184 if (output_separator)
1185 fputs (output_separator, stdout);
1186 else
1187 putchar ('\n');
1188 }
1189
1190 if (c != EOF)
1191 /* FIXME: Using tail recursion here is lazy, but it works. */
1192 print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1193 }
1194
1195 /* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1196 encountered according to the setting of the unicode_display variable.
1197 The stream is positioned at ADDRESS and is attached to FILENAME. */
1198
1199 static void
1200 print_unicode_stream (const char * filename,
1201 file_ptr address,
1202 FILE * stream)
1203 {
1204 /* Paranoia checks... */
1205 if (filename == NULL
1206 || stream == NULL
1207 || unicode_display == unicode_default
1208 || encoding != 'S'
1209 || encoding_bytes != 1)
1210 {
1211 fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1212 return;
1213 }
1214
1215 /* Allocate space for string_min 4-byte utf-8 characters. */
1216 unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1217 /* We should never have to put back more than 4 bytes. */
1218 unsigned char putback_buf[5];
1219 unsigned int num_putback = 0;
1220
1221 print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1222 free (print_buf);
1223 }
1224 \f
1225 /* Find the strings in file FILENAME, read from STREAM.
1226 Assume that STREAM is positioned so that the next byte read
1227 is at address ADDRESS in the file.
1228
1229 If STREAM is NULL, do not read from it.
1230 The caller can supply a buffer of characters
1231 to be processed before the data in STREAM.
1232 MAGIC is the address of the buffer and
1233 MAGICCOUNT is how many characters are in it.
1234 Those characters come at address ADDRESS and the data in STREAM follow. */
1235
1236 static void
1237 print_strings (const char *filename, FILE *stream, file_ptr address,
1238 int magiccount, char *magic)
1239 {
1240 if (unicode_display != unicode_default)
1241 {
1242 if (magic != NULL)
1243 print_unicode_buffer (filename, address,
1244 (const unsigned char *) magic, magiccount);
1245
1246 if (stream != NULL)
1247 print_unicode_stream (filename, address, stream);
1248 return;
1249 }
1250
1251 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1252
1253 while (1)
1254 {
1255 file_ptr start;
1256 unsigned int i;
1257 long c;
1258
1259 /* See if the next `string_min' chars are all graphic chars. */
1260 tryline:
1261 start = address;
1262 for (i = 0; i < string_min; i++)
1263 {
1264 c = get_char (stream, &address, &magiccount, &magic);
1265 if (c == EOF)
1266 {
1267 free (buf);
1268 return;
1269 }
1270
1271 if (! STRING_ISGRAPHIC (c))
1272 {
1273 /* Found a non-graphic. Try again starting with next byte. */
1274 unget_part_char (c, &address, &magiccount, &magic);
1275 goto tryline;
1276 }
1277 buf[i] = c;
1278 }
1279
1280 /* We found a run of `string_min' graphic characters. Print up
1281 to the next non-graphic character. */
1282 print_filename_and_address (filename, start);
1283
1284 buf[i] = '\0';
1285 fputs (buf, stdout);
1286
1287 while (1)
1288 {
1289 c = get_char (stream, &address, &magiccount, &magic);
1290 if (c == EOF)
1291 break;
1292 if (! STRING_ISGRAPHIC (c))
1293 {
1294 unget_part_char (c, &address, &magiccount, &magic);
1295 break;
1296 }
1297 putchar (c);
1298 }
1299
1300 if (output_separator)
1301 fputs (output_separator, stdout);
1302 else
1303 putchar ('\n');
1304 }
1305 free (buf);
1306 }
1307 \f
1308 static void
1309 usage (FILE *stream, int status)
1310 {
1311 fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1312 fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1313 fprintf (stream, _(" The options are:\n"));
1314
1315 if (DEFAULT_STRINGS_ALL)
1316 fprintf (stream, _("\
1317 -a - --all Scan the entire file, not just the data section [default]\n\
1318 -d --data Only scan the data sections in the file\n"));
1319 else
1320 fprintf (stream, _("\
1321 -a - --all Scan the entire file, not just the data section\n\
1322 -d --data Only scan the data sections in the file [default]\n"));
1323
1324 fprintf (stream, _("\
1325 -f --print-file-name Print the name of the file before each string\n\
1326 -n --bytes=[number] Locate & print any NUL-terminated sequence of at\n\
1327 -<number> least [number] characters (default 4).\n\
1328 -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\
1329 -w --include-all-whitespace Include all whitespace as valid string characters\n\
1330 -o An alias for --radix=o\n\
1331 -T --target=<BFDNAME> Specify the binary file format\n\
1332 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1333 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1334 --unicode={default|show|invalid|hex|escape|highlight}\n\
1335 -u {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
1336 -s --output-separator=<string> String used to separate strings in output.\n\
1337 @<file> Read options from <file>\n\
1338 -h --help Display this information\n\
1339 -v -V --version Print the program's version number\n"));
1340 list_supported_targets (program_name, stream);
1341 if (REPORT_BUGS_TO[0] && status == 0)
1342 fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1343 exit (status);
1344 }