Re: Add --unicode option
[binutils-gdb.git] / binutils / strings.c
1 /* strings -- print the strings of printable characters in files
2 Copyright (C) 1993-2021 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18 \f
19 /* Usage: strings [options] file...
20
21 Options:
22 --all
23 -a
24 - Scan each file in its entirety.
25
26 --data
27 -d Scan only the initialized data section(s) of object files.
28
29 --print-file-name
30 -f Print the name of the file before each string.
31
32 --bytes=min-len
33 -n min-len
34 -min-len Print graphic char sequences, MIN-LEN or more bytes long,
35 that are followed by a NUL or a newline. Default is 4.
36
37 --radix={o,x,d}
38 -t {o,x,d} Print the offset within the file before each string,
39 in octal/hex/decimal.
40
41 --include-all-whitespace
42 -w By default tab and space are the only whitepace included in graphic
43 char sequences. This option considers all of isspace() valid.
44
45 -o Like -to. (Some other implementations have -o like -to,
46 others like -td. We chose one arbitrarily.)
47
48 --encoding={s,S,b,l,B,L}
49 -e {s,S,b,l,B,L}
50 Select character encoding: 7-bit-character, 8-bit-character,
51 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
52 littleendian 32-bit.
53
54 --target=BFDNAME
55 -T {bfdname}
56 Specify a non-default object file format.
57
58 --unicode={default|locale|invalid|hex|escape|highlight}
59 -u {d|l|i|x|e|h}
60 Determine how to handle UTF-8 unicode characters. The default
61 is no special treatment. All other versions of this option
62 only apply if the encoding is valid and enabling the option
63 implies --encoding=S.
64 The 'locale' option displays the characters according to the
65 current locale. The 'invalid' option treats them as
66 non-string characters. The 'hex' option displays them as hex
67 byte sequences. The 'escape' option displays them as escape
68 sequences and the 'highlight' option displays them as
69 coloured escape sequences.
70
71 --output-separator=sep_string
72 -s sep_string String used to separate parsed strings in output.
73 Default is newline.
74
75 --help
76 -h Print the usage message on the standard output.
77
78 --version
79 -V
80 -v Print the program version number.
81
82 Written by Richard Stallman <rms@gnu.ai.mit.edu>
83 and David MacKenzie <djm@gnu.ai.mit.edu>. */
84
85 #include "sysdep.h"
86 #include "bfd.h"
87 #include "getopt.h"
88 #include "libiberty.h"
89 #include "safe-ctype.h"
90 #include "bucomm.h"
91
92 #ifndef streq
93 #define streq(a,b) (strcmp ((a),(b)) == 0)
94 #endif
95
96 typedef enum unicode_display_type
97 {
98 unicode_default = 0,
99 unicode_locale,
100 unicode_escape,
101 unicode_hex,
102 unicode_highlight,
103 unicode_invalid
104 } unicode_display_type;
105
106 static unicode_display_type unicode_display = unicode_default;
107
108 #define STRING_ISGRAPHIC(c) \
109 ( (c) >= 0 \
110 && (c) <= 255 \
111 && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
112 || (include_all_whitespace && ISSPACE (c))) \
113 )
114
115 #ifndef errno
116 extern int errno;
117 #endif
118
119 /* The BFD section flags that identify an initialized data section. */
120 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
121
122 /* Radix for printing addresses (must be 8, 10 or 16). */
123 static int address_radix;
124
125 /* Minimum length of sequence of graphic chars to trigger output. */
126 static uint string_min;
127
128 /* Whether or not we include all whitespace as a graphic char. */
129 static bool include_all_whitespace;
130
131 /* TRUE means print address within file for each string. */
132 static bool print_addresses;
133
134 /* TRUE means print filename for each string. */
135 static bool print_filenames;
136
137 /* TRUE means for object files scan only the data section. */
138 static bool datasection_only;
139
140 /* The BFD object file format. */
141 static char *target;
142
143 /* The character encoding format. */
144 static char encoding;
145 static int encoding_bytes;
146
147 /* Output string used to separate parsed strings */
148 static char *output_separator;
149
150 static struct option long_options[] =
151 {
152 {"all", no_argument, NULL, 'a'},
153 {"bytes", required_argument, NULL, 'n'},
154 {"data", no_argument, NULL, 'd'},
155 {"encoding", required_argument, NULL, 'e'},
156 {"help", no_argument, NULL, 'h'},
157 {"include-all-whitespace", no_argument, NULL, 'w'},
158 {"output-separator", required_argument, NULL, 's'},
159 {"print-file-name", no_argument, NULL, 'f'},
160 {"radix", required_argument, NULL, 't'},
161 {"target", required_argument, NULL, 'T'},
162 {"unicode", required_argument, NULL, 'U'},
163 {"version", no_argument, NULL, 'v'},
164 {NULL, 0, NULL, 0}
165 };
166
167 static bool strings_file (char *);
168 static void print_strings (const char *, FILE *, file_ptr, int, char *);
169 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
170 \f
171 int main (int, char **);
172
173 int
174 main (int argc, char **argv)
175 {
176 int optc;
177 int exit_status = 0;
178 bool files_given = false;
179 char *s;
180 int numeric_opt = 0;
181
182 setlocale (LC_ALL, "");
183 bindtextdomain (PACKAGE, LOCALEDIR);
184 textdomain (PACKAGE);
185
186 program_name = argv[0];
187 xmalloc_set_program_name (program_name);
188 bfd_set_error_program_name (program_name);
189
190 expandargv (&argc, &argv);
191
192 string_min = 4;
193 include_all_whitespace = false;
194 print_addresses = false;
195 print_filenames = false;
196 if (DEFAULT_STRINGS_ALL)
197 datasection_only = false;
198 else
199 datasection_only = true;
200 target = NULL;
201 encoding = 's';
202 output_separator = NULL;
203
204 while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
205 long_options, (int *) 0)) != EOF)
206 {
207 switch (optc)
208 {
209 case 'a':
210 datasection_only = false;
211 break;
212
213 case 'd':
214 datasection_only = true;
215 break;
216
217 case 'f':
218 print_filenames = true;
219 break;
220
221 case 'H':
222 case 'h':
223 usage (stdout, 0);
224
225 case 'n':
226 string_min = (int) strtoul (optarg, &s, 0);
227 if (s != NULL && *s != 0)
228 fatal (_("invalid integer argument %s"), optarg);
229 break;
230
231 case 'w':
232 include_all_whitespace = true;
233 break;
234
235 case 'o':
236 print_addresses = true;
237 address_radix = 8;
238 break;
239
240 case 't':
241 print_addresses = true;
242 if (optarg[1] != '\0')
243 usage (stderr, 1);
244 switch (optarg[0])
245 {
246 case 'o':
247 address_radix = 8;
248 break;
249
250 case 'd':
251 address_radix = 10;
252 break;
253
254 case 'x':
255 address_radix = 16;
256 break;
257
258 default:
259 usage (stderr, 1);
260 }
261 break;
262
263 case 'T':
264 target = optarg;
265 break;
266
267 case 'e':
268 if (optarg[1] != '\0')
269 usage (stderr, 1);
270 encoding = optarg[0];
271 break;
272
273 case 's':
274 output_separator = optarg;
275 break;
276
277 case 'U':
278 if (streq (optarg, "default") || streq (optarg, "d"))
279 unicode_display = unicode_default;
280 else if (streq (optarg, "locale") || streq (optarg, "l"))
281 unicode_display = unicode_locale;
282 else if (streq (optarg, "escape") || streq (optarg, "e"))
283 unicode_display = unicode_escape;
284 else if (streq (optarg, "invalid") || streq (optarg, "i"))
285 unicode_display = unicode_invalid;
286 else if (streq (optarg, "hex") || streq (optarg, "x"))
287 unicode_display = unicode_hex;
288 else if (streq (optarg, "highlight") || streq (optarg, "h"))
289 unicode_display = unicode_highlight;
290 else
291 fatal (_("invalid argument to -U/--unicode: %s"), optarg);
292 break;
293
294 case 'V':
295 case 'v':
296 print_version ("strings");
297 break;
298
299 case '?':
300 usage (stderr, 1);
301
302 default:
303 numeric_opt = optind;
304 break;
305 }
306 }
307
308 if (unicode_display != unicode_default)
309 encoding = 'S';
310
311 if (numeric_opt != 0)
312 {
313 string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
314 if (s != NULL && *s != 0)
315 fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
316 }
317 if (string_min < 1)
318 fatal (_("invalid minimum string length %d"), string_min);
319
320 switch (encoding)
321 {
322 case 'S':
323 case 's':
324 encoding_bytes = 1;
325 break;
326 case 'b':
327 case 'l':
328 encoding_bytes = 2;
329 break;
330 case 'B':
331 case 'L':
332 encoding_bytes = 4;
333 break;
334 default:
335 usage (stderr, 1);
336 }
337
338 if (bfd_init () != BFD_INIT_MAGIC)
339 fatal (_("fatal error: libbfd ABI mismatch"));
340 set_default_bfd_target ();
341
342 if (optind >= argc)
343 {
344 datasection_only = false;
345 SET_BINARY (fileno (stdin));
346 print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
347 files_given = true;
348 }
349 else
350 {
351 for (; optind < argc; ++optind)
352 {
353 if (streq (argv[optind], "-"))
354 datasection_only = false;
355 else
356 {
357 files_given = true;
358 exit_status |= !strings_file (argv[optind]);
359 }
360 }
361 }
362
363 if (!files_given)
364 usage (stderr, 1);
365
366 return (exit_status);
367 }
368 \f
369 /* Scan section SECT of the file ABFD, whose printable name is
370 FILENAME. If it contains initialized data set GOT_A_SECTION and
371 print the strings in it. */
372
373 static void
374 strings_a_section (bfd *abfd, asection *sect, const char *filename,
375 bool *got_a_section)
376 {
377 bfd_size_type sectsize;
378 bfd_byte *mem;
379
380 if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
381 return;
382
383 sectsize = bfd_section_size (sect);
384 if (sectsize == 0)
385 return;
386
387 if (!bfd_malloc_and_get_section (abfd, sect, &mem))
388 {
389 non_fatal (_("%s: Reading section %s failed: %s"),
390 filename, sect->name, bfd_errmsg (bfd_get_error ()));
391 return;
392 }
393
394 *got_a_section = true;
395 print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
396 free (mem);
397 }
398
399 /* Scan all of the sections in FILE, and print the strings
400 in the initialized data section(s).
401
402 Return TRUE if successful,
403 FALSE if not (such as if FILE is not an object file). */
404
405 static bool
406 strings_object_file (const char *file)
407 {
408 bfd *abfd;
409 asection *s;
410 bool got_a_section;
411
412 abfd = bfd_openr (file, target);
413
414 if (abfd == NULL)
415 /* Treat the file as a non-object file. */
416 return false;
417
418 /* This call is mainly for its side effect of reading in the sections.
419 We follow the traditional behavior of `strings' in that we don't
420 complain if we don't recognize a file to be an object file. */
421 if (!bfd_check_format (abfd, bfd_object))
422 {
423 bfd_close (abfd);
424 return false;
425 }
426
427 got_a_section = false;
428 for (s = abfd->sections; s != NULL; s = s->next)
429 strings_a_section (abfd, s, file, &got_a_section);
430
431 if (!bfd_close (abfd))
432 {
433 bfd_nonfatal (file);
434 return false;
435 }
436
437 return got_a_section;
438 }
439
440 /* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */
441
442 static bool
443 strings_file (char *file)
444 {
445 struct stat st;
446
447 /* get_file_size does not support non-S_ISREG files. */
448
449 if (stat (file, &st) < 0)
450 {
451 if (errno == ENOENT)
452 non_fatal (_("'%s': No such file"), file);
453 else
454 non_fatal (_("Warning: could not locate '%s'. reason: %s"),
455 file, strerror (errno));
456 return false;
457 }
458 else if (S_ISDIR (st.st_mode))
459 {
460 non_fatal (_("Warning: '%s' is a directory"), file);
461 return false;
462 }
463
464 /* If we weren't told to scan the whole file,
465 try to open it as an object file and only look at
466 initialized data sections. If that fails, fall back to the
467 whole file. */
468 if (!datasection_only || !strings_object_file (file))
469 {
470 FILE *stream;
471
472 stream = fopen (file, FOPEN_RB);
473 if (stream == NULL)
474 {
475 fprintf (stderr, "%s: ", program_name);
476 perror (file);
477 return false;
478 }
479
480 print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
481
482 if (fclose (stream) == EOF)
483 {
484 fprintf (stderr, "%s: ", program_name);
485 perror (file);
486 return false;
487 }
488 }
489
490 return true;
491 }
492 \f
493 /* Read the next character, return EOF if none available.
494 Assume that STREAM is positioned so that the next byte read
495 is at address ADDRESS in the file.
496
497 If STREAM is NULL, do not read from it.
498 The caller can supply a buffer of characters
499 to be processed before the data in STREAM.
500 MAGIC is the address of the buffer and
501 MAGICCOUNT is how many characters are in it. */
502
503 static long
504 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
505 {
506 int c, i;
507 long r = 0;
508
509 for (i = 0; i < encoding_bytes; i++)
510 {
511 if (*magiccount)
512 {
513 (*magiccount)--;
514 c = *(*magic)++;
515 }
516 else
517 {
518 if (stream == NULL)
519 return EOF;
520
521 /* Only use getc_unlocked if we found a declaration for it.
522 Otherwise, libc is not thread safe by default, and we
523 should not use it. */
524
525 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
526 c = getc_unlocked (stream);
527 #else
528 c = getc (stream);
529 #endif
530 if (c == EOF)
531 return EOF;
532 }
533
534 (*address)++;
535 r = (r << 8) | (c & 0xff);
536 }
537
538 switch (encoding)
539 {
540 default:
541 break;
542 case 'l':
543 r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
544 break;
545 case 'L':
546 r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
547 | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
548 break;
549 }
550
551 return r;
552 }
553
554 /* Throw away one byte of a (possibly) multi-byte char C, updating
555 address and buffer to suit. */
556
557 static void
558 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
559 {
560 static char tmp[4];
561
562 if (encoding_bytes > 1)
563 {
564 *address -= encoding_bytes - 1;
565
566 if (*magiccount == 0)
567 {
568 /* If no magic buffer exists, use temp buffer. */
569 switch (encoding)
570 {
571 default:
572 break;
573 case 'b':
574 tmp[0] = c & 0xff;
575 *magiccount = 1;
576 break;
577 case 'l':
578 tmp[0] = (c >> 8) & 0xff;
579 *magiccount = 1;
580 break;
581 case 'B':
582 tmp[0] = (c >> 16) & 0xff;
583 tmp[1] = (c >> 8) & 0xff;
584 tmp[2] = c & 0xff;
585 *magiccount = 3;
586 break;
587 case 'L':
588 tmp[0] = (c >> 8) & 0xff;
589 tmp[1] = (c >> 16) & 0xff;
590 tmp[2] = (c >> 24) & 0xff;
591 *magiccount = 3;
592 break;
593 }
594 *magic = tmp;
595 }
596 else
597 {
598 /* If magic buffer exists, rewind. */
599 *magic -= encoding_bytes - 1;
600 *magiccount += encoding_bytes - 1;
601 }
602 }
603 }
604
605 static void
606 print_filename_and_address (const char * filename, file_ptr address)
607 {
608 if (print_filenames)
609 printf ("%s: ", filename);
610
611 if (! print_addresses)
612 return;
613
614 switch (address_radix)
615 {
616 case 8:
617 if (sizeof (address) > sizeof (long))
618 {
619 #ifndef __MSVCRT__
620 printf ("%7llo ", (unsigned long long) address);
621 #else
622 printf ("%7I64o ", (unsigned long long) address);
623 #endif
624 }
625 else
626 printf ("%7lo ", (unsigned long) address);
627 break;
628
629 case 10:
630 if (sizeof (address) > sizeof (long))
631 {
632 #ifndef __MSVCRT__
633 printf ("%7llu ", (unsigned long long) address);
634 #else
635 printf ("%7I64d ", (unsigned long long) address);
636 #endif
637 }
638 else
639 printf ("%7ld ", (long) address);
640 break;
641
642 case 16:
643 if (sizeof (address) > sizeof (long))
644 {
645 #ifndef __MSVCRT__
646 printf ("%7llx ", (unsigned long long) address);
647 #else
648 printf ("%7I64x ", (unsigned long long) address);
649 #endif
650 }
651 else
652 printf ("%7lx ", (unsigned long) address);
653 break;
654 }
655 }
656
657 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
658 If the encoding is valid then returns the number of bytes it uses. */
659
660 static unsigned int
661 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
662 {
663 if (buffer[0] < 0xc0)
664 return 0;
665
666 if (buflen < 2)
667 return 0;
668
669 if ((buffer[1] & 0xc0) != 0x80)
670 return 0;
671
672 if ((buffer[0] & 0x20) == 0)
673 return 2;
674
675 if (buflen < 3)
676 return 0;
677
678 if ((buffer[2] & 0xc0) != 0x80)
679 return 0;
680
681 if ((buffer[0] & 0x10) == 0)
682 return 3;
683
684 if (buflen < 4)
685 return 0;
686
687 if ((buffer[3] & 0xc0) != 0x80)
688 return 0;
689
690 return 4;
691 }
692
693 /* Display a UTF-8 encoded character in BUFFER according to the setting
694 of unicode_display. The character is known to be valid.
695 Returns the number of bytes consumed. */
696
697 static uint
698 display_utf8_char (const unsigned char * buffer)
699 {
700 uint j;
701 uint utf8_len;
702
703 switch (buffer[0] & 0x30)
704 {
705 case 0x00:
706 case 0x10:
707 utf8_len = 2;
708 break;
709 case 0x20:
710 utf8_len = 3;
711 break;
712 default:
713 utf8_len = 4;
714 }
715
716 switch (unicode_display)
717 {
718 default:
719 fprintf (stderr, "ICE: unexpected unicode display type\n");
720 break;
721
722 case unicode_escape:
723 case unicode_highlight:
724 if (unicode_display == unicode_highlight && isatty (1))
725 printf ("\x1B[31;47m"); /* Red. */
726
727 switch (utf8_len)
728 {
729 case 2:
730 printf ("\\u%02x%02x",
731 ((buffer[0] & 0x1c) >> 2),
732 ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
733 break;
734
735 case 3:
736 printf ("\\u%02x%02x",
737 ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
738 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
739 break;
740
741 case 4:
742 printf ("\\u%02x%02x%02x",
743 ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
744 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
745 ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
746 break;
747 default:
748 /* URG. */
749 break;
750 }
751
752 if (unicode_display == unicode_highlight && isatty (1))
753 printf ("\033[0m"); /* Default colour. */
754 break;
755
756 case unicode_hex:
757 putchar ('<');
758 printf ("0x");
759 for (j = 0; j < utf8_len; j++)
760 printf ("%02x", buffer [j]);
761 putchar ('>');
762 break;
763
764 case unicode_locale:
765 printf ("%.1s", buffer);
766 break;
767 }
768
769 return utf8_len;
770 }
771
772 /* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
773 according to the setting of the unicode_display variable. The buffer
774 contains BUFLEN bytes.
775
776 Display the characters as if they started at ADDRESS and are contained in
777 FILENAME. */
778
779 static void
780 print_unicode_buffer (const char * filename,
781 file_ptr address,
782 const unsigned char * buffer,
783 unsigned long buflen)
784 {
785 /* Paranoia checks... */
786 if (filename == NULL
787 || buffer == NULL
788 || unicode_display == unicode_default
789 || encoding != 'S'
790 || encoding_bytes != 1)
791 {
792 fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
793 return;
794 }
795
796 if (buflen == 0)
797 return;
798
799 /* We must only display strings that are at least string_min *characters*
800 long. So we scan the buffer in two stages. First we locate the start
801 of a potential string. Then we walk along it until we have found
802 string_min characters. Then we go back to the start point and start
803 displaying characters according to the unicode_display setting. */
804
805 unsigned long start_point = 0;
806 unsigned long i = 0;
807 unsigned int char_len = 1;
808 unsigned int num_found = 0;
809
810 for (i = 0; i < buflen; i += char_len)
811 {
812 int c = buffer[i];
813
814 char_len = 1;
815
816 /* Find the first potential character of a string. */
817 if (! STRING_ISGRAPHIC (c))
818 {
819 num_found = 0;
820 continue;
821 }
822
823 if (c > 126)
824 {
825 if (c < 0xc0)
826 {
827 num_found = 0;
828 continue;
829 }
830
831 if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
832 {
833 char_len = 1;
834 num_found = 0;
835 continue;
836 }
837
838 if (unicode_display == unicode_invalid)
839 {
840 /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
841 num_found = 0;
842 continue;
843 }
844 }
845
846 if (num_found == 0)
847 /* We have found a potential starting point for a string. */
848 start_point = i;
849
850 ++ num_found;
851
852 if (num_found >= string_min)
853 break;
854 }
855
856 if (num_found < string_min)
857 return;
858
859 print_filename_and_address (filename, address + start_point);
860
861 /* We have found string_min characters. Display them and any
862 more that follow. */
863 for (i = start_point; i < buflen; i += char_len)
864 {
865 int c = buffer[i];
866
867 char_len = 1;
868
869 if (! STRING_ISGRAPHIC (c))
870 break;
871 else if (c < 127)
872 putchar (c);
873 else if (! is_valid_utf8 (buffer + i, buflen - i))
874 break;
875 else if (unicode_display == unicode_invalid)
876 break;
877 else
878 char_len = display_utf8_char (buffer + i);
879 }
880
881 if (output_separator)
882 fputs (output_separator, stdout);
883 else
884 putchar ('\n');
885
886 /* FIXME: Using tail recursion here is lazy programming... */
887 print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
888 }
889
890 static int
891 get_unicode_byte (FILE * stream, unsigned char * putback, uint * num_putback, uint * num_read)
892 {
893 if (* num_putback > 0)
894 {
895 * num_putback = * num_putback - 1;
896 return putback [* num_putback];
897 }
898
899 * num_read = * num_read + 1;
900
901 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
902 return getc_unlocked (stream);
903 #else
904 return getc (stream);
905 #endif
906 }
907
908 /* Helper function for print_unicode_stream. */
909
910 static void
911 print_unicode_stream_body (const char * filename,
912 file_ptr address,
913 FILE * stream,
914 unsigned char * putback_buf,
915 uint num_putback,
916 unsigned char * print_buf)
917 {
918 /* It would be nice if we could just read the stream into a buffer
919 and then process if with print_unicode_buffer. But the input
920 might be huge or it might time-locked (eg stdin). So instead
921 we go one byte at a time... */
922
923 file_ptr start_point = 0;
924 uint num_read = 0;
925 uint num_chars = 0;
926 uint num_print = 0;
927 int c = 0;
928
929 /* Find a series of string_min characters. Put them into print_buf. */
930 do
931 {
932 if (num_chars >= string_min)
933 break;
934
935 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
936 if (c == EOF)
937 break;
938
939 if (! STRING_ISGRAPHIC (c))
940 {
941 num_chars = num_print = 0;
942 continue;
943 }
944
945 if (num_chars == 0)
946 start_point = num_read - 1;
947
948 if (c < 127)
949 {
950 print_buf[num_print] = c;
951 num_chars ++;
952 num_print ++;
953 continue;
954 }
955
956 if (c < 0xc0)
957 {
958 num_chars = num_print = 0;
959 continue;
960 }
961
962 /* We *might* have a UTF-8 sequence. Time to start peeking. */
963 char utf8[4];
964
965 utf8[0] = c;
966 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
967 if (c == EOF)
968 break;
969 utf8[1] = c;
970
971 if ((utf8[1] & 0xc0) != 0x80)
972 {
973 /* Invalid UTF-8. */
974 putback_buf[num_putback++] = utf8[1];
975 num_chars = num_print = 0;
976 continue;
977 }
978 else if ((utf8[0] & 0x20) == 0)
979 {
980 /* A valid 2-byte UTF-8 encoding. */
981 if (unicode_display == unicode_invalid)
982 {
983 putback_buf[num_putback++] = utf8[1];
984 num_chars = num_print = 0;
985 }
986 else
987 {
988 print_buf[num_print ++] = utf8[0];
989 print_buf[num_print ++] = utf8[1];
990 num_chars ++;
991 }
992 continue;
993 }
994
995 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
996 if (c == EOF)
997 break;
998 utf8[2] = c;
999
1000 if ((utf8[2] & 0xc0) != 0x80)
1001 {
1002 /* Invalid UTF-8. */
1003 putback_buf[num_putback++] = utf8[2];
1004 putback_buf[num_putback++] = utf8[1];
1005 num_chars = num_print = 0;
1006 continue;
1007 }
1008 else if ((utf8[0] & 0x10) == 0)
1009 {
1010 /* A valid 3-byte UTF-8 encoding. */
1011 if (unicode_display == unicode_invalid)
1012 {
1013 putback_buf[num_putback++] = utf8[2];
1014 putback_buf[num_putback++] = utf8[1];
1015 num_chars = num_print = 0;
1016 }
1017 else
1018 {
1019 print_buf[num_print ++] = utf8[0];
1020 print_buf[num_print ++] = utf8[1];
1021 print_buf[num_print ++] = utf8[2];
1022 num_chars ++;
1023 }
1024 continue;
1025 }
1026
1027 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1028 if (c == EOF)
1029 break;
1030 utf8[3] = c;
1031
1032 if ((utf8[3] & 0xc0) != 0x80)
1033 {
1034 /* Invalid UTF-8. */
1035 putback_buf[num_putback++] = utf8[3];
1036 putback_buf[num_putback++] = utf8[2];
1037 putback_buf[num_putback++] = utf8[1];
1038 num_chars = num_print = 0;
1039 }
1040 /* We have a valid 4-byte UTF-8 encoding. */
1041 else if (unicode_display == unicode_invalid)
1042 {
1043 putback_buf[num_putback++] = utf8[3];
1044 putback_buf[num_putback++] = utf8[1];
1045 putback_buf[num_putback++] = utf8[2];
1046 num_chars = num_print = 0;
1047 }
1048 else
1049 {
1050 print_buf[num_print ++] = utf8[0];
1051 print_buf[num_print ++] = utf8[1];
1052 print_buf[num_print ++] = utf8[2];
1053 print_buf[num_print ++] = utf8[3];
1054 num_chars ++;
1055 }
1056 }
1057 while (1);
1058
1059 if (num_chars >= string_min)
1060 {
1061 /* We know that we have string_min valid characters in print_buf,
1062 and there may be more to come in the stream. Start displaying
1063 them. */
1064
1065 print_filename_and_address (filename, address + start_point);
1066
1067 uint i;
1068 for (i = 0; i < num_print;)
1069 {
1070 if (print_buf[i] < 127)
1071 putchar (print_buf[i++]);
1072 else
1073 i += display_utf8_char (print_buf + i);
1074 }
1075
1076 /* OK so now we have to start read unchecked bytes. */
1077
1078 /* Find a series of string_min characters. Put them into print_buf. */
1079 do
1080 {
1081 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1082 if (c == EOF)
1083 break;
1084
1085 if (! STRING_ISGRAPHIC (c))
1086 break;
1087
1088 if (c < 127)
1089 {
1090 putchar (c);
1091 continue;
1092 }
1093
1094 if (c < 0xc0)
1095 break;
1096
1097 /* We *might* have a UTF-8 sequence. Time to start peeking. */
1098 unsigned char utf8[4];
1099
1100 utf8[0] = c;
1101 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1102 if (c == EOF)
1103 break;
1104 utf8[1] = c;
1105
1106 if ((utf8[1] & 0xc0) != 0x80)
1107 {
1108 /* Invalid UTF-8. */
1109 putback_buf[num_putback++] = utf8[1];
1110 break;
1111 }
1112 else if ((utf8[0] & 0x20) == 0)
1113 {
1114 /* Valid 2-byte UTF-8. */
1115 if (unicode_display == unicode_invalid)
1116 {
1117 putback_buf[num_putback++] = utf8[1];
1118 break;
1119 }
1120 else
1121 {
1122 (void) display_utf8_char (utf8);
1123 continue;
1124 }
1125 }
1126
1127 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1128 if (c == EOF)
1129 break;
1130 utf8[2] = c;
1131
1132 if ((utf8[2] & 0xc0) != 0x80)
1133 {
1134 /* Invalid UTF-8. */
1135 putback_buf[num_putback++] = utf8[2];
1136 putback_buf[num_putback++] = utf8[1];
1137 break;
1138 }
1139 else if ((utf8[0] & 0x10) == 0)
1140 {
1141 /* Valid 3-byte UTF-8. */
1142 if (unicode_display == unicode_invalid)
1143 {
1144 putback_buf[num_putback++] = utf8[2];
1145 putback_buf[num_putback++] = utf8[1];
1146 break;
1147 }
1148 else
1149 {
1150 (void) display_utf8_char (utf8);
1151 continue;
1152 }
1153 }
1154
1155 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1156 if (c == EOF)
1157 break;
1158 utf8[3] = c;
1159
1160 if ((utf8[3] & 0xc0) != 0x80)
1161 {
1162 /* Invalid UTF-8. */
1163 putback_buf[num_putback++] = utf8[3];
1164 putback_buf[num_putback++] = utf8[2];
1165 putback_buf[num_putback++] = utf8[1];
1166 break;
1167 }
1168 else if (unicode_display == unicode_invalid)
1169 {
1170 putback_buf[num_putback++] = utf8[3];
1171 putback_buf[num_putback++] = utf8[2];
1172 putback_buf[num_putback++] = utf8[1];
1173 break;
1174 }
1175 else
1176 /* A valid 4-byte UTF-8 encoding. */
1177 (void) display_utf8_char (utf8);
1178 }
1179 while (1);
1180
1181 if (output_separator)
1182 fputs (output_separator, stdout);
1183 else
1184 putchar ('\n');
1185 }
1186
1187 if (c != EOF)
1188 /* FIXME: Using tail recursion here is lazy, but it works. */
1189 print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1190 }
1191
1192 /* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1193 encountered according to the setting of the unicode_display variable.
1194 The stream is positioned at ADDRESS and is attached to FILENAME. */
1195
1196 static void
1197 print_unicode_stream (const char * filename,
1198 file_ptr address,
1199 FILE * stream)
1200 {
1201 /* Paranoia checks... */
1202 if (filename == NULL
1203 || stream == NULL
1204 || unicode_display == unicode_default
1205 || encoding != 'S'
1206 || encoding_bytes != 1)
1207 {
1208 fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1209 return;
1210 }
1211
1212 /* Allocate space for string_min 4-byte utf-8 characters. */
1213 unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1214 /* We should never have to put back more than 4 bytes. */
1215 unsigned char putback_buf[5];
1216 uint num_putback = 0;
1217
1218 print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1219 free (print_buf);
1220 }
1221 \f
1222 /* Find the strings in file FILENAME, read from STREAM.
1223 Assume that STREAM is positioned so that the next byte read
1224 is at address ADDRESS in the file.
1225
1226 If STREAM is NULL, do not read from it.
1227 The caller can supply a buffer of characters
1228 to be processed before the data in STREAM.
1229 MAGIC is the address of the buffer and
1230 MAGICCOUNT is how many characters are in it.
1231 Those characters come at address ADDRESS and the data in STREAM follow. */
1232
1233 static void
1234 print_strings (const char *filename, FILE *stream, file_ptr address,
1235 int magiccount, char *magic)
1236 {
1237 if (unicode_display != unicode_default)
1238 {
1239 if (magic != NULL)
1240 print_unicode_buffer (filename, address,
1241 (const unsigned char *) magic, magiccount);
1242
1243 if (stream != NULL)
1244 print_unicode_stream (filename, address, stream);
1245 return;
1246 }
1247
1248 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1249
1250 while (1)
1251 {
1252 file_ptr start;
1253 uint i;
1254 long c;
1255
1256 /* See if the next `string_min' chars are all graphic chars. */
1257 tryline:
1258 start = address;
1259 for (i = 0; i < string_min; i++)
1260 {
1261 c = get_char (stream, &address, &magiccount, &magic);
1262 if (c == EOF)
1263 {
1264 free (buf);
1265 return;
1266 }
1267
1268 if (! STRING_ISGRAPHIC (c))
1269 {
1270 /* Found a non-graphic. Try again starting with next byte. */
1271 unget_part_char (c, &address, &magiccount, &magic);
1272 goto tryline;
1273 }
1274 buf[i] = c;
1275 }
1276
1277 /* We found a run of `string_min' graphic characters. Print up
1278 to the next non-graphic character. */
1279 print_filename_and_address (filename, start);
1280
1281 buf[i] = '\0';
1282 fputs (buf, stdout);
1283
1284 while (1)
1285 {
1286 c = get_char (stream, &address, &magiccount, &magic);
1287 if (c == EOF)
1288 break;
1289 if (! STRING_ISGRAPHIC (c))
1290 {
1291 unget_part_char (c, &address, &magiccount, &magic);
1292 break;
1293 }
1294 putchar (c);
1295 }
1296
1297 if (output_separator)
1298 fputs (output_separator, stdout);
1299 else
1300 putchar ('\n');
1301 }
1302 free (buf);
1303 }
1304 \f
1305 static void
1306 usage (FILE *stream, int status)
1307 {
1308 fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1309 fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1310 fprintf (stream, _(" The options are:\n"));
1311
1312 if (DEFAULT_STRINGS_ALL)
1313 fprintf (stream, _("\
1314 -a - --all Scan the entire file, not just the data section [default]\n\
1315 -d --data Only scan the data sections in the file\n"));
1316 else
1317 fprintf (stream, _("\
1318 -a - --all Scan the entire file, not just the data section\n\
1319 -d --data Only scan the data sections in the file [default]\n"));
1320
1321 fprintf (stream, _("\
1322 -f --print-file-name Print the name of the file before each string\n\
1323 -n --bytes=[number] Locate & print any NUL-terminated sequence of at\n\
1324 -<number> least [number] characters (default 4).\n\
1325 -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\
1326 -w --include-all-whitespace Include all whitespace as valid string characters\n\
1327 -o An alias for --radix=o\n\
1328 -T --target=<BFDNAME> Specify the binary file format\n\
1329 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1330 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1331 --unicode={default|show|invalid|hex|escape|highlight}\n\
1332 -u {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
1333 -s --output-separator=<string> String used to separate strings in output.\n\
1334 @<file> Read options from <file>\n\
1335 -h --help Display this information\n\
1336 -v -V --version Print the program's version number\n"));
1337 list_supported_targets (program_name, stream);
1338 if (REPORT_BUGS_TO[0] && status == 0)
1339 fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1340 exit (status);
1341 }