libjava/java/io/StreamTokenizer.java

   1 /* StreamTokenizer.java -- parses streams of characters into tokens
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002  Free Software Foundation
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19 02111-1307 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38 package java.io;
  39
  40 /**
  41  * This class parses streams of characters into tokens.  There are a
  42  * million-zillion flags that can be set to control the parsing, as
  43  * described under the various method headings.
  44  *
  45  * @author Warren Levy <warrenl@cygnus.com>
  46  * @date October 25, 1998.
  47  */
  48 /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
  49  * "The Java Language Specification", ISBN 0-201-63451-1
  50  * plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
  51  * Status:  Believed complete and correct.
  52  */
  53
  54 public class StreamTokenizer
  55 {
  56   /** A constant indicating that the end of the stream has been read. */
  57   public static final int TT_EOF = -1;
  58
  59   /** A constant indicating that the end of the line has been read. */
  60   public static final int TT_EOL = '\n';
  61
  62   /** A constant indicating that a number token has been read. */
  63   public static final int TT_NUMBER = -2;
  64
  65   /** A constant indicating that a word token has been read. */
  66   public static final int TT_WORD = -3;
  67
  68   /** A constant indicating that no tokens have been read yet. */
  69   private static final int TT_NONE = -4;
  70
  71   /**
  72    * Contains the type of the token read resulting from a call to nextToken
  73    * The rules are as follows:
  74    * <ul>
  75    * <li>For a token consisting of a single ordinary character, this is the
  76    *     value of that character.
  77    * <li>For a quoted string, this is the value of the quote character
  78    * <li>For a word, this is TT_WORD
  79    * <li>For a number, this is TT_NUMBER
  80    * <li>For the end of the line, this is TT_EOL
  81    * <li>For the end of the stream, this is TT_EOF
  82    * </ul>
  83    */
  84   public int ttype = TT_NONE;
  85
  86   /** The String associated with word and string tokens. */
  87   public String sval;
  88
  89   /** The numeric value associated with number tokens. */
  90   public double nval;
  91
  92   /* Indicates whether end-of-line is recognized as a token. */
  93   private boolean eolSignificant = false;
  94
  95   /* Indicates whether word tokens are automatically made lower case. */
  96   private boolean lowerCase = false;
  97
  98   /* Indicates whether C++ style comments are recognized and skipped. */
  99   private boolean slashSlash = false;
 100
 101   /* Indicates whether C style comments are recognized and skipped. */
 102   private boolean slashStar = false;
 103
 104   /* Attribute tables of each byte from 0x00 to 0xFF. */
 105   private boolean[] whitespace = new boolean[256];
 106   private boolean[] alphabetic = new boolean[256];
 107   private boolean[] numeric = new boolean[256];
 108   private boolean[] quote = new boolean[256];
 109   private boolean[] comment = new boolean[256];
 110
 111   /* The Reader associated with this class. */
 112   private PushbackReader in;
 113
 114   /* Indicates if a token has been pushed back. */
 115   private boolean pushedBack = false;
 116
 117   /* Contains the current line number of the reader. */
 118   private int lineNumber = 1;
 119
 120   /**
 121    * This method reads bytes from an <code>InputStream</code> and tokenizes
 122    * them.  For details on how this method operates by default, see
 123    * <code>StreamTokenizer(Reader)</code>.
 124    *
 125    * @param in The <code>InputStream</code> to read from
 126    *
 127    * @deprecated Since JDK 1.1.
 128    */
 129   public StreamTokenizer(InputStream is)
 130   {
 131     this(new InputStreamReader(is));
 132   }
 133
 134   /**
 135    * This method initializes a new <code>StreamTokenizer</code> to read
 136    * characters from a <code>Reader</code> and parse them.  The char values
 137    * have their hight bits masked so that the value is treated a character
 138    * in the range of 0x0000 to 0x00FF.
 139    * <p>
 140    * This constructor sets up the parsing table to parse the stream in the
 141    * following manner:
 142    * <ul>
 143    * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF
 144    *     are initialized as alphabetic
 145    * <li>The values 0x00 through 0x20 are initialized as whitespace
 146    * <li>The values '\'' and '"' are initialized as quote characters
 147    * <li>'/' is a comment character
 148    * <li>Numbers will be parsed
 149    * <li>EOL is not treated as significant
 150    * <li>C  and C++ (//) comments are not recognized
 151    * </ul>
 152    *
 153    * @param in The <code>Reader</code> to read chars from
 154    */
 155   public StreamTokenizer(Reader r)
 156   {
 157     in = new PushbackReader(r);
 158
 159     whitespaceChars(0x00, 0x20);
 160     wordChars('A', 'Z');
 161     wordChars('a', 'z');
 162     wordChars(0xA0, 0xFF);
 163     commentChar('/');
 164     quoteChar('\'');
 165     quoteChar('"');
 166     parseNumbers();
 167   }
 168
 169   /**
 170    * This method sets the comment attribute on the specified character.
 171    *
 172    * @param c The character to set the comment attribute for, passed as an int
 173    */
 174   public void commentChar(int ch)
 175   {
 176     if (ch >= 0 && ch <= 255)
 177       comment[ch] = true;
 178   }
 179
 180   /**
 181    * This method sets a flag that indicates whether or not the end of line
 182    * sequence terminates and is a token.  The defaults to <code>false</code>
 183    *
 184    * @param flag <code>true</code> if EOF is significant, <code>false</code>
 185    *             otherwise
 186    */
 187   public void eolIsSignificant(boolean flag)
 188   {
 189     eolSignificant = flag;
 190   }
 191
 192   /**
 193    * This method returns the current line number.  Note that if the
 194    * <code>pushBack()</code> method is called, it has no effect on the
 195    * line number returned by this method.
 196    *
 197    * @return The current line number
 198    */
 199   public int lineno()
 200   {
 201     return lineNumber;
 202   }
 203
 204   /**
 205    * This method sets a flag that indicates whether or not alphabetic
 206    * tokens that are returned should be converted to lower case.
 207    *
 208    * @param flag <code>true</code> to convert to lower case,
 209    *             <code>false</code> otherwise
 210    */
 211   public void lowerCaseMode(boolean flag)
 212   {
 213     lowerCase = flag;
 214   }
 215
 216   private boolean isWhitespace(int ch)
 217   {
 218     return (ch >= 0 && ch <= 255 && whitespace[ch]);
 219   }
 220
 221   private boolean isAlphabetic(int ch)
 222   {
 223     return ((ch > 255) || (ch >= 0 && alphabetic[ch]));
 224   }
 225
 226   private boolean isNumeric(int ch)
 227   {
 228     return (ch >= 0 && ch <= 255 && numeric[ch]);
 229   }
 230
 231   private boolean isQuote(int ch)
 232   {
 233     return (ch >= 0 && ch <= 255 && quote[ch]);
 234   }
 235
 236   private boolean isComment(int ch)
 237   {
 238     return (ch >= 0 && ch <= 255 && comment[ch]);
 239   }
 240
 241   /**
 242    * This method reads the next token from the stream.  It sets the
 243    * <code>ttype</code> variable to the appropriate token type and
 244    * returns it.  It also can set <code>sval</code> or <code>nval</code>
 245    * as described below.  The parsing strategy is as follows:
 246    * <ul>
 247    * <li>Skip any whitespace characters.
 248    * <li>If a numeric character is encountered, attempt to parse a numeric
 249    * value.  Leading '-' characters indicate a numeric only if followed by
 250    * another non-'-' numeric.  The value of the numeric token is terminated
 251    * by either the first non-numeric encountered, or the second occurrence of
 252    * '-' or '.'.  The token type returned is TT_NUMBER and <code>nval</code>
 253    * is set to the value parsed.
 254    * <li>If an alphabetic character is parsed, all subsequent characters
 255    * are read until the first non-alphabetic or non-numeric character is
 256    * encountered.  The token type returned is TT_WORD and the value parsed
 257    * is stored in <code>sval</code>.  If lower case mode is set, the token
 258    * stored in <code>sval</code> is converted to lower case.  The end of line
 259    * sequence terminates a word only if EOL signficance has been turned on.
 260    * The start of a comment also terminates a word.  Any character with a
 261    * non-alphabetic and non-numeric attribute (such as white space, a quote,
 262    * or a commet) are treated as non-alphabetic and terminate the word.
 263    * <li>If a comment character is parsed, then all remaining characters on
 264    * the current line are skipped and another token is parsed.  Any EOL or
 265    * EOF's encountered are not discarded, but rather terminate the comment.
 266    * <li>If a quote character is parsed, then all characters up to the
 267    * second occurrence of the same quote character are parsed into a
 268    * <code>String</code>.  This <code>String</code> is stored as
 269    * <code>sval</code>, but is not converted to lower case, even if lower case
 270    * mode is enabled.  The token type returned is the value of the quote
 271    * character encountered.  Any escape sequences
 272    * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r
 273    * (carriage return), \" (double quote), \' (single quote), \\
 274    * (backslash), \XXX (octal esacpe)) are converted to the appropriate
 275    * char values.  Invalid esacape sequences are left in untranslated.
 276    * Unicode characters like ('\ u0000') are not recognized.
 277    * <li>If the C++ comment sequence "//" is encountered, and the parser
 278    * is configured to handle that sequence, then the remainder of the line
 279    * is skipped and another token is read exactly as if a character with
 280    * the comment attribute was encountered.
 281    * <li>If the C comment sequence "/*" is encountered, and the parser
 282    * is configured to handle that sequence, then all characters up to and
 283    * including the comment terminator sequence are discarded and another
 284    * token is parsed.
 285    * <li>If all cases above are not met, then the character is an ordinary
 286    * character that is parsed as a token by itself.  The char encountered
 287    * is returned as the token type.
 288    * </ul>
 289    *
 290    * @return The token type
 291    * @exception IOException If an I/O error occurs
 292    */
 293   public int nextToken() throws IOException
 294   {
 295     if (pushedBack)
 296       {
 297         pushedBack = false;
 298         if (ttype != TT_NONE)
 299           return ttype;
 300       }
 301
 302     sval = null;
 303     int ch;
 304
 305     // Skip whitespace.  Deal with EOL along the way.
 306     while (isWhitespace(ch = in.read()))
 307       if (ch == '\n' || ch == '\r')
 308         {
 309           lineNumber++;
 310
 311           // Throw away \n if in combination with \r.
 312           if (ch == '\r' && (ch = in.read()) != '\n')
 313             {
 314               if (ch != TT_EOF)
 315                 in.unread(ch);
 316             }
 317           if (eolSignificant)
 318             return (ttype = TT_EOL);
 319         }
 320
 321     if (ch == '/')
 322       if ((ch = in.read()) == '/' && slashSlash)
 323         {
 324           while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
 325             ;
 326           if (ch != TT_EOF)
 327             in.unread(ch);
 328           return nextToken(); // Recursive, but not too deep in normal cases
 329         }
 330       else if (ch == '*' && slashStar)
 331         {
 332           while (true)
 333             {
 334               ch = in.read();
 335               if (ch == '*')
 336                 {
 337                   if ((ch = in.read()) == '/')
 338                     break;
 339                   else if (ch != TT_EOF)
 340                     in.unread(ch);
 341                 }
 342               else if (ch == '\n' || ch == '\r')
 343                 {
 344                   lineNumber++;
 345                   if (ch == '\r' && (ch = in.read()) != '\n')
 346                     {
 347                       if (ch != TT_EOF)
 348                         in.unread(ch);
 349                     }
 350                 }
 351               else if (ch == TT_EOF)
 352                 {
 353                   break;
 354                 }
 355             }
 356           return nextToken(); // Recursive, but not too deep in normal cases
 357         }
 358       else
 359         {
 360           if (ch != TT_EOF)
 361             in.unread(ch);
 362           ch = '/';
 363         }
 364
 365     if (ch == TT_EOF)
 366       ttype = TT_EOF;
 367     else if (isNumeric(ch))
 368       {
 369         boolean isNegative = false;
 370         if (ch == '-')
 371           {
 372             // Read ahead to see if this is an ordinary '-' rather than numeric.
 373             ch = in.read();
 374             if (isNumeric(ch) && ch != '-')
 375               {
 376                 isNegative = true;
 377               }
 378             else
 379               {
 380                 if (ch != TT_EOF)
 381                   in.unread(ch);
 382                 return (ttype = '-');
 383               }
 384           }
 385
 386         StringBuffer tokbuf = new StringBuffer();
 387         tokbuf.append((char) ch);
 388
 389         int decCount = 0;
 390         while (isNumeric(ch = in.read()) && ch != '-')
 391           if (ch == '.' && decCount++ > 0)
 392             break;
 393           else
 394             tokbuf.append((char) ch);
 395
 396         if (ch != TT_EOF)
 397           in.unread(ch);
 398         ttype = TT_NUMBER;
 399         try
 400           {
 401             nval = Double.valueOf(tokbuf.toString()).doubleValue();
 402           }
 403         catch (NumberFormatException _)
 404           {
 405             nval = 0.0;
 406           }
 407         if (isNegative)
 408           nval = -nval;
 409       }
 410     else if (isAlphabetic(ch))
 411       {
 412         StringBuffer tokbuf = new StringBuffer();
 413         tokbuf.append((char) ch);
 414         while (isAlphabetic(ch = in.read()) || isNumeric(ch))
 415           tokbuf.append((char) ch);
 416         if (ch != TT_EOF)
 417           in.unread(ch);
 418         ttype = TT_WORD;
 419         sval = tokbuf.toString();
 420         if (lowerCase)
 421           sval = sval.toLowerCase();
 422       }
 423     else if (isComment(ch))
 424       {
 425         while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
 426           ;
 427         if (ch != TT_EOF)
 428           in.unread(ch);
 429         return nextToken();     // Recursive, but not too deep in normal cases.
 430       }
 431     else if (isQuote(ch))
 432       {
 433         ttype = ch;
 434         StringBuffer tokbuf = new StringBuffer();
 435         while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' &&
 436                ch != TT_EOF)
 437           {
 438             if (ch == '\\')
 439               switch (ch = in.read())
 440                 {
 441                   case 'a':     ch = 0x7;
 442                     break;
 443                   case 'b':     ch = '\b';
 444                     break;
 445                   case 'f':     ch = 0xC;
 446                     break;
 447                   case 'n':     ch = '\n';
 448                     break;
 449                   case 'r':     ch = '\r';
 450                     break;
 451                   case 't':     ch = '\t';
 452                     break;
 453                   case 'v':     ch = 0xB;
 454                     break;
 455                   case '\n':    ch = '\n';
 456                     break;
 457                   case '\r':    ch = '\r';
 458                     break;
 459                   case '\"':
 460                   case '\'':
 461                   case '\\':
 462                     break;
 463                   default:
 464                     int ch1, nextch;
 465                     if ((nextch = ch1 = ch) >= '0' && ch <= '7')
 466                       {
 467                         ch -= '0';
 468                         if ((nextch = in.read()) >= '0' && nextch <= '7')
 469                           {
 470                             ch = ch * 8 + nextch - '0';
 471                             if ((nextch = in.read()) >= '0' && nextch <= '7' &&
 472                                 ch1 >= '0' && ch1 <= '3')
 473                               {
 474                                 ch = ch * 8 + nextch - '0';
 475                                 nextch = in.read();
 476                               }
 477                           }
 478                       }
 479
 480                     if (nextch != TT_EOF)
 481                       in.unread(nextch);
 482                 }
 483
 484             tokbuf.append((char) ch);
 485           }
 486
 487         // Throw away matching quote char.
 488         if (ch != ttype && ch != TT_EOF)
 489           in.unread(ch);
 490
 491         sval = tokbuf.toString();
 492       }
 493     else
 494       {
 495         ttype = ch;
 496       }
 497
 498     return ttype;
 499   }
 500
 501   private void resetChar(int ch)
 502   {
 503     whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] =
 504       false;
 505   }
 506
 507   /**
 508    * This method makes the specified character an ordinary character.  This
 509    * means that none of the attributes (whitespace, alphabetic, numeric,
 510    * quote, or comment) will be set on this character.  This character will
 511    * parse as its own token.
 512    *
 513    * @param c The character to make ordinary, passed as an int
 514    */
 515   public void ordinaryChar(int ch)
 516   {
 517     if (ch >= 0 && ch <= 255)
 518       resetChar(ch);
 519   }
 520
 521   /**
 522    * This method makes all the characters in the specified range, range
 523    * terminators included, ordinary.  This means the none of the attributes
 524    * (whitespace, alphabetic, numeric, quote, or comment) will be set on
 525    * any of the characters in the range.  This makes each character in this
 526    * range parse as its own token.
 527    *
 528    * @param low The low end of the range of values to set the whitespace
 529    *            attribute for
 530    * @param high The high end of the range of values to set the whitespace
 531    *            attribute for
 532    */
 533   public void ordinaryChars(int low, int hi)
 534   {
 535     if (low < 0)
 536       low = 0;
 537     if (hi > 255)
 538       hi = 255;
 539     for (int i = low; i <= hi; i++)
 540       resetChar(i);
 541   }
 542
 543   /**
 544    * This method sets the numeric attribute on the characters '0' - '9' and
 545    * the characters '.' and '-'.
 546    */
 547   public void parseNumbers()
 548   {
 549     for (int i = 0; i <= 9; i++)
 550       numeric['0' + i] = true;
 551
 552     numeric['.'] = true;
 553     numeric['-'] = true;
 554   }
 555
 556   /**
 557    * Puts the current token back into the StreamTokenizer so
 558    * <code>nextToken</code> will return the same value on the next call.
 559    * May cause the lineno method to return an incorrect value
 560    * if lineno is called before the next call to nextToken.
 561    */
 562   public void pushBack()
 563   {
 564     pushedBack = true;
 565   }
 566
 567   /**
 568    * This method sets the quote attribute on the specified character.
 569    *
 570    * @param c The character to set the quote attribute for, passed as an int.
 571    */
 572   public void quoteChar(int ch)
 573   {
 574     if (ch >= 0 && ch <= 255)
 575       quote[ch] = true;
 576   }
 577
 578   /**
 579    * This method removes all attributes (whitespace, alphabetic, numeric,
 580    * quote, and comment) from all characters.  It is equivalent to calling
 581    * <code>ordinaryChars(0x00, 0xFF)</code>.
 582    *
 583    * @see #ordinaryChars(int, int)
 584    */
 585   public void resetSyntax()
 586   {
 587     ordinaryChars(0x00, 0xFF);
 588   }
 589
 590   /**
 591    * This method sets a flag that indicates whether or not "C++" language style
 592    * comments ("//" comments through EOL ) are handled by the parser.
 593    * If this is <code>true</code> commented out sequences are skipped and
 594    * ignored by the parser.  This defaults to <code>false</code>.
 595    *
 596    * @param flag <code>true</code> to recognized and handle "C++" style
 597    *             comments, <code>false</code> otherwise
 598    */
 599   public void slashSlashComments(boolean flag)
 600   {
 601     slashSlash = flag;
 602   }
 603
 604   /**
 605    * This method sets a flag that indicates whether or not "C" language style
 606    * comments (with nesting not allowed) are handled by the parser.
 607    * If this is <code>true</code> commented out sequences are skipped and
 608    * ignored by the parser.  This defaults to <code>false</code>.
 609    *
 610    * @param flag <code>true</code> to recognized and handle "C" style comments,
 611    *             <code>false</code> otherwise
 612    */
 613   public void slashStarComments(boolean flag)
 614   {
 615     slashStar = flag;
 616   }
 617
 618   /**
 619    * This method returns the current token value as a <code>String</code> in
 620    * the form "Token[x], line n", where 'n' is the current line numbers and
 621    * 'x' is determined as follows.
 622    * <p>
 623    * <ul>
 624    * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0
 625    * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"
 626    * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"
 627    * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code>
 628    * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where
 629    * 'strnval' is <code>String.valueOf(nval)</code>.
 630    * <li>If <code>ttype</code> is a quote character, then 'x' is
 631    * <code>sval</code>
 632    * <li>For all other cases, 'x' is <code>ttype</code>
 633    * </ul>
 634    */
 635   public String toString()
 636   {
 637     String tempstr;
 638     if (ttype == TT_EOF)
 639       tempstr = "EOF";
 640     else if (ttype == TT_EOL)
 641       tempstr = "EOL";
 642     else if (ttype == TT_WORD)
 643       tempstr = sval;
 644     else if (ttype == TT_NUMBER)
 645       tempstr = "n=" + nval;
 646     else if (ttype == TT_NONE)
 647       tempstr = "NOTHING";
 648     else // must be an ordinary char.
 649       tempstr = "\'" + (char) ttype + "\'";
 650
 651     return "Token[" + tempstr + "], line " + lineno();
 652   }
 653
 654   /**
 655    * This method sets the whitespace attribute for all characters in the
 656    * specified range, range terminators included.
 657    *
 658    * @param low The low end of the range of values to set the whitespace
 659    *            attribute for
 660    * @param high The high end of the range of values to set the whitespace
 661    *             attribute for
 662    */
 663   public void whitespaceChars(int low, int hi)
 664   {
 665     if (low < 0)
 666       low = 0;
 667     if (hi > 255)
 668       hi = 255;
 669     for (int i = low; i <= hi; i++)
 670       {
 671         resetChar(i);
 672         whitespace[i] = true;
 673       }
 674   }
 675
 676   /**
 677    * This method sets the alphabetic attribute for all characters in the
 678    * specified range, range terminators included.
 679    *
 680    * @param low The low end of the range of values to set the alphabetic
 681    *            attribute for
 682    * @param high The high end of the range of values to set the alphabetic
 683    *             attribute for
 684    */
 685   public void wordChars(int low, int hi)
 686   {
 687     if (low < 0)
 688       low = 0;
 689     if (hi > 255)
 690       hi = 255;
 691     for (int i = low; i <= hi; i++)
 692       alphabetic[i] = true;
 693   }
 694 }