lex.c (java_new_lexer): Initialize new fields.
authorTom Tromey <tromey@cygnus.com>
Thu, 26 Oct 2000 00:01:46 +0000 (00:01 +0000)
committerTom Tromey <tromey@gcc.gnu.org>
Thu, 26 Oct 2000 00:01:46 +0000 (00:01 +0000)
* lex.c (java_new_lexer): Initialize new fields.  Work around
broken iconv() implementations.
(java_read_char): Swap bytes if required.  Use fallback decoder if
required.
(byteswap_init, need_byteswap): New globals.
(java_destroy_lexer): Only close iconv handle if it is in use.
* lex.h (java_lexer): New fields read_anything, byte_swap,
use_fallback.
Made out_buffer unsigned.

From-SVN: r37063

gcc/java/ChangeLog
gcc/java/lex.c
gcc/java/lex.h

index 14069ab06e31ffc6f8ec86faf0061d3cca6b0b63..53408cf3977e842ef850d870be7cfaf3c965c7db 100644 (file)
@@ -1,3 +1,15 @@
+2000-10-24  Tom Tromey  <tromey@cygnus.com>
+
+       * lex.c (java_new_lexer): Initialize new fields.  Work around
+       broken iconv() implementations.
+       (java_read_char): Swap bytes if required.  Use fallback decoder if
+       required.
+       (byteswap_init, need_byteswap): New globals.
+       (java_destroy_lexer): Only close iconv handle if it is in use.
+       * lex.h (java_lexer): New fields read_anything, byte_swap,
+       use_fallback.
+       Made out_buffer unsigned.
+
 2000-10-24  Alexandre Petit-Bianco  <apbianco@cygnus.com>
 
        * parse.y (register_incomplete_type): Include JDEP_FIELD as a case
index 329d62877599fea125c99a515302e31cd2db72aa..b26499b314bd9355da21b090e65a76ed0c38537a 100644 (file)
@@ -59,6 +59,15 @@ static void java_unget_unicode PARAMS ((void));
 static unicode_t java_sneak_unicode PARAMS ((void));
 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
 
+/* This is nonzero if we have initialized `need_byteswap'.  */
+static int byteswap_init = 0;
+
+/* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
+   big-endian order -- not native endian order.  We handle this by
+   doing a conversion once at startup and seeing what happens.  This
+   flag holds the results of this determination.  */
+static int need_byteswap = 0;
+
 void
 java_init_lex (finput, encoding)
      FILE *finput;
@@ -208,19 +217,66 @@ java_new_lexer (finput, encoding)
 
 #ifdef HAVE_ICONV
   lex->handle = iconv_open ("UCS-2", encoding);
-  if (lex->handle == (iconv_t) -1)
+  if (lex->handle != (iconv_t) -1)
     {
-      /* FIXME: we should give a nice error based on errno here.  */
-      enc_error = 1;
+      lex->first = -1;
+      lex->last = -1;
+      lex->out_first = -1;
+      lex->out_last = -1;
+      lex->read_anything = 0;
+      lex->use_fallback = 0;
+
+      /* Work around broken iconv() implementations by doing checking at
+        runtime.  We assume that if the UTF-8 => UCS-2 encoder is broken,
+        then all UCS-2 encoders will be broken.  Perhaps not a valid
+        assumption.  */
+      if (! byteswap_init)
+       {
+         iconv_t handle;
+
+         byteswap_init = 1;
+
+         handle = iconv_open ("UCS-2", "UTF-8");
+         if (handle != (iconv_t) -1)
+           {
+             unicode_t result;
+             unsigned char in[3];
+             char *inp, *outp;
+             size_t inc, outc, r;
+
+             /* This is the UTF-8 encoding of \ufeff.  */
+             in[0] = 0xef;
+             in[1] = 0xbb;
+             in[2] = 0xbf;
+
+             inp = in;
+             inc = 3;
+             outp = (char *) &result;
+             outc = 2;
+
+             r = iconv (handle, (const char **) &inp, &inc, &outp, &outc);
+             /* Conversion must be complete for us to use the result.  */
+             if (r != (size_t) -1 && inc == 0 && outc == 0)
+               need_byteswap = (result != 0xfeff);
+           }
+       }
+
+      lex->byte_swap = need_byteswap;
     }
-  lex->first = -1;
-  lex->last = -1;
-  lex->out_first = -1;
-  lex->out_last = -1;
-#else /* HAVE_ICONV */
-  if (strcmp (encoding, DEFAULT_ENCODING))
-    enc_error = 1;
+  else
 #endif /* HAVE_ICONV */
+    {
+      /* If iconv failed, use the internal decoder if the default
+        encoding was requested.  This code is used on platforms where
+        iconv() exists but is insufficient for our needs.  For
+        instance, on Solaris 2.5 iconv() cannot handle UTF-8 or UCS-2.  */
+      if (strcmp (encoding, DEFAULT_ENCODING))
+       enc_error = 1;
+#ifdef HAVE_ICONV
+      else
+       lex->use_fallback = 1;
+#endif /* HAVE_ICONV */
+    }
 
   if (enc_error)
     fatal ("unknown encoding: `%s'", encoding);
@@ -233,7 +289,8 @@ java_destroy_lexer (lex)
      java_lexer *lex;
 {
 #ifdef HAVE_ICONV
-  iconv_close (lex->handle);
+  if (! lex->use_fallback)
+    iconv_close (lex->handle);
 #endif
   free (lex);
 }
@@ -250,140 +307,170 @@ java_read_char (lex)
     }
 
 #ifdef HAVE_ICONV
-  {
-    size_t ir, inbytesleft, in_save, out_count, out_save;
-    char *inp, *outp;
-    unicode_t result;
+  if (! lex->use_fallback)
+    {
+      size_t ir, inbytesleft, in_save, out_count, out_save;
+      char *inp, *outp;
+      unicode_t result;
 
-    /* If there is data which has already been converted, use it.  */
-    if (lex->out_first == -1 || lex->out_first >= lex->out_last)
-      {
-       lex->out_first = 0;
-       lex->out_last = 0;
+      /* If there is data which has already been converted, use it.  */
+      if (lex->out_first == -1 || lex->out_first >= lex->out_last)
+       {
+         lex->out_first = 0;
+         lex->out_last = 0;
 
-       while (1)
-         {
-           /* See if we need to read more data.  If FIRST == 0 then
-              the previous conversion attempt ended in the middle of
-              a character at the end of the buffer.  Otherwise we
-              only have to read if the buffer is empty.  */
-           if (lex->first == 0 || lex->first >= lex->last)
-             {
-               int r;
-
-               if (lex->first >= lex->last)
-                 {
-                   lex->first = 0;
-                   lex->last = 0;
-                 }
-               if (feof (lex->finput))
+         while (1)
+           {
+             /* See if we need to read more data.  If FIRST == 0 then
+                the previous conversion attempt ended in the middle of
+                a character at the end of the buffer.  Otherwise we
+                only have to read if the buffer is empty.  */
+             if (lex->first == 0 || lex->first >= lex->last)
+               {
+                 int r;
+
+                 if (lex->first >= lex->last)
+                   {
+                     lex->first = 0;
+                     lex->last = 0;
+                   }
+                 if (feof (lex->finput))
+                   return UEOF;
+                 r = fread (&lex->buffer[lex->last], 1,
+                            sizeof (lex->buffer) - lex->last,
+                            lex->finput);
+                 lex->last += r;
+               }
+
+             inbytesleft = lex->last - lex->first;
+             out_count = sizeof (lex->out_buffer) - lex->out_last;
+
+             if (inbytesleft == 0)
+               {
+                 /* We've tried to read and there is nothing left.  */
                  return UEOF;
-               r = fread (&lex->buffer[lex->last], 1,
-                          sizeof (lex->buffer) - lex->last,
-                          lex->finput);
-               lex->last += r;
-             }
+               }
 
-           inbytesleft = lex->last - lex->first;
-           out_count = sizeof (lex->out_buffer) - lex->out_last;
+             in_save = inbytesleft;
+             out_save = out_count;
+             inp = &lex->buffer[lex->first];
+             outp = &lex->out_buffer[lex->out_last];
+             ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
+                         &outp, &out_count);
 
-           if (inbytesleft == 0)
-             {
-               /* We've tried to read and there is nothing left.  */
-               return UEOF;
-             }
+             /* If we haven't read any bytes, then look to see if we
+                have read a BOM.  */
+             if (! lex->read_anything && out_save - out_count >= 2)
+               {
+                 unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
+                 if (uc == 0xfeff)
+                   {
+                     lex->byte_swap = 0;
+                     lex->out_first += 2;
+                   }
+                 else if (uc == 0xfffe)
+                   {
+                     lex->byte_swap = 1;
+                     lex->out_first += 2;
+                   }
+                 lex->read_anything = 1;
+               }
 
-           in_save = inbytesleft;
-           out_save = out_count;
-           inp = &lex->buffer[lex->first];
-           outp = &lex->out_buffer[lex->out_last];
-           ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
-                       &outp, &out_count);
-           lex->first += in_save - inbytesleft;
-           lex->out_last += out_save - out_count;
-
-           /* If we converted anything at all, move along.  */
-           if (out_count != out_save)
-             break;
+             if (lex->byte_swap)
+               {
+                 unsigned int i;
+                 for (i = 0; i < out_save - out_count; i += 2)
+                   {
+                     char t = lex->out_buffer[lex->out_last + i];
+                     lex->out_buffer[lex->out_last + i]
+                       = lex->out_buffer[lex->out_last + i + 1];
+                     lex->out_buffer[lex->out_last + i + 1] = t;
+                   }
+               }
 
-           if (ir == (size_t) -1)
-             {
-               if (errno == EINVAL)
-                 {
-                   /* This is ok.  This means that the end of our buffer
-                      is in the middle of a character sequence.  We just
-                      move the valid part of the buffer to the beginning
-                      to force a read.  */
-                   /* We use bcopy() because it should work for
-                      overlapping strings.  Use memmove() instead... */
-                   bcopy (&lex->buffer[lex->first], &lex->buffer[0],
-                          lex->last - lex->first);
-                   lex->last -= lex->first;
-                   lex->first = 0;
-                 }
-               else
-                 {
-                   /* A more serious error.  */
-                   java_lex_error ("unrecognized character in input stream",
-                                   0);
-                   return UEOF;
-                 }
-             }
-         }
-      }
+             lex->first += in_save - inbytesleft;
+             lex->out_last += out_save - out_count;
 
-    if (lex->out_first == -1 || lex->out_first >= lex->out_last)
-      {
-       /* Don't have any data.  */
-       return UEOF;
-      }
+             /* If we converted anything at all, move along.  */
+             if (out_count != out_save)
+               break;
 
-    /* Success.  We assume that UCS-2 is big-endian.  This appears to
-       be an ok assumption.  */
-    result = ((((unsigned char) lex->out_buffer[lex->out_first]) << 8)
-             | (unsigned char) lex->out_buffer[lex->out_first + 1]);
-    lex->out_first += 2;
-    return result;
-  }
-#else /* HAVE_ICONV */
-  {
-    int c, c1, c2;
-    c = getc (lex->finput);
-
-    if (c < 128)
-      return (unicode_t)c;
-    if (c == EOF)
-      return UEOF;
-    else
-      {
-       if ((c & 0xe0) == 0xc0)
-         {
-           c1 = getc (lex->finput);
-           if ((c1 & 0xc0) == 0x80)
-             return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
-           c = c1;
-         }
-       else if ((c & 0xf0) == 0xe0)
-         {
-           c1 = getc (lex->finput);
-           if ((c1 & 0xc0) == 0x80)
-             {
-               c2 = getc (lex->finput);
-               if ((c2 & 0xc0) == 0x80)
-                 return (unicode_t)(((c & 0xf) << 12) + 
-                                    (( c1 & 0x3f) << 6) + (c2 & 0x3f));
-               else
-                 c = c2;
-             }
-           else
-             c = c1;
-         }
+             if (ir == (size_t) -1)
+               {
+                 if (errno == EINVAL)
+                   {
+                     /* This is ok.  This means that the end of our buffer
+                        is in the middle of a character sequence.  We just
+                        move the valid part of the buffer to the beginning
+                        to force a read.  */
+                     /* We use bcopy() because it should work for
+                        overlapping strings.  Use memmove() instead... */
+                     bcopy (&lex->buffer[lex->first], &lex->buffer[0],
+                            lex->last - lex->first);
+                     lex->last -= lex->first;
+                     lex->first = 0;
+                   }
+                 else
+                   {
+                     /* A more serious error.  */
+                     java_lex_error ("unrecognized character in input stream",
+                                     0);
+                     return UEOF;
+                   }
+               }
+           }
+       }
 
-       /* We simply don't support invalid characters.  */
-       java_lex_error ("malformed UTF-8 character", 0);
-      }
-  }
+      if (lex->out_first == -1 || lex->out_first >= lex->out_last)
+       {
+         /* Don't have any data.  */
+         return UEOF;
+       }
+
+      /* Success.  */
+      result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
+      lex->out_first += 2;
+      return result;
+    }
+  else
 #endif /* HAVE_ICONV */
+    {
+      int c, c1, c2;
+      c = getc (lex->finput);
+
+      if (c < 128)
+       return (unicode_t)c;
+      if (c == EOF)
+       return UEOF;
+      else
+       {
+         if ((c & 0xe0) == 0xc0)
+           {
+             c1 = getc (lex->finput);
+             if ((c1 & 0xc0) == 0x80)
+               return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
+             c = c1;
+           }
+         else if ((c & 0xf0) == 0xe0)
+           {
+             c1 = getc (lex->finput);
+             if ((c1 & 0xc0) == 0x80)
+               {
+                 c2 = getc (lex->finput);
+                 if ((c2 & 0xc0) == 0x80)
+                   return (unicode_t)(((c & 0xf) << 12) + 
+                                      (( c1 & 0x3f) << 6) + (c2 & 0x3f));
+                 else
+                   c = c2;
+               }
+             else
+               c = c1;
+           }
+
+         /* We simply don't support invalid characters.  */
+         java_lex_error ("malformed UTF-8 character", 0);
+       }
+    }
 
   /* We only get here on error.  */
   return UEOF;
index 71a030d2fefd50a04f9fe5794260542d98b039de..ae9eebb68e550904c9d3171bb1946d48f2fcc0ea 100644 (file)
@@ -115,6 +115,16 @@ typedef struct java_lexer
   unicode_t unget_value;
 
 #ifdef HAVE_ICONV
+  /* Nonzero if we've read any bytes.  We only recognize the
+     byte-order-marker (BOM) as the first word.  */
+  int read_anything : 1;
+
+  /* Nonzero if we have to byte swap.  */
+  int byte_swap : 1;
+
+  /* Nonzero if we're using the fallback decoder.  */
+  int use_fallback : 1;
+
   /* The handle for the iconv converter we're using.  */
   iconv_t handle;
 
@@ -132,7 +142,7 @@ typedef struct java_lexer
   /* This is a buffer of characters already converted by iconv.  We
      use `char' here because we're assuming that iconv() converts to
      big-endian UCS-2, and then we convert it ourselves.  */
-  char out_buffer[1024];
+  unsigned char out_buffer[1024];
 
   /* Index of first valid output character.  -1 if no valid
      characters.  */