From 07b5e470a6e5a28f5e887d65c98174318b940812 Mon Sep 17 00:00:00 2001
From: Tom Tromey <tromey@cygnus.com>
Date: Thu, 26 Oct 2000 00:01:46 +0000
Subject: [PATCH] lex.c (java_new_lexer): Initialize new fields.

	* lex.c (java_new_lexer): Initialize new fields.  Work around
	broken iconv() implementations.
	(java_read_char): Swap bytes if required.  Use fallback decoder if
	required.
	(byteswap_init, need_byteswap): New globals.
	(java_destroy_lexer): Only close iconv handle if it is in use.
	* lex.h (java_lexer): New fields read_anything, byte_swap,
	use_fallback.
	Made out_buffer unsigned.

From-SVN: r37063
---
 gcc/java/ChangeLog |  12 ++
 gcc/java/lex.c     | 355 ++++++++++++++++++++++++++++-----------------
 gcc/java/lex.h     |  12 +-
 3 files changed, 244 insertions(+), 135 deletions(-)

diff --git a/gcc/java/ChangeLog b/gcc/java/ChangeLog
index 14069ab06e3..53408cf3977 100644
--- a/gcc/java/ChangeLog
+++ b/gcc/java/ChangeLog
@@ -1,3 +1,15 @@
+2000-10-24  Tom Tromey  <tromey@cygnus.com>
+
+	* lex.c (java_new_lexer): Initialize new fields.  Work around
+	broken iconv() implementations.
+	(java_read_char): Swap bytes if required.  Use fallback decoder if
+	required.
+	(byteswap_init, need_byteswap): New globals.
+	(java_destroy_lexer): Only close iconv handle if it is in use.
+	* lex.h (java_lexer): New fields read_anything, byte_swap,
+	use_fallback.
+	Made out_buffer unsigned.
+
 2000-10-24  Alexandre Petit-Bianco  <apbianco@cygnus.com>
 
 	* parse.y (register_incomplete_type): Include JDEP_FIELD as a case
diff --git a/gcc/java/lex.c b/gcc/java/lex.c
index 329d6287759..b26499b314b 100644
--- a/gcc/java/lex.c
+++ b/gcc/java/lex.c
@@ -59,6 +59,15 @@ static void java_unget_unicode PARAMS ((void));
 static unicode_t java_sneak_unicode PARAMS ((void));
 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
 
+/* This is nonzero if we have initialized `need_byteswap'.  */
+static int byteswap_init = 0;
+
+/* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
+   big-endian order -- not native endian order.  We handle this by
+   doing a conversion once at startup and seeing what happens.  This
+   flag holds the results of this determination.  */
+static int need_byteswap = 0;
+
 void
 java_init_lex (finput, encoding)
      FILE *finput;
@@ -208,19 +217,66 @@ java_new_lexer (finput, encoding)
 
 #ifdef HAVE_ICONV
   lex->handle = iconv_open ("UCS-2", encoding);
-  if (lex->handle == (iconv_t) -1)
+  if (lex->handle != (iconv_t) -1)
     {
-      /* FIXME: we should give a nice error based on errno here.  */
-      enc_error = 1;
+      lex->first = -1;
+      lex->last = -1;
+      lex->out_first = -1;
+      lex->out_last = -1;
+      lex->read_anything = 0;
+      lex->use_fallback = 0;
+
+      /* Work around broken iconv() implementations by doing checking at
+	 runtime.  We assume that if the UTF-8 => UCS-2 encoder is broken,
+	 then all UCS-2 encoders will be broken.  Perhaps not a valid
+	 assumption.  */
+      if (! byteswap_init)
+	{
+	  iconv_t handle;
+
+	  byteswap_init = 1;
+
+	  handle = iconv_open ("UCS-2", "UTF-8");
+	  if (handle != (iconv_t) -1)
+	    {
+	      unicode_t result;
+	      unsigned char in[3];
+	      char *inp, *outp;
+	      size_t inc, outc, r;
+
+	      /* This is the UTF-8 encoding of \ufeff.  */
+	      in[0] = 0xef;
+	      in[1] = 0xbb;
+	      in[2] = 0xbf;
+
+	      inp = in;
+	      inc = 3;
+	      outp = (char *) &result;
+	      outc = 2;
+
+	      r = iconv (handle, (const char **) &inp, &inc, &outp, &outc);
+	      /* Conversion must be complete for us to use the result.  */
+	      if (r != (size_t) -1 && inc == 0 && outc == 0)
+		need_byteswap = (result != 0xfeff);
+	    }
+	}
+
+      lex->byte_swap = need_byteswap;
     }
-  lex->first = -1;
-  lex->last = -1;
-  lex->out_first = -1;
-  lex->out_last = -1;
-#else /* HAVE_ICONV */
-  if (strcmp (encoding, DEFAULT_ENCODING))
-    enc_error = 1;
+  else
 #endif /* HAVE_ICONV */
+    {
+      /* If iconv failed, use the internal decoder if the default
+	 encoding was requested.  This code is used on platforms where
+	 iconv() exists but is insufficient for our needs.  For
+	 instance, on Solaris 2.5 iconv() cannot handle UTF-8 or UCS-2.  */
+      if (strcmp (encoding, DEFAULT_ENCODING))
+	enc_error = 1;
+#ifdef HAVE_ICONV
+      else
+	lex->use_fallback = 1;
+#endif /* HAVE_ICONV */
+    }
 
   if (enc_error)
     fatal ("unknown encoding: `%s'", encoding);
@@ -233,7 +289,8 @@ java_destroy_lexer (lex)
      java_lexer *lex;
 {
 #ifdef HAVE_ICONV
-  iconv_close (lex->handle);
+  if (! lex->use_fallback)
+    iconv_close (lex->handle);
 #endif
   free (lex);
 }
@@ -250,140 +307,170 @@ java_read_char (lex)
     }
 
 #ifdef HAVE_ICONV
-  {
-    size_t ir, inbytesleft, in_save, out_count, out_save;
-    char *inp, *outp;
-    unicode_t result;
+  if (! lex->use_fallback)
+    {
+      size_t ir, inbytesleft, in_save, out_count, out_save;
+      char *inp, *outp;
+      unicode_t result;
 
-    /* If there is data which has already been converted, use it.  */
-    if (lex->out_first == -1 || lex->out_first >= lex->out_last)
-      {
-	lex->out_first = 0;
-	lex->out_last = 0;
+      /* If there is data which has already been converted, use it.  */
+      if (lex->out_first == -1 || lex->out_first >= lex->out_last)
+	{
+	  lex->out_first = 0;
+	  lex->out_last = 0;
 
-	while (1)
-	  {
-	    /* See if we need to read more data.  If FIRST == 0 then
-	       the previous conversion attempt ended in the middle of
-	       a character at the end of the buffer.  Otherwise we
-	       only have to read if the buffer is empty.  */
-	    if (lex->first == 0 || lex->first >= lex->last)
-	      {
-		int r;
-
-		if (lex->first >= lex->last)
-		  {
-		    lex->first = 0;
-		    lex->last = 0;
-		  }
-		if (feof (lex->finput))
+	  while (1)
+	    {
+	      /* See if we need to read more data.  If FIRST == 0 then
+		 the previous conversion attempt ended in the middle of
+		 a character at the end of the buffer.  Otherwise we
+		 only have to read if the buffer is empty.  */
+	      if (lex->first == 0 || lex->first >= lex->last)
+		{
+		  int r;
+
+		  if (lex->first >= lex->last)
+		    {
+		      lex->first = 0;
+		      lex->last = 0;
+		    }
+		  if (feof (lex->finput))
+		    return UEOF;
+		  r = fread (&lex->buffer[lex->last], 1,
+			     sizeof (lex->buffer) - lex->last,
+			     lex->finput);
+		  lex->last += r;
+		}
+
+	      inbytesleft = lex->last - lex->first;
+	      out_count = sizeof (lex->out_buffer) - lex->out_last;
+
+	      if (inbytesleft == 0)
+		{
+		  /* We've tried to read and there is nothing left.  */
 		  return UEOF;
-		r = fread (&lex->buffer[lex->last], 1,
-			   sizeof (lex->buffer) - lex->last,
-			   lex->finput);
-		lex->last += r;
-	      }
+		}
 
-	    inbytesleft = lex->last - lex->first;
-	    out_count = sizeof (lex->out_buffer) - lex->out_last;
+	      in_save = inbytesleft;
+	      out_save = out_count;
+	      inp = &lex->buffer[lex->first];
+	      outp = &lex->out_buffer[lex->out_last];
+	      ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
+			  &outp, &out_count);
 
-	    if (inbytesleft == 0)
-	      {
-		/* We've tried to read and there is nothing left.  */
-		return UEOF;
-	      }
+	      /* If we haven't read any bytes, then look to see if we
+		 have read a BOM.  */
+	      if (! lex->read_anything && out_save - out_count >= 2)
+		{
+		  unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
+		  if (uc == 0xfeff)
+		    {
+		      lex->byte_swap = 0;
+		      lex->out_first += 2;
+		    }
+		  else if (uc == 0xfffe)
+		    {
+		      lex->byte_swap = 1;
+		      lex->out_first += 2;
+		    }
+		  lex->read_anything = 1;
+		}
 
-	    in_save = inbytesleft;
-	    out_save = out_count;
-	    inp = &lex->buffer[lex->first];
-	    outp = &lex->out_buffer[lex->out_last];
-	    ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
-			&outp, &out_count);
-	    lex->first += in_save - inbytesleft;
-	    lex->out_last += out_save - out_count;
-
-	    /* If we converted anything at all, move along.  */
-	    if (out_count != out_save)
-	      break;
+	      if (lex->byte_swap)
+		{
+		  unsigned int i;
+		  for (i = 0; i < out_save - out_count; i += 2)
+		    {
+		      char t = lex->out_buffer[lex->out_last + i];
+		      lex->out_buffer[lex->out_last + i]
+			= lex->out_buffer[lex->out_last + i + 1];
+		      lex->out_buffer[lex->out_last + i + 1] = t;
+		    }
+		}
 
-	    if (ir == (size_t) -1)
-	      {
-		if (errno == EINVAL)
-		  {
-		    /* This is ok.  This means that the end of our buffer
-		       is in the middle of a character sequence.  We just
-		       move the valid part of the buffer to the beginning
-		       to force a read.  */
-		    /* We use bcopy() because it should work for
-		       overlapping strings.  Use memmove() instead... */
-		    bcopy (&lex->buffer[lex->first], &lex->buffer[0],
-			   lex->last - lex->first);
-		    lex->last -= lex->first;
-		    lex->first = 0;
-		  }
-		else
-		  {
-		    /* A more serious error.  */
-		    java_lex_error ("unrecognized character in input stream",
-				    0);
-		    return UEOF;
-		  }
-	      }
-	  }
-      }
+	      lex->first += in_save - inbytesleft;
+	      lex->out_last += out_save - out_count;
 
-    if (lex->out_first == -1 || lex->out_first >= lex->out_last)
-      {
-	/* Don't have any data.  */
-	return UEOF;
-      }
+	      /* If we converted anything at all, move along.  */
+	      if (out_count != out_save)
+		break;
 
-    /* Success.  We assume that UCS-2 is big-endian.  This appears to
-       be an ok assumption.  */
-    result = ((((unsigned char) lex->out_buffer[lex->out_first]) << 8)
-	      | (unsigned char) lex->out_buffer[lex->out_first + 1]);
-    lex->out_first += 2;
-    return result;
-  }
-#else /* HAVE_ICONV */
-  {
-    int c, c1, c2;
-    c = getc (lex->finput);
-
-    if (c < 128)
-      return (unicode_t)c;
-    if (c == EOF)
-      return UEOF;
-    else
-      {
-	if ((c & 0xe0) == 0xc0)
-	  {
-	    c1 = getc (lex->finput);
-	    if ((c1 & 0xc0) == 0x80)
-	      return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
-	    c = c1;
-	  }
-	else if ((c & 0xf0) == 0xe0)
-	  {
-	    c1 = getc (lex->finput);
-	    if ((c1 & 0xc0) == 0x80)
-	      {
-		c2 = getc (lex->finput);
-		if ((c2 & 0xc0) == 0x80)
-		  return (unicode_t)(((c & 0xf) << 12) + 
-				     (( c1 & 0x3f) << 6) + (c2 & 0x3f));
-		else
-		  c = c2;
-	      }
-	    else
-	      c = c1;
-	  }
+	      if (ir == (size_t) -1)
+		{
+		  if (errno == EINVAL)
+		    {
+		      /* This is ok.  This means that the end of our buffer
+			 is in the middle of a character sequence.  We just
+			 move the valid part of the buffer to the beginning
+			 to force a read.  */
+		      /* We use bcopy() because it should work for
+			 overlapping strings.  Use memmove() instead... */
+		      bcopy (&lex->buffer[lex->first], &lex->buffer[0],
+			     lex->last - lex->first);
+		      lex->last -= lex->first;
+		      lex->first = 0;
+		    }
+		  else
+		    {
+		      /* A more serious error.  */
+		      java_lex_error ("unrecognized character in input stream",
+				      0);
+		      return UEOF;
+		    }
+		}
+	    }
+	}
 
-	/* We simply don't support invalid characters.  */
-	java_lex_error ("malformed UTF-8 character", 0);
-      }
-  }
+      if (lex->out_first == -1 || lex->out_first >= lex->out_last)
+	{
+	  /* Don't have any data.  */
+	  return UEOF;
+	}
+
+      /* Success.  */
+      result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
+      lex->out_first += 2;
+      return result;
+    }
+  else
 #endif /* HAVE_ICONV */
+    {
+      int c, c1, c2;
+      c = getc (lex->finput);
+
+      if (c < 128)
+	return (unicode_t)c;
+      if (c == EOF)
+	return UEOF;
+      else
+	{
+	  if ((c & 0xe0) == 0xc0)
+	    {
+	      c1 = getc (lex->finput);
+	      if ((c1 & 0xc0) == 0x80)
+		return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
+	      c = c1;
+	    }
+	  else if ((c & 0xf0) == 0xe0)
+	    {
+	      c1 = getc (lex->finput);
+	      if ((c1 & 0xc0) == 0x80)
+		{
+		  c2 = getc (lex->finput);
+		  if ((c2 & 0xc0) == 0x80)
+		    return (unicode_t)(((c & 0xf) << 12) + 
+				       (( c1 & 0x3f) << 6) + (c2 & 0x3f));
+		  else
+		    c = c2;
+		}
+	      else
+		c = c1;
+	    }
+
+	  /* We simply don't support invalid characters.  */
+	  java_lex_error ("malformed UTF-8 character", 0);
+	}
+    }
 
   /* We only get here on error.  */
   return UEOF;
diff --git a/gcc/java/lex.h b/gcc/java/lex.h
index 71a030d2fef..ae9eebb68e5 100644
--- a/gcc/java/lex.h
+++ b/gcc/java/lex.h
@@ -115,6 +115,16 @@ typedef struct java_lexer
   unicode_t unget_value;
 
 #ifdef HAVE_ICONV
+  /* Nonzero if we've read any bytes.  We only recognize the
+     byte-order-marker (BOM) as the first word.  */
+  int read_anything : 1;
+
+  /* Nonzero if we have to byte swap.  */
+  int byte_swap : 1;
+
+  /* Nonzero if we're using the fallback decoder.  */
+  int use_fallback : 1;
+
   /* The handle for the iconv converter we're using.  */
   iconv_t handle;
 
@@ -132,7 +142,7 @@ typedef struct java_lexer
   /* This is a buffer of characters already converted by iconv.  We
      use `char' here because we're assuming that iconv() converts to
      big-endian UCS-2, and then we convert it ourselves.  */
-  char out_buffer[1024];
+  unsigned char out_buffer[1024];
 
   /* Index of first valid output character.  -1 if no valid
      characters.  */
-- 
2.30.2