From: Jesse Rosenstock Date: Mon, 11 Nov 2002 07:36:41 +0000 (+0000) Subject: ISO_8859_1.java, [...]: New files. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8a423d779f598036c5e9048fb949494a7a98d564;p=gcc.git ISO_8859_1.java, [...]: New files. 2002-11-11 Jesse Rosenstock * gnu/java/nio/charset/ISO_8859_1.java, gnu/java/nio/charset/Provider.java, gnu/java/nio/charset/US_ASCII.java, gnu/java/nio/charset/UTF_16.java, gnu/java/nio/charset/UTF_16BE.java, gnu/java/nio/charset/UTF_16Decoder.java, gnu/java/nio/charset/UTF_16Encoder.java, gnu/java/nio/charset/UTF_16LE.java, gnu/java/nio/charset/UTF_8.java: New files. * Makefile.am (): Added new files. * Makefile.in: Regenerated. From-SVN: r59013 --- diff --git a/libjava/ChangeLog b/libjava/ChangeLog index 4ddee784389..0732a5629da 100644 --- a/libjava/ChangeLog +++ b/libjava/ChangeLog @@ -1,3 +1,18 @@ +2002-11-11 Jesse Rosenstock + + * gnu/java/nio/charset/ISO_8859_1.java, + gnu/java/nio/charset/Provider.java, + gnu/java/nio/charset/US_ASCII.java, + gnu/java/nio/charset/UTF_16.java, + gnu/java/nio/charset/UTF_16BE.java, + gnu/java/nio/charset/UTF_16Decoder.java, + gnu/java/nio/charset/UTF_16Encoder.java, + gnu/java/nio/charset/UTF_16LE.java, + gnu/java/nio/charset/UTF_8.java: New files. + * Makefile.am (): + Added new files. + * Makefile.in: Regenerated. + 2002-11-11 Michael Koch * java/nio/charset/CharacterCodingException.java: diff --git a/libjava/gnu/java/nio/charset/ISO_8859_1.java b/libjava/gnu/java/nio/charset/ISO_8859_1.java new file mode 100644 index 00000000000..f29fa260300 --- /dev/null +++ b/libjava/gnu/java/nio/charset/ISO_8859_1.java @@ -0,0 +1,132 @@ +/* ISO_8859_1.java -- + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.java.nio.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +/** + * ISO-8859-1 charset. + * + * @author Jesse Rosenstock + */ +final class ISO_8859_1 extends Charset +{ + ISO_8859_1 () + { + super ("ISO-8859-1", new String[]{"ISO-LATIN-1"}); + } + + public boolean contains (Charset cs) + { + return cs instanceof US_ASCII || cs instanceof ISO_8859_1; + } + + public CharsetDecoder newDecoder () + { + return new Decoder (this); + } + + public CharsetEncoder newEncoder () + { + return new Encoder (this); + } + + private static final class Decoder extends CharsetDecoder + { + private Decoder (Charset cs) + { + super (cs, 1.0f, 1.0f); + } + + protected CoderResult decodeLoop (ByteBuffer in, CharBuffer out) + { + // TODO: Optimize this in the case in.hasArray() / out.hasArray() + while (in.hasRemaining ()) + { + byte b = in.get (); + + if (!out.hasRemaining ()) + { + in.position (in.position () - 1); + return CoderResult.OVERFLOW; + } + + out.put ((char) (b & 0xFF)); + } + + return CoderResult.UNDERFLOW; + } + } + + private static final class Encoder extends CharsetEncoder + { + private Encoder (Charset cs) + { + super (cs, 1.0f, 1.0f); + } + + protected CoderResult encodeLoop (CharBuffer in, ByteBuffer out) + { + // TODO: Optimize this in the case in.hasArray() / out.hasArray() + while (in.hasRemaining ()) + { + char c = in.get (); + + if (c > 0xFF) + { + in.position (in.position () - 1); + return CoderResult.unmappableForLength (1); + } + if (!out.hasRemaining ()) + { + in.position (in.position () - 1); + return CoderResult.OVERFLOW; + } + + out.put ((byte) c); + } + + return CoderResult.UNDERFLOW; + } + } +} diff --git a/libjava/gnu/java/nio/charset/Provider.java b/libjava/gnu/java/nio/charset/Provider.java new file mode 100644 index 00000000000..13f637113e5 --- /dev/null +++ b/libjava/gnu/java/nio/charset/Provider.java @@ -0,0 +1,135 @@ +/* Provider.java -- + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.java.nio.charset; + +import java.nio.charset.Charset; +import java.nio.charset.spi.CharsetProvider; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; + +/** + * Charset provider for the required charsets. Used by + * {@link Charset#charsetForName} and * {@link Charset#availableCharsets}. + * + * @author Jesse Rosenstock + * @see Charset + */ +public final class Provider extends CharsetProvider +{ + private static Provider singleton; + + static + { + synchronized (Provider.class) + { + singleton = null; + } + } + + /** + * Map from charset name to charset canonical name. + */ + private final HashMap canonicalNames; + + /** + * Map from canonical name to Charset. + * TODO: We may want to use soft references. We would then need to keep + * track of the class name to regenerate the object. + */ + private final HashMap charsets; + + private Provider () + { + // FIXME: We might need to make the name comparison case insensitive. + // Verify this with the Sun JDK. + canonicalNames = new HashMap (); + charsets = new HashMap (); + + // US-ASCII aka ISO646-US + addCharset (new US_ASCII ()); + + // ISO-8859-1 aka ISO-LATIN-1 + addCharset (new ISO_8859_1 ()); + + // UTF-8 + addCharset (new UTF_8 ()); + + // UTF-16BE + addCharset (new UTF_16BE ()); + + // UTF-16LE + addCharset (new UTF_16LE ()); + + // UTF-16 + addCharset (new UTF_16 ()); + } + + public Iterator charsets () + { + return Collections.unmodifiableCollection (charsets.values ()) + .iterator (); + } + + public Charset charsetForName (String charsetName) + { + return (Charset) charsets.get (canonicalize (charsetName)); + } + + private Object canonicalize (String charsetName) + { + Object o = canonicalNames.get (charsetName); + return o == null ? charsetName : o; + } + + private void addCharset (Charset cs) + { + String canonicalName = cs.name (); + charsets.put (canonicalName, cs); + + for (Iterator i = cs.aliases ().iterator (); i.hasNext (); ) + canonicalNames.put (i.next (), canonicalName); + } + + public static synchronized Provider provider () + { + if (singleton == null) + singleton = new Provider (); + return singleton; + } +} diff --git a/libjava/gnu/java/nio/charset/US_ASCII.java b/libjava/gnu/java/nio/charset/US_ASCII.java new file mode 100644 index 00000000000..a1ff2510402 --- /dev/null +++ b/libjava/gnu/java/nio/charset/US_ASCII.java @@ -0,0 +1,137 @@ +/* US_ASCII.java -- + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.java.nio.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +/** + * US-ASCII charset. + * + * @author Jesse Rosenstock + */ +final class US_ASCII extends Charset +{ + US_ASCII () + { + super ("US-ASCII", new String[]{"ISO646-US"}); + } + + public boolean contains (Charset cs) + { + return cs instanceof US_ASCII; + } + + public CharsetDecoder newDecoder () + { + return new Decoder (this); + } + + public CharsetEncoder newEncoder () + { + return new Encoder (this); + } + + private static final class Decoder extends CharsetDecoder + { + private Decoder (Charset cs) + { + super (cs, 1.0f, 1.0f); + } + + protected CoderResult decodeLoop (ByteBuffer in, CharBuffer out) + { + // TODO: Optimize this in the case in.hasArray() / out.hasArray() + while (in.hasRemaining ()) + { + byte b = in.get (); + + if (b < 0) + { + in.position (in.position () - 1); + return CoderResult.malformedForLength (1); + } + if (!out.hasRemaining ()) + { + in.position (in.position () - 1); + return CoderResult.OVERFLOW; + } + + out.put ((char) b); + } + + return CoderResult.UNDERFLOW; + } + } + + private static final class Encoder extends CharsetEncoder + { + private Encoder (Charset cs) + { + super (cs, 1.0f, 1.0f); + } + + protected CoderResult encodeLoop (CharBuffer in, ByteBuffer out) + { + // TODO: Optimize this in the case in.hasArray() / out.hasArray() + while (in.hasRemaining ()) + { + char c = in.get (); + + if (c > Byte.MAX_VALUE) + { + in.position (in.position () - 1); + return CoderResult.unmappableForLength (1); + } + if (!out.hasRemaining ()) + { + in.position (in.position () - 1); + return CoderResult.OVERFLOW; + } + + out.put ((byte) c); + } + + return CoderResult.UNDERFLOW; + } + } +} diff --git a/libjava/gnu/java/nio/charset/UTF_16.java b/libjava/gnu/java/nio/charset/UTF_16.java new file mode 100644 index 00000000000..18c9be7f4b2 --- /dev/null +++ b/libjava/gnu/java/nio/charset/UTF_16.java @@ -0,0 +1,75 @@ +/* UTF_16.java -- + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.java.nio.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +/** + * UTF-16 charset. + * + * @author Jesse Rosenstock + */ +final class UTF_16 extends Charset +{ + UTF_16 () + { + super ("UTF-16", null); + } + + public boolean contains (Charset cs) + { + return cs instanceof US_ASCII || cs instanceof ISO_8859_1 + || cs instanceof UTF_8 || cs instanceof UTF_16BE + || cs instanceof UTF_16LE || cs instanceof UTF_16; + } + + public CharsetDecoder newDecoder () + { + return new UTF_16Decoder (this, UTF_16Decoder.UNKNOWN_ENDIAN); + } + + public CharsetEncoder newEncoder () + { + return new UTF_16Encoder (this, UTF_16Encoder.BIG_ENDIAN, false); + } +} diff --git a/libjava/gnu/java/nio/charset/UTF_16BE.java b/libjava/gnu/java/nio/charset/UTF_16BE.java new file mode 100644 index 00000000000..6fb28cdf830 --- /dev/null +++ b/libjava/gnu/java/nio/charset/UTF_16BE.java @@ -0,0 +1,75 @@ +/* UTF_16BE.java -- + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.java.nio.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +/** + * UTF-16BE charset. + * + * @author Jesse Rosenstock + */ +final class UTF_16BE extends Charset +{ + UTF_16BE () + { + super ("UTF-16BE", null); + } + + public boolean contains (Charset cs) + { + return cs instanceof US_ASCII || cs instanceof ISO_8859_1 + || cs instanceof UTF_8 || cs instanceof UTF_16BE + || cs instanceof UTF_16LE || cs instanceof UTF_16; + } + + public CharsetDecoder newDecoder () + { + return new UTF_16Decoder (this, UTF_16Decoder.BIG_ENDIAN); + } + + public CharsetEncoder newEncoder () + { + return new UTF_16Encoder (this, UTF_16Encoder.BIG_ENDIAN, true); + } +} diff --git a/libjava/gnu/java/nio/charset/UTF_16Decoder.java b/libjava/gnu/java/nio/charset/UTF_16Decoder.java new file mode 100644 index 00000000000..c8e474d5741 --- /dev/null +++ b/libjava/gnu/java/nio/charset/UTF_16Decoder.java @@ -0,0 +1,169 @@ +/* UTF_16Decoder.java -- + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.java.nio.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; + +/** + * Decoder for UTF-16, UTF-15LE, and UTF-16BE. + * + * @author Jesse Rosenstock + */ +final class UTF_16Decoder extends CharsetDecoder +{ + // byte orders + static final int BIG_ENDIAN = 0; + static final int LITTLE_ENDIAN = 1; + static final int UNKNOWN_ENDIAN = 2; + + private static final char BYTE_ORDER_MARK = '\uFEFF'; + private static final char REVERSED_BYTE_ORDER_MARK = '\uFFFE'; + + private final int originalByteOrder; + private int byteOrder; + + UTF_16Decoder (Charset cs, int byteOrder) + { + super (cs, 0.5f, 1.0f); + this.originalByteOrder = byteOrder; + this.byteOrder = byteOrder; + } + + protected CoderResult decodeLoop (ByteBuffer in, CharBuffer out) + { + // TODO: Optimize this in the case in.hasArray() / out.hasArray() + + int inPos = in.position (); + try + { + while (in.remaining () >= 2) + { + byte b1 = in.get (); + byte b2 = in.get (); + + // handle byte order mark + if (byteOrder == UNKNOWN_ENDIAN) + { + char c = (char) ((b1 << 8) | b2); + if (c == BYTE_ORDER_MARK) + { + byteOrder = BIG_ENDIAN; + inPos += 2; + continue; + } + else if (c == REVERSED_BYTE_ORDER_MARK) + { + byteOrder = LITTLE_ENDIAN; + inPos += 2; + continue; + } + else + { + // assume big endian, do not consume bytes, + // continue with normal processing + byteOrder = BIG_ENDIAN; + } + } + + char c = byteOrder == BIG_ENDIAN ? (char) ((b1 << 8) | b2) + : (char) ((b2 << 8) | b1); + + if (0xD800 <= c && c <= 0xDFFF) + { + // c is a surrogate + + // make sure c is a high surrogate + if (c > 0xDBFF) + return CoderResult.malformedForLength (2); + if (in.remaining () < 2) + return CoderResult.UNDERFLOW; + byte b3 = in.get (); + byte b4 = in.get (); + char d = byteOrder == BIG_ENDIAN ? (char) ((b3 << 8) | b4) + : (char) ((b4 << 8) | b3); + // make sure d is a low surrogate + if (d < 0xDC00 || d > 0xDFFF) + return CoderResult.malformedForLength (2); + out.put (c); + out.put (d); + inPos += 4; + } + else + { + if (!out.hasRemaining ()) + return CoderResult.UNDERFLOW; + out.put (c); + inPos += 2; + } + } + + return CoderResult.UNDERFLOW; + } + finally + { + in.position (inPos); + } + } + + /** + * Writes c to out in the byte order + * specified by byteOrder. + **/ + private void put (ByteBuffer out, char c) + { + if (byteOrder == BIG_ENDIAN) + { + out.put ((byte) (c >> 8)); + out.put ((byte) (c & 0xFF)); + } + else + { + out.put ((byte) (c & 0xFF)); + out.put ((byte) (c >> 8)); + } + } + + protected void implReset () + { + byteOrder = originalByteOrder; + } +} diff --git a/libjava/gnu/java/nio/charset/UTF_16Encoder.java b/libjava/gnu/java/nio/charset/UTF_16Encoder.java new file mode 100644 index 00000000000..b0cb9ed8ce1 --- /dev/null +++ b/libjava/gnu/java/nio/charset/UTF_16Encoder.java @@ -0,0 +1,153 @@ +/* UTF_16Encoder.java -- + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.java.nio.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +/** + * Encoder for UTF-16, UTF-15LE, and UTF-16BE. + * + * @author Jesse Rosenstock + */ +final class UTF_16Encoder extends CharsetEncoder +{ + // byte orders + static final int BIG_ENDIAN = 0; + static final int LITTLE_ENDIAN = 1; + + private static final char BYTE_ORDER_MARK = '\uFEFF'; + + private final int byteOrder; + private final boolean useByteOrderMark; + private boolean needsByteOrderMark; + + UTF_16Encoder (Charset cs, int byteOrder, boolean useByteOrderMark) + { + super (cs, 2.0f, + useByteOrderMark ? 4.0f : 2.0f, + byteOrder == BIG_ENDIAN + ? new byte[] { (byte) 0xFF, (byte) 0xFD } + : new byte[] { (byte) 0xFD, (byte) 0xFF }); + this.byteOrder = byteOrder; + this.useByteOrderMark = useByteOrderMark; + this.needsByteOrderMark = useByteOrderMark; + } + + protected CoderResult encodeLoop (CharBuffer in, ByteBuffer out) + { + // TODO: Optimize this in the case in.hasArray() / out.hasArray() + + if (needsByteOrderMark) + { + if (out.remaining () < 2) + return CoderResult.OVERFLOW; + put (out, BYTE_ORDER_MARK); + needsByteOrderMark = false; + } + + int inPos = in.position (); + try + { + while (in.hasRemaining ()) + { + char c = in.get (); + + if (0xD800 <= c && c <= 0xDFFF) + { + // c is a surrogate + + // make sure c is a high surrogate + if (c > 0xDBFF) + return CoderResult.malformedForLength (1); + if (in.remaining () < 1) + return CoderResult.UNDERFLOW; + char d = in.get (); + // make sure d is a low surrogate + if (d < 0xDC00 || d > 0xDFFF) + return CoderResult.malformedForLength (1); + put (out, c); + put (out, d); + inPos += 2; + } + else + { + if (out.remaining () < 2) + return CoderResult.OVERFLOW; + put (out, c); + inPos++; + } + } + + return CoderResult.UNDERFLOW; + } + finally + { + in.position (inPos); + } + } + + /** + * Writes c to out in the byte order + * specified by byteOrder. + **/ + private void put (ByteBuffer out, char c) + { + if (byteOrder == BIG_ENDIAN) + { + out.put ((byte) (c >> 8)); + out.put ((byte) (c & 0xFF)); + } + else + { + out.put ((byte) (c & 0xFF)); + out.put ((byte) (c >> 8)); + } + } + + protected void implReset () + { + needsByteOrderMark = useByteOrderMark; + } + + // TODO: override canEncode(char) and canEncode(CharSequence) + // for performance +} diff --git a/libjava/gnu/java/nio/charset/UTF_16LE.java b/libjava/gnu/java/nio/charset/UTF_16LE.java new file mode 100644 index 00000000000..b914ae07273 --- /dev/null +++ b/libjava/gnu/java/nio/charset/UTF_16LE.java @@ -0,0 +1,75 @@ +/* UTF_16LE.java -- + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.java.nio.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +/** + * UTF-16LE charset. + * + * @author Jesse Rosenstock + */ +final class UTF_16LE extends Charset +{ + UTF_16LE () + { + super ("UTF-16LE", null); + } + + public boolean contains (Charset cs) + { + return cs instanceof US_ASCII || cs instanceof ISO_8859_1 + || cs instanceof UTF_8 || cs instanceof UTF_16BE + || cs instanceof UTF_16LE || cs instanceof UTF_16; + } + + public CharsetDecoder newDecoder () + { + return new UTF_16Decoder (this, UTF_16Decoder.LITTLE_ENDIAN); + } + + public CharsetEncoder newEncoder () + { + return new UTF_16Encoder (this, UTF_16Encoder.LITTLE_ENDIAN, true); + } +} diff --git a/libjava/gnu/java/nio/charset/UTF_8.java b/libjava/gnu/java/nio/charset/UTF_8.java new file mode 100644 index 00000000000..aa623b2f2cf --- /dev/null +++ b/libjava/gnu/java/nio/charset/UTF_8.java @@ -0,0 +1,279 @@ +/* UTF_8.java -- + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +02111-1307 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.java.nio.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +/** + * UTF-8 charset. + * + *

UTF-8 references: + *

+ * + * @author Jesse Rosenstock + */ +final class UTF_8 extends Charset +{ + UTF_8 () + { + super ("UTF-8", null); + } + + public boolean contains (Charset cs) + { + return cs instanceof US_ASCII || cs instanceof ISO_8859_1 + || cs instanceof UTF_8 || cs instanceof UTF_16BE + || cs instanceof UTF_16LE || cs instanceof UTF_16; + } + + public CharsetDecoder newDecoder () + { + return new Decoder (this); + } + + public CharsetEncoder newEncoder () + { + return new Encoder (this); + } + + private static final class Decoder extends CharsetDecoder + { + private Decoder (Charset cs) + { + super (cs, 1.0f, 1.0f); + } + + protected CoderResult decodeLoop (ByteBuffer in, CharBuffer out) + { + // TODO: Optimize this in the case in.hasArray() / out.hasArray() + int inPos = 0; + try + { + while (in.hasRemaining ()) + { + char c; + byte b1 = in.get (); + int highNibble = (b1 >> 4) & 0xF; + + switch (highNibble) + { + case 0: case 1: case 2: case 3: + case 4: case 5: case 6: case 7: + if (out.remaining () < 1) + return CoderResult.OVERFLOW; + out.put ((char) b1); + inPos++; + break; + + case 0xC: case 0xD: + byte b2; + if (in.remaining () < 1) + return CoderResult.UNDERFLOW; + if (out.remaining () < 1) + return CoderResult.OVERFLOW; + if (!isContinuation (b2 = in.get ())) + return CoderResult.malformedForLength (1); + c = (char) (((b1 & 0x1F) << 6) | (b2 & 0x3F)); + // check that we had the shortest encoding + if (c <= 0x7F) + return CoderResult.malformedForLength (2); + out.put (c); + inPos += 2; + break; + + case 0xE: + byte b3; + if (in.remaining () < 2) + return CoderResult.UNDERFLOW; + if (out.remaining () < 1) + return CoderResult.OVERFLOW; + if (!isContinuation (b2 = in.get ())) + return CoderResult.malformedForLength (1); + if (!isContinuation (b3 = in.get ())) + return CoderResult.malformedForLength (1); + c = (char) (((b1 & 0x0F) << 12) + | ((b2 & 0x3F) << 6) + | (b3 & 0x3F)); + // check that we had the shortest encoding + if (c <= 0x7FF) + return CoderResult.malformedForLength (3); + out.put (c); + inPos += 3; + break; + + default: + return CoderResult.malformedForLength (1); + } + } + + return CoderResult.UNDERFLOW; + } + finally + { + // In case we did a get(), then encountered an error, reset the + // position to before the error. If there was no error, this + // will benignly reset the position to the value it already has. + in.position (inPos); + } + } + + private static boolean isContinuation (byte b) + { + return (b & 0xC0) == 0x80; + } + } + + private static final class Encoder extends CharsetEncoder + { + private Encoder (Charset cs) + { + // According to + // http://www-106.ibm.com/developerworks/unicode/library/utfencodingforms/index.html + // On average, English takes slightly over one unit per code point. + // Most Latin-script languages take about 1.1 bytes. Greek, Russian, + // Arabic and Hebrew take about 1.7 bytes, and most others (including + // Japanese, Chinese, Korean and Hindi) take about 3 bytes. + // We assume we will be dealing with latin scripts, and use 1.1 + // for averageBytesPerChar. + super (cs, 1.1f, 4.0f); + } + + protected CoderResult encodeLoop (CharBuffer in, ByteBuffer out) + { + int inPos = 0; + try + { + // TODO: Optimize this in the case in.hasArray() / out.hasArray() + while (in.hasRemaining ()) + { + int remaining = out.remaining (); + char c = in.get (); + + // UCS-4 range (hex.) UTF-8 octet sequence (binary) + // 0000 0000-0000 007F 0xxxxxxx + // 0000 0080-0000 07FF 110xxxxx 10xxxxxx + // 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx + + // Scalar Value UTF-16 byte 1 byte 2 byte 3 byte 4 + // 0000 0000 0xxx xxxx 0000 0000 0xxx xxxx 0xxx xxxx + // 0000 0yyy yyxx xxxx 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx + // zzzz yyyy yyxx xxxx zzzz yyyy yyxx xxxx 1110 zzzz 10yy yyyy 10xx xxxx + // u uuuu zzzz yyyy yyxx xxxx 1101 10ww wwzz zzyy 1111 0uuu 10uu zzzz 10yy yyyy 10xx xxxx + // + 1101 11yy yyxx xxxx + // Note: uuuuu = wwww + 1 + + if (c <= 0x7F) + { + if (remaining < 1) + return CoderResult.OVERFLOW; + out.put ((byte) c); + inPos++; + } + else if (c <= 0x7FF) + { + if (remaining < 2) + return CoderResult.OVERFLOW; + out.put ((byte) (0xC0 | (c >> 6))); + out.put ((byte) (0x80 | (c & 0x3F))); + inPos++; + } + else if (0xD800 <= c && c <= 0xDFFF) + { + if (remaining < 4) + return CoderResult.OVERFLOW; + + // we got a low surrogate without a preciding high one + if (c > 0xDBFF) + return CoderResult.malformedForLength (1); + + // high surrogates + if (!in.hasRemaining ()) + return CoderResult.UNDERFLOW; + + char d = in.get (); + + // make sure d is a low surrogate + if (d < 0xDC00 || d > 0xDFFF) + return CoderResult.malformedForLength (1); + + // make the 32 bit value + // int value2 = (c - 0xD800) * 0x400 + (d - 0xDC00) + 0x10000; + int value = (((c & 0x3FF) << 10) | (d & 0x3FF)) + 0x10000; + // assert value == value2; + out.put ((byte) (0xF0 | (value >> 18))); + out.put ((byte) (0x80 | ((value >> 12) & 0x3F))); + out.put ((byte) (0x80 | ((value >> 6) & 0x3F))); + out.put ((byte) (0x80 | ((value ) & 0x3F))); + + inPos += 2; + } + else + { + if (remaining < 3) + return CoderResult.OVERFLOW; + + out.put ((byte) (0xE0 | (c >> 12))); + out.put ((byte) (0x80 | ((c >> 6) & 0x3F))); + out.put ((byte) (0x80 | (c & 0x3F))); + inPos++; + } + } + + return CoderResult.UNDERFLOW; + } + finally + { + // In case we did a get(), then encountered an error, reset the + // position to before the error. If there was no error, this + // will benignly reset the position to the value it already has. + in.position (inPos); + } + } + } +}