Initial revision
[gcc.git] / libjava / gnu / gcj / text / WordBreakIterator.java
1 // Default word BreakIterator.
2
3 /* Copyright (C) 1999 Cygnus Solutions
4
5 This file is part of libgcj.
6
7 This software is copyrighted work licensed under the terms of the
8 Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
9 details. */
10
11 package gnu.gcj.text;
12
13 import java.text.BreakIterator;
14 import java.text.CharacterIterator;
15
16 /**
17 * @author Tom Tromey <tromey@cygnus.com>
18 * @date March 22, 1999
19 * Written using The Unicode Standard, Version 2.0.
20 */
21
22 public class WordBreakIterator extends BaseBreakIterator
23 {
24 public Object clone ()
25 {
26 return new WordBreakIterator (this);
27 }
28
29 public WordBreakIterator ()
30 {
31 iter = null;
32 }
33
34 private WordBreakIterator (WordBreakIterator other)
35 {
36 iter = (CharacterIterator) other.iter.clone();
37 }
38
39 // Some methods to tell us different properties of characters.
40 private final boolean isHira (char c)
41 {
42 return c >= 0x3040 && c <= 0x309f;
43 }
44 private final boolean isKata (char c)
45 {
46 return c >= 0x30a0 && c <= 0x30ff;
47 }
48 private final boolean isHan (char c)
49 {
50 return c >= 0x4e00 && c <= 0x9fff;
51 }
52
53 public int next ()
54 {
55 int end = iter.getEndIndex();
56 if (iter.getIndex() == end)
57 return DONE;
58
59 while (iter.getIndex() < end)
60 {
61 char c = iter.current();
62 if (c == CharacterIterator.DONE)
63 break;
64 int type = Character.getType(c);
65
66 char n = iter.next();
67 if (n == CharacterIterator.DONE)
68 break;
69
70 // Break after paragraph separators.
71 if (type == Character.PARAGRAPH_SEPARATOR
72 || type == Character.LINE_SEPARATOR)
73 break;
74
75 // Break between letters and non-letters.
76 // FIXME: we treat apostrophe as part of a word. This
77 // is an English-ism.
78 boolean is_letter = Character.isLetter(c);
79 if (c != '\'' && ! is_letter && type != Character.NON_SPACING_MARK
80 && Character.isLetter(n))
81 break;
82
83 // Always break after certain symbols, such as punctuation.
84 // This heuristic is derived from hints in the JCL book and is
85 // not part of Unicode. It seems to be right, however.
86 // FIXME: we treat apostrophe as part of a word. This
87 // is an English-ism.
88 if (c != '\''
89 && (type == Character.DASH_PUNCTUATION
90 || type == Character.START_PUNCTUATION
91 || type == Character.END_PUNCTUATION
92 || type == Character.CONNECTOR_PUNCTUATION
93 || type == Character.OTHER_PUNCTUATION
94 || type == Character.MATH_SYMBOL
95 || type == Character.CURRENCY_SYMBOL
96 || type == Character.MODIFIER_SYMBOL
97 || type == Character.OTHER_SYMBOL
98 || type == Character.FORMAT
99 || type == Character.CONTROL))
100 break;
101
102 boolean is_hira = isHira (c);
103 boolean is_kata = isKata (c);
104 boolean is_han = isHan (c);
105
106 // Special case Japanese.
107 if (! is_hira && ! is_kata && ! is_han
108 && type != Character.NON_SPACING_MARK
109 && (isHira (n) || isKata (n) || isHan (n)))
110 break;
111
112 if (is_hira || is_kata || is_han || is_letter)
113 {
114 // Now we need to do some lookahead. We might need to do
115 // quite a bit of lookahead, so we save our position and
116 // restore it later.
117 int save = iter.getIndex();
118 // Skip string of non spacing marks.
119 while (n != CharacterIterator.DONE
120 && Character.getType(n) == Character.NON_SPACING_MARK)
121 n = iter.next();
122 if (n == CharacterIterator.DONE)
123 break;
124 if ((is_hira && ! isHira (n))
125 || (is_kata && ! isHira (n) && ! isKata (n))
126 || (is_han && ! isHira (n) && ! isHan (n))
127 // FIXME: we treat apostrophe as part of a word. This
128 // is an English-ism.
129 || (is_letter && ! Character.isLetter(n) && n != '\''))
130 break;
131 iter.setIndex(save);
132 }
133 }
134
135 return iter.getIndex();
136 }
137
138 public int previous ()
139 {
140 int start = iter.getBeginIndex();
141 if (iter.getIndex() == start)
142 return DONE;
143
144 while (iter.getIndex() >= start)
145 {
146 char c = iter.previous();
147 if (c == CharacterIterator.DONE)
148 break;
149
150 boolean is_hira = isHira (c);
151 boolean is_kata = isKata (c);
152 boolean is_han = isHan (c);
153 boolean is_letter = Character.isLetter(c);
154
155 char n = iter.previous();
156 if (n == CharacterIterator.DONE)
157 break;
158 iter.next();
159 int type = Character.getType(n);
160 // Break after paragraph separators.
161 if (type == Character.PARAGRAPH_SEPARATOR
162 || type == Character.LINE_SEPARATOR)
163 break;
164
165 // Break between letters and non-letters.
166 // FIXME: we treat apostrophe as part of a word. This
167 // is an English-ism.
168 if (n != '\'' && ! Character.isLetter(n)
169 && type != Character.NON_SPACING_MARK
170 && is_letter)
171 break;
172
173 // Always break after certain symbols, such as punctuation.
174 // This heuristic is derived from hints in the JCL book and is
175 // not part of Unicode. It seems to be right, however.
176 // FIXME: we treat apostrophe as part of a word. This
177 // is an English-ism.
178 if (n != '\''
179 && (type == Character.DASH_PUNCTUATION
180 || type == Character.START_PUNCTUATION
181 || type == Character.END_PUNCTUATION
182 || type == Character.CONNECTOR_PUNCTUATION
183 || type == Character.OTHER_PUNCTUATION
184 || type == Character.MATH_SYMBOL
185 || type == Character.CURRENCY_SYMBOL
186 || type == Character.MODIFIER_SYMBOL
187 || type == Character.OTHER_SYMBOL
188 || type == Character.FORMAT
189 || type == Character.CONTROL))
190 break;
191
192 // Special case Japanese.
193 if ((is_hira || is_kata || is_han)
194 && ! isHira (n) && ! isKata (n) && ! isHan (n)
195 && type != Character.NON_SPACING_MARK)
196 break;
197
198 // We might have to skip over non spacing marks to see what's
199 // on the other side.
200 if (! is_hira || (! is_letter && c != '\''))
201 {
202 int save = iter.getIndex();
203 while (n != CharacterIterator.DONE
204 && Character.getType(n) == Character.NON_SPACING_MARK)
205 n = iter.previous();
206 iter.setIndex(save);
207 // This is a strange case: a bunch of non-spacing marks at
208 // the beginning. We treat the current location as a word
209 // break.
210 if (n == CharacterIterator.DONE)
211 break;
212 if ((isHira (n) && ! is_hira)
213 || (isKata (n) && ! is_hira && ! is_kata)
214 || (isHan (n) && ! is_hira && ! is_han)
215 // FIXME: we treat apostrophe as part of a word. This
216 // is an English-ism.
217 || (! is_letter && c != '\'' && Character.isLetter(n)))
218 break;
219 }
220 }
221
222 return iter.getIndex();
223 }
224 }