1 // Default word BreakIterator.
3 /* Copyright (C) 1999 Cygnus Solutions
5 This file is part of libgcj.
7 This software is copyrighted work licensed under the terms of the
8 Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
13 import java
.text
.BreakIterator
;
14 import java
.text
.CharacterIterator
;
17 * @author Tom Tromey <tromey@cygnus.com>
18 * @date March 22, 1999
19 * Written using The Unicode Standard, Version 2.0.
22 public class WordBreakIterator
extends BaseBreakIterator
24 public Object
clone ()
26 return new WordBreakIterator (this);
29 public WordBreakIterator ()
34 private WordBreakIterator (WordBreakIterator other
)
36 iter
= (CharacterIterator
) other
.iter
.clone();
39 // Some methods to tell us different properties of characters.
40 private final boolean isHira (char c
)
42 return c
>= 0x3040 && c
<= 0x309f;
44 private final boolean isKata (char c
)
46 return c
>= 0x30a0 && c
<= 0x30ff;
48 private final boolean isHan (char c
)
50 return c
>= 0x4e00 && c
<= 0x9fff;
55 int end
= iter
.getEndIndex();
56 if (iter
.getIndex() == end
)
59 while (iter
.getIndex() < end
)
61 char c
= iter
.current();
62 if (c
== CharacterIterator
.DONE
)
64 int type
= Character
.getType(c
);
67 if (n
== CharacterIterator
.DONE
)
70 // Break after paragraph separators.
71 if (type
== Character
.PARAGRAPH_SEPARATOR
72 || type
== Character
.LINE_SEPARATOR
)
75 // Break between letters and non-letters.
76 // FIXME: we treat apostrophe as part of a word. This
78 boolean is_letter
= Character
.isLetter(c
);
79 if (c
!= '\'' && ! is_letter
&& type
!= Character
.NON_SPACING_MARK
80 && Character
.isLetter(n
))
83 // Always break after certain symbols, such as punctuation.
84 // This heuristic is derived from hints in the JCL book and is
85 // not part of Unicode. It seems to be right, however.
86 // FIXME: we treat apostrophe as part of a word. This
89 && (type
== Character
.DASH_PUNCTUATION
90 || type
== Character
.START_PUNCTUATION
91 || type
== Character
.END_PUNCTUATION
92 || type
== Character
.CONNECTOR_PUNCTUATION
93 || type
== Character
.OTHER_PUNCTUATION
94 || type
== Character
.MATH_SYMBOL
95 || type
== Character
.CURRENCY_SYMBOL
96 || type
== Character
.MODIFIER_SYMBOL
97 || type
== Character
.OTHER_SYMBOL
98 || type
== Character
.FORMAT
99 || type
== Character
.CONTROL
))
102 boolean is_hira
= isHira (c
);
103 boolean is_kata
= isKata (c
);
104 boolean is_han
= isHan (c
);
106 // Special case Japanese.
107 if (! is_hira
&& ! is_kata
&& ! is_han
108 && type
!= Character
.NON_SPACING_MARK
109 && (isHira (n
) || isKata (n
) || isHan (n
)))
112 if (is_hira
|| is_kata
|| is_han
|| is_letter
)
114 // Now we need to do some lookahead. We might need to do
115 // quite a bit of lookahead, so we save our position and
117 int save
= iter
.getIndex();
118 // Skip string of non spacing marks.
119 while (n
!= CharacterIterator
.DONE
120 && Character
.getType(n
) == Character
.NON_SPACING_MARK
)
122 if (n
== CharacterIterator
.DONE
)
124 if ((is_hira
&& ! isHira (n
))
125 || (is_kata
&& ! isHira (n
) && ! isKata (n
))
126 || (is_han
&& ! isHira (n
) && ! isHan (n
))
127 // FIXME: we treat apostrophe as part of a word. This
128 // is an English-ism.
129 || (is_letter
&& ! Character
.isLetter(n
) && n
!= '\''))
135 return iter
.getIndex();
138 public int previous ()
140 int start
= iter
.getBeginIndex();
141 if (iter
.getIndex() == start
)
144 while (iter
.getIndex() >= start
)
146 char c
= iter
.previous();
147 if (c
== CharacterIterator
.DONE
)
150 boolean is_hira
= isHira (c
);
151 boolean is_kata
= isKata (c
);
152 boolean is_han
= isHan (c
);
153 boolean is_letter
= Character
.isLetter(c
);
155 char n
= iter
.previous();
156 if (n
== CharacterIterator
.DONE
)
159 int type
= Character
.getType(n
);
160 // Break after paragraph separators.
161 if (type
== Character
.PARAGRAPH_SEPARATOR
162 || type
== Character
.LINE_SEPARATOR
)
165 // Break between letters and non-letters.
166 // FIXME: we treat apostrophe as part of a word. This
167 // is an English-ism.
168 if (n
!= '\'' && ! Character
.isLetter(n
)
169 && type
!= Character
.NON_SPACING_MARK
173 // Always break after certain symbols, such as punctuation.
174 // This heuristic is derived from hints in the JCL book and is
175 // not part of Unicode. It seems to be right, however.
176 // FIXME: we treat apostrophe as part of a word. This
177 // is an English-ism.
179 && (type
== Character
.DASH_PUNCTUATION
180 || type
== Character
.START_PUNCTUATION
181 || type
== Character
.END_PUNCTUATION
182 || type
== Character
.CONNECTOR_PUNCTUATION
183 || type
== Character
.OTHER_PUNCTUATION
184 || type
== Character
.MATH_SYMBOL
185 || type
== Character
.CURRENCY_SYMBOL
186 || type
== Character
.MODIFIER_SYMBOL
187 || type
== Character
.OTHER_SYMBOL
188 || type
== Character
.FORMAT
189 || type
== Character
.CONTROL
))
192 // Special case Japanese.
193 if ((is_hira
|| is_kata
|| is_han
)
194 && ! isHira (n
) && ! isKata (n
) && ! isHan (n
)
195 && type
!= Character
.NON_SPACING_MARK
)
198 // We might have to skip over non spacing marks to see what's
199 // on the other side.
200 if (! is_hira
|| (! is_letter
&& c
!= '\''))
202 int save
= iter
.getIndex();
203 while (n
!= CharacterIterator
.DONE
204 && Character
.getType(n
) == Character
.NON_SPACING_MARK
)
207 // This is a strange case: a bunch of non-spacing marks at
208 // the beginning. We treat the current location as a word
210 if (n
== CharacterIterator
.DONE
)
212 if ((isHira (n
) && ! is_hira
)
213 || (isKata (n
) && ! is_hira
&& ! is_kata
)
214 || (isHan (n
) && ! is_hira
&& ! is_han
)
215 // FIXME: we treat apostrophe as part of a word. This
216 // is an English-ism.
217 || (! is_letter
&& c
!= '\'' && Character
.isLetter(n
)))
222 return iter
.getIndex();