0 | 0 |
package uniseg
|
|
1 |
|
|
2 |
import "unicode/utf8"
|
1 | 3 |
|
2 | 4 |
// The states of the word break parser.
|
3 | 5 |
const (
|
|
5 | 7 |
wbCR
|
6 | 8 |
wbLF
|
7 | 9 |
wbNewline
|
8 | |
wbZWJ
|
9 | 10 |
wbWSegSpace
|
|
11 |
wbHebrewLetter
|
|
12 |
wbALetter
|
|
13 |
wbWB7
|
|
14 |
wbWB7c
|
|
15 |
wbNumeric
|
|
16 |
wbWB11
|
|
17 |
wbKatakana
|
|
18 |
wbExtendNumLet
|
|
19 |
wbOddRI
|
|
20 |
wbEvenRI
|
|
21 |
wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
|
10 | 22 |
)
|
11 | 23 |
|
12 | 24 |
// The word break parser's breaking instructions.
|
|
15 | 27 |
wbBreak
|
16 | 28 |
)
|
17 | 29 |
|
18 | |
// The word break parser's state transitions. It's anologous to wbTransitions,
|
19 | |
// see comments there for details.
|
|
30 |
// The word break parser's state transitions. It's anologous to grTransitions,
|
|
31 |
// see comments there for details. Unicode version 14.0.0.
|
20 | 32 |
var wbTransitions = map[[2]int][3]int{
|
21 | 33 |
// WB3b.
|
22 | 34 |
{wbAny, prNewline}: {wbNewline, wbBreak, 32},
|
|
31 | 43 |
// WB3.
|
32 | 44 |
{wbCR, prLF}: {wbLF, wbDontBreak, 30},
|
33 | 45 |
|
34 | |
// WB3c.
|
35 | |
{wbAny, prZWJ}: {wbZWJ, wbBreak, 9990},
|
36 | |
{wbZWJ, prExtendedPictographic}: {wbAny, wbDontBreak, 33},
|
37 | |
|
38 | 46 |
// WB3d.
|
39 | 47 |
{wbAny, prWSegSpace}: {wbWSegSpace, wbBreak, 9990},
|
40 | 48 |
{wbWSegSpace, prWSegSpace}: {wbWSegSpace, wbDontBreak, 34},
|
|
49 |
|
|
50 |
// WB5.
|
|
51 |
{wbAny, prALetter}: {wbALetter, wbBreak, 9990},
|
|
52 |
{wbAny, prHebrewLetter}: {wbHebrewLetter, wbBreak, 9990},
|
|
53 |
{wbALetter, prALetter}: {wbALetter, wbDontBreak, 50},
|
|
54 |
{wbALetter, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 50},
|
|
55 |
{wbHebrewLetter, prALetter}: {wbALetter, wbDontBreak, 50},
|
|
56 |
{wbHebrewLetter, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 50},
|
|
57 |
|
|
58 |
// WB7. Transitions to wbWB7 handled by transitionWordBreakState().
|
|
59 |
{wbWB7, prALetter}: {wbALetter, wbDontBreak, 70},
|
|
60 |
{wbWB7, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 70},
|
|
61 |
|
|
62 |
// WB7a.
|
|
63 |
{wbHebrewLetter, prSingleQuote}: {wbAny, wbDontBreak, 71},
|
|
64 |
|
|
65 |
// WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
|
|
66 |
{wbWB7c, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 73},
|
|
67 |
|
|
68 |
// WB8.
|
|
69 |
{wbAny, prNumeric}: {wbNumeric, wbBreak, 9990},
|
|
70 |
{wbNumeric, prNumeric}: {wbNumeric, wbDontBreak, 80},
|
|
71 |
|
|
72 |
// WB9.
|
|
73 |
{wbALetter, prNumeric}: {wbNumeric, wbDontBreak, 90},
|
|
74 |
{wbHebrewLetter, prNumeric}: {wbNumeric, wbDontBreak, 90},
|
|
75 |
|
|
76 |
// WB10.
|
|
77 |
{wbNumeric, prALetter}: {wbALetter, wbDontBreak, 100},
|
|
78 |
{wbNumeric, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 100},
|
|
79 |
|
|
80 |
// WB11. Transitions to wbWB11 handled by transitionWordBreakState().
|
|
81 |
{wbWB11, prNumeric}: {wbNumeric, wbDontBreak, 110},
|
|
82 |
|
|
83 |
// WB13.
|
|
84 |
{wbAny, prKatakana}: {wbKatakana, wbBreak, 9990},
|
|
85 |
{wbKatakana, prKatakana}: {wbKatakana, wbDontBreak, 130},
|
|
86 |
|
|
87 |
// WB13a.
|
|
88 |
{wbAny, prExtendNumLet}: {wbExtendNumLet, wbBreak, 9990},
|
|
89 |
{wbALetter, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
|
|
90 |
{wbHebrewLetter, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
|
|
91 |
{wbNumeric, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
|
|
92 |
{wbKatakana, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
|
|
93 |
{wbExtendNumLet, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
|
|
94 |
|
|
95 |
// WB13b.
|
|
96 |
{wbExtendNumLet, prALetter}: {wbALetter, wbDontBreak, 132},
|
|
97 |
{wbExtendNumLet, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 132},
|
|
98 |
{wbExtendNumLet, prNumeric}: {wbNumeric, wbDontBreak, 132},
|
|
99 |
{wbExtendNumLet, prKatakana}: {prKatakana, wbDontBreak, 132},
|
41 | 100 |
}
|
42 | 101 |
|
43 | 102 |
// transitionWordBreakState determines the new state of the word break parser
|
44 | 103 |
// given the current state and the next code point. It also returns whether a
|
45 | |
// word break was detected.
|
46 | |
func transitionWordBreakState(state int, r rune) (newState int, wordBreak bool) {
|
|
104 |
// word break was detected. If more than one code point is needed to determine
|
|
105 |
// the new state, the byte slice or the string starting after rune "r" can be
|
|
106 |
// used (whichever is not nil or empty) for further lookups.
|
|
107 |
func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
|
47 | 108 |
// Determine the property of the next character.
|
48 | 109 |
nextProperty := property(workBreakCodePoints, r)
|
49 | 110 |
|
50 | |
// Find the applicable transition.
|
|
111 |
// "Replacing Ignore Rules".
|
|
112 |
if nextProperty == prZWJ {
|
|
113 |
// WB4 (for zero-width joiners).
|
|
114 |
if state == wbNewline || state == wbCR || state == wbLF {
|
|
115 |
return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
|
|
116 |
}
|
|
117 |
return state | wbZWJBit, false
|
|
118 |
} else if nextProperty == prExtend || nextProperty == prFormat {
|
|
119 |
// WB4 (for Extend and Format).
|
|
120 |
if state == wbNewline || state == wbCR || state == wbLF {
|
|
121 |
return wbAny, true // Make sure we don't apply WB4 to WB3a.
|
|
122 |
}
|
|
123 |
return state, false
|
|
124 |
} else if nextProperty == prExtendedPictographic && state&wbZWJBit != 0 {
|
|
125 |
// WB3c.
|
|
126 |
return wbAny, false
|
|
127 |
}
|
|
128 |
if state >= 0 {
|
|
129 |
state = state &^ wbZWJBit
|
|
130 |
}
|
|
131 |
|
|
132 |
// Find the applicable transition in the table.
|
|
133 |
var rule int
|
51 | 134 |
transition, ok := wbTransitions[[2]int{state, nextProperty}]
|
52 | 135 |
if ok {
|
53 | 136 |
// We have a specific transition. We'll use it.
|
54 | |
return transition[0], transition[1] == wbBreak
|
55 | |
}
|
56 | |
|
57 | |
// No specific transition found. Try the less specific ones.
|
58 | |
transAnyProp, okAnyProp := wbTransitions[[2]int{state, prAny}]
|
59 | |
transAnyState, okAnyState := wbTransitions[[2]int{wbAny, nextProperty}]
|
60 | |
if okAnyProp && okAnyState {
|
61 | |
// Both apply. We'll use a mix (see comments for wbTransitions).
|
62 | |
newState = transAnyState[0]
|
63 | |
wordBreak = transAnyState[1] == wbBreak
|
64 | |
if transAnyProp[2] < transAnyState[2] {
|
65 | |
wordBreak = transAnyProp[1] == wbBreak
|
66 | |
}
|
67 | |
return
|
68 | |
}
|
69 | |
|
70 | |
if okAnyProp {
|
71 | |
// We only have a specific state.
|
72 | |
return transAnyProp[0], transAnyProp[1] == wbBreak
|
73 | |
// This branch will probably never be reached because okAnyState will
|
74 | |
// always be true given the current transition map. But we keep it here
|
75 | |
// for future modifications to the transition map where this may not be
|
76 | |
// true anymore.
|
77 | |
}
|
78 | |
|
79 | |
if okAnyState {
|
80 | |
// We only have a specific property.
|
81 | |
return transAnyState[0], transAnyState[1] == wbBreak
|
82 | |
}
|
83 | |
|
84 | |
// No known transition. WB999: Any ÷ Any.
|
85 | |
return wbAny, true
|
|
137 |
newState, wordBreak, rule = transition[0], transition[1] == wbBreak, transition[2]
|
|
138 |
} else {
|
|
139 |
// No specific transition found. Try the less specific ones.
|
|
140 |
transAnyProp, okAnyProp := wbTransitions[[2]int{state, prAny}]
|
|
141 |
transAnyState, okAnyState := wbTransitions[[2]int{wbAny, nextProperty}]
|
|
142 |
if okAnyProp && okAnyState {
|
|
143 |
// Both apply. We'll use a mix (see comments for grTransitions).
|
|
144 |
newState, wordBreak, rule = transAnyState[0], transAnyState[1] == wbBreak, transAnyState[2]
|
|
145 |
if transAnyProp[2] < transAnyState[2] {
|
|
146 |
wordBreak, rule = transAnyProp[1] == wbBreak, transAnyProp[2]
|
|
147 |
}
|
|
148 |
} else if okAnyProp {
|
|
149 |
// We only have a specific state.
|
|
150 |
newState, wordBreak, rule = transAnyProp[0], transAnyProp[1] == wbBreak, transAnyProp[2]
|
|
151 |
// This branch will probably never be reached because okAnyState will
|
|
152 |
// always be true given the current transition map. But we keep it here
|
|
153 |
// for future modifications to the transition map where this may not be
|
|
154 |
// true anymore.
|
|
155 |
} else if okAnyState {
|
|
156 |
// We only have a specific property.
|
|
157 |
newState, wordBreak, rule = transAnyState[0], transAnyState[1] == wbBreak, transAnyState[2]
|
|
158 |
} else {
|
|
159 |
// No known transition. WB999: Any ÷ Any.
|
|
160 |
newState, wordBreak, rule = wbAny, true, 9990
|
|
161 |
}
|
|
162 |
}
|
|
163 |
|
|
164 |
// For those rules that need to look up runes further in the string, we
|
|
165 |
// determine the property after nextProperty, skipping over Format, Extend,
|
|
166 |
// and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
|
|
167 |
// be determined (because the text ends or the rune is faulty).
|
|
168 |
farProperty := -1
|
|
169 |
if rule > 60 &&
|
|
170 |
(state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
|
|
171 |
(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
|
|
172 |
nextProperty == prDoubleQuote || // WB7b.
|
|
173 |
nextProperty == prMidNum) { // WB12.
|
|
174 |
for {
|
|
175 |
var (
|
|
176 |
r rune
|
|
177 |
length int
|
|
178 |
)
|
|
179 |
if b != nil { // Byte slice version.
|
|
180 |
r, length = utf8.DecodeRune(b)
|
|
181 |
b = b[length:]
|
|
182 |
} else { // String version.
|
|
183 |
r, length = utf8.DecodeRuneInString(str)
|
|
184 |
str = str[length:]
|
|
185 |
}
|
|
186 |
if r == utf8.RuneError {
|
|
187 |
break
|
|
188 |
}
|
|
189 |
prop := property(workBreakCodePoints, r)
|
|
190 |
if prop == prExtend || prop == prFormat || prop == prZWJ {
|
|
191 |
continue
|
|
192 |
}
|
|
193 |
farProperty = prop
|
|
194 |
break
|
|
195 |
}
|
|
196 |
}
|
|
197 |
|
|
198 |
// WB6.
|
|
199 |
if rule > 60 &&
|
|
200 |
(state == wbALetter || state == wbHebrewLetter) &&
|
|
201 |
(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
|
|
202 |
(farProperty == prALetter || farProperty == prHebrewLetter) {
|
|
203 |
return wbWB7, false
|
|
204 |
}
|
|
205 |
|
|
206 |
// WB7b.
|
|
207 |
if rule > 72 &&
|
|
208 |
state == wbHebrewLetter &&
|
|
209 |
nextProperty == prDoubleQuote &&
|
|
210 |
farProperty == prHebrewLetter {
|
|
211 |
return wbWB7c, false
|
|
212 |
}
|
|
213 |
|
|
214 |
// WB12.
|
|
215 |
if rule > 120 &&
|
|
216 |
state == wbNumeric &&
|
|
217 |
(nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
|
|
218 |
farProperty == prNumeric {
|
|
219 |
return wbWB11, false
|
|
220 |
}
|
|
221 |
|
|
222 |
// WB15 and WB16.
|
|
223 |
if rule > 160 && nextProperty == prRegionalIndicator {
|
|
224 |
if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
|
|
225 |
// Transition into the first RI.
|
|
226 |
return wbOddRI, true
|
|
227 |
}
|
|
228 |
if state == wbOddRI {
|
|
229 |
// Don't break pairs of Regional Indicators.
|
|
230 |
return wbEvenRI, false
|
|
231 |
}
|
|
232 |
return wbOddRI, true // We can break after a pair.
|
|
233 |
}
|
|
234 |
|
|
235 |
return
|
86 | 236 |
}
|