Codebase list golang-github-rivo-uniseg / 780c326
Implemented word break rules. Not yet integrated. Not yet tested. Oliver 1 year, 9 months ago
3 changed file(s) with 198 addition(s) and 46 deletion(s). Raw diff Collapse all Expand all
2424 // The index of the next code point to be parsed.
2525 pos int
2626
27 // The current state of the code point parser.
28 state int
27 // The current state of the Grapheme code point parser.
28 graphemeState int
2929 }
3030
3131 // NewGraphemes returns a new grapheme cluster iterator.
6868
6969 // Calculate the next state.
7070 var boundary bool
71 g.state, boundary = transitionGraphemeState(g.state, g.codePoints[g.pos])
71 g.graphemeState, boundary = transitionGraphemeState(g.graphemeState, g.codePoints[g.pos])
7272
7373 // If we found a cluster boundary, let's stop here. The current cluster will
7474 // be the one that just ended.
127127 // Reset puts the iterator into its initial state such that the next call to
128128 // Next() sets it to the first grapheme cluster again.
129129 func (g *Graphemes) Reset() {
130 g.start, g.end, g.pos, g.state = 0, 0, 0, grAny
130 g.start, g.end, g.pos, g.graphemeState = 0, 0, 0, grAny
131131 g.Next() // Parse ahead again.
132132 }
133133
3434 // from the transition with the lower rule number, prefer (3) if rule numbers
3535 // are equal. Stop.
3636 // 6. Assume grAny and grBoundary.
37 //
38 // Unicode version 14.0.0.
3739 var grTransitions = map[[2]int][3]int{
3840 // GB5
3941 {grAny, prCR}: {grCR, grBoundary, 50},
00 package uniseg
1
2 import "unicode/utf8"
13
24 // The states of the word break parser.
35 const (
57 wbCR
68 wbLF
79 wbNewline
8 wbZWJ
910 wbWSegSpace
11 wbHebrewLetter
12 wbALetter
13 wbWB7
14 wbWB7c
15 wbNumeric
16 wbWB11
17 wbKatakana
18 wbExtendNumLet
19 wbOddRI
20 wbEvenRI
21 wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
1022 )
1123
1224 // The word break parser's breaking instructions.
1527 wbBreak
1628 )
1729
18 // The word break parser's state transitions. It's anologous to wbTransitions,
19 // see comments there for details.
30 // The word break parser's state transitions. It's anologous to grTransitions,
31 // see comments there for details. Unicode version 14.0.0.
2032 var wbTransitions = map[[2]int][3]int{
2133 // WB3b.
2234 {wbAny, prNewline}: {wbNewline, wbBreak, 32},
3143 // WB3.
3244 {wbCR, prLF}: {wbLF, wbDontBreak, 30},
3345
34 // WB3c.
35 {wbAny, prZWJ}: {wbZWJ, wbBreak, 9990},
36 {wbZWJ, prExtendedPictographic}: {wbAny, wbDontBreak, 33},
37
3846 // WB3d.
3947 {wbAny, prWSegSpace}: {wbWSegSpace, wbBreak, 9990},
4048 {wbWSegSpace, prWSegSpace}: {wbWSegSpace, wbDontBreak, 34},
49
50 // WB5.
51 {wbAny, prALetter}: {wbALetter, wbBreak, 9990},
52 {wbAny, prHebrewLetter}: {wbHebrewLetter, wbBreak, 9990},
53 {wbALetter, prALetter}: {wbALetter, wbDontBreak, 50},
54 {wbALetter, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 50},
55 {wbHebrewLetter, prALetter}: {wbALetter, wbDontBreak, 50},
56 {wbHebrewLetter, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 50},
57
58 // WB7. Transitions to wbWB7 handled by transitionWordBreakState().
59 {wbWB7, prALetter}: {wbALetter, wbDontBreak, 70},
60 {wbWB7, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 70},
61
62 // WB7a.
63 {wbHebrewLetter, prSingleQuote}: {wbAny, wbDontBreak, 71},
64
65 // WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
66 {wbWB7c, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 73},
67
68 // WB8.
69 {wbAny, prNumeric}: {wbNumeric, wbBreak, 9990},
70 {wbNumeric, prNumeric}: {wbNumeric, wbDontBreak, 80},
71
72 // WB9.
73 {wbALetter, prNumeric}: {wbNumeric, wbDontBreak, 90},
74 {wbHebrewLetter, prNumeric}: {wbNumeric, wbDontBreak, 90},
75
76 // WB10.
77 {wbNumeric, prALetter}: {wbALetter, wbDontBreak, 100},
78 {wbNumeric, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 100},
79
80 // WB11. Transitions to wbWB11 handled by transitionWordBreakState().
81 {wbWB11, prNumeric}: {wbNumeric, wbDontBreak, 110},
82
83 // WB13.
84 {wbAny, prKatakana}: {wbKatakana, wbBreak, 9990},
85 {wbKatakana, prKatakana}: {wbKatakana, wbDontBreak, 130},
86
87 // WB13a.
88 {wbAny, prExtendNumLet}: {wbExtendNumLet, wbBreak, 9990},
89 {wbALetter, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
90 {wbHebrewLetter, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
91 {wbNumeric, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
92 {wbKatakana, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
93 {wbExtendNumLet, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
94
95 // WB13b.
96 {wbExtendNumLet, prALetter}: {wbALetter, wbDontBreak, 132},
97 {wbExtendNumLet, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 132},
98 {wbExtendNumLet, prNumeric}: {wbNumeric, wbDontBreak, 132},
99 {wbExtendNumLet, prKatakana}: {prKatakana, wbDontBreak, 132},
41100 }
42101
43102 // transitionWordBreakState determines the new state of the word break parser
44103 // given the current state and the next code point. It also returns whether a
45 // word break was detected.
46 func transitionWordBreakState(state int, r rune) (newState int, wordBreak bool) {
104 // word break was detected. If more than one code point is needed to determine
105 // the new state, the byte slice or the string starting after rune "r" can be
106 // used (whichever is not nil or empty) for further lookups.
107 func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
47108 // Determine the property of the next character.
48109 nextProperty := property(workBreakCodePoints, r)
49110
50 // Find the applicable transition.
111 // "Replacing Ignore Rules".
112 if nextProperty == prZWJ {
113 // WB4 (for zero-width joiners).
114 if state == wbNewline || state == wbCR || state == wbLF {
115 return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
116 }
117 return state | wbZWJBit, false
118 } else if nextProperty == prExtend || nextProperty == prFormat {
119 // WB4 (for Extend and Format).
120 if state == wbNewline || state == wbCR || state == wbLF {
121 return wbAny, true // Make sure we don't apply WB4 to WB3a.
122 }
123 return state, false
124 } else if nextProperty == prExtendedPictographic && state&wbZWJBit != 0 {
125 // WB3c.
126 return wbAny, false
127 }
128 if state >= 0 {
129 state = state &^ wbZWJBit
130 }
131
132 // Find the applicable transition in the table.
133 var rule int
51134 transition, ok := wbTransitions[[2]int{state, nextProperty}]
52135 if ok {
53136 // We have a specific transition. We'll use it.
54 return transition[0], transition[1] == wbBreak
55 }
56
57 // No specific transition found. Try the less specific ones.
58 transAnyProp, okAnyProp := wbTransitions[[2]int{state, prAny}]
59 transAnyState, okAnyState := wbTransitions[[2]int{wbAny, nextProperty}]
60 if okAnyProp && okAnyState {
61 // Both apply. We'll use a mix (see comments for wbTransitions).
62 newState = transAnyState[0]
63 wordBreak = transAnyState[1] == wbBreak
64 if transAnyProp[2] < transAnyState[2] {
65 wordBreak = transAnyProp[1] == wbBreak
66 }
67 return
68 }
69
70 if okAnyProp {
71 // We only have a specific state.
72 return transAnyProp[0], transAnyProp[1] == wbBreak
73 // This branch will probably never be reached because okAnyState will
74 // always be true given the current transition map. But we keep it here
75 // for future modifications to the transition map where this may not be
76 // true anymore.
77 }
78
79 if okAnyState {
80 // We only have a specific property.
81 return transAnyState[0], transAnyState[1] == wbBreak
82 }
83
84 // No known transition. WB999: Any ÷ Any.
85 return wbAny, true
137 newState, wordBreak, rule = transition[0], transition[1] == wbBreak, transition[2]
138 } else {
139 // No specific transition found. Try the less specific ones.
140 transAnyProp, okAnyProp := wbTransitions[[2]int{state, prAny}]
141 transAnyState, okAnyState := wbTransitions[[2]int{wbAny, nextProperty}]
142 if okAnyProp && okAnyState {
143 // Both apply. We'll use a mix (see comments for grTransitions).
144 newState, wordBreak, rule = transAnyState[0], transAnyState[1] == wbBreak, transAnyState[2]
145 if transAnyProp[2] < transAnyState[2] {
146 wordBreak, rule = transAnyProp[1] == wbBreak, transAnyProp[2]
147 }
148 } else if okAnyProp {
149 // We only have a specific state.
150 newState, wordBreak, rule = transAnyProp[0], transAnyProp[1] == wbBreak, transAnyProp[2]
151 // This branch will probably never be reached because okAnyState will
152 // always be true given the current transition map. But we keep it here
153 // for future modifications to the transition map where this may not be
154 // true anymore.
155 } else if okAnyState {
156 // We only have a specific property.
157 newState, wordBreak, rule = transAnyState[0], transAnyState[1] == wbBreak, transAnyState[2]
158 } else {
159 // No known transition. WB999: Any ÷ Any.
160 newState, wordBreak, rule = wbAny, true, 9990
161 }
162 }
163
164 // For those rules that need to look up runes further in the string, we
165 // determine the property after nextProperty, skipping over Format, Extend,
166 // and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
167 // be determined (because the text ends or the rune is faulty).
168 farProperty := -1
169 if rule > 60 &&
170 (state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
171 (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
172 nextProperty == prDoubleQuote || // WB7b.
173 nextProperty == prMidNum) { // WB12.
174 for {
175 var (
176 r rune
177 length int
178 )
179 if b != nil { // Byte slice version.
180 r, length = utf8.DecodeRune(b)
181 b = b[length:]
182 } else { // String version.
183 r, length = utf8.DecodeRuneInString(str)
184 str = str[length:]
185 }
186 if r == utf8.RuneError {
187 break
188 }
189 prop := property(workBreakCodePoints, r)
190 if prop == prExtend || prop == prFormat || prop == prZWJ {
191 continue
192 }
193 farProperty = prop
194 break
195 }
196 }
197
198 // WB6.
199 if rule > 60 &&
200 (state == wbALetter || state == wbHebrewLetter) &&
201 (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
202 (farProperty == prALetter || farProperty == prHebrewLetter) {
203 return wbWB7, false
204 }
205
206 // WB7b.
207 if rule > 72 &&
208 state == wbHebrewLetter &&
209 nextProperty == prDoubleQuote &&
210 farProperty == prHebrewLetter {
211 return wbWB7c, false
212 }
213
214 // WB12.
215 if rule > 120 &&
216 state == wbNumeric &&
217 (nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
218 farProperty == prNumeric {
219 return wbWB11, false
220 }
221
222 // WB15 and WB16.
223 if rule > 160 && nextProperty == prRegionalIndicator {
224 if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
225 // Transition into the first RI.
226 return wbOddRI, true
227 }
228 if state == wbOddRI {
229 // Don't break pairs of Regional Indicators.
230 return wbEvenRI, false
231 }
232 return wbOddRI, true // We can break after a pair.
233 }
234
235 return
86236 }