Codebase list golang-github-rivo-uniseg / be02319
Ignoring LB3 also in Step functions. Oliver 1 year, 9 months ago
2 changed file(s) with 14 addition(s) and 11 deletion(s). Raw diff Collapse all Expand all
239239 }
240240 }
241241 // Output: First |line.
242 //‖Second |line.‖
242 //‖Second |line.|
243243 }
244244
245245 func ExampleStepString_lineBreaking() {
259259 }
260260 }
261261 // Output: First |line.
262 //‖Second |line.‖
262 //‖Second |line.|
263263 }
264264
265265 func ExampleGraphemes_graphemes() {
304304 }
305305 }
306306 // Output: First |line.
307 //‖Second |line.‖
308 }
307 //‖Second |line.|
308 }
7373 // has much better performance and makes no allocations. It lends itself well to
7474 // large byte slices.
7575 //
76 // Note that in accordance with UAX #14 LB3, the final segment will end with
77 // a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose
78 // to ignore this by checking if the length of the "rest" slice is 0.
76 // Note that this algorithm diverges from UAX #14 in LB3, in that the final
77 // segment will not end with an optional line break (boundaries&MaskLine ==
78 // LineCanBreak). The reason for this is that when the text ends with a newline
79 // character, it is impossible to know whether the line break is due to that
80 // character or due to the end of the text. You can enforce LB3 yourself by
81 // checking if the length of the "rest" slice is 0.
7982 func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) {
8083 // An empty byte slice returns nothing.
8184 if len(b) == 0 {
8588 // Extract the first rune.
8689 r, length := utf8.DecodeRune(b)
8790 if len(b) <= length { // If we're already past the end, there is nothing else to parse.
88 return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
91 return b, nil, LineCanBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
8992 }
9093
9194 // If we don't know the state, determine it now.
130133
131134 length += l
132135 if len(b) <= length {
133 return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
136 return b, nil, LineCanBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
134137 }
135138 }
136139 }
145148 // Extract the first rune.
146149 r, length := utf8.DecodeRuneInString(str)
147150 if len(str) <= length { // If we're already past the end, there is nothing else to parse.
148 return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
151 return str, "", LineCanBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
149152 }
150153
151154 // If we don't know the state, determine it now.
190193
191194 length += l
192195 if len(str) <= length {
193 return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
196 return str, "", LineCanBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
194197 }
195198 }
196199 }