73 | 73 |
// has much better performance and makes no allocations. It lends itself well to
|
74 | 74 |
// large byte slices.
|
75 | 75 |
//
|
76 | |
// Note that in accordance with UAX #14 LB3, the final segment will end with
|
77 | |
// a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose
|
78 | |
// to ignore this by checking if the length of the "rest" slice is 0.
|
|
76 |
// Note that this algorithm diverges from UAX #14 in LB3, in that the final
|
|
77 |
// segment will not end with an optional line break (boundaries&MaskLine ==
|
|
78 |
// LineCanBreak). The reason for this is that when the text ends with a newline
|
|
79 |
// character, it is impossible to know whether the line break is due to that
|
|
80 |
// character or due to the end of the text. You can enforce LB3 yourself by
|
|
81 |
// checking if the length of the "rest" slice is 0.
|
79 | 82 |
func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) {
|
80 | 83 |
// An empty byte slice returns nothing.
|
81 | 84 |
if len(b) == 0 {
|
|
85 | 88 |
// Extract the first rune.
|
86 | 89 |
r, length := utf8.DecodeRune(b)
|
87 | 90 |
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
|
88 | |
return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
|
|
91 |
return b, nil, LineCanBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
|
89 | 92 |
}
|
90 | 93 |
|
91 | 94 |
// If we don't know the state, determine it now.
|
|
130 | 133 |
|
131 | 134 |
length += l
|
132 | 135 |
if len(b) <= length {
|
133 | |
return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
|
|
136 |
return b, nil, LineCanBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
|
134 | 137 |
}
|
135 | 138 |
}
|
136 | 139 |
}
|
|
145 | 148 |
// Extract the first rune.
|
146 | 149 |
r, length := utf8.DecodeRuneInString(str)
|
147 | 150 |
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
|
148 | |
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
|
|
151 |
return str, "", LineCanBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
|
149 | 152 |
}
|
150 | 153 |
|
151 | 154 |
// If we don't know the state, determine it now.
|
|
190 | 193 |
|
191 | 194 |
length += l
|
192 | 195 |
if len(str) <= length {
|
193 | |
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
|
|
196 |
return str, "", LineCanBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
|
194 | 197 |
}
|
195 | 198 |
}
|
196 | 199 |
}
|