Codebase list golang-github-rivo-uniseg / 56f4d68
Cleaning up latest actions and adding functions to detect trailing line breaks. Oliver 1 year, 9 months ago
5 changed file(s) with 34 addition(s) and 17 deletion(s). Raw diff Collapse all Expand all
55 // user-perceived characters. While iterating, it also provides information
66 // about word boundaries, sentence boundaries, and line breaks.
77 //
8 // After constructing the class via NewGraphemes(str) for a given string "str",
9 // Next() is called for every grapheme cluster in a loop until it returns false.
8 // After constructing the class via [NewGraphemes] for a given string "str",
9 // [Next] is called for every grapheme cluster in a loop until it returns false.
1010 // Inside the loop, information about the grapheme cluster as well as boundary
1111 // information is available via the various methods (see examples below).
1212 //
1313 // Using this class to iterate over a string is convenient but it is much slower
14 // than using this package's Step() or StepString() functions or any of the
14 // than using this package's [Step] or [StepString] functions or any of the
1515 // other specialized functions starting with "First".
1616 type Graphemes struct {
1717 // The original string.
5959 }
6060
6161 // Runes returns a slice of runes (code points) which corresponds to the current
62 // grapheme cluster. If the iterator is already past the end or Next() has not
62 // grapheme cluster. If the iterator is already past the end or [Next] has not
6363 // yet been called, nil is returned.
6464 func (g *Graphemes) Runes() []rune {
6565 if g.state < 0 {
6969 }
7070
7171 // Str returns a substring of the original string which corresponds to the
72 // current grapheme cluster. If the iterator is already past the end or Next()
72 // current grapheme cluster. If the iterator is already past the end or [Next]
7373 // has not yet been called, an empty string is returned.
7474 func (g *Graphemes) Str() string {
7575 return g.cluster
7676 }
7777
7878 // Bytes returns a byte slice which corresponds to the current grapheme cluster.
79 // If the iterator is already past the end or Next() has not yet been called,
79 // If the iterator is already past the end or [Next] has not yet been called,
8080 // nil is returned.
8181 func (g *Graphemes) Bytes() []byte {
8282 if g.state < 0 {
8989 // positions into the original string. The first returned value "from" indexes
9090 // the first byte and the second returned value "to" indexes the first byte that
9191 // is not included anymore, i.e. str[from:to] is the current grapheme cluster of
92 // the original string "str". If Next() has not yet been called, both values are
92 // the original string "str". If [Next] has not yet been called, both values are
9393 // 0. If the iterator is already past the end, both values are 1.
9494 func (g *Graphemes) Positions() (int, int) {
9595 if g.state == -1 {
119119 }
120120
121121 // LineBreak returns whether the line can be broken after the current grapheme
122 // cluster. A value of LineDontBreak means the line may not be broken, a value
123 // of LineMustBreak means the line must be broken, and a value of LineCanBreak
124 // means the line may or may not be broken.
122 // cluster. A value of [LineDontBreak] means the line may not be broken, a value
123 // of [LineMustBreak] means the line must be broken, and a value of
124 // [LineCanBreak] means the line may or may not be broken.
125125 func (g *Graphemes) LineBreak() int {
126126 if g.state == -1 {
127127 return LineDontBreak
133133 }
134134
135135 // Reset puts the iterator into its initial state such that the next call to
136 // Next() sets it to the first grapheme cluster again.
136 // [Next] sets it to the first grapheme cluster again.
137137 func (g *Graphemes) Reset() {
138138 g.state = -1
139139 g.offset = 0
209209 }
210210 }
211211
212 // FirstGraphemeClusterInString is like FirstGraphemeCluster() but its input and
212 // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
213213 // outputs are strings.
214214 func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, reserved, newState int) {
215215 // An empty string returns nothing.
3636 //
3737 // Note that in accordance with UAX #14 LB3, the final segment will end with
3838 // "mustBreak" set to true. You can choose to ignore this by checking if the
39 // length of the "rest" slice is 0.
39 // length of the "rest" slice is 0 and calling [HasTrailingLineBreak] or
40 // [HasTrailingLineBreakInString] on the last rune.
4041 //
4142 // Note also that this algorithm may break within grapheme clusters. This is
4243 // addressed in Section 8.2 Example 6 of UAX #14. To avoid this, you can use
110111 }
111112 }
112113 }
114
115 // HasTrailingLineBreak returns true if the last rune in the given byte slice is
116 // one of the hard line break code points as defined in LB4 and LB5 of UAX #14.
117 func HasTrailingLineBreak(b []byte) bool {
118 r, _ := utf8.DecodeLastRune(b)
119 property, _ := propertyWithGenCat(lineBreakCodePoints, r)
120 return property == lbBK || property == lbCR || property == lbLF || property == lbNL
121 }
122
123 // HasTrailingLineBreakInString is like [HasTrailingLineBreak] but for a string.
124 func HasTrailingLineBreakInString(str string) bool {
125 r, _ := utf8.DecodeLastRuneInString(str)
126 property, _ := propertyWithGenCat(lineBreakCodePoints, r)
127 return property == lbBK || property == lbCR || property == lbLF || property == lbNL
128 }
5050 }
5151 }
5252
53 // FirstSentenceInString is like FirstSentence() but its input and outputs are
53 // FirstSentenceInString is like [FirstSentence] but its input and outputs are
5454 // strings.
5555 func FirstSentenceInString(str string, state int) (sentence, rest string, newState int) {
5656 // An empty byte slice returns nothing.
7575 //
7676 // Note that in accordance with UAX #14 LB3, the final segment will end with
7777 // a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose
78 // to ignore this by checking if the length of the "rest" slice is 0.
78 // to ignore this by checking if the length of the "rest" slice is 0 and calling
79 // [HasTrailingLineBreak] or [HasTrailingLineBreakInString] on the last rune.
7980 func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) {
8081 // An empty byte slice returns nothing.
8182 if len(b) == 0 {
135136 }
136137 }
137138
138 // StepString is like Step() but its input and outputs are strings.
139 // StepString is like [Step] but its input and outputs are strings.
139140 func StepString(str string, state int) (cluster, rest string, boundaries int, newState int) {
140141 // An empty byte slice returns nothing.
141142 if len(str) == 0 {
5050 }
5151 }
5252
53 // FirstWordInString is like FirstWord() but its input and outputs are strings.
53 // FirstWordInString is like [FirstWord] but its input and outputs are strings.
5454 func FirstWordInString(str string, state int) (word, rest string, newState int) {
5555 // An empty byte slice returns nothing.
5656 if len(str) == 0 {