Commit be02319ce361418ab740d65d05ecd990556cc528 - golang-github-rivo-uniseg

Ignoring LB3 also in Step functions. Oliver 1 year, 9 months ago

2 changed file(s) with 14 addition(s) and 11 deletion(s). Raw diff Collapse all Expand all

-4

examples_test.go less more

239	239	}
240	240	}
241	241	// Output: First \|line.
242		//‖Second \|line.‖
	242	//‖Second \|line.\|
243	243	}
244	244
245	245	func ExampleStepString_lineBreaking() {

259	259	}
260	260	}
261	261	// Output: First \|line.
262		//‖Second \|line.‖
	262	//‖Second \|line.\|
263	263	}
264	264
265	265	func ExampleGraphemes_graphemes() {

304	304	}
305	305	}
306	306	// Output: First \|line.
307		//‖Second \|line.‖
308		}
	307	//‖Second \|line.\|
	308	}

+10

-7

step.go less more

73	73	// has much better performance and makes no allocations. It lends itself well to
74	74	// large byte slices.
75	75	//
76		// Note that in accordance with UAX #14 LB3, the final segment will end with
77		// a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose
78		// to ignore this by checking if the length of the "rest" slice is 0.
	76	// Note that this algorithm diverges from UAX #14 in LB3, in that the final
	77	// segment will not end with an optional line break (boundaries&MaskLine ==
	78	// LineCanBreak). The reason for this is that when the text ends with a newline
	79	// character, it is impossible to know whether the line break is due to that
	80	// character or due to the end of the text. You can enforce LB3 yourself by
	81	// checking if the length of the "rest" slice is 0.
79	82	func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) {
80	83	// An empty byte slice returns nothing.
81	84	if len(b) == 0 {

85	88	// Extract the first rune.
86	89	r, length := utf8.DecodeRune(b)
87	90	if len(b) <= length { // If we're already past the end, there is nothing else to parse.
88		return b, nil, LineMustBreak \| (1 << shiftWord) \| (1 << shiftSentence), grAny \| (wbAny << shiftWordState) \| (sbAny << shiftSentenceState) \| (lbAny << shiftLineState)
	91	return b, nil, LineCanBreak \| (1 << shiftWord) \| (1 << shiftSentence), grAny \| (wbAny << shiftWordState) \| (sbAny << shiftSentenceState) \| (lbAny << shiftLineState)
89	92	}
90	93
91	94	// If we don't know the state, determine it now.

130	133
131	134	length += l
132	135	if len(b) <= length {
133		return b, nil, LineMustBreak \| (1 << shiftWord) \| (1 << shiftSentence), grAny \| (wbAny << shiftWordState) \| (sbAny << shiftSentenceState) \| (lbAny << shiftLineState)
	136	return b, nil, LineCanBreak \| (1 << shiftWord) \| (1 << shiftSentence), grAny \| (wbAny << shiftWordState) \| (sbAny << shiftSentenceState) \| (lbAny << shiftLineState)
134	137	}
135	138	}
136	139	}

145	148	// Extract the first rune.
146	149	r, length := utf8.DecodeRuneInString(str)
147	150	if len(str) <= length { // If we're already past the end, there is nothing else to parse.
148		return str, "", LineMustBreak \| (1 << shiftWord) \| (1 << shiftSentence), grAny \| (wbAny << shiftWordState) \| (sbAny << shiftSentenceState) \| (lbAny << shiftLineState)
	151	return str, "", LineCanBreak \| (1 << shiftWord) \| (1 << shiftSentence), grAny \| (wbAny << shiftWordState) \| (sbAny << shiftSentenceState) \| (lbAny << shiftLineState)
149	152	}
150	153
151	154	// If we don't know the state, determine it now.

190	193
191	194	length += l
192	195	if len(str) <= length {
193		return str, "", LineMustBreak \| (1 << shiftWord) \| (1 << shiftSentence), grAny \| (wbAny << shiftWordState) \| (sbAny << shiftSentenceState) \| (lbAny << shiftLineState)
	196	return str, "", LineCanBreak \| (1 << shiftWord) \| (1 << shiftSentence), grAny \| (wbAny << shiftWordState) \| (sbAny << shiftSentenceState) \| (lbAny << shiftLineState)
194	197	}
195	198	}
196	199	}