Commit 780c3268c0f98b0f9d84fbd6b3f7f73648fb86af - golang-github-rivo-uniseg

Implemented word break rules. Not yet integrated. Not yet tested. Oliver 1 year, 9 months ago

3 changed file(s) with 198 addition(s) and 46 deletion(s). Raw diff Collapse all Expand all

-4

grapheme.go less more

24	24	// The index of the next code point to be parsed.
25	25	pos int
26	26
27		// The current state of the code point parser.
28		state int
	27	// The current state of the Grapheme code point parser.
	28	graphemeState int
29	29	}
30	30
31	31	// NewGraphemes returns a new grapheme cluster iterator.

68	68
69	69	// Calculate the next state.
70	70	var boundary bool
71		g.state, boundary = transitionGraphemeState(g.state, g.codePoints[g.pos])
	71	g.graphemeState, boundary = transitionGraphemeState(g.graphemeState, g.codePoints[g.pos])
72	72
73	73	// If we found a cluster boundary, let's stop here. The current cluster will
74	74	// be the one that just ended.

127	127	// Reset puts the iterator into its initial state such that the next call to
128	128	// Next() sets it to the first grapheme cluster again.
129	129	func (g *Graphemes) Reset() {
130		g.start, g.end, g.pos, g.state = 0, 0, 0, grAny
	130	g.start, g.end, g.pos, g.graphemeState = 0, 0, 0, grAny
131	131	g.Next() // Parse ahead again.
132	132	}
133	133

-0

graphemerules.go less more

34	34	// from the transition with the lower rule number, prefer (3) if rule numbers
35	35	// are equal. Stop.
36	36	// 6. Assume grAny and grBoundary.
	37	//
	38	// Unicode version 14.0.0.
37	39	var grTransitions = map[[2]int][3]int{
38	40	// GB5
39	41	{grAny, prCR}: {grCR, grBoundary, 50},

+192

-42

wordbreakrules.go less more

0	0	package uniseg
	1
	2	import "unicode/utf8"
1	3
2	4	// The states of the word break parser.
3	5	const (

5	7	wbCR
6	8	wbLF
7	9	wbNewline
8		wbZWJ
9	10	wbWSegSpace
	11	wbHebrewLetter
	12	wbALetter
	13	wbWB7
	14	wbWB7c
	15	wbNumeric
	16	wbWB11
	17	wbKatakana
	18	wbExtendNumLet
	19	wbOddRI
	20	wbEvenRI
	21	wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
10	22	)
11	23
12	24	// The word break parser's breaking instructions.

15	27	wbBreak
16	28	)
17	29
18		// The word break parser's state transitions. It's anologous to wbTransitions,
19		// see comments there for details.
	30	// The word break parser's state transitions. It's anologous to grTransitions,
	31	// see comments there for details. Unicode version 14.0.0.
20	32	var wbTransitions = map[[2]int][3]int{
21	33	// WB3b.
22	34	{wbAny, prNewline}: {wbNewline, wbBreak, 32},

31	43	// WB3.
32	44	{wbCR, prLF}: {wbLF, wbDontBreak, 30},
33	45
34		// WB3c.
35		{wbAny, prZWJ}: {wbZWJ, wbBreak, 9990},
36		{wbZWJ, prExtendedPictographic}: {wbAny, wbDontBreak, 33},
37
38	46	// WB3d.
39	47	{wbAny, prWSegSpace}: {wbWSegSpace, wbBreak, 9990},
40	48	{wbWSegSpace, prWSegSpace}: {wbWSegSpace, wbDontBreak, 34},
	49
	50	// WB5.
	51	{wbAny, prALetter}: {wbALetter, wbBreak, 9990},
	52	{wbAny, prHebrewLetter}: {wbHebrewLetter, wbBreak, 9990},
	53	{wbALetter, prALetter}: {wbALetter, wbDontBreak, 50},
	54	{wbALetter, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 50},
	55	{wbHebrewLetter, prALetter}: {wbALetter, wbDontBreak, 50},
	56	{wbHebrewLetter, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 50},
	57
	58	// WB7. Transitions to wbWB7 handled by transitionWordBreakState().
	59	{wbWB7, prALetter}: {wbALetter, wbDontBreak, 70},
	60	{wbWB7, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 70},
	61
	62	// WB7a.
	63	{wbHebrewLetter, prSingleQuote}: {wbAny, wbDontBreak, 71},
	64
	65	// WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
	66	{wbWB7c, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 73},
	67
	68	// WB8.
	69	{wbAny, prNumeric}: {wbNumeric, wbBreak, 9990},
	70	{wbNumeric, prNumeric}: {wbNumeric, wbDontBreak, 80},
	71
	72	// WB9.
	73	{wbALetter, prNumeric}: {wbNumeric, wbDontBreak, 90},
	74	{wbHebrewLetter, prNumeric}: {wbNumeric, wbDontBreak, 90},
	75
	76	// WB10.
	77	{wbNumeric, prALetter}: {wbALetter, wbDontBreak, 100},
	78	{wbNumeric, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 100},
	79
	80	// WB11. Transitions to wbWB11 handled by transitionWordBreakState().
	81	{wbWB11, prNumeric}: {wbNumeric, wbDontBreak, 110},
	82
	83	// WB13.
	84	{wbAny, prKatakana}: {wbKatakana, wbBreak, 9990},
	85	{wbKatakana, prKatakana}: {wbKatakana, wbDontBreak, 130},
	86
	87	// WB13a.
	88	{wbAny, prExtendNumLet}: {wbExtendNumLet, wbBreak, 9990},
	89	{wbALetter, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
	90	{wbHebrewLetter, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
	91	{wbNumeric, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
	92	{wbKatakana, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
	93	{wbExtendNumLet, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
	94
	95	// WB13b.
	96	{wbExtendNumLet, prALetter}: {wbALetter, wbDontBreak, 132},
	97	{wbExtendNumLet, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 132},
	98	{wbExtendNumLet, prNumeric}: {wbNumeric, wbDontBreak, 132},
	99	{wbExtendNumLet, prKatakana}: {prKatakana, wbDontBreak, 132},
41	100	}
42	101
43	102	// transitionWordBreakState determines the new state of the word break parser
44	103	// given the current state and the next code point. It also returns whether a
45		// word break was detected.
46		func transitionWordBreakState(state int, r rune) (newState int, wordBreak bool) {
	104	// word break was detected. If more than one code point is needed to determine
	105	// the new state, the byte slice or the string starting after rune "r" can be
	106	// used (whichever is not nil or empty) for further lookups.
	107	func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
47	108	// Determine the property of the next character.
48	109	nextProperty := property(workBreakCodePoints, r)
49	110
50		// Find the applicable transition.
	111	// "Replacing Ignore Rules".
	112	if nextProperty == prZWJ {
	113	// WB4 (for zero-width joiners).
	114	if state == wbNewline \|\| state == wbCR \|\| state == wbLF {
	115	return wbAny \| wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
	116	}
	117	return state \| wbZWJBit, false
	118	} else if nextProperty == prExtend \|\| nextProperty == prFormat {
	119	// WB4 (for Extend and Format).
	120	if state == wbNewline \|\| state == wbCR \|\| state == wbLF {
	121	return wbAny, true // Make sure we don't apply WB4 to WB3a.
	122	}
	123	return state, false
	124	} else if nextProperty == prExtendedPictographic && state&wbZWJBit != 0 {
	125	// WB3c.
	126	return wbAny, false
	127	}
	128	if state >= 0 {
	129	state = state &^ wbZWJBit
	130	}
	131
	132	// Find the applicable transition in the table.
	133	var rule int
51	134	transition, ok := wbTransitions[[2]int{state, nextProperty}]
52	135	if ok {
53	136	// We have a specific transition. We'll use it.
54		return transition[0], transition[1] == wbBreak
55		}
56
57		// No specific transition found. Try the less specific ones.
58		transAnyProp, okAnyProp := wbTransitions[[2]int{state, prAny}]
59		transAnyState, okAnyState := wbTransitions[[2]int{wbAny, nextProperty}]
60		if okAnyProp && okAnyState {
61		// Both apply. We'll use a mix (see comments for wbTransitions).
62		newState = transAnyState[0]
63		wordBreak = transAnyState[1] == wbBreak
64		if transAnyProp[2] < transAnyState[2] {
65		wordBreak = transAnyProp[1] == wbBreak
66		}
67		return
68		}
69
70		if okAnyProp {
71		// We only have a specific state.
72		return transAnyProp[0], transAnyProp[1] == wbBreak
73		// This branch will probably never be reached because okAnyState will
74		// always be true given the current transition map. But we keep it here
75		// for future modifications to the transition map where this may not be
76		// true anymore.
77		}
78
79		if okAnyState {
80		// We only have a specific property.
81		return transAnyState[0], transAnyState[1] == wbBreak
82		}
83
84		// No known transition. WB999: Any ÷ Any.
85		return wbAny, true
	137	newState, wordBreak, rule = transition[0], transition[1] == wbBreak, transition[2]
	138	} else {
	139	// No specific transition found. Try the less specific ones.
	140	transAnyProp, okAnyProp := wbTransitions[[2]int{state, prAny}]
	141	transAnyState, okAnyState := wbTransitions[[2]int{wbAny, nextProperty}]
	142	if okAnyProp && okAnyState {
	143	// Both apply. We'll use a mix (see comments for grTransitions).
	144	newState, wordBreak, rule = transAnyState[0], transAnyState[1] == wbBreak, transAnyState[2]
	145	if transAnyProp[2] < transAnyState[2] {
	146	wordBreak, rule = transAnyProp[1] == wbBreak, transAnyProp[2]
	147	}
	148	} else if okAnyProp {
	149	// We only have a specific state.
	150	newState, wordBreak, rule = transAnyProp[0], transAnyProp[1] == wbBreak, transAnyProp[2]
	151	// This branch will probably never be reached because okAnyState will
	152	// always be true given the current transition map. But we keep it here
	153	// for future modifications to the transition map where this may not be
	154	// true anymore.
	155	} else if okAnyState {
	156	// We only have a specific property.
	157	newState, wordBreak, rule = transAnyState[0], transAnyState[1] == wbBreak, transAnyState[2]
	158	} else {
	159	// No known transition. WB999: Any ÷ Any.
	160	newState, wordBreak, rule = wbAny, true, 9990
	161	}
	162	}
	163
	164	// For those rules that need to look up runes further in the string, we
	165	// determine the property after nextProperty, skipping over Format, Extend,
	166	// and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
	167	// be determined (because the text ends or the rune is faulty).
	168	farProperty := -1
	169	if rule > 60 &&
	170	(state == wbALetter \|\| state == wbHebrewLetter \|\| state == wbNumeric) &&
	171	(nextProperty == prMidLetter \|\| nextProperty == prMidNumLet \|\| nextProperty == prSingleQuote \|\| // WB6.
	172	nextProperty == prDoubleQuote \|\| // WB7b.
	173	nextProperty == prMidNum) { // WB12.
	174	for {
	175	var (
	176	r rune
	177	length int
	178	)
	179	if b != nil { // Byte slice version.
	180	r, length = utf8.DecodeRune(b)
	181	b = b[length:]
	182	} else { // String version.
	183	r, length = utf8.DecodeRuneInString(str)
	184	str = str[length:]
	185	}
	186	if r == utf8.RuneError {
	187	break
	188	}
	189	prop := property(workBreakCodePoints, r)
	190	if prop == prExtend \|\| prop == prFormat \|\| prop == prZWJ {
	191	continue
	192	}
	193	farProperty = prop
	194	break
	195	}
	196	}
	197
	198	// WB6.
	199	if rule > 60 &&
	200	(state == wbALetter \|\| state == wbHebrewLetter) &&
	201	(nextProperty == prMidLetter \|\| nextProperty == prMidNumLet \|\| nextProperty == prSingleQuote) &&
	202	(farProperty == prALetter \|\| farProperty == prHebrewLetter) {
	203	return wbWB7, false
	204	}
	205
	206	// WB7b.
	207	if rule > 72 &&
	208	state == wbHebrewLetter &&
	209	nextProperty == prDoubleQuote &&
	210	farProperty == prHebrewLetter {
	211	return wbWB7c, false
	212	}
	213
	214	// WB12.
	215	if rule > 120 &&
	216	state == wbNumeric &&
	217	(nextProperty == prMidNum \|\| nextProperty == prMidNumLet \|\| nextProperty == prSingleQuote) &&
	218	farProperty == prNumeric {
	219	return wbWB11, false
	220	}
	221
	222	// WB15 and WB16.
	223	if rule > 160 && nextProperty == prRegionalIndicator {
	224	if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
	225	// Transition into the first RI.
	226	return wbOddRI, true
	227	}
	228	if state == wbOddRI {
	229	// Don't break pairs of Regional Indicators.
	230	return wbEvenRI, false
	231	}
	232	return wbOddRI, true // We can break after a pair.
	233	}
	234
	235	return
86	236	}