Codebase list golang-github-rivo-uniseg / run/6ee89ecd-7000-4b73-b56e-0076d3e1c72c/main sentencerules.go
run/6ee89ecd-7000-4b73-b56e-0076d3e1c72c/main

Tree @run/6ee89ecd-7000-4b73-b56e-0076d3e1c72c/main (Download .tar.gz)

sentencerules.go @run/6ee89ecd-7000-4b73-b56e-0076d3e1c72c/mainraw · history · blame

package uniseg

import "unicode/utf8"

// The states of the sentence break parser.
const (
	sbAny = iota
	sbCR
	sbParaSep
	sbATerm
	sbUpper
	sbLower
	sbSB7
	sbSB8Close
	sbSB8Sp
	sbSTerm
	sbSB8aClose
	sbSB8aSp
)

// The sentence break parser's breaking instructions.
const (
	sbDontBreak = iota
	sbBreak
)

// The sentence break parser's state transitions. It's anologous to
// grTransitions, see comments there for details. Unicode version 14.0.0.
var sbTransitions = map[[2]int][3]int{
	// SB3.
	{sbAny, prCR}: {sbCR, sbDontBreak, 9990},
	{sbCR, prLF}:  {sbParaSep, sbDontBreak, 30},

	// SB4.
	{sbAny, prSep}:     {sbParaSep, sbDontBreak, 9990},
	{sbAny, prLF}:      {sbParaSep, sbDontBreak, 9990},
	{sbParaSep, prAny}: {sbAny, sbBreak, 40},
	{sbCR, prAny}:      {sbAny, sbBreak, 40},

	// SB6.
	{sbAny, prATerm}:     {sbATerm, sbDontBreak, 9990},
	{sbATerm, prNumeric}: {sbAny, sbDontBreak, 60},
	{sbSB7, prNumeric}:   {sbAny, sbDontBreak, 60}, // Because ATerm also appears in SB7.

	// SB7.
	{sbAny, prUpper}:   {sbUpper, sbDontBreak, 9990},
	{sbAny, prLower}:   {sbLower, sbDontBreak, 9990},
	{sbUpper, prATerm}: {sbSB7, sbDontBreak, 70},
	{sbLower, prATerm}: {sbSB7, sbDontBreak, 70},
	{sbSB7, prUpper}:   {sbUpper, sbDontBreak, 70},

	// SB8a.
	{sbAny, prSTerm}:           {sbSTerm, sbDontBreak, 9990},
	{sbATerm, prSContinue}:     {sbAny, sbDontBreak, 81},
	{sbATerm, prATerm}:         {sbATerm, sbDontBreak, 81},
	{sbATerm, prSTerm}:         {sbSTerm, sbDontBreak, 81},
	{sbSB7, prSContinue}:       {sbAny, sbDontBreak, 81},
	{sbSB7, prATerm}:           {sbATerm, sbDontBreak, 81},
	{sbSB7, prSTerm}:           {sbSTerm, sbDontBreak, 81},
	{sbSB8Close, prSContinue}:  {sbAny, sbDontBreak, 81},
	{sbSB8Close, prATerm}:      {sbATerm, sbDontBreak, 81},
	{sbSB8Close, prSTerm}:      {sbSTerm, sbDontBreak, 81},
	{sbSB8Sp, prSContinue}:     {sbAny, sbDontBreak, 81},
	{sbSB8Sp, prATerm}:         {sbATerm, sbDontBreak, 81},
	{sbSB8Sp, prSTerm}:         {sbSTerm, sbDontBreak, 81},
	{sbSTerm, prSContinue}:     {sbAny, sbDontBreak, 81},
	{sbSTerm, prATerm}:         {sbATerm, sbDontBreak, 81},
	{sbSTerm, prSTerm}:         {sbSTerm, sbDontBreak, 81},
	{sbSB8aClose, prSContinue}: {sbAny, sbDontBreak, 81},
	{sbSB8aClose, prATerm}:     {sbATerm, sbDontBreak, 81},
	{sbSB8aClose, prSTerm}:     {sbSTerm, sbDontBreak, 81},
	{sbSB8aSp, prSContinue}:    {sbAny, sbDontBreak, 81},
	{sbSB8aSp, prATerm}:        {sbATerm, sbDontBreak, 81},
	{sbSB8aSp, prSTerm}:        {sbSTerm, sbDontBreak, 81},

	// SB9.
	{sbATerm, prClose}:     {sbSB8Close, sbDontBreak, 90},
	{sbSB7, prClose}:       {sbSB8Close, sbDontBreak, 90},
	{sbSB8Close, prClose}:  {sbSB8Close, sbDontBreak, 90},
	{sbATerm, prSp}:        {sbSB8Sp, sbDontBreak, 90},
	{sbSB7, prSp}:          {sbSB8Sp, sbDontBreak, 90},
	{sbSB8Close, prSp}:     {sbSB8Sp, sbDontBreak, 90},
	{sbSTerm, prClose}:     {sbSB8aClose, sbDontBreak, 90},
	{sbSB8aClose, prClose}: {sbSB8aClose, sbDontBreak, 90},
	{sbSTerm, prSp}:        {sbSB8aSp, sbDontBreak, 90},
	{sbSB8aClose, prSp}:    {sbSB8aSp, sbDontBreak, 90},
	{sbATerm, prSep}:       {sbParaSep, sbDontBreak, 90},
	{sbATerm, prCR}:        {sbParaSep, sbDontBreak, 90},
	{sbATerm, prLF}:        {sbParaSep, sbDontBreak, 90},
	{sbSB7, prSep}:         {sbParaSep, sbDontBreak, 90},
	{sbSB7, prCR}:          {sbParaSep, sbDontBreak, 90},
	{sbSB7, prLF}:          {sbParaSep, sbDontBreak, 90},
	{sbSB8Close, prSep}:    {sbParaSep, sbDontBreak, 90},
	{sbSB8Close, prCR}:     {sbParaSep, sbDontBreak, 90},
	{sbSB8Close, prLF}:     {sbParaSep, sbDontBreak, 90},
	{sbSTerm, prSep}:       {sbParaSep, sbDontBreak, 90},
	{sbSTerm, prCR}:        {sbParaSep, sbDontBreak, 90},
	{sbSTerm, prLF}:        {sbParaSep, sbDontBreak, 90},
	{sbSB8aClose, prSep}:   {sbParaSep, sbDontBreak, 90},
	{sbSB8aClose, prCR}:    {sbParaSep, sbDontBreak, 90},
	{sbSB8aClose, prLF}:    {sbParaSep, sbDontBreak, 90},

	// SB10.
	{sbSB8Sp, prSp}:  {sbSB8Sp, sbDontBreak, 100},
	{sbSB8aSp, prSp}: {sbSB8aSp, sbDontBreak, 100},
	{sbSB8Sp, prSep}: {sbParaSep, sbDontBreak, 100},
	{sbSB8Sp, prCR}:  {sbParaSep, sbDontBreak, 100},
	{sbSB8Sp, prLF}:  {sbParaSep, sbDontBreak, 100},

	// SB11.
	{sbATerm, prAny}:     {sbAny, sbBreak, 110},
	{sbSB7, prAny}:       {sbAny, sbBreak, 110},
	{sbSB8Close, prAny}:  {sbAny, sbBreak, 110},
	{sbSB8Sp, prAny}:     {sbAny, sbBreak, 110},
	{sbSTerm, prAny}:     {sbAny, sbBreak, 110},
	{sbSB8aClose, prAny}: {sbAny, sbBreak, 110},
	{sbSB8aSp, prAny}:    {sbAny, sbBreak, 110},
	// We'll always break after ParaSep due to SB4.
}

// transitionSentenceBreakState determines the new state of the sentence break
// parser given the current state and the next code point. It also returns
// whether a sentence boundary was detected. If more than one code point is
// needed to determine the new state, the byte slice or the string starting
// after rune "r" can be used (whichever is not nil or empty) for further
// lookups.
func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
	// Determine the property of the next character.
	nextProperty := property(sentenceBreakCodePoints, r)

	// SB5 (Replacing Ignore Rules).
	if nextProperty == prExtend || nextProperty == prFormat {
		if state == sbParaSep || state == sbCR {
			return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
		}
		if state < 0 {
			return sbAny, true // SB1.
		}
		return state, false
	}

	// Find the applicable transition in the table.
	var rule int
	transition, ok := sbTransitions[[2]int{state, nextProperty}]
	if ok {
		// We have a specific transition. We'll use it.
		newState, sentenceBreak, rule = transition[0], transition[1] == sbBreak, transition[2]
	} else {
		// No specific transition found. Try the less specific ones.
		transAnyProp, okAnyProp := sbTransitions[[2]int{state, prAny}]
		transAnyState, okAnyState := sbTransitions[[2]int{sbAny, nextProperty}]
		if okAnyProp && okAnyState {
			// Both apply. We'll use a mix (see comments for grTransitions).
			newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
			if transAnyProp[2] < transAnyState[2] {
				sentenceBreak, rule = transAnyProp[1] == sbBreak, transAnyProp[2]
			}
		} else if okAnyProp {
			// We only have a specific state.
			newState, sentenceBreak, rule = transAnyProp[0], transAnyProp[1] == sbBreak, transAnyProp[2]
			// This branch will probably never be reached because okAnyState will
			// always be true given the current transition map. But we keep it here
			// for future modifications to the transition map where this may not be
			// true anymore.
		} else if okAnyState {
			// We only have a specific property.
			newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
		} else {
			// No known transition. SB999: Any × Any.
			newState, sentenceBreak, rule = sbAny, false, 9990
		}
	}

	// SB8.
	if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
		// Check the right side of the rule.
		var length int
		for nextProperty != prOLetter &&
			nextProperty != prUpper &&
			nextProperty != prLower &&
			nextProperty != prSep &&
			nextProperty != prCR &&
			nextProperty != prLF &&
			nextProperty != prATerm &&
			nextProperty != prSTerm {
			// Move on to the next rune.
			if b != nil { // Byte slice version.
				r, length = utf8.DecodeRune(b)
				b = b[length:]
			} else { // String version.
				r, length = utf8.DecodeRuneInString(str)
				str = str[length:]
			}
			if r == utf8.RuneError {
				break
			}
			nextProperty = property(sentenceBreakCodePoints, r)
		}
		if nextProperty == prLower {
			return sbLower, false
		}
	}

	return
}