Codebase list golang-github-jdkato-prose / debian/1.2.1-1 summarize / syllables.go
debian/1.2.1-1

Tree @debian/1.2.1-1 (Download .tar.gz)

syllables.go @debian/1.2.1-1raw · history · blame

package summarize

import (
	"regexp"
	"strings"
	"unicode/utf8"
)

type clearFunc func(word string, suffixes []string) string

// Syllables returns the number of syllables in the string word.
//
// NOTE: This function expects a word (not raw text) as input.
func Syllables(word string) int {
	// See if we can leave early ...
	length := utf8.RuneCountInString(word)
	if length < 1 {
		return 0
	} else if length < 3 {
		return 1
	}

	// See if this is a known cornercase ...
	word = strings.ToLower(word)
	if syllables, ok := cornercases[word]; ok {
		return syllables
	}
	text, count := clean(word)

	// Count multiple vowels
	count += len(vowels.FindAllString(text, -1))

	count -= len(monosyllabicOne.FindAllString(text, -1))
	count -= len(monosyllabicTwo.FindAllString(text, -1))

	count += len(doubleSyllabicOne.FindAllString(text, -1))
	count += len(doubleSyllabicTwo.FindAllString(text, -1))
	count += len(doubleSyllabicThree.FindAllString(text, -1))
	count += len(doubleSyllabicFour.FindAllString(text, -1))

	if count < 1 {
		return 1
	}
	return count
}

func clean(word string) (string, int) {
	var prefix, suffix int
	word, prefix = clearPart(word, incrementToPrefix, trimAnyPrefix)
	word, suffix = clearPart(word, incrementToSuffix, trimAnySuffix)
	return word, prefix + suffix
}

func clearPart(s string, options [][]string, clearer clearFunc) (string, int) {
	old := s
	pos := len(options)
	for i, trim := range options {
		s = clearer(s, trim)
		if s != old {
			return s, (pos - i)
		}
	}
	return s, 0
}

func trimAnySuffix(word string, suffixes []string) string {
	for _, suffix := range suffixes {
		if strings.HasSuffix(word, suffix) {
			return strings.TrimSuffix(word, suffix)
		}
	}
	return word
}

func trimAnyPrefix(word string, prefixes []string) string {
	for _, prefix := range prefixes {
		if strings.HasPrefix(word, prefix) {
			return strings.TrimPrefix(word, prefix)
		}
	}
	return word
}

var cornercases = map[string]int{
	"abalone":     4,
	"abare":       3,
	"abed":        2,
	"abruzzese":   4,
	"abbruzzese":  4,
	"aborigine":   5,
	"aborigines":  5,
	"acreage":     3,
	"adame":       3,
	"adieu":       2,
	"adobe":       3,
	"anemone":     4,
	"apache":      3,
	"aphrodite":   4,
	"apostrophe":  4,
	"ariadne":     4,
	"cafe":        2,
	"cafes":       2,
	"calliope":    4,
	"catastrophe": 4,
	"chile":       2,
	"chloe":       2,
	"circe":       2,
	"coyote":      3,
	"epitome":     4,
	"facsimile":   4,
	"forever":     3,
	"gethsemane":  4,
	"guacamole":   4,
	"hyperbole":   4,
	"jesse":       2,
	"jukebox":     2,
	"karate":      3,
	"machete":     3,
	"maybe":       2,
	"people":      2,
	"recipe":      3,
	"sesame":      3,
	"shoreline":   2,
	"simile":      3,
	"syncope":     3,
	"tamale":      3,
	"yosemite":    4,
	"daphne":      2,
	"eurydice":    4,
	"euterpe":     3,
	"hermione":    4,
	"penelope":    4,
	"persephone":  4,
	"phoebe":      2,
	"zoe":         2,
}

var monosyllabicOne = regexp.MustCompile("cia(l|$)|" +
	"tia|" +
	"cius|" +
	"cious|" +
	"[^aeiou]giu|" +
	"[aeiouy][^aeiouy]ion|" +
	"iou|" +
	"sia$|" +
	"eous$|" +
	"[oa]gue$|" +
	".[^aeiuoycgltdb]{2,}ed$|" +
	".ely$|" +
	"^jua|" +
	"uai|" +
	"eau|" +
	"^busi$|" +
	"(" +
	"[aeiouy]" +
	"(" +
	"b|" +
	"c|" +
	"ch|" +
	"dg|" +
	"f|" +
	"g|" +
	"gh|" +
	"gn|" +
	"k|" +
	"l|" +
	"lch|" +
	"ll|" +
	"lv|" +
	"m|" +
	"mm|" +
	"n|" +
	"nc|" +
	"ng|" +
	"nch|" +
	"nn|" +
	"p|" +
	"r|" +
	"rc|" +
	"rn|" +
	"rs|" +
	"rv|" +
	"s|" +
	"sc|" +
	"sk|" +
	"sl|" +
	"squ|" +
	"ss|" +
	"th|" +
	"v|" +
	"y|" +
	"z" +
	")" +
	"ed$" +
	")|" +
	"(" +
	"[aeiouy]" +
	"(" +
	"b|" +
	"ch|" +
	"d|" +
	"f|" +
	"gh|" +
	"gn|" +
	"k|" +
	"l|" +
	"lch|" +
	"ll|" +
	"lv|" +
	"m|" +
	"mm|" +
	"n|" +
	"nch|" +
	"nn|" +
	"p|" +
	"r|" +
	"rn|" +
	"rs|" +
	"rv|" +
	"s|" +
	"sc|" +
	"sk|" +
	"sl|" +
	"squ|" +
	"ss|" +
	"st|" +
	"t|" +
	"th|" +
	"v|" +
	"y" +
	")" +
	"es$" +
	")",
)
var monosyllabicTwo = regexp.MustCompile("[aeiouy]" +
	"(" +
	"b|" +
	"c|" +
	"ch|" +
	"d|" +
	"dg|" +
	"f|" +
	"g|" +
	"gh|" +
	"gn|" +
	"k|" +
	"l|" +
	"ll|" +
	"lv|" +
	"m|" +
	"mm|" +
	"n|" +
	"nc|" +
	"ng|" +
	"nn|" +
	"p|" +
	"r|" +
	"rc|" +
	"rn|" +
	"rs|" +
	"rv|" +
	"s|" +
	"sc|" +
	"sk|" +
	"sl|" +
	"squ|" +
	"ss|" +
	"st|" +
	"t|" +
	"th|" +
	"v|" +
	"y|" +
	"z" +
	")" +
	"e$",
)
var doubleSyllabicOne = regexp.MustCompile("(?:" +
	"[^aeiouy]ie" +
	"(" +
	"r|" +
	"st|" +
	"t" +
	")|" +
	"[aeiouym]bl|" +
	"eo|" +
	"ism|" +
	"asm|" +
	"thm|" +
	"dnt|" +
	"uity|" +
	"dea|" +
	"gean|" +
	"oa|" +
	"ua|" +
	"eings?|" +
	"[aeiouy]sh?e[rsd]" +
	")$")

var doubleSyllabicTwo = regexp.MustCompile(
	"[^gq]ua[^auieo]|[aeiou]{3}|^(ia|mc|coa[dglx].)")

var doubleSyllabicThree = regexp.MustCompile(
	"[^aeiou]y[ae]|" +
		"[^l]lien|" +
		"riet|" +
		"dien|" +
		"iu|" +
		"io|" +
		"ii|" +
		"uen|" +
		"real|" +
		"iell|" +
		"eo[^aeiou]|" +
		"[aeiou]y[aeiou]",
)
var doubleSyllabicFour = regexp.MustCompile(
	"[^s]ia",
)
var vowels = regexp.MustCompile(
	"[aeiouy]+",
)
var incrementToPrefix = [][]string{
	{
		"above", "anti", "ante", "counter", "hyper", "afore", "agri", "infra",
		"intra", "inter", "over", "semi", "ultra", "under", "extra", "dia",
		"micro", "mega", "kilo", "pico", "nano", "macro"},
	{
		"un", "fore", "ware", "none", "non", "out", "post", "sub", "pre",
		"pro", "dis", "side"},
}
var incrementToSuffix = [][]string{
	{"ology", "ologist", "onomy", "onomist"},
	{"fully", "berry", "woman", "women"},
	{
		"ly", "less", "some", "ful", "er", "ers", "ness", "cian", "cians",
		"ment", "ments", "ette", "ettes", "ville", "villes", "ships", "ship",
		"side", "sides", "port", "ports", "shire", "shires", "tion", "tioned"},
}