sentences_test.go - golang-gopkg-neurosnap-sentences.v1 (b7aea962-f804-4bc0-a9dc-6c3667da5872/main)

sentences_test.go @b7aea962-f804-4bc0-a9dc-6c3667da5872/main — raw · history · blame

package sentences

import (
	"io/ioutil"
	"strings"
	"testing"

	td "github.com/neurosnap/sentences/data"
)

func loadTokenizer(data string) *DefaultSentenceTokenizer {
	b, err := td.Asset(data)
	if err != nil {
		panic(err)
	}

	training, err := LoadTraining(b)
	if err != nil {
		panic(err)
	}

	return NewSentenceTokenizer(training)
}

func readFile(fname string) string {
	content, err := ioutil.ReadFile(fname)
	if err != nil {
		panic(err)
	}

	return string(content)
}

func getFileLocation(prefix, original, expected string) []string {
	origText := strings.Join([]string{prefix, original}, "")
	expectedText := strings.Join([]string{prefix, expected}, "")
	return []string{origText, expectedText}
}

func TestEnglish(t *testing.T) {
	t.Log("Starting test suite ...")

	tokenizer := loadTokenizer("data/english.json")

	prefix := "test_files/english/"

	testFiles := [][]string{
		getFileLocation(prefix, "carolyn.txt", "carolyn_s.txt"),
		getFileLocation(prefix, "ecig.txt", "ecig_s.txt"),
		getFileLocation(prefix, "foul_ball.txt", "foul_ball_s.txt"),
		getFileLocation(prefix, "fbi.txt", "fbi_s.txt"),
		getFileLocation(prefix, "dre.txt", "dre_s.txt"),
		getFileLocation(prefix, "dr.txt", "dr_s.txt"),
		getFileLocation(prefix, "quotes.txt", "quotes_s.txt"),
		getFileLocation(prefix, "kiss.txt", "kiss_s.txt"),
		getFileLocation(prefix, "kentucky.txt", "kentucky_s.txt"),
		getFileLocation(prefix, "iphone6s.txt", "iphone6s_s.txt"),
		getFileLocation(prefix, "lebanon.txt", "lebanon_s.txt"),
		getFileLocation(prefix, "duma.txt", "duma_s.txt"),
		getFileLocation(prefix, "demolitions.txt", "demolitions_s.txt"),
		getFileLocation(prefix, "qa.txt", "qa_s.txt"),
		getFileLocation(prefix, "anarchy.txt", "anarchy_s.txt"),
		getFileLocation(prefix, "ethicist.txt", "ethicist_s.txt"),
		getFileLocation(prefix, "self_reliance.txt", "self_reliance_s.txt"),
		getFileLocation(prefix, "punct.txt", "punct_s.txt"),
		getFileLocation(prefix, "clinton.txt", "clinton_s.txt"),
		getFileLocation(prefix, "markets.txt", "markets_s.txt"),
		getFileLocation(prefix, "nyfed.txt", "nyfed_s.txt"),
	}

	for _, f := range testFiles {
		actualText := readFile(f[0])
		expectedText := readFile(f[1])
		expected := strings.Split(expectedText, "{{sentence_break}}")

		t.Log(f[0])
		sentences := tokenizer.Tokenize(actualText)
		for index, s := range sentences {
			sentence := strings.TrimSpace(s.Text)
			if sentence != strings.TrimSpace(expected[index]) {
				t.Logf("Actual  : %q", sentence)
				t.Log("--------")
				t.Logf("Expected: %q", strings.TrimSpace(expected[index]))
				t.Fatalf("%s line %d: Actual sentence does not match expected sentence", f[0], index)
			}
		}
	}
}

func TestSemicolon(t *testing.T) {
	t.Log("Tokenizer should parse sentences with semicolons")

	tokenizer := loadTokenizer("data/english.json")

	actualText := "I am here; you are over there.  Will the tokenizer output two complete sentences?"
	actual := tokenizer.Tokenize(actualText)

	expected := []string{
		"I am here; you are over there.",
		"  Will the tokenizer output two complete sentences?",
	}

	t.Logf("%v", actual)

	if len(actual) != len(expected) {
		t.Fatalf("Actual: %d, Expected: %d", len(actual), len(expected))
	}

	for index, sent := range actual {
		if sent.Text != expected[index] {
			t.Fatalf("Actual: %s\nExpected: %s", sent.Text, expected[index])
		}
	}
}

func TestEndOfTextNoPunct(t *testing.T) {
	t.Log("Tokenizer should break up sentences even if text doesn't end in punctuation")

	tokenizer := loadTokenizer("data/english.json")

	actualText := "Hi does this work?\n\nIt seems to.  This is great"
	actual := tokenizer.Tokenize(actualText)

	expected := []string{
		"Hi does this work?",
		"\n\nIt seems to.",
		"  This is great",
	}

	t.Logf("%v", actual)

	if len(actual) != len(expected) {
		t.Fatalf("Actual: %d, Expected: %d", len(actual), len(expected))
	}

	for index, sent := range actual {
		if sent.Text != expected[index] {
			t.Fatalf("Actual: %s\nExpected: %s", sent.Text, expected[index])
		}
	}
}

func TestWeirdEllipsis(t *testing.T) {
	t.Log("Tokenizer should not break up ellipsis")

	actualText := "Harry Potter . . . what an honor."
	expected := []string{
		"Harry Potter . . . what an honor.",
	}

	compareSentence(t, actualText, expected)
}

func TestNormalEllipsis(t *testing.T) {
	t.Log("Tokenizer should not break up ellipsis")

	actualText := "Harry Potter ... what an honor."
	expected := []string{
		"Harry Potter ... what an honor.",
	}

	compareSentence(t, actualText, expected)
}

func TestSpacedPeriod(t *testing.T) {
	t.Log("Tokenizer should break up sentence with a barren period")

	actualText := "Hi my name is steve . what is your name?"
	expected := []string{
		"Hi my name is steve .",
		" what is your name?",
	}

	compareSentence(t, actualText, expected)
}

func compareSentence(t *testing.T, actualText string, expected []string) {
	tokenizer := loadTokenizer("data/english.json")
	actual := tokenizer.Tokenize(actualText)

	t.Logf("Actual: %v", actual)

	if len(actual) != len(expected) {
		t.Fatalf("Actual: %d, Expected: %d", len(actual), len(expected))
	}

	for index, sent := range actual {
		if sent.Text != expected[index] {
			t.Fatalf("Actual: %s\nExpected: %s", sent.Text, expected[index])
		}
	}
}
Tree @b7aea962-f804-4bc0-a9dc-6c3667da5872/main (Download .tar.gz)

sentences_test.go @b7aea962-f804-4bc0-a9dc-6c3667da5872/main — raw · history · blame