Codebase list golang-github-rivo-uniseg / 6ee89ecd-7000-4b73-b56e-0076d3e1c72c/v0.4.3 gen_properties.go
6ee89ecd-7000-4b73-b56e-0076d3e1c72c/v0.4.3

Tree @6ee89ecd-7000-4b73-b56e-0076d3e1c72c/v0.4.3 (Download .tar.gz)

gen_properties.go @6ee89ecd-7000-4b73-b56e-0076d3e1c72c/v0.4.3raw · history · blame

//go:build generate

// This program generates a property file in Go file from Unicode Character
// Database auxiliary data files. The command line arguments are as follows:
//
//  1. The name of the Unicode data file (just the filename, without extension).
//     Can be "-" (to skip) if the emoji flag is included.
//  2. The name of the locally generated Go file.
//  3. The name of the slice mapping code points to properties.
//  4. The name of the generator, for logging purposes.
//  5. (Optional) Flags, comma-separated. The following flags are available:
//     - "emojis=<property>": include the specified emoji properties (e.g.
//     "Extended_Pictographic").
//     - "gencat": include general category properties.
//
//go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
//go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
//go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
//go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
//go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
//go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
package main

import (
	"bufio"
	"bytes"
	"errors"
	"fmt"
	"go/format"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"regexp"
	"sort"
	"strconv"
	"strings"
	"time"
)

// We want to test against a specific version rather than the latest. When the
// package is upgraded to a new version, change these to generate new tests.
const (
	propertyURL = `https://www.unicode.org/Public/14.0.0/ucd/%s.txt`
	emojiURL    = `https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt`
)

// The regular expression for a line containing a code point range property.
var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)

func main() {
	if len(os.Args) < 5 {
		fmt.Println("Not enough arguments, see code for details")
		os.Exit(1)
	}

	log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
	log.SetFlags(0)

	// Parse flags.
	flags := make(map[string]string)
	if len(os.Args) >= 6 {
		for _, flag := range strings.Split(os.Args[5], ",") {
			flagFields := strings.Split(flag, "=")
			if len(flagFields) == 1 {
				flags[flagFields[0]] = "yes"
			} else {
				flags[flagFields[0]] = flagFields[1]
			}
		}
	}

	// Parse the text file and generate Go source code from it.
	_, includeGeneralCategory := flags["gencat"]
	var mainURL string
	if os.Args[1] != "-" {
		mainURL = fmt.Sprintf(propertyURL, os.Args[1])
	}
	src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
	if err != nil {
		log.Fatal(err)
	}

	// Format the Go code.
	formatted, err := format.Source([]byte(src))
	if err != nil {
		log.Fatal("gofmt:", err)
	}

	// Save it to the (local) target file.
	log.Print("Writing to ", os.Args[2])
	if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
		log.Fatal(err)
	}
}

// parse parses the Unicode Properties text files located at the given URLs and
// returns their equivalent Go source code to be used in the uniseg package. If
// "emojiProperty" is not an empty string, emoji code points for that emoji
// property (e.g. "Extended_Pictographic") will be included. In those cases, you
// may pass an empty "propertyURL" to skip parsing the main properties file. If
// "includeGeneralCategory" is true, the Unicode General Category property will
// be extracted from the comments and included in the output.
func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
	if propertyURL == "" && emojiProperty == "" {
		return "", errors.New("no properties to parse")
	}

	// Temporary buffer to hold properties.
	var properties [][4]string

	// Open the first URL.
	if propertyURL != "" {
		log.Printf("Parsing %s", propertyURL)
		res, err := http.Get(propertyURL)
		if err != nil {
			return "", err
		}
		in1 := res.Body
		defer in1.Close()

		// Parse it.
		scanner := bufio.NewScanner(in1)
		num := 0
		for scanner.Scan() {
			num++
			line := strings.TrimSpace(scanner.Text())

			// Skip comments and empty lines.
			if strings.HasPrefix(line, "#") || line == "" {
				continue
			}

			// Everything else must be a code point range, a property and a comment.
			from, to, property, comment, err := parseProperty(line)
			if err != nil {
				return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
			}
			properties = append(properties, [4]string{from, to, property, comment})
		}
		if err := scanner.Err(); err != nil {
			return "", err
		}
	}

	// Open the second URL.
	if emojiProperty != "" {
		log.Printf("Parsing %s", emojiURL)
		res, err := http.Get(emojiURL)
		if err != nil {
			return "", err
		}
		in2 := res.Body
		defer in2.Close()

		// Parse it.
		scanner := bufio.NewScanner(in2)
		num := 0
		for scanner.Scan() {
			num++
			line := scanner.Text()

			// Skip comments, empty lines, and everything not containing
			// "Extended_Pictographic".
			if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
				continue
			}

			// Everything else must be a code point range, a property and a comment.
			from, to, property, comment, err := parseProperty(line)
			if err != nil {
				return "", fmt.Errorf("emojis line %d: %v", num, err)
			}
			properties = append(properties, [4]string{from, to, property, comment})
		}
		if err := scanner.Err(); err != nil {
			return "", err
		}
	}

	// Sort properties.
	sort.Slice(properties, func(i, j int) bool {
		left, _ := strconv.ParseUint(properties[i][0], 16, 64)
		right, _ := strconv.ParseUint(properties[j][0], 16, 64)
		return left < right
	})

	// Header.
	var (
		buf          bytes.Buffer
		emojiComment string
	)
	columns := 3
	if includeGeneralCategory {
		columns = 4
	}
	if emojiURL != "" {
		emojiComment = `
// and
// ` + emojiURL + `
// ("Extended_Pictographic" only)`
	}
	buf.WriteString(`package uniseg

// Code generated via go generate from gen_properties.go. DO NOT EDIT.

// ` + os.Args[3] + ` are taken from
// ` + propertyURL + emojiComment + `
// on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
	`)

	// Properties.
	for _, prop := range properties {
		if includeGeneralCategory {
			generalCategory := "gc" + prop[3][:2]
			if generalCategory == "gcL&" {
				generalCategory = "gcLC"
			}
			prop[3] = prop[3][3:]
			fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
		} else {
			fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
		}
	}

	// Tail.
	buf.WriteString("}")

	return buf.String(), nil
}

// parseProperty parses a line of the Unicode properties text file containing a
// property for a code point range and returns it along with its comment.
func parseProperty(line string) (from, to, property, comment string, err error) {
	fields := propertyPattern.FindStringSubmatch(line)
	if fields == nil {
		err = errors.New("no property found")
		return
	}
	from = fields[1]
	to = fields[3]
	if to == "" {
		to = from
	}
	property = fields[4]
	comment = fields[5]
	return
}

// translateProperty translates a property name as used in the Unicode data file
// to a variable used in the Go code.
func translateProperty(prefix, property string) string {
	return prefix + strings.ReplaceAll(property, "_", "")
}