base/gen_character_set.py - mozc (upstream/1.0.558.102)

Tree @upstream/1.0.558.102 (Download .tar.gz)

gen_character_set.py @upstream/1.0.558.102 — raw · history · blame

# -*- coding: utf-8 -*-
# Copyright 2010, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
#     * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

__author__ = "taku"

import re
import string
import sys

kUnicodePat = re.compile(r'[0-9A-Fa-f]{2,4}')

def IsValidUnicode(n):
  return kUnicodePat.match(n)

def LoadJISX0201(filename):
  fh = open(filename)
  result = set()
  for line in fh.readlines():
    if line.startswith('#'):
      continue
    array = string.split(line)
    ucs2 = array[1].replace('0x', '')
    if IsValidUnicode(ucs2):
      result.add(ucs2)

  return result

def LoadJISX0208(filename):
  fh = open(filename)
  result = set()
  for line in fh.readlines():
    if line.startswith('#'):
      continue
    array = line.split()
    ucs2 = array[2].replace('0x', '')
    if IsValidUnicode(ucs2):
      result.add(ucs2)

  # FF3C (FULLWIDTH REVERSE SOLIDS) should be in JISX0208
  result.add('FF3C')

  # FF0D (FULLWIDTH HYPHEN MINUS) should be in JISX0208
  result.add('FF0D')

  return result

def LoadJISX0212(filename):
  fh = open(filename)
  result = set()
  for line in fh.readlines():
    if line.startswith('#'):
      continue
    array = line.split()
    ucs2 = array[1].replace('0x', '')
    if IsValidUnicode(ucs2):
      result.add(ucs2)

  return result

def LoadCP932(filename):
  fh = open(filename)
  result = set()
  for line in fh.readlines():
    if line.startswith('#'):
      continue
    array = line.split()
    ucs2 = array[1].replace('0x', '')
    if IsValidUnicode(ucs2):
      result.add(ucs2)

  return result

def LoadJISX0213(filename):
  fh = open(filename)
  result = set()
  for line in fh.readlines():
    if line.startswith('#'):
      continue
    array = line.split()
    ucs2 = array[1].replace('U+', '')
    if IsValidUnicode(ucs2):
      result.add(ucs2)

  return result

# The following chars have different mapping in
# Windows and Mac. Technically, they are platform dependent
# characters, but Mozc treat them so that they are normal characters
# defined in JISX0208
def LoadExceptions():
  # treat Unicode Japanese incompatible characters as JISX0208.
  result = set()
  result.add('00A5')  # YEN SIGN
  result.add('003E')  # OVERLINE
  result.add('301C')  # WAVE DASH
  result.add('FF5E')  # FULL WIDTH TILDE
  result.add('2016')  # DOUBLE VERTICAL LINE
  result.add('2225')  # PARALEL TO
  result.add('2212')  # MINUS SIGN
  result.add('FF0D')  # FULL WIDTH HYPHEN MINUS
  result.add('00A2')  # CENT SIGN
  result.add('FFE0')  # FULL WIDTH CENT SIGN
  result.add('00A3')  # POUND SIGN
  result.add('FFE1')  # FULL WIDTH POUND SIGN
  result.add('00AC')  # NOT SIGN
  result.add('FFE2')  # FULL WIDTH NOT SIGN
  return result

def Lookup(key, hash):
  if key in hash:
    return "D"
  else:
    return "N"

def Categorize(key, pattern):
  # ASCII
  if int(key, 16) <= 0x007F:
    return "ASCII"

  # "CP932 JISX0201 JISX0208 JISX0212 JISX0213" and "D or N"
  # regexp => result mapping
  kMapping = ( [ "D . . . . .",  "JISX0208" ],   # vender specific
               [ ". N N N N N",  "UNICODE_ONLY"  ],  # not in JIS nor CP932
               [ "N D N N N N",  "CP932"    ],       # only CP932
               [ "N . D . . .",  "JISX0201" ],       # at least in JISX0201
               [ "N . N D . .",  "JISX0208" ],       # at least in JISX0208
               [ "N . N N D .",  "JISX0212" ],       # in JISX0212
               [ "N . N N N D",  "JISX0213" ] )      # in JISX0213

  for m in kMapping:
    if re.search(m[0], pattern):
      return m[1]

  raise 'Cannot find pattern %s ' % (pattern)

def OutputTable():
  exceptions = LoadExceptions()
  cp932      = LoadCP932(sys.argv[1])
  jisx0201   = LoadJISX0201(sys.argv[2])
  jisx0208   = LoadJISX0208(sys.argv[3])
  jisx0212   = LoadJISX0212(sys.argv[4])
  jisx0213   = LoadJISX0213(sys.argv[5])

  cat = []
  for i in xrange(0, 65536):
    key = "%4.4X" % (i)
    pattern = "%s %s %s %s %s %s" % (
        Lookup(key, exceptions),
        Lookup(key, cp932),
        Lookup(key, jisx0201),
        Lookup(key, jisx0208),
        Lookup(key, jisx0212),
        Lookup(key, jisx0213))
    cat.append(Categorize(key, pattern))

  # Grouping
  prev = ""
  start = -1
  end = 0
  group = []
  for i in xrange(0, 65536):
    if prev != cat[i]:
      if start == -1:
        start = i
      else:
        end = i
        group.append([prev, start, end])
        start = i
    prev = cat[i]

  group.append([prev, start, 65536])

  print "Util::CharacterSet Util::GetCharacterSet(uint16 ucs2) {"
  print "  switch (ucs2) {";
  for g in group:
    cat = g[0]
    start = g[1]
    end = g[2]
    if cat == "UNICODE_ONLY":
      continue
    for i in xrange(start, end):
      print  "    case 0x%4.4X:" % (i)
    print "      return %s;" % (cat)
    print "      break;";

  print "    default:";
  print "      return UNICODE_ONLY;"
  print "      break;";
  print "  }";   # end switch
  print "  return UNICODE_ONLY;"
  print "}";     # end function

def main():
  print "// This file is generated by base/gen_character_set.py"
  print "// Do not edit me!";
  OutputTable()

if __name__ == "__main__":
  main()