base/gen_character_set.py - mozc (7306a9d)

Tree @7306a9d (Download .tar.gz)

gen_character_set.py @7306a9d — raw · history · blame

# -*- coding: utf-8 -*-
# Copyright 2010-2012, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
#     * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

__author__ = "taku"

import re
import string
import sys

kUnicodePat = re.compile(r'[0-9A-Fa-f]{2,4}')

def IsValidUCS2(n):
  try:
    return 0 <= int(n, 16) <= 0xFFFF
  except ValueError:
    return False

def IsValidUCS4(n):
  try:
    return 0 <= int(n, 16) <= 0x7FFFFFFF
  except ValueError:
    return False

def LoadJISX0201(filename):
  fh = open(filename)
  result = set()
  for line in fh.readlines():
    if line.startswith('#'):
      continue
    array = string.split(line)
    ucs2 = array[1].replace('0x', '')
    if IsValidUCS2(ucs2):
      result.add(ucs2)

  return result

def LoadJISX0208(filename):
  fh = open(filename)
  result = set()
  for line in fh.readlines():
    if line.startswith('#'):
      continue
    array = line.split()
    ucs2 = array[2].replace('0x', '')
    if IsValidUCS2(ucs2):
      result.add(ucs2)

  # FF3C (FULLWIDTH REVERSE SOLIDS) should be in JISX0208
  result.add('FF3C')

  # FF0D (FULLWIDTH HYPHEN MINUS) should be in JISX0208
  result.add('FF0D')

  return result

def LoadJISX0212(filename):
  fh = open(filename)
  result = set()
  for line in fh.readlines():
    if line.startswith('#'):
      continue
    array = line.split()
    ucs2 = array[1].replace('0x', '')
    if IsValidUCS2(ucs2):
      result.add(ucs2)

  return result

def LoadCP932(filename):
  fh = open(filename)
  result = set()
  for line in fh.readlines():
    if line.startswith('#'):
      continue
    array = line.split()
    ucs2 = array[1].replace('0x', '')
    if IsValidUCS2(ucs2):
      result.add(ucs2)

  return result

def LoadJISX0213(filename):
  fh = open(filename)
  result = set()
  for line in fh.readlines():
    if line.startswith('#'):
      continue
    array = line.split()
    # Some JIS X 0213 characters are described as 'U+xxxx+xxxx'.
    ucs4 = array[1].replace('U+', '').split('+')[0]
    if IsValidUCS4(ucs4):
      result.add(ucs4)

  return result

# The following chars have different mapping in
# Windows and Mac. Technically, they are platform dependent
# characters, but Mozc treat them so that they are normal characters
# defined in JISX0208
def LoadExceptions():
  # treat Unicode Japanese incompatible characters as JISX0208.
  result = set()
  result.add('00A5')  # YEN SIGN
  result.add('003E')  # OVERLINE
  result.add('301C')  # WAVE DASH
  result.add('FF5E')  # FULL WIDTH TILDE
  result.add('2016')  # DOUBLE VERTICAL LINE
  result.add('2225')  # PARALEL TO
  result.add('2212')  # MINUS SIGN
  result.add('FF0D')  # FULL WIDTH HYPHEN MINUS
  result.add('00A2')  # CENT SIGN
  result.add('FFE0')  # FULL WIDTH CENT SIGN
  result.add('00A3')  # POUND SIGN
  result.add('FFE1')  # FULL WIDTH POUND SIGN
  result.add('00AC')  # NOT SIGN
  result.add('FFE2')  # FULL WIDTH NOT SIGN
  return result

def Lookup(key, hash):
  if key in hash:
    return "D"
  else:
    return "N"

def Categorize(key, pattern):
  # ASCII
  if int(key, 16) <= 0x007F:
    return "ASCII"

  # "CP932 JISX0201 JISX0208 JISX0212 JISX0213" and "D or N"
  # regexp => result mapping
  kMapping = ( [ "D . . . . .",  "JISX0208" ],   # vender specific
               [ ". N N N N N",  "UNICODE_ONLY"  ],  # not in JIS nor CP932
               [ "N D N N N N",  "CP932"    ],       # only CP932
               [ "N . D . . .",  "JISX0201" ],       # at least in JISX0201
               [ "N . N D . .",  "JISX0208" ],       # at least in JISX0208
               [ "N . N N D .",  "JISX0212" ],       # in JISX0212
               [ "N . N N N D",  "JISX0213" ] )      # in JISX0213

  for m in kMapping:
    if re.search(m[0], pattern):
      return m[1]

  raise 'Cannot find pattern %s ' % (pattern)

def OutputTable():
  charset = { 'exceptions' : LoadExceptions(),
              'cp932'      : LoadCP932(sys.argv[1]),
              'jisx0201'   : LoadJISX0201(sys.argv[2]),
              'jisx0208'   : LoadJISX0208(sys.argv[3]),
              'jisx0212'   : LoadJISX0212(sys.argv[4]),
              'jisx0213'   : LoadJISX0213(sys.argv[5])}

  max_char = 0
  for chars in charset.values():
    max_char = max(max_char, max([int(c, 16) for c in chars]))

  char_range = xrange(0, max_char + 1)

  cat = []
  for i in char_range:
    key = ("%5.4X" % (i)).lstrip(' ')
    pattern = "%s %s %s %s %s %s" % (
        Lookup(key, charset['exceptions']),
        Lookup(key, charset['cp932']),
        Lookup(key, charset['jisx0201']),
        Lookup(key, charset['jisx0208']),
        Lookup(key, charset['jisx0212']),
        Lookup(key, charset['jisx0213']))
    cat.append(Categorize(key, pattern))

  # Grouping
  prev = ""
  ucs4s = []
  group = []
  for i in char_range:
    if prev != cat[i]:
      if len(ucs4s) > 0:
        group.append([prev, ucs4s])
      prev = cat[i]
      ucs4s = []
    ucs4s.append(i)

  if len(ucs4s) > 0:
    group.append([prev, ucs4s])

  print "Util::CharacterSet Util::GetCharacterSet(char32 ucs4) {"
  print "  switch (ucs4) {";
  for g in group:
    cat = g[0]
    chars = g[1]
    if cat == "UNICODE_ONLY":
      continue
    for i in chars:
      print  "    case 0x%8.8X:" % (i)
    print "      return %s;" % (cat)
    print "      break;";

  print "    default:";
  print "      return UNICODE_ONLY;"
  print "      break;";
  print "  }";   # end switch
  print "  return UNICODE_ONLY;"
  print "}";     # end function

def main():
  print "// This file is generated by base/gen_character_set.py"
  print "// Do not edit me!";
  OutputTable()

if __name__ == "__main__":
  main()