Codebase list mozc / debian/2.18.2595.102+dfsg-1 src / rewriter / gen_single_kanji_rewriter_data.py
debian/2.18.2595.102+dfsg-1

Tree @debian/2.18.2595.102+dfsg-1 (Download .tar.gz)

gen_single_kanji_rewriter_data.py @debian/2.18.2595.102+dfsg-1raw · history · blame

# -*- coding: utf-8 -*-
# Copyright 2010-2016, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
#     * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""Single kanji dictionary generator.

How to run this script:
gen_single_kanji_rewriter_data.py
  --single_kanji_file=single_kanji.tsv
  --variant_file=variant_rule.txt
  --output=single_kanji_data.h
"""

__author__ = "hidehiko"

import optparse
import struct

from build_tools import code_generator_util
from build_tools import serialized_string_array_builder


def ReadSingleKanji(stream):
  """Parses single kanji dictionary data from stream."""
  stream = code_generator_util.SkipLineComment(stream)
  stream = code_generator_util.ParseColumnStream(stream, num_column=2)
  outputs = list(stream)
  # For binary search by |key|, sort outputs here.
  outputs.sort(lambda x, y: cmp(x[0], y[0]))

  return outputs


def ReadVariant(stream):
  """Parses variant data from stream."""
  variant_types = []
  variant_items = []

  stream = code_generator_util.SkipLineComment(stream)
  stream = code_generator_util.ParseColumnStream(stream)
  for tokens in stream:
    if len(tokens) == 1:
      variant_types.append(tokens[0])
    elif len(tokens) == 2 and variant_types:
      (target, original) = tokens
      variant_items.append([target, original, len(variant_types) - 1])

  # For binary search by |target|, sort variant items here.
  variant_items.sort(lambda x, y: cmp(x[0], y[0]))

  return (variant_types, variant_items)


def WriteSingleKanji(single_kanji_dic, output_tokens, output_string_array):
  """Writes single kanji list for readings.

  The token output is an array of uint32s, where array[2 * i] and
  array[2 * i + 1] are the indices of key and value in the string array.
  See rewriter/single_kanji_rewriter.cc.
  """
  strings = []
  with open(output_tokens, 'wb') as f:
    for index, (key, value) in enumerate(single_kanji_dic):
      f.write(struct.pack('<I', 2 * index))
      f.write(struct.pack('<I', 2 * index + 1))
      strings.append(key)
      strings.append(value)
  serialized_string_array_builder.SerializeToFile(strings, output_string_array)


def WriteVariantInfo(variant_info,
                     output_variant_types,
                     output_variant_tokens,
                     output_variant_strings):
  """Writes single kanji variants info.

  The token output is an array of uint32s, where array[3 * i],
  array[3 * i + 1] and array[3 * i + 2] are the index of target, index of
  original,  and variant type ID. See rewriter/single_kanji_rewriter.cc.
  """
  (variant_types, variant_items) = variant_info

  serialized_string_array_builder.SerializeToFile(variant_types,
                                                  output_variant_types)

  strings = []
  with open(output_variant_tokens, 'wb') as f:
    for index, (target, original, variant_type) in enumerate(variant_items):
      f.write(struct.pack('<I', 2 * index))
      f.write(struct.pack('<I', 2 * index + 1))
      f.write(struct.pack('<I', variant_type))
      strings.append(target)
      strings.append(original)

  serialized_string_array_builder.SerializeToFile(strings,
                                                  output_variant_strings)


def _ParseOptions():
  parser = optparse.OptionParser()

  parser.add_option('--single_kanji_file', dest='single_kanji_file',
                    help='Single kanji file')
  parser.add_option('--variant_file', dest='variant_file',
                    help='Variant rule file')
  parser.add_option('--output_single_kanji_token',
                    dest='output_single_kanji_token',
                    help='Output Single Kanji token data.')
  parser.add_option('--output_single_kanji_string',
                    dest='output_single_kanji_string',
                    help='Output Single Kanji string data.')
  parser.add_option('--output_variant_types',
                    dest='output_variant_types',
                    help='Output variant types.')
  parser.add_option('--output_variant_tokens',
                    dest='output_variant_tokens',
                    help='Output variant tokens.')
  parser.add_option('--output_variant_strings',
                    dest='output_variant_strings',
                    help='Output variant strings.')

  return parser.parse_args()[0]


def main():
  options = _ParseOptions()

  with open(options.single_kanji_file, 'r') as single_kanji_stream:
    single_kanji = ReadSingleKanji(single_kanji_stream)

  with open(options.variant_file, 'r') as variant_stream:
    variant_info = ReadVariant(variant_stream)

  WriteSingleKanji(single_kanji,
                   options.output_single_kanji_token,
                   options.output_single_kanji_string)
  WriteVariantInfo(variant_info,
                   options.output_variant_types,
                   options.output_variant_tokens,
                   options.output_variant_strings)


if __name__ == '__main__':
  main()