Codebase list mozc / debian/1.13.1651.102-1 dictionary / pos_util.py
debian/1.13.1651.102-1

Tree @debian/1.13.1651.102-1 (Download .tar.gz)

pos_util.py @debian/1.13.1651.102-1raw · history · blame

# -*- coding: utf-8 -*-
# Copyright 2010-2014, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
#     * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""Utilities to handle pos related stuff for source code generation."""

__author__ = "hidehiko"


from collections import defaultdict
import logging
import re

from build_tools import code_generator_util


class PosDataBase(object):
  """Utility to look up data in id.def and special_pos.def."""
  def __init__(self):
    self.id_list = []

  def Parse(self, id_file, special_pos_file):
    id_list = []
    with open(id_file, 'r') as stream:
      stream = code_generator_util.SkipLineComment(stream)
      stream = code_generator_util.ParseColumnStream(stream, num_column=2)
      for pos_id, feature in stream:
        id_list.append((feature, int(pos_id)))

    max_id = max(pos_id for _, pos_id in id_list)
    with open(special_pos_file, 'r') as stream:
      stream = code_generator_util.SkipLineComment(stream)
      for pos_id, line in enumerate(stream, start=max_id + 1):
        id_list.append((line, pos_id))
    self.id_list = id_list

  def GetPosId(self, feature):
    """Returns id for the feature if found. Otherwise None."""
    assert feature
    for line, pos_id in self.id_list:
      # Return by prefix match.
      if line.startswith(feature): return pos_id

    logging.warning('Cannot find the POS for: %s', feature)

  @staticmethod
  def _GroupConsecutiveId(iterable):
    result = []
    for value in iterable:
      if result and result[-1] + 1 != value:
        yield result
        result = []
      result.append(value)
    if result:
      yield result

  def GetRange(self, pattern):
    id_list = [
        pos_id for line, pos_id in self.id_list if pattern.match(line)]
    id_list.sort()
    return [(id_range[0], id_range[-1])
            for id_range in PosDataBase._GroupConsecutiveId(id_list)]


class PosMatcher(object):
  def __init__(self, pos_database):
    self.pos_database = pos_database
    self._match_rule_map = {}

  def Parse(self, pos_matcher_rule_file):
    with open(pos_matcher_rule_file, 'r') as stream:
      stream = code_generator_util.SkipLineComment(stream)
      stream = code_generator_util.ParseColumnStream(stream, num_column=2)
      self._match_rule_map = dict(
          (name, (pattern, re.compile(pattern.replace('*', '[^,]+')), sortkey))
          for sortkey, (name, pattern) in enumerate(stream))

  def GetRuleNameList(self):
    """Returns a list of rule names in the original file's order."""
    sorted_rule_list = sorted(
        self._match_rule_map.items(), key=lambda item:item[1][2])
    return [rule_name for rule_name, _ in sorted_rule_list]

  def GetRange(self, name):
    return self.pos_database.GetRange(self._match_rule_map[name][1])

  def GetId(self, name):
    return self.pos_database.GetRange(self._match_rule_map[name][1])[0][0]

  def GetOriginalPattern(self, name):
    return self._match_rule_map[name][0]


class InflectionMap(object):
  """Utility to handle inflection map.

  Inflection map is a map from key to (form, value_suffix, key_suffix).
  """
  def __init__(self):
    self._map = {}

  def Parse(self, filepath):
    result = defaultdict(list)
    with open(filepath, 'r') as stream:
      stream = code_generator_util.SkipLineComment(stream)
      stream = code_generator_util.ParseColumnStream(stream, num_column=4)
      for key, form, value_suffix, key_suffix in stream:
        result[key].append((
            form,
            value_suffix if value_suffix != '*' else '',
            key_suffix if key_suffix != '*' else ''))
    self._map = result

  def Get(self, key):
    return self._map[key]


class UserPos(object):
  """Utility to handle user pos.

  The data is assoc list from user_pos (string) to conjugation_list.
  conjugation_list is a list of (value_suffix, key_suffix, pos_id).
  """
  def __init__(self, pos_database, inflection_map):
    self._pos_database = pos_database
    self._inflection_map = inflection_map
    self.data = []

  def Parse(self, filepath):
    result = []
    with open(filepath, 'r') as stream:
      stream = code_generator_util.SkipLineComment(stream)
      stream = code_generator_util.ParseColumnStream(stream, num_column=4)
      for user_pos, _, ctype, feature in stream:
        conjugation_list = []
        if ctype == '*':
          conjugation_list.append(
              (None, None, self._pos_database.GetPosId(feature)))
        else:
          for form, value_suffix, key_suffix in self._inflection_map.Get(ctype):
            # repalce <cfrom> with actual cform
            pos_id = self._pos_database.GetPosId(
                feature.replace('<cform>', form))

            # Known error items.
            # 動詞,自立,*,*,五段動詞,体言接続特殊2,*
            # 形容詞,自立,*,*,形容詞・アウオ段,文語基本形,*
            if pos_id is not None:
              conjugation_list.append((value_suffix, key_suffix, pos_id))

        result.append((user_pos, conjugation_list))
    self.data = result

  def GetPosId(self, pos):
    """Returns id of the given pos."""
    for user_pos, conjugation_list in self.data:
      if user_pos == pos:
        return conjugation_list[0][2]