Codebase list mozc / upstream/latest rewriter / gen_usage_rewriter_dictionary_main.cc
upstream/latest

Tree @upstream/latest (Download .tar.gz)

gen_usage_rewriter_dictionary_main.cc @upstream/latestraw · history · blame

// Copyright 2010-2014, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// Usage dictionary generator:
// % gen_usage_rewriter_dictionary_main
//    --usage_data_file=usage_data.txt
//    --cforms_file=cforms.def
//    --output=output_header

#include <algorithm>
#include <iostream>
#include <map>
#include <set>
#include <string>
#include <vector>

#include "base/file_stream.h"
#include "base/logging.h"
#include "base/util.h"

DEFINE_string(usage_data_file, "", "usage data file");
DEFINE_string(cforms_file, "", "cforms file");
DEFINE_string(output, "", "output header file");

namespace mozc {
namespace {
struct ConjugationType {
  string form;
  string value_suffix;
  string key_suffix;
};

struct UsageItem {
  string key;
  string value;
  string conjugation;
  int conjugation_id;
  string meaning;
};

bool UsageItemKeynameCmp(const UsageItem& l, const UsageItem& r) {
  return l.key < r.key;
}

// Load cforms_file
void LoadConjugation(const string &filename,
                     map<string, vector<ConjugationType> > *output,
                     map<string, ConjugationType> *baseform_map) {
  InputFileStream ifs(filename.c_str());
  CHECK(ifs.good());

  string line;
  vector<string> fields;
  while (!getline(ifs, line).fail()) {
    if (line.empty() || line[0] == '#') {
      continue;
    }
    fields.clear();
    Util::SplitStringUsing(line, "\t ", &fields);
    CHECK_GE(fields.size(), 4)  << "format error: " << line;

    ConjugationType tmp;
    tmp.form = fields[1];
    tmp.value_suffix = ((fields[2] == "*") ? "" : fields[2]);
    tmp.key_suffix   = ((fields[3] == "*") ? "" : fields[3]);
    (*output)[fields[0]].push_back(tmp);   // insert

    if (tmp.form == "\xE5\x9F\xBA\xE6\x9C\xAC\xE5\xBD\xA2") {  // 基本形
      (*baseform_map)[fields[0]] = tmp;
    }
  }
}

// Load usage_data_file
void LoadUsage(const string &filename,
               vector<UsageItem> *usage_entries,
               vector<string> *conjugation_list) {
  InputFileStream ifs(filename.c_str());

  if (!ifs.good()) {
    LOG(WARNING) << "Can't open file:" << filename;
    return;
  }

  string line;
  vector<string> fields;
  map<string, int> conjugation_id_map;

  int conjugation_id = 0;
  while (!getline(ifs, line).fail()) {
    if (line.empty() || line[0] == '#') {
      // starting with '#' is a comment line.
      continue;
    }
    fields.clear();
    Util::SplitStringAllowEmpty(line, "\t", &fields);
    CHECK_GE(fields.size(), 4) << "format error: " << line;

    UsageItem item;
    item.key = ((fields[0] == "*") ? "" : fields[0]);
    item.value = ((fields[1] == "*") ? "" : fields[1]);
    item.conjugation = ((fields[2] == "*") ? "" : fields[2]);
    string tmp = ((fields[3] == "*") ? "" : fields[3]);
    Util::StringReplace(tmp, "\\n", "\n", true, &item.meaning);

    map<string, int>::iterator it = conjugation_id_map.find(item.conjugation);
    if (it == conjugation_id_map.end()) {
      conjugation_id_map.insert(
        pair<string, int>(item.conjugation, conjugation_id));
      item.conjugation_id = conjugation_id;
      conjugation_list->push_back(item.conjugation);
      ++conjugation_id;
    } else {
      item.conjugation_id = it->second;
    }
    usage_entries->push_back(item);
  }
}

// remove "基本形"'s conjugation suffix
void RemoveBaseformConjugationSuffix(
  const map<string, ConjugationType> &baseform_map,
  vector<UsageItem> *usage_entries) {
  for (vector<UsageItem>::iterator usage_itr = usage_entries->begin();
      usage_itr != usage_entries->end(); ++usage_itr) {
    const map<string, ConjugationType>::const_iterator baseform_itr =
      baseform_map.find(usage_itr->conjugation);
    if (baseform_itr == baseform_map.end()) {
      continue;
    }
    const ConjugationType &type = baseform_itr->second;

    if (usage_itr->key.length() <= type.key_suffix.length()) {
      LOG(WARNING) << "key:[" << usage_itr->key << "] is not longer then "
                   << "baseform.key_suffix  of \"" << usage_itr->conjugation
                   << "\" : [" << type.key_suffix << "]";
    }
    if (usage_itr->value.length() <= type.value_suffix.length()) {
      LOG(WARNING) << "value:[" << usage_itr->value << "] is not longer then "
                   << "baseform.value_suffix  of \"" << usage_itr->conjugation
                   << "\" : [" << type.value_suffix << "]";
    }

    usage_itr->key.erase(usage_itr->key.length() - type.key_suffix.length());
    usage_itr->value.erase(
        usage_itr->value.length() - type.value_suffix.length());
  }
}

void Convert() {
  // Load cforms_file
  map<string, vector<ConjugationType> > inflection_map;
  map<string, ConjugationType> baseform_map;
  LoadConjugation(FLAGS_cforms_file, &inflection_map, &baseform_map);

  // Load usage_data_file
  vector<UsageItem> usage_entries;
  vector<string> conjugation_list;
  LoadUsage(FLAGS_usage_data_file, &usage_entries, &conjugation_list);

  ostream *ofs = &cout;
  if (!FLAGS_output.empty()) {
    ofs = new OutputFileStream(FLAGS_output.c_str());
  }

  *ofs << "// This header file is generated by "
       << "gen_usage_rewriter_dictionary_main."
       << endl;

  // Output kConjugationNum
  *ofs << "static const int kConjugationNum = " <<
          conjugation_list.size() << ";" << endl;

  // Output kBaseConjugationSuffix
  *ofs << "static const ConjugationSuffix kBaseConjugationSuffix[] = {" << endl;
  for (size_t i = 0; i < conjugation_list.size(); ++i) {
    string value_suffix, key_suffix;
    Util::Escape(baseform_map[conjugation_list[i]].value_suffix, &value_suffix);
    Util::Escape(baseform_map[conjugation_list[i]].key_suffix, &key_suffix);
      *ofs << "  {\"" << value_suffix << "\", \"" << key_suffix << "\"},  "
        << "// " << conjugation_list[i] << endl;
  }
  *ofs << "};" << endl;

  // Output kConjugationSuffixData
  vector<int> conjugation_index(conjugation_list.size() + 1);
  *ofs << "static const ConjugationSuffix kConjugationSuffixData[] = {" << endl;
  int out_count = 0;
  for (size_t i = 0; i < conjugation_list.size(); ++i) {
    vector<ConjugationType> conjugations = inflection_map[conjugation_list[i]];
    conjugation_index[i] = out_count;
    if (conjugations.size() == 0) {
      *ofs << "  // " << i << ": (" << out_count << "-" << out_count
           << "): no conjugations" << endl;
      *ofs << "  {\"\",\"\"}," << endl;
      ++out_count;
    } else {
      typedef pair<string, string> StrPair;
      set<StrPair> key_and_value_suffix_set;
      for (size_t j = 0; j < conjugations.size(); ++j) {
        StrPair key_and_value_suffix(conjugations[j].value_suffix,
                                    conjugations[j].key_suffix);
        key_and_value_suffix_set.insert(key_and_value_suffix);
      }
      *ofs << "  // " << i << ": (" << out_count << "-"
           << (out_count + key_and_value_suffix_set.size()-1)
           << "): " << conjugation_list[i] << endl << " ";
      set<StrPair>::iterator itr;
      for (itr = key_and_value_suffix_set.begin();
          itr != key_and_value_suffix_set.end(); ++itr) {
        string value_suffix, key_suffix;
        Util::Escape(itr->first, &value_suffix);
        Util::Escape(itr->second, &key_suffix);
        *ofs << " {\"" << value_suffix <<
                "\", \"" << key_suffix << "\"},";
        ++out_count;
      }
      *ofs << endl;
    }
  }
  *ofs << "};" << endl;
  conjugation_index[conjugation_list.size()] = out_count;

  // Output kConjugationSuffixDataIndex
  *ofs << "static const int kConjugationSuffixDataIndex[] = {";
  for (size_t i = 0; i < conjugation_index.size(); ++i) {
    if (i != 0) {
      *ofs << ", ";
    }
    *ofs << conjugation_index[i];
  }
  *ofs << "};" << endl;

  RemoveBaseformConjugationSuffix(baseform_map, &usage_entries);
  sort(usage_entries.begin(), usage_entries.end(), UsageItemKeynameCmp);

  // Output kUsageDataSize
  *ofs << "static const size_t kUsageDataSize = "
       << usage_entries.size() << ";" << endl;

  // Output kUsageData_value
  *ofs << "static const UsageDictItem kUsageData_value[] = {" << endl;
  int32 usage_id = 0;
  for (vector<UsageItem>::iterator i = usage_entries.begin();
      i != usage_entries.end(); i++) {
    string key, value, meaning;
    Util::Escape(i->key, &key);
    Util::Escape(i->value, &value);
    Util::Escape(i->meaning, &meaning);
    *ofs <<  "  {" << usage_id << ", \"" << key << "\", "
        << "\"" << value << "\", "
        << "" << i->conjugation_id << ", "
        << "\"" << meaning << "\"}, // "
        << i->value << "(" << i->key << ")" << endl;
    ++usage_id;
  }
  *ofs << "  { 0, NULL, NULL, 0, NULL }" << endl;
  *ofs << "};" << endl;

  if (ofs != &cout) {
    delete ofs;
  }
}

}  // namespace
}  // namespace mozc

int main(int argc, char **argv) {
  InitGoogle(argv[0], &argc, &argv, true);
  mozc::Convert();
  return 0;
}