Codebase list mozc / d2e045e dictionary / gen_user_pos_data_main.cc
d2e045e

Tree @d2e045e (Download .tar.gz)

gen_user_pos_data_main.cc @d2e045eraw · history · blame

// Copyright 2010-2012, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <iostream>
#include <string>
#include <vector>
#include <map>
#include "base/base.h"
#include "base/file_stream.h"
#include "base/util.h"

// Input: id.def, user-pos.def, cforms.def
// Output: pos_data.h
DEFINE_string(id_file, "", "");
DEFINE_string(special_pos_file, "", "");
DEFINE_string(user_pos_file, "", "");
DEFINE_string(cforms_file, "", "");
DEFINE_string(output, "", "");
DECLARE_bool(logtostderr);

namespace mozc {
namespace {

class POSUtil {
 public:
  // load data/dictioanry/id.def
  void Open(const string &id_file, const string &special_pos_file) {
    ids_.clear();
    int max_id = 0;

    {
      InputFileStream ifs(id_file.c_str());
      CHECK(ifs);
      string line;
      vector<string> fields;
      while (getline(ifs, line)) {
        if (line.empty() || line[0] == '#') {
          continue;
        }
        fields.clear();
        Util::SplitStringUsing(line, "\t ", &fields);
        CHECK_GE(fields.size(), 2);
        const int id = atoi32(fields[0].c_str());
        max_id = max(max_id, id);
        ids_.push_back(make_pair(fields[1], static_cast<uint16>(id)));
      }
    }

    {
      ++max_id;
      InputFileStream ifs(special_pos_file.c_str());
      CHECK(ifs);
      string line;
      vector<string> fields;
      while (getline(ifs, line)) {
        if (line.empty() || line[0] == '#') {
          continue;
        }
        ids_.push_back(make_pair(line, static_cast<uint16>(max_id)));
        ++max_id;
      }
    }
  }

  // return id of feature defined in id.def
  uint16 id(const string &feature) const {
    CHECK(!feature.empty());
    for (size_t i = 0; i < ids_.size(); ++i) {
      if (ids_[i].first.find(feature) == 0) {
        return ids_[i].second;
      }
    }
    LOG(ERROR) << "Cannot find the POS for: " << feature;
    return 0;
  }

 private:
  vector<pair<string, uint16> > ids_;
};

string Escape(const string &str) {
  string output;
  Util::Escape(str, &output);
  return output;
}

struct ConjugationType {
  string form;
  string value_suffix;
  string key_suffix;
};

void LoadConjugation(const string &filename,
                     map<string, vector<ConjugationType> > *output) {
  InputFileStream ifs(filename.c_str());
  CHECK(ifs);

  string line;
  vector<string> fields;
  while (getline(ifs, line)) {
    if (line.empty() || line[0] == '#') {
      continue;
    }
    fields.clear();
    Util::SplitStringUsing(line, "\t ", &fields);
    CHECK_GE(fields.size(), 4);

    ConjugationType tmp;
    tmp.form = fields[1];
    tmp.value_suffix = fields[2] == "*" ? "" : fields[2];
    tmp.key_suffix   = fields[3] == "*" ? "" : fields[3];
    (*output)[fields[0]].push_back(tmp);   // insert
  }
}

void Convert() {
  POSUtil util;
  util.Open(FLAGS_id_file, FLAGS_special_pos_file);

  map<string, vector<ConjugationType> > inflection_map;
  LoadConjugation(FLAGS_cforms_file, &inflection_map);

  InputFileStream ifs(FLAGS_user_pos_file.c_str());
  ostream *ofs = &cout;
  if (!FLAGS_output.empty()) {
    ofs = new OutputFileStream(FLAGS_output.c_str());
  }
  CHECK(ifs);
  CHECK(*ofs);
  string line;
  vector<string> fields, pos_fields;
  vector<pair<string, size_t> > pos_tokens;

  while (getline(ifs, line)) {
    if (line.empty() || line[0] == '#') {
      continue;
    }
    fields.clear();
    Util::SplitStringUsing(line, "\t ", &fields);
    CHECK_GE(fields.size(), 3);
    const string &user_pos = fields[0];
    const string ctype = fields[1];
    const string &feature = fields[2];

    if (ctype == "*") {
      const uint16 id = util.id(fields[2]);
      CHECK_NE(id, 0);
      *ofs << "static const ConjugationType kConjugation" << pos_tokens.size()
           << "[] = {" << endl;
      *ofs << "  { NULL, NULL, " << id << "}" << endl;
      *ofs << "};" << endl;
      pos_tokens.push_back(make_pair(user_pos, static_cast<size_t>(1)));
    } else {
      vector<ConjugationType> &forms = inflection_map[ctype];
      CHECK(!forms.empty());
      *ofs << "const ConjugationType kConjugation"
           << pos_tokens.size() << "[] = {" << endl;
      bool is_first = true;;
      size_t added = 0;
      for (size_t i = 0; i < forms.size(); ++i) {
        // repalce <cfrom> with actual cform
        string output;
        Util::StringReplace(feature, "<cform>", forms[i].form, true, &output);
        const uint16 id = util.id(output);
        if (id == 0) {
          LOG(ERROR) << "Cannot find id for:" << output;
          continue;
        }
        if (!is_first) {
          *ofs << ",";
        }
        *ofs << "  { ";
        *ofs << "\"" << Escape(forms[i].value_suffix) << "\"" << ", ";
        *ofs << "\"" << Escape(forms[i].key_suffix)    << "\"" << ", ";
        *ofs << id << " }";
        is_first = false;
        ++added;
      }
      CHECK_GT(added, 0);
      *ofs << "};" << endl;
      pos_tokens.push_back(make_pair(user_pos, added));
    }
  }

  *ofs << "const POSToken kPOSToken[] = {" << endl;
  for (size_t i = 0; i < pos_tokens.size(); ++i) {
    *ofs << "  { \"" << Escape(pos_tokens[i].first) << "\", "
         << pos_tokens[i].second << ", kConjugation" << i << " }," << endl;
  }
  *ofs << "  { NULL, 0, NULL }" << endl;
  *ofs << "};" << endl;

  if (ofs != &cout) {
    delete ofs;
  }
}
}  // namespace
}  // namespace mozc

int main(int argc, char **argv) {
  FLAGS_logtostderr = true;
  InitGoogle(argv[0], &argc, &argv, false);

  if (FLAGS_id_file.empty() &&
      FLAGS_special_pos_file.empty() &&
      FLAGS_user_pos_file.empty() &&
      FLAGS_cforms_file.empty() &&
      argc > 4) {
    FLAGS_id_file = argv[1];
    FLAGS_special_pos_file = argv[2];
    FLAGS_user_pos_file = argv[3];
    FLAGS_cforms_file = argv[4];
  }

  LOG(INFO) << FLAGS_id_file;
  LOG(INFO) << FLAGS_special_pos_file;
  LOG(INFO) << FLAGS_user_pos_file;
  LOG(INFO) << FLAGS_cforms_file;

  mozc::Convert();

  return 0;
}