Codebase list mozc / upstream/1.3.931.102 rewriter / gen_emoticon_rewriter_dictionary_main.cc
upstream/1.3.931.102

Tree @upstream/1.3.931.102 (Download .tar.gz)

gen_emoticon_rewriter_dictionary_main.cc @upstream/1.3.931.102raw · history · blame

// Copyright 2010-2011, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// Single Kanji dictionary generator:
// % gen_emoticon_rewriter_dictionary_main
//    --input=input.tsv --output=output_header

#include <algorithm>
#include <cmath>
#include <map>
#include <vector>
#include <string>
#include "base/base.h"
#include "base/file_stream.h"
#include "base/util.h"
#include "rewriter/dictionary_generator.h"
#include "rewriter/embedded_dictionary.h"

DEFINE_string(input, "", "emoticon dictionary file");
DEFINE_string(output, "", "output header file");

namespace mozc {
namespace {

// Generate a description from readings. We simply add
// 1) the most general reading and 2) the most specific reading.
// 1) and 2) are simply approximated by checking the frequency
// of the readings.
string GetDescription(const vector<string> &keys,
                      const map<string, int> &key_count) {
  vector<pair<int, string> > freq;
  for (size_t i = 0; i < keys.size(); ++i) {
    const map<string, int>::const_iterator it = key_count.find(keys[i]);
    if (it != key_count.end()) {
      freq.push_back(make_pair(it->second, keys[i]));
    }
  }
  CHECK(!freq.empty());

  // use stable sort just in case.
  stable_sort(freq.begin(), freq.end());

  if (freq.size() >= 2) {
    return freq.back().second + " " + freq.front().second;
  } else {
    return freq.back().second;
  }

  return "";
}

// Read dic:
void ConvertEmoticonTsvToTextDictionary(
    const string &emoticon_dictionary_file,
    const string &output_file) {
  vector<pair<string, vector<string> > > emoticon_data;
  map<string, int> key_count;

  // Load data
  {
    InputFileStream ifs(emoticon_dictionary_file.c_str());
    CHECK(ifs);

    string line;
    CHECK(getline(ifs, line));  // get first line

    while (getline(ifs, line)) {
      // Format:
      // value <tab> readings(space delimitered)
      vector<string> fields;
      Util::SplitStringAllowEmpty(line, "\t", &fields);
      CHECK_GE(fields.size(), 2) << "format error: " << line;
      if (fields.size() > 3) {
        LOG(WARNING) << "ignore extra columns: " << line;
      }

      string keys_str;
      // \xE3\x80\x80 is full width space
      Util::StringReplace(fields[1], "\xE3\x80\x80", " ", true, &keys_str);
      vector<string> keys;
      Util::SplitStringUsing(keys_str, " ", &keys);
      const string &value = fields[0];
      emoticon_data.push_back(make_pair(value, keys));
      for (size_t j = 0; j < keys.size(); ++j) {
        key_count[keys[j]]++;
      }
    }
  }

  // Output dictionary
  {
    OutputFileStream ofs(output_file.c_str());
    CHECK(ofs);
    int cost = 10;
    for (size_t i = 0; i < emoticon_data.size(); ++i) {
      const string &value = emoticon_data[i].first;
      const vector<string> &keys = emoticon_data[i].second;
      const string description = GetDescription(keys, key_count);
      for (size_t j = 0; j < keys.size(); ++j) {
        ofs << keys[j] << "\t0\t0\t" << cost << "\t" << value;
        if (!description.empty()) {
          ofs << '\t'  << description;
        }
        ofs << endl;
      }
      cost += 10;
    }
  }
}
}  // namespace
}  // mozc

int main(int argc, char **argv) {
  InitGoogle(argv[0], &argc, &argv, true);

  const string tmp_text_file = FLAGS_output + ".txt";
  const char kHeaderName[] = "EmoticonData";

  mozc::ConvertEmoticonTsvToTextDictionary(FLAGS_input,
                                           tmp_text_file);

  mozc::EmbeddedDictionary::Compile(kHeaderName,
                                    tmp_text_file,
                                    FLAGS_output);

  mozc::Util::Unlink(tmp_text_file);

  return 0;
}