Codebase list mozc / 12cc99c rewriter / emoticon_rewriter.cc
12cc99c

Tree @12cc99c (Download .tar.gz)

emoticon_rewriter.cc @12cc99craw · history · blame

// Copyright 2010-2011, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "rewriter/emoticon_rewriter.h"

#include <algorithm>
#include <cstring>
#include <string>
#include <vector>
#include "base/base.h"
#include "base/singleton.h"
#include "base/util.h"
#include "config/config_handler.h"
#include "config/config.pb.h"
#include "converter/segments.h"
#include "rewriter/rewriter_interface.h"
#include "rewriter/embedded_dictionary.h"
#include "session/commands.pb.h"

namespace mozc {
namespace {

#include "rewriter/emoticon_rewriter_data.h"

class EmoticonDictionary {
 public:
  EmoticonDictionary()
      : dic_(new EmbeddedDictionary(kEmoticonData_token_data,
                                    kEmoticonData_token_size)) {}

  ~EmoticonDictionary() {}

  EmbeddedDictionary *GetDictionary() const {
    return dic_.get();
  }

 private:
  scoped_ptr<EmbeddedDictionary> dic_;
};

class ValueCostCompare {
 public:
  bool operator() (const EmbeddedDictionary::Value *a,
                   const EmbeddedDictionary::Value *b) const {
    return a->cost < b->cost;
  }
};

class IsEqualValue {
 public:
  bool operator() (const EmbeddedDictionary::Value *a,
                   const EmbeddedDictionary::Value *b) const {
    return strcmp(a->value, b->value) == 0;
  }
};

// Insert Emoticon into the |segment|
// Top |initial_insert_size| candidates are inserted from |initial_insert_pos|.
// Remained candidates are added to the buttom.
void InsertCandidates(const EmbeddedDictionary::Value *value,
                      size_t value_size,
                      size_t initial_insert_pos,
                      size_t initial_insert_size,
                      bool is_no_learning,
                      Segment *segment) {
  if (segment->candidates_size() == 0) {
    LOG(WARNING) << "candiadtes_size is 0";
    return;
  }

  const Segment::Candidate &base_candidate = segment->candidate(0);
  size_t offset = min(initial_insert_pos, segment->candidates_size());

  // Sort values by cost just in case
  vector<const EmbeddedDictionary::Value *> sorted_value;
  for (size_t i = 0; i < value_size; ++i) {
    sorted_value.push_back(&value[i]);
  }

  sort(sorted_value.begin(), sorted_value.end(), ValueCostCompare());

  // after sorting the valeus by |cost|, adjacent candidates
  // will have the same value. It is almost OK to use std::unique to
  // remove dup entries, it is not a perfect way though.
  sorted_value.erase(unique(sorted_value.begin(),
                            sorted_value.end(),
                            IsEqualValue()),
                     sorted_value.end());

  for (size_t i = 0; i < sorted_value.size(); ++i) {
    Segment::Candidate *c = NULL;

    if (i < initial_insert_size) {
      c = segment->insert_candidate(offset);
      ++offset;
    } else {
      c = segment->push_back_candidate();
    }

    if (c == NULL) {
      LOG(ERROR) << "cannot insert candidate at " << offset;
      continue;
    }

    c->Init();
    // TODO(taku): set an appropriate POS here.
    c->lid = sorted_value[i]->lid;
    c->rid = sorted_value[i]->rid;
    c->cost = base_candidate.cost;
    c->value = sorted_value[i]->value;
    c->content_value = sorted_value[i]->value;
    c->key = base_candidate.key;
    c->content_key = base_candidate.content_key;
    // no full/half width normalizations
    c->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
    c->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
    if (is_no_learning) {
      c->attributes |= Segment::Candidate::NO_LEARNING;
    }

    //  "顔文字";
    const char kBaseEmoticonDescription[]
        = "\xE9\xA1\x94\xE6\x96\x87\xE5\xAD\x97";

    if (sorted_value[i]->description == NULL) {
      c->description = kBaseEmoticonDescription;
    } else {
      string description = kBaseEmoticonDescription;
      description.append(" ");
      description.append(sorted_value[i]->description);
      c->description = description;
    }
  }
}

bool RewriteCandidate(Segments *segments) {
  bool modified = false;
  for (size_t i = 0; i < segments->conversion_segments_size(); ++i) {
    const string &key = segments->conversion_segment(i).key();
    if (key.empty()) {
      LOG(ERROR) << "Key is empty";
      continue;
    }
    bool is_no_learning = false;
    const EmbeddedDictionary::Value *value = NULL;
    size_t value_size = 0;
    size_t initial_insert_size = 0;
    size_t initial_insert_pos = 0;

    // TODO(taku): Emoticon dictionary does not always include "facemark".
    // Displaying non-facemarks with "かおもじ" is not always correct.
    // We have to distinguish pure facemarks and other symbol marks.

    // "かおもじ"
    if (key == "\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98") {
      // When key is "かおもじ", default candidate size should be small enough.
      // It is safe to expand all candidates at this time.
      const EmbeddedDictionary::Token *token
          = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken();
      CHECK(token);
      // set large value(100) so that all candidates are pushed to the bottom
      value = token->value;
      value_size = token->value_size;
      initial_insert_pos = 100;
      initial_insert_size = token->value_size;
      // "かお"
    } else if (key == "\xE3\x81\x8B\xE3\x81\x8A") {
      // When key is "かお", expand all candidates in conservative way.
      const EmbeddedDictionary::Token *token
          = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken();
      CHECK(token);
      // first 6 candidates are inserted at 4 th position.
      // Other candidates are pushed to the buttom.
      value = token->value;
      value_size = token->value_size;
      initial_insert_pos = 4;
      initial_insert_size = 6;
    } else if (key == "\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F"
               "\xE3\x82\x89\xE3\x81\x84") {   // "ふくわらい"
      // Choose one emoticon randomly from the dictionary.
      // TODO(taku): want to make it "generate" more funny emoticon.
      const EmbeddedDictionary::Token *token
          = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken();
      CHECK(token);
      uint32 n = 0;
      // use secure random not to predict the next emoticon.
      if (Util::GetSecureRandomSequence(reinterpret_cast<char *>(&n),
                                        sizeof(n))) {
        value = token->value + n % token->value_size;
        value_size = 1;
        initial_insert_pos = 4;
        initial_insert_size = 1;
        is_no_learning = true;   // do not learn this candidate.
      }
    } else {
      const EmbeddedDictionary::Token *token
          = Singleton<EmoticonDictionary>::get()->GetDictionary()->Lookup(key);
      // by default, insert canidate at 7 th position.
      if (token != NULL) {
        value = token->value;
        value_size = token->value_size;
        initial_insert_pos = 6;
        initial_insert_size = token == NULL ? 0 : token->value_size;
      }
    }

    if (value == NULL || value_size == 0) {
      continue;
    }

    InsertCandidates(value, value_size,
                     initial_insert_pos,
                     initial_insert_size,
                     is_no_learning,
                     segments->mutable_conversion_segment(i));
    modified = true;
  }

  return modified;
}
}  // namespace

EmoticonRewriter::EmoticonRewriter() {}

EmoticonRewriter::~EmoticonRewriter() {}

int EmoticonRewriter::capability() const {
  return RewriterInterface::CONVERSION;
}

bool EmoticonRewriter::Rewrite(Segments *segments) const {
  if (!GET_CONFIG(use_emoticon_conversion)) {
    VLOG(2) << "no use_emoticon_conversion";
    return false;
  }
  return RewriteCandidate(segments);
}
}  // namespace mozc