// Copyright 2010-2018, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "rewriter/emoji_rewriter.h"
#include <algorithm>
#include <cstddef>
#include <cstring>
#include <string>
#include <vector>
#include "base/logging.h"
#include "base/util.h"
#include "config/config_handler.h"
#include "converter/segments.h"
#include "protocol/commands.pb.h"
#include "protocol/config.pb.h"
#include "request/conversion_request.h"
#include "usage_stats/usage_stats.h"
// EmojiRewriter:
// Converts HIRAGANA strings to emoji characters, if they are names of emojis.
namespace mozc {
using commands::Request;
namespace {
const char kEmoji[] = "絵文字";
const char kEmojiKey[] = "えもじ";
// Where to insert emoji candidate by default.
const size_t kDefaultInsertPos = 6;
// Inserts a candidate to the segment at insert_position.
// Returns true if succeeded, otherwise false. Also, if succeeded, increments
// the insert_position to represent the next insert position.
bool InsertCandidate(StringPiece key,
StringPiece value,
StringPiece description,
int cost,
Segment *segment,
size_t *insert_position) {
Segment::Candidate *candidate = segment->insert_candidate(*insert_position);
if (candidate == NULL) {
LOG(ERROR) << "cannot insert candidate at " << insert_position
<< "th position nor tail of candidates.";
return false;
}
++*insert_position;
candidate->Init();
// Fill 0 (BOS/EOS) pos code intentionally.
candidate->lid = 0;
candidate->rid = 0;
candidate->cost = cost;
candidate->value.assign(value.data(), value.size());
candidate->content_value.assign(value.data(), value.size());
candidate->key.assign(key.data(), key.size());
candidate->content_key.assign(key.data(), key.size());
candidate->description.assign(kEmoji);
if (!description.empty()) {
Util::AppendStringWithDelimiter(
" ", description, &(candidate->description));
}
candidate->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
candidate->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
return true;
}
// Merges two descriptions. Connects them if one is not a substring of the
// other.
void AddDescription(StringPiece adding, std::vector<string> *descriptions) {
DCHECK(descriptions);
if (adding.empty()) {
return;
}
// Add |adding| if it matches with no elements of |descriptions|.
for (size_t i = 0; i < descriptions->size(); ++i) {
if (adding == (*descriptions)[i]) {
return;
}
}
descriptions->emplace_back(adding.data(), adding.size());
}
bool InsertEmojiData(StringPiece key,
EmojiRewriter::EmojiDataIterator iter,
const SerializedStringArray &string_array,
int cost,
int32 available_carrier,
Segment *segment,
size_t *insert_position) {
bool inserted = false;
StringPiece utf8_emoji = string_array[iter.emoji_index()];
// Fill a candidate of Unicode 6.0 emoji.
if ((available_carrier & Request::UNICODE_EMOJI) && !utf8_emoji.empty()) {
inserted |= InsertCandidate(
key, utf8_emoji, string_array[iter.description_utf8_index()],
cost, segment, insert_position);
}
std::vector<string> descriptions;
if (available_carrier & Request::DOCOMO_EMOJI) {
AddDescription(string_array[iter.description_docomo_index()],
&descriptions);
}
if (available_carrier & Request::SOFTBANK_EMOJI) {
AddDescription(string_array[iter.description_softbank_index()],
&descriptions);
}
if (available_carrier & Request::KDDI_EMOJI) {
AddDescription(string_array[iter.description_kddi_index()], &descriptions);
}
if (!descriptions.empty()) {
// Encode the PUA code point to utf8 and fill it to candidate.
string android_pua;
string description;
Util::UCS4ToUTF8Append(iter.android_pua(), &android_pua);
Util::JoinStrings(descriptions, " ", &description);
inserted |= InsertCandidate(
key, android_pua, description.c_str(), cost, segment, insert_position);
}
return inserted;
}
int GetEmojiCost(const Segment &segment) {
// Use the first candidate's cost (or 0 if not available).
return segment.candidates_size() == 0 ? 0 : segment.candidate(0).cost;
}
bool InsertAllEmojiData(StringPiece key,
EmojiRewriter::EmojiDataIterator begin,
EmojiRewriter::EmojiDataIterator end,
const SerializedStringArray &string_array,
int32 available_carrier,
Segment *segment) {
bool inserted = false;
// Insert all candidates at the tail of the segment.
size_t insert_position = segment->candidates_size();
int cost = GetEmojiCost(*segment);
for (; begin != end; ++begin) {
inserted |= InsertEmojiData(key, begin, string_array, cost,
available_carrier,
segment, &insert_position);
}
return inserted;
}
bool InsertToken(StringPiece key,
EmojiRewriter::IteratorRange range,
const SerializedStringArray &string_array,
int32 available_carrier,
Segment *segment) {
bool inserted = false;
size_t insert_position =
std::min(segment->candidates_size(), kDefaultInsertPos);
int cost = GetEmojiCost(*segment);
for (; range.first != range.second; ++range.first) {
inserted |= InsertEmojiData(
key, range.first, string_array, cost, available_carrier,
segment, &insert_position);
}
return inserted;
}
} // namespace
EmojiRewriter::EmojiRewriter(const DataManagerInterface &data_manager) {
StringPiece string_array_data;
data_manager.GetEmojiRewriterData(&token_array_data_, &string_array_data);
DCHECK(SerializedStringArray::VerifyData(string_array_data));
string_array_.Set(string_array_data);
}
EmojiRewriter::~EmojiRewriter() = default;
int EmojiRewriter::capability(const ConversionRequest &request) const {
// The capability of the EmojiRewriter is up to the client's request.
// Note that the bit representation of RewriterInterface::CapabilityType
// and Request::RewriterCapability should exactly same, so it is ok
// to just return the value as is.
return request.request().emoji_rewriter_capability();
}
bool EmojiRewriter::Rewrite(const ConversionRequest &request,
Segments *segments) const {
if (!request.config().use_emoji_conversion()) {
VLOG(2) << "no use_emoji_conversion";
return false;
}
int32 available_emoji_carrier = request.request().available_emoji_carrier();
if (available_emoji_carrier == 0) {
VLOG(2) << "No available emoji carrier.";
return false;
}
CHECK(segments != NULL);
return RewriteCandidates(available_emoji_carrier, segments);
}
void EmojiRewriter::Finish(const ConversionRequest &request,
Segments *segments) {
if (!request.config().use_emoji_conversion()) {
return;
}
// Update usage stats
for (size_t i = 0; i < segments->conversion_segments_size(); ++i) {
const Segment &segment = segments->conversion_segment(i);
// Ignores segments which are not converted or not committed.
if (segment.candidates_size() == 0 ||
segment.segment_type() != Segment::FIXED_VALUE) {
continue;
}
// Check if the chosen candidate (index 0) is an emoji candidate.
// The Mozc converter replaces committed candidates into the 0-th index.
if (IsEmojiCandidate(segment.candidate(0))) {
usage_stats::UsageStats::IncrementCount("CommitEmoji");
}
}
}
bool EmojiRewriter::IsEmojiCandidate(const Segment::Candidate &candidate) {
return candidate.description.find(kEmoji) != string::npos;
}
std::pair<EmojiRewriter::EmojiDataIterator, EmojiRewriter::EmojiDataIterator>
EmojiRewriter::LookUpToken(StringPiece key) const {
// Search string array for key.
auto iter = std::lower_bound(string_array_.begin(), string_array_.end(), key);
if (iter == string_array_.end() || *iter != key) {
return std::pair<EmojiDataIterator, EmojiDataIterator>(end(), end());
}
// Search token array for the string index.
return std::equal_range(begin(), end(), iter.index());
}
bool EmojiRewriter::RewriteCandidates(
int32 available_emoji_carrier, Segments *segments) const {
bool modified = false;
string reading;
for (size_t i = 0; i < segments->conversion_segments_size(); ++i) {
Segment *segment = segments->mutable_conversion_segment(i);
Util::FullWidthAsciiToHalfWidthAscii(segment->key(), &reading);
if (reading.empty()) {
continue;
}
if (reading == kEmojiKey) {
// When key is "えもじ", we expect to expand all Emoji characters.
modified |= InsertAllEmojiData(reading, begin(), end(), string_array_,
available_emoji_carrier, segment);
continue;
}
const auto range = LookUpToken(reading);
if (range.first == range.second) {
VLOG(2) << "Token not found: " << reading;
continue;
}
modified |= InsertToken(reading, range, string_array_,
available_emoji_carrier, segment);
}
return modified;
}
} // namespace mozc