Codebase list mozc / debian/2.18.2595.102+dfsg-1 src / rewriter / language_aware_rewriter.cc
debian/2.18.2595.102+dfsg-1

Tree @debian/2.18.2595.102+dfsg-1 (Download .tar.gz)

language_aware_rewriter.cc @debian/2.18.2595.102+dfsg-1

4a370c6
f0b2fd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c9a3d3
f0b2fd3
 
 
dbe8005
9070e4f
1f0b782
f0b2fd3
 
b64a7de
 
 
f0b2fd3
 
 
 
 
 
 
 
 
 
 
 
913b2b2
f0b2fd3
 
 
 
913b2b2
f0b2fd3
 
913b2b2
f0b2fd3
 
 
 
913b2b2
f0b2fd3
913b2b2
2c9a3d3
 
 
 
 
 
 
 
f0b2fd3
 
 
 
 
 
 
913b2b2
f0b2fd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ed7a89
 
 
 
 
 
 
 
 
f0b2fd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2da3de7
 
 
 
 
 
 
f0b2fd3
 
2da3de7
f0b2fd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
913b2b2
f0b2fd3
 
 
 
 
 
461f60e
f0b2fd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461f60e
913b2b2
f0b2fd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461f60e
f0b2fd3
 
 
 
 
 
// Copyright 2010-2016, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "rewriter/language_aware_rewriter.h"

#include <string>

#include "base/logging.h"
#include "base/util.h"
#include "composer/composer.h"
#include "config/config_handler.h"
#include "converter/segments.h"
#include "dictionary/dictionary_interface.h"
#include "dictionary/pos_matcher.h"
#include "protocol/commands.pb.h"
#include "protocol/config.pb.h"
#include "request/conversion_request.h"
#include "usage_stats/usage_stats.h"

using mozc::dictionary::DictionaryInterface;
using mozc::dictionary::POSMatcher;

namespace mozc {

LanguageAwareRewriter::LanguageAwareRewriter(
    const POSMatcher &pos_matcher,
    const DictionaryInterface *dictionary)
    : unknown_id_(pos_matcher.GetUnknownId()),
      dictionary_(dictionary) {}

LanguageAwareRewriter::~LanguageAwareRewriter() {}

namespace {

bool IsEnabled(const ConversionRequest &request) {
  // The current default value of language_aware_input is
  // NO_LANGUAGE_AWARE_INPUT and only unittests set LANGUAGE_AWARE_SUGGESTION
  // at this moment.  Thus, FillRawText is not performed in the productions
  // yet.
  if (request.request().language_aware_input() ==
      mozc::commands::Request::NO_LANGUAGE_AWARE_INPUT) {
    return false;
  } else if (request.request().language_aware_input() ==
             mozc::commands::Request::LANGUAGE_AWARE_SUGGESTION) {
    return true;
  }
  DCHECK_EQ(mozc::commands::Request::DEFAULT_LANGUAGE_AWARE_BEHAVIOR,
            request.request().language_aware_input());

  if (!request.config().use_spelling_correction()) {
    return false;
  }

#ifdef OS_ANDROID
  return false;
#else  // OS_ANDROID
  return true;
#endif  // OS_ANDROID
}

}  // namespace

int LanguageAwareRewriter::capability(
    const ConversionRequest &request) const {
  // Language aware input is performed only on suggestion or prediction.
  if (!IsEnabled(request)) {
    return RewriterInterface::NOT_AVAILABLE;
  }

  return (RewriterInterface::SUGGESTION | RewriterInterface::PREDICTION);
}

namespace {
bool IsRawQuery(const composer::Composer &composer,
                const DictionaryInterface *dictionary,
                int *rank) {
  string raw_text;
  composer.GetRawString(&raw_text);

  // Check if the length of text is less than or equal to three.
  // For example, "cat" is not treated as a raw query so far to avoid
  // false negative cases.
  if (raw_text.size() <= 3) {
    return false;
  }

  // If the composition string is same with the raw_text, there is no
  // need to add the candidate to suggestions.
  string composition;
  composer.GetStringForPreedit(&composition);
  if (composition == raw_text) {
    return false;
  }

  // If the composition string is the full width form of the raw_text,
  // there is no need to add the candidate to suggestions.
  string composition_in_half_width_ascii;
  Util::FullWidthAsciiToHalfWidthAscii(composition,
                                       &composition_in_half_width_ascii);
  if (composition_in_half_width_ascii == raw_text) {
    return false;
  }

  // If alphabet characters are in the middle of the composition, it is
  // probably a raw query.  For example, "えぁmpぇ" (example) contains
  // "m" and "p" in the middle.  So it is treated as a raw query.  On the
  // other hand, "くえry" (query) contains alphabet characters, but they
  // are at the end of the string, so it cannot be determined here.
  //
  // Note, GetQueryForPrediction omits the trailing alphabet characters of
  // the composition string and returns it.
  string key;
  composer.GetQueryForPrediction(&key);
  if (Util::ContainsScriptType(key, Util::ALPHABET)) {
    *rank = 0;
    return true;
  }

  // If the composition is storead as a key in the dictionary like
  // "はな" (hana), "たけ" (take), the query is not handled as a raw query.
  // It is a little conservative, but a safer way.
  if (dictionary->HasKey(key)) {
    return false;
  }

  // If the input text is stored in the dictionary, it is perhaps a raw query.
  // For example, the input characters of "れもヴぇ" (remove) is in the
  // dictionary, so it is treated as a raw text.
  if (dictionary->HasValue(raw_text)) {
    *rank = 2;
    return true;
  }

  return false;
}

// Get T13n candidate ids from existing candidates.
void GetAlphabetIds(const Segment &segment, uint16 *lid, uint16 *rid) {
  DCHECK(lid);
  DCHECK(rid);

  for (int i = 0; i < segment.candidates_size(); ++i) {
    const Segment::Candidate &candidate = segment.candidate(i);
    const Util::ScriptType type = Util::GetScriptType(candidate.value);
    if (type == Util::ALPHABET) {
      *lid = candidate.lid;
      *rid = candidate.rid;
      return;
    }
  }
}
}  // namespace

// Note: This function seemed slow, but the benchmark tests
// resulted that it was only less than 0.1% point penalty.
// = session_handler_benchmark_test
// BM_PerformanceForRandomKeyEvents: 891944807 -> 892740748 (1.00089)
// = converter_benchmark_test
// BM_DesktopAnthyCorpusConversion 25062440090 -> 25101542382 (1.002)
// BM_DesktopStationPredictionCorpusPrediction 8695341697 -> 8672187681 (0.997)
// BM_DesktopStationPredictionCorpusSuggestion 6149502840 -> 6152393270 (1.000)
bool LanguageAwareRewriter::FillRawText(
    const ConversionRequest &request, Segments *segments) const {
  if (segments->conversion_segments_size() != 1 || !request.has_composer()) {
    return false;
  }

  int rank = 0;
  if (!IsRawQuery(request.composer(), dictionary_, &rank)) {
    return false;
  }

  Segment *segment = segments->mutable_conversion_segment(0);

  string raw_string;
  request.composer().GetRawString(&raw_string);

  uint16 lid = unknown_id_;
  uint16 rid = unknown_id_;
  GetAlphabetIds(*segment, &lid, &rid);

  // Create a candidate.
  if (rank > segment->candidates_size()) {
    rank = segment->candidates_size();
  }
  Segment::Candidate *candidate = segment->insert_candidate(rank);
  candidate->Init();
  candidate->value = raw_string;
  candidate->key = raw_string;
  candidate->content_value = raw_string;
  candidate->content_key = raw_string;
  candidate->lid = lid;
  candidate->rid = rid;

  candidate->attributes |= (Segment::Candidate::NO_VARIANTS_EXPANSION |
                            Segment::Candidate::NO_EXTRA_DESCRIPTION);
  candidate->prefix = "\xE2\x86\x92 ";  // "→ "
  candidate->description =
    // "もしかして"
    "\xE3\x82\x82\xE3\x81\x97\xE3\x81\x8B\xE3\x81\x97\xE3\x81\xA6";

  // Set usage stats
  usage_stats::UsageStats::IncrementCount("LanguageAwareSuggestionTriggered");

  return true;
}

bool LanguageAwareRewriter::Rewrite(
    const ConversionRequest &request, Segments *segments) const {
  if (!IsEnabled(request)) {
    return false;
  }
  return FillRawText(request, segments);
}

namespace {
bool IsLanguageAwareInputCandidate(const composer::Composer &composer,
                                   const Segment::Candidate &candidate) {
  // Check candidate.prefix to filter if the candidate is probably generated
  // from LanguangeAwareInput or not.
  //
  // "→ "
  if (candidate.prefix != "\xE2\x86\x92 ") {
    return false;
  }

  string raw_string;
  composer.GetRawString(&raw_string);
  if (raw_string != candidate.value) {
    return false;
  }
  return true;
}
}  // namespace

void LanguageAwareRewriter::Finish(const ConversionRequest &request,
                                   Segments *segments) {
  if (!IsEnabled(request)) {
    return;
  }

  if (segments->conversion_segments_size() != 1 || !request.has_composer()) {
    return;
  }

  // Update usage stats
  const Segment &segment = segments->conversion_segment(0);
  // Ignores segments which are not converted or not committed.
  if (segment.candidates_size() == 0 ||
      segment.segment_type() != Segment::FIXED_VALUE) {
    return;
  }

  if (IsLanguageAwareInputCandidate(request.composer(),
                                    segment.candidate(0))) {
    usage_stats::UsageStats::IncrementCount("LanguageAwareSuggestionCommitted");
  }
}

}  // namespace mozc