Codebase list mozc / lintian-fixes/main src / rewriter / symbol_rewriter.cc
lintian-fixes/main

Tree @lintian-fixes/main (Download .tar.gz)

symbol_rewriter.cc @lintian-fixes/main

a1dcada
0fdb7a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9455e2
 
0fdb7a7
3e6ea8c
1f0b782
0fdb7a7
 
f9455e2
247b13e
0fdb7a7
 
247b13e
0fdb7a7
 
cdcee81
dbe8005
9070e4f
1f0b782
247b13e
a1dcada
0fdb7a7
 
 
 
 
 
 
 
 
 
 
f9455e2
a1dcada
 
f9455e2
 
a40d5db
0fdb7a7
a1dcada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fdb7a7
 
f9455e2
 
0fdb7a7
f9455e2
a1dcada
 
 
3e6ea8c
0fdb7a7
 
a1dcada
f9455e2
3e6ea8c
 
3314dee
3e6ea8c
f9455e2
a8fdb0c
0fdb7a7
 
 
f9455e2
a1dcada
8348279
 
a1fae21
0fdb7a7
 
 
 
 
 
f9455e2
 
0fdb7a7
 
 
 
a1dcada
e3417ae
d82f515
 
0fdb7a7
e3417ae
0fdb7a7
 
 
a8fdb0c
d82f515
 
0fdb7a7
 
 
 
 
f9455e2
 
3e6ea8c
 
f9455e2
 
3e6ea8c
f9455e2
 
 
 
 
3e6ea8c
 
 
f9455e2
 
3e6ea8c
f9455e2
 
3e6ea8c
b19ee59
a1dcada
 
f9455e2
 
0fdb7a7
f9455e2
3e6ea8c
a1dcada
 
0fdb7a7
 
 
 
 
 
 
 
 
 
 
 
 
 
3e6ea8c
0fdb7a7
a1dcada
 
a1fae21
 
 
 
e3417ae
a1fae21
 
 
 
 
a1dcada
a1fae21
a1dcada
a1fae21
 
 
 
 
 
0fdb7a7
 
 
 
 
3e6ea8c
f9455e2
 
a1fae21
3e6ea8c
a8fdb0c
 
 
 
3e6ea8c
 
a8fdb0c
 
a3981c7
 
a1fae21
 
0fdb7a7
 
a8fdb0c
0fdb7a7
 
8436464
 
e3417ae
8436464
a8fdb0c
0fdb7a7
 
a1dcada
 
a8fdb0c
 
f9455e2
 
 
3e6ea8c
a1dcada
f9455e2
3e6ea8c
f9455e2
a8fdb0c
3e6ea8c
f9455e2
 
0fdb7a7
 
 
 
a1fae21
 
3e6ea8c
a1fae21
 
a1dcada
a1fae21
 
 
3e6ea8c
 
 
 
a1fae21
a1dcada
3e6ea8c
a1fae21
 
 
 
 
 
a1dcada
 
a8fdb0c
0fdb7a7
a1dcada
3e6ea8c
 
0fdb7a7
 
 
 
 
 
a1dcada
0fdb7a7
 
a8fdb0c
0fdb7a7
 
a8fdb0c
0fdb7a7
 
8348279
a40d5db
a1dcada
0fdb7a7
 
 
 
3e6ea8c
 
0fdb7a7
 
 
 
664029b
0fdb7a7
 
 
 
 
 
 
 
 
 
a40d5db
0fdb7a7
a8fdb0c
a1dcada
 
a8fdb0c
0fdb7a7
 
 
 
 
cdcee81
 
a40d5db
 
a1dcada
3e6ea8c
 
a1dcada
 
a40d5db
0fdb7a7
 
 
cdcee81
 
3812565
 
8885b4a
 
 
a40d5db
 
913b2b2
0fdb7a7
 
 
 
 
 
 
8348279
a1dcada
0fdb7a7
8436464
0fdb7a7
// Copyright 2010-2020, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "rewriter/symbol_rewriter.h"

#include <algorithm>
#include <cstring>
#include <set>
#include <string>
#include <vector>

#include "base/logging.h"
#include "base/singleton.h"
#include "base/util.h"
#include "config/config_handler.h"
#include "converter/converter_interface.h"
#include "converter/segments.h"
#include "data_manager/data_manager_interface.h"
#include "protocol/commands.pb.h"
#include "protocol/config.pb.h"
#include "request/conversion_request.h"
#include "rewriter/rewriter_interface.h"
#include "absl/strings/string_view.h"

// SymbolRewriter:
// When updating the rule
// 1. Export the spreadsheet into TEXT (TSV)
// 2. Copy the TSV to mozc/data/symbol/symbol.tsv
// 3. Run symbol_rewriter_dictionary_generator_main in this directory
// 4. Make sure symbol_rewriter_data.h is correct

namespace mozc {

namespace {
// Try to start inserting symbols from this position
const size_t kDefaultOffset = 3;
const size_t kOffsetForSymbolKey = 1;
// Number of symbols which are inserted to first part
const size_t kMaxInsertToMedium = 15;
}  // namespace

size_t SymbolRewriter::GetOffset(const ConversionRequest &request,
                                 absl::string_view key) {
  const bool is_symbol_key =
      Util::CharsLen(key) == 1 && Util::IsScriptType(key, Util::UNKNOWN_SCRIPT);

  if (request.request().mixed_conversion() && is_symbol_key) {
    // Some software keyboard layouts have very limited space for candidates.
    // We want to show symbol variants as many as possible for symbol key input.
    // Without this hack, candidate list might be filled with prediction results
    // and users would not be able to find symbol candidates.
    return kOffsetForSymbolKey;
  }
  return kDefaultOffset;
}

// Some characters may have different description for full/half width forms.
// Here we just change the description in this function.
// If the symbol has description and additional description,
// Return merged description.
// TODO(taku): allow us to define two descriptions in *.tsv file
// static function
const std::string SymbolRewriter::GetDescription(
    const std::string &value, absl::string_view description,
    absl::string_view additional_description) {
  if (description.empty()) {
    return "";
  }
  std::string result = std::string(description);
  // Merge description
  if (!additional_description.empty()) {
    result.append(1, '(');
    result.append(additional_description.data(), additional_description.size());
    result.append(1, ')');
  }
  return result;
}

// return true key has no-hiragana
// static function
bool SymbolRewriter::IsSymbol(const std::string &key) {
  for (ConstChar32Iterator iter(key); !iter.Done(); iter.Next()) {
    const char32 ucs4 = iter.Get();
    if (ucs4 >= 0x3041 && ucs4 <= 0x309F) {  // hiragana
      return false;
    }
  }
  return true;
}

// static function
void SymbolRewriter::ExpandSpace(Segment *segment) {
  for (size_t i = 0; i < segment->candidates_size(); ++i) {
    if (segment->candidate(i).value == " ") {
      Segment::Candidate *c = segment->insert_candidate(i + 1);
      *c = segment->candidate(i);
      c->value = " ";          // Full-width space
      c->content_value = " ";  // Full-width space
      // Boundary is invalidated and unnecessary for space.
      c->inner_segment_boundary.clear();
      return;
    } else if (segment->candidate(i).value == " ") {  // Full-width space
      Segment::Candidate *c = segment->insert_candidate(i + 1);
      *c = segment->candidate(i);
      c->value = " ";
      c->content_value = " ";
      // Boundary is invalidated and unnecessary for space.
      c->inner_segment_boundary.clear();
      return;
    }
  }
}

// TODO(toshiyuki): Should we move this under Util module?
bool SymbolRewriter::IsPlatformDependent(
    SerializedDictionary::const_iterator iter) {
  if (iter.value().empty()) {
    return false;
  }
  const Util::CharacterSet cset = Util::GetCharacterSet(iter.value());
  return (cset >= Util::JISX0212);
}

// Return true if two symbols are in same group
// static function
bool SymbolRewriter::InSameSymbolGroup(
    SerializedDictionary::const_iterator lhs,
    SerializedDictionary::const_iterator rhs) {
  // "矢印記号", "矢印記号"
  // "ギリシャ(大文字)", "ギリシャ(小文字)"
  if (lhs.description().empty() || rhs.description().empty()) {
    return false;
  }
  const size_t cmp_len =
      std::max(lhs.description().size(), rhs.description().size());
  return std::strncmp(lhs.description().data(), rhs.description().data(),
                      cmp_len) == 0;
}

// Insert Symbol into segment.
// static function
void SymbolRewriter::InsertCandidates(
    size_t default_offset, const SerializedDictionary::IterRange &range,
    bool context_sensitive, Segment *segment) {
  if (segment->candidates_size() == 0) {
    LOG(WARNING) << "candiadtes_size is 0";
    return;
  }

  // work around for space.
  // space is not expanded in ExpandAlternative because it is not registered in
  // CharacterFormManager.
  // We do not want to make the form of spaces configurable, so we do not
  // register space to CharacterFormManager.
  ExpandSpace(segment);

  // If the original candidates given by ImmutableConveter already
  // include the target symbols, do assign description to these candidates.
  AddDescForCurrentCandidates(range, segment);

  const std::string &candidate_key =
      ((!segment->key().empty()) ? segment->key() : segment->candidate(0).key);
  size_t offset = 0;

  // If the key is "かおもじ", set the insert position at the bottom,
  // giving priority to emoticons inserted by EmoticonRewriter.
  if (candidate_key == "かおもじ") {
    offset = segment->candidates_size();
  } else {
    // Find the position wehere we start to insert the symbols
    // We want to skip the single-kanji we inserted by single-kanji rewriter.
    // We also skip transliterated key candidates.
    offset = std::min(default_offset, segment->candidates_size());
    for (size_t i = offset; i < segment->candidates_size(); ++i) {
      const std::string &target_value = segment->candidate(i).value;
      if ((Util::CharsLen(target_value) == 1 &&
           Util::IsScriptType(target_value, Util::KANJI)) ||
          Util::IsScriptType(target_value, Util::HIRAGANA) ||
          Util::IsScriptType(target_value, Util::KATAKANA)) {
        ++offset;
      } else {
        break;
      }
    }
  }

  const size_t range_size = range.second - range.first;
  size_t inserted_count = 0;
  bool finish_first_part = false;
  const Segment::Candidate &base_candidate = segment->candidate(0);
  for (auto iter = range.first; iter != range.second; ++iter) {
    Segment::Candidate *candidate = segment->insert_candidate(offset);
    DCHECK(candidate);

    candidate->Init();
    candidate->lid = iter.lid();
    candidate->rid = iter.rid();
    candidate->cost = base_candidate.cost;
    candidate->structure_cost = base_candidate.structure_cost;
    candidate->value.assign(iter.value().data(), iter.value().size());
    candidate->content_value.assign(iter.value().data(), iter.value().size());
    candidate->key = candidate_key;
    candidate->content_key = candidate_key;

    if (context_sensitive) {
      candidate->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
    }

    // The first two consist of two characters but the one of characters doesn't
    // have alternative character.
    if (candidate->value == "“”" || candidate->value == "‘’" ||
        candidate->value == "w" || candidate->value == "www") {
      candidate->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
    }

    candidate->description = GetDescription(
        candidate->value, iter.description(), iter.additional_description());
    ++offset;
    ++inserted_count;

    // Insert to latter position
    // If number of rest symbols is small, insert current position.
    const auto next = iter + 1;
    if (next != range.second && !finish_first_part &&
        inserted_count >= kMaxInsertToMedium &&
        range_size - inserted_count >= 5 &&
        // Do not divide symbols which seem to be in the same group
        // providing that they are not platform dependent characters.
        (!InSameSymbolGroup(iter, next) || IsPlatformDependent(next))) {
      offset = segment->candidates_size();
      finish_first_part = true;
    }
  }
}

// static
void SymbolRewriter::AddDescForCurrentCandidates(
    const SerializedDictionary::IterRange &range, Segment *segment) {
  for (size_t i = 0; i < segment->candidates_size(); ++i) {
    Segment::Candidate *candidate = segment->mutable_candidate(i);
    std::string full_width_value, half_width_value;
    Util::HalfWidthToFullWidth(candidate->value, &full_width_value);
    Util::FullWidthToHalfWidth(candidate->value, &half_width_value);

    for (auto iter = range.first; iter != range.second; ++iter) {
      if (candidate->value == iter.value() ||
          full_width_value == iter.value() ||
          half_width_value == iter.value()) {
        candidate->description =
            GetDescription(candidate->value, iter.description(),
                           iter.additional_description());
        break;
      }
    }
  }
}

bool SymbolRewriter::RewriteEachCandidate(const ConversionRequest &request,
                                          Segments *segments) const {
  bool modified = false;
  for (size_t i = 0; i < segments->conversion_segments_size(); ++i) {
    const std::string &key = segments->conversion_segment(i).key();
    const SerializedDictionary::IterRange range = dictionary_->equal_range(key);
    if (range.first == range.second) {
      continue;
    }

    // if key is symbol, no need to see the context
    const bool context_sensitive = !IsSymbol(key);

    InsertCandidates(GetOffset(request, key), range, context_sensitive,
                     segments->mutable_conversion_segment(i));

    modified = true;
  }

  return modified;
}

bool SymbolRewriter::RewriteEntireCandidate(const ConversionRequest &request,
                                            Segments *segments) const {
  std::string key;
  for (size_t i = 0; i < segments->conversion_segments_size(); ++i) {
    key += segments->conversion_segment(i).key();
  }

  const SerializedDictionary::IterRange range = dictionary_->equal_range(key);
  if (range.first == range.second) {
    return false;
  }

  if (segments->conversion_segments_size() > 1) {
    if (segments->resized()) {
      // the given segments are resized by user
      // so don't modify anymore
      return false;
    }
    // need to resize
    const size_t all_length = Util::CharsLen(key);
    const size_t first_length =
        Util::CharsLen(segments->conversion_segment(0).key());
    const int diff = static_cast<int>(all_length - first_length);
    if (diff > 0) {
      parent_converter_->ResizeSegment(segments, request, 0, diff);
    }
  } else {
    InsertCandidates(GetOffset(request, key), range,
                     false,  // not context sensitive
                     segments->mutable_conversion_segment(0));
  }

  return true;
}

SymbolRewriter::SymbolRewriter(const ConverterInterface *parent_converter,
                               const DataManagerInterface *data_manager)
    : parent_converter_(parent_converter) {
  DCHECK(parent_converter_);
  absl::string_view token_array_data, string_array_data;
  data_manager->GetSymbolRewriterData(&token_array_data, &string_array_data);
  DCHECK(SerializedDictionary::VerifyData(token_array_data, string_array_data));
  dictionary_.reset(
      new SerializedDictionary(token_array_data, string_array_data));
}

SymbolRewriter::~SymbolRewriter() {}

int SymbolRewriter::capability(const ConversionRequest &request) const {
  if (request.request().mixed_conversion()) {
    return RewriterInterface::ALL;
  }
  return RewriterInterface::CONVERSION;
}

bool SymbolRewriter::Rewrite(const ConversionRequest &request,
                             Segments *segments) const {
  if (!request.config().use_symbol_conversion()) {
    VLOG(2) << "no use_symbol_conversion";
    return false;
  }

  // apply entire candidate first, as we want to
  // find character combinations first, e.g.,
  // "->" -> "→"
  return (RewriteEntireCandidate(request, segments) ||
          RewriteEachCandidate(request, segments));
}

}  // namespace mozc