// Copyright 2010, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "rewriter/number_rewriter.h"
#include <stdio.h>
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "converter/segments.h"
#include "converter/pos_matcher.h"
#include "session/config_handler.h"
#include "session/config.pb.h"
namespace mozc {
namespace {
const char* const kNumKanjiDigits[] = {
"\xe3\x80\x87", "\xe4\xb8\x80", "\xe4\xba\x8c", "\xe4\xb8\x89",
"\xe5\x9b\x9b", "\xe4\xba\x94", "\xe5\x85\xad", "\xe4\xb8\x83",
"\xe5\x85\xab", "\xe4\xb9\x9d", NULL
// "〇", "一", "二", "三", "四", "五", "六", "七", "八", "九", NULL
};
const char* const kNumWideDigits[] = {
"\xef\xbc\x90", "\xef\xbc\x91", "\xef\xbc\x92", "\xef\xbc\x93",
"\xef\xbc\x94", "\xef\xbc\x95", "\xef\xbc\x96", "\xef\xbc\x97",
"\xef\xbc\x98", "\xef\xbc\x99", NULL
// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", NULL
};
const char* const kNumHalfDigits[] = {
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", NULL
};
const char* const kNumKanjiOldDigits[] = {
NULL, "\xe5\xa3\xb1", "\xe5\xbc\x90", "\xe5\x8f\x82", "\xe5\x9b\x9b",
"\xe4\xba\x94", "\xe5\x85\xad", "\xe4\xb8\x83", "\xe5\x85\xab",
"\xe4\xb9\x9d"
// NULL, "壱", "弐", "参", "四", "五", "六", "七", "八", "九"
};
const char* const kNumKanjiRanks[] = {
NULL, "", "\xe5\x8d\x81", "\xe7\x99\xbe", "\xe5\x8d\x83"
// NULL, "", "十", "百", "千"
};
const char* const kNumKanjiBiggerRanks[] = {
"", "\xe4\xb8\x87", "\xe5\x84\x84", "\xe5\x85\x86", "\xe4\xba\xac"
// "", "万", "億", "兆", "京"
};
const char* const* const kKanjiDigitsVariations[] = {
kNumKanjiDigits, kNumKanjiOldDigits, NULL
};
const char* const* const kSingleDigitsVariations[] = {
kNumKanjiDigits, kNumWideDigits, NULL
};
const char* const* const kNumDigitsVariations[] = {
kNumHalfDigits, kNumWideDigits, NULL
};
const char* kRomanNumbersCapital[] = {
NULL, "\xe2\x85\xa0", "\xe2\x85\xa1", "\xe2\x85\xa2", "\xe2\x85\xa3",
"\xe2\x85\xa4", "\xe2\x85\xa5", "\xe2\x85\xa6", "\xe2\x85\xa7",
"\xe2\x85\xa8", "\xe2\x85\xa9", "\xe2\x85\xaa", "\xe2\x85\xab", NULL
// NULL, "Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ", "Ⅹ", "Ⅺ", "Ⅻ", NULL
};
const char* kRomanNumbersSmall[] = {
NULL, "\xe2\x85\xb0", "\xe2\x85\xb1", "\xe2\x85\xb2", "\xe2\x85\xb3",
"\xe2\x85\xb4", "\xe2\x85\xb5", "\xe2\x85\xb6", "\xe2\x85\xb7",
"\xe2\x85\xb8", "\xe2\x85\xb9", "\xe2\x85\xba", "\xe2\x85\xbb", NULL
// NULL, "ⅰ", "ⅱ", "ⅲ", "ⅳ", "ⅴ", "ⅵ", "ⅶ", "ⅷ", "ⅸ", "ⅹ", "ⅺ", "ⅻ", NULL
};
const char* kCircledNumbers[] = {
NULL, "\xe2\x91\xa0", "\xe2\x91\xa1", "\xe2\x91\xa2", "\xe2\x91\xa3",
"\xe2\x91\xa4", "\xe2\x91\xa5", "\xe2\x91\xa6", "\xe2\x91\xa7",
"\xe2\x91\xa8", "\xe2\x91\xa9",
// NULL, "①", "②", "③", "④", "⑤", "⑥", "⑦", "⑧", "⑨", "⑩",
"\xe2\x91\xaa", "\xe2\x91\xab", "\xe2\x91\xac", "\xe2\x91\xad",
"\xe2\x91\xae", "\xe2\x91\xaf", "\xe2\x91\xb0", "\xe2\x91\xb1",
"\xe2\x91\xb2", "\xe2\x91\xb3",
// "⑪", "⑫", "⑬", "⑭", "⑮", "⑯", "⑰", "⑱", "⑲", "⑳",
NULL
};
const char* const* const kSpecialNumericVariations[] = {
kRomanNumbersCapital, kRomanNumbersSmall, kCircledNumbers,
NULL
};
const int kSpecialNumericSizes[] = {
arraysize(kRomanNumbersCapital),
arraysize(kRomanNumbersSmall),
arraysize(kCircledNumbers),
-1
};
const char* const kNumZero = "\xe9\x9b\xb6";
// const char* const kNumZero = "零";
const char* const kNumOldTen = "\xe6\x8b\xbe";
// const char* const kNumOldTen = "拾";
const char* const kNumOldTwenty = "\xe5\xbb\xbf";
// const char* const kNumOldTwenty = "廿";
const char* const kNumOldThousand = "\xe9\x98\xa1";
// const char* const kNumOldThousand = "阡";
const char* const kNumOldTenThousand = "\xe8\x90\xac";
// const char* const kNumOldTenThousand = "萬";
const char* const kNumGoogol =
"100000000000000000000000000000000000000000000000000"
"00000000000000000000000000000000000000000000000000";
// Helper functions.
void AppendToEachElement(const string& s,
vector<pair<string, uint16> >* out) {
for (vector<pair<string, uint16> >::iterator it = out->begin();
it != out->end(); ++it) {
it->first.append(s);
}
}
void ReplaceElement(const string& before, const string& after,
vector<pair<string, uint16> >* texts) {
for (size_t i = 0; i < texts->size(); ++i) {
if ((*texts)[i].first.find(before) != string::npos) {
string replaced = (*texts)[i].first;
size_t tpos = 0;
while ((tpos = replaced.find(before, tpos)) != string::npos) {
replaced.replace(tpos, before.size(), after);
}
// ReplaceElement is used for old kanji
texts->push_back(make_pair(replaced,
Segment::Candidate::NUMBER_OLD_KANJI));
}
}
}
void PushBackCandidate(const string& value, const string& desc,
uint16 style,
vector<Segment::Candidate>* results) {
bool found = false;
for (vector<Segment::Candidate>::const_iterator it = results->begin();
it != results->end(); ++it) {
if (it->value == value) {
found = true;
break;
}
}
if (!found) {
Segment::Candidate cand;
cand.value = value;
cand.description = desc;
cand.style = style;
results->push_back(cand);
}
}
// Number Converters main functions.
// They receives two arguments:
// - input_num: a string consisting of arabic numeric characters
// - output: a pointer to a vector of string, which contains the
// converted number representations.
// If the input_num is invalid or cannot represent as the form, this
// function does nothing. If finds more than one representations,
// pushes all candidates into the output.
void ArabicToKanji(const string& input_num,
vector<Segment::Candidate>* output) {
for (const char* const* const* digits_ptr = kKanjiDigitsVariations;
*digits_ptr; digits_ptr++) {
bool is_old = (*digits_ptr == kNumKanjiOldDigits);
const char* const* const digits = *digits_ptr;
const char* input_ptr = input_num.data();
int input_len = static_cast<int>(input_num.size());
while (input_len > 0 && *input_ptr == '0') {
++input_ptr;
--input_len;
}
if (input_len == 0) {
// "大字"
// http://ja.wikipedia.org/wiki/%E5%A4%A7%E5%AD%97_(%E6%95%B0%E5%AD%97)
PushBackCandidate(kNumZero, "\xE5\xA4\xA7\xE5\xAD\x97",
Segment::Candidate::NUMBER_OLD_KANJI, output);
break;
}
int bigger_ranks = input_len / 4;
if (bigger_ranks * 4 == input_len) {
--bigger_ranks;
}
if (bigger_ranks < static_cast<int>(arraysize(kNumKanjiBiggerRanks))) {
vector<pair<string, uint16> > results; // pair of value and type
const uint16 kStyle = is_old ? Segment::Candidate::NUMBER_OLD_KANJI :
Segment::Candidate::NUMBER_KANJI;
results.push_back(make_pair("", kStyle));
for (; bigger_ranks >= 0; --bigger_ranks) {
bool is_printed = false;
int smaller_rank_len = input_len - bigger_ranks * 4;
for (int i = smaller_rank_len; i > 0; --i, ++input_ptr, --input_len) {
uint32 n = *input_ptr - '0';
if (n != 0) {
is_printed = true;
}
if (!is_old && i == 4 && bigger_ranks > 0 &&
strncmp(input_ptr, "1000", 4) == 0) {
AppendToEachElement(digits[n], &results);
AppendToEachElement(kNumKanjiRanks[i], &results);
input_ptr += 4;
input_len -= 4;
break;
}
if (n == 1) {
if (is_old) {
AppendToEachElement(digits[n], &results);
} else if (i == 4) {
if (is_old) {
AppendToEachElement(digits[n], &results);
} else {
const size_t len = results.size();
for (size_t j = 0; j < len; ++j) {
results.push_back(make_pair(results[j].first + digits[n],
kStyle));
}
}
} else if (i == 1) {
AppendToEachElement(digits[n], &results);
}
} else if (n >= 2 && n <= 9) {
AppendToEachElement(digits[n], &results);
}
if (n > 0 && n <= 9) {
AppendToEachElement(
(i == 2 && is_old)? kNumOldTen : kNumKanjiRanks[i], &results);
}
}
if (is_printed) {
AppendToEachElement(kNumKanjiBiggerRanks[bigger_ranks], &results);
}
}
if (is_old) {
ReplaceElement("\xe5\x8d\x83", kNumOldThousand, &results);
// ReplaceElement("千", kNumOldThousand, &results);
ReplaceElement("\xe5\xbc\x90\xe6\x8b\xbe", kNumOldTwenty, &results);
// ReplaceElement("弐拾", kNumOldTwenty, &results);
ReplaceElement(kNumKanjiBiggerRanks[1], kNumOldTenThousand, &results);
}
for (vector<pair<string, uint16> >::const_iterator it = results.begin();
it != results.end(); ++it) {
if (it->second == Segment::Candidate::NUMBER_OLD_KANJI) {
// "大字"
// http://ja.wikipedia.org/wiki/%E5%A4%A7%E5%AD%97_(%E6%95%B0%E5%AD%97)
PushBackCandidate(it->first, "\xE5\xA4\xA7\xE5\xAD\x97",
it->second, output);
} else {
// "漢数字"
PushBackCandidate(it->first, "\xE6\xBC\xA2\xE6\x95\xB0\xE5\xAD\x97",
it->second, output);
}
}
}
}
}
void ArabicToSeparatedArabic(const string& input_num,
vector<Segment::Candidate>* output) {
if (input_num[0] == '0') {
// We don't add separator to number starting with '0'
return;
}
const char* kSeparaters[] = {",", "\xef\xbc\x8c", NULL};
const uint16 kStyles[] = {
Segment::Candidate::NUMBER_SEPARATED_ARABIC_HALFWIDTH,
Segment::Candidate::NUMBER_SEPARATED_ARABIC_FULLWIDTH,
0,
};
for (size_t i = 0; kNumDigitsVariations[i] != NULL; ++i) {
int counter = 2 - ((input_num.size() - 1) % 3);
string result;
for (size_t j = 0; j < input_num.size(); ++j) {
// We don't add separater first
if (j != 0 && counter % 3 == 0 && kSeparaters[i]) {
result.append(kSeparaters[i]);
}
const uint32 d = input_num[j] - '0';
if (d <= 9 && kNumDigitsVariations[i][d]) {
result.append(kNumDigitsVariations[i][d]);
}
++counter;
}
// "数字"
PushBackCandidate(result, "\xE6\x95\xB0\xE5\xAD\x97",
kStyles[i], output);
}
}
void ArabicToWideArabic(const string& input_num,
vector<Segment::Candidate>* output) {
const uint16 kStyles[] = {
Segment::Candidate::NUMBER_KANJI_ARABIC,
Segment::Candidate::DEFAULT,
// use default for wide arabic, because half/full width for
// normal number is learned by charactor form manager.
0,
};
const char *kStylesName[] = {
"\xE6\xBC\xA2\xE6\x95\xB0\xE5\xAD\x97", // "漢数字"
"\xE6\x95\xB0\xE5\xAD\x97", // "数字"
NULL
};
for (size_t i = 0; kSingleDigitsVariations[i] != NULL; ++i) {
string result;
for (size_t j = 0; j < input_num.size(); ++j) {
uint32 n = input_num[j] - '0';
if (n <= 9 && kSingleDigitsVariations[i][n]) {
result.append(kSingleDigitsVariations[i][n]);
} else {
break;
}
}
if (!result.empty()) {
PushBackCandidate(result, kStylesName[i], kStyles[i], output);
}
}
}
void ArabicToOtherForms(const string& input_num,
vector<Segment::Candidate>* output) {
if (input_num == kNumGoogol) {
PushBackCandidate("Googol", "", Segment::Candidate::DEFAULT, output);
}
int32 n = 0;
for (size_t i = 0; i < input_num.size(); ++i) {
uint32 d = input_num[i] - '0';
if (d <= 9) {
n = n * 10 + input_num[i] - '0';
if (n > 99) {
return;
}
} else {
break;
}
}
const uint16 kStyles[] = {
Segment::Candidate::NUMBER_ROMAN_CAPITAL,
Segment::Candidate::NUMBER_ROMAN_SMALL,
Segment::Candidate::NUMBER_CIRCLED,
0,
};
// "ローマ数字(大文字)",
// "ローマ数字(小文字)",
// "丸数字"
const char *kStylesName[] = {
"\xE3\x83\xAD\xE3\x83\xBC\xE3\x83\x9E\xE6\x95\xB0"
"\xE5\xAD\x97(\xE5\xA4\xA7\xE6\x96\x87\xE5\xAD\x97)",
"\xE3\x83\xAD\xE3\x83\xBC\xE3\x83\x9E\xE6\x95\xB0"
"\xE5\xAD\x97(\xE5\xB0\x8F\xE6\x96\x87\xE5\xAD\x97)",
"\xE4\xB8\xB8\xE6\x95\xB0\xE5\xAD\x97"
};
for (int i = 0; kSpecialNumericVariations[i]; ++i) {
if (n < kSpecialNumericSizes[i] && kSpecialNumericVariations[i][n]) {
PushBackCandidate(kSpecialNumericVariations[i][n], kStylesName[i],
kStyles[i], output);
}
}
}
void ArabicToOtherRadixes(const string& input_num,
vector<Segment::Candidate>* output) {
// uint64 size of digits is smaller than 20.
#define MAX_INT64_SIZE 20
if (input_num.size() >= MAX_INT64_SIZE) {
return;
}
uint64 n = 0;
for (string::const_iterator i = input_num.begin();
i != input_num.end(); ++i) {
n = 10 * n + (*i) - '0';
}
if (n > 9) {
// Hexadecimal
string hexadecimal("0x");
char buf[MAX_INT64_SIZE];
int len = snprintf(buf, MAX_INT64_SIZE, "%llx", n);
hexadecimal.append(buf, len);
// 16\xE9\x80\xB2\xE6\x95\xB0 is "16進数"
PushBackCandidate(hexadecimal, "16\xE9\x80\xB2\xE6\x95\xB0",
Segment::Candidate::NUMBER_HEX, output);
}
if (n > 1) {
// octal and binary
string octal;
string binary;
bool put_octal = (n > 7);
while (n > 0) {
octal.push_back('0' + static_cast<char>(n & 0x7));
for (int i = 0; i < 3 && n > 0; ++i) {
binary.push_back('0' + static_cast<char>(n & 0x1));
n >>= 1;
}
}
if (put_octal) {
reverse(octal.begin(), octal.end());
// 8\xE9\x80\xB2\xE6\x95\xB0 is "8進数"
PushBackCandidate(string("0") + octal, "8\xE9\x80\xB2\xE6\x95\xB0",
Segment::Candidate::NUMBER_OCT, output);
}
reverse(binary.begin(), binary.end());
// 2\xE9\x80\xB2\xE6\x95\xB0 is "2進数"
PushBackCandidate(string("0b") + binary, "2\xE9\x80\xB2\xE6\x95\xB0",
Segment::Candidate::NUMBER_BIN, output);
}
}
// return true if candidate is pure Arabic numeric candidate
bool IsArabicNumericCandidate(const Segment::Candidate &cand) {
if (cand.value.empty()) {
return false;
}
for (size_t i = 0; i < cand.value.size(); ++i) {
if (!isdigit(static_cast<unsigned char>(cand.value[i]))) {
return false;
}
}
return true;
}
// return the candidate index of Arabic number candidate.
// return -1 if no Arabic number candidate is found
int GetNumericCandidate(Segment* seg) {
if (seg->candidates_size() == 0) {
return -1;
}
const Segment::Candidate &top = seg->candidate(0);
if (IsArabicNumericCandidate(top)) {
return 0;
}
// if Segment only has 1 candidate and the top candidate
// looks numeric, try to expand the candidates to find
// Arabic numbers.
if (seg->candidates_size() == 1) {
if (!POSMatcher::IsNumber(top.lid)) {
return -1;
}
const size_t kExpandSize = 5;
seg->GetCandidates(kExpandSize);
}
// try to find ArabicNumerci candidates from the rest
for (size_t i = 0; i < seg->candidates_size(); ++i) {
const Segment::Candidate& cand = seg->candidate(i);
if (cand.value.empty()) {
continue;
}
if (IsArabicNumericCandidate(cand)) {
return i;
}
}
return -1;
}
} // namespace
NumberRewriter::NumberRewriter() {}
NumberRewriter::~NumberRewriter() {}
bool NumberRewriter::Rewrite(Segments *segments) const {
if (!GET_CONFIG(use_number_conversion)) {
VLOG(2) << "no use_number_conversion";
return false;
}
bool modified = false;
for (size_t i = segments->history_segments_size();
i < segments->segments_size(); ++i) {
Segment *seg = segments->mutable_segment(i);
if (seg == NULL) {
return false;
}
int pos = GetNumericCandidate(seg);
if (pos < 0) {
continue;
}
const Segment::Candidate &base_cand = seg->candidate(pos);
modified = true;
vector<Segment::Candidate> converted_numbers;
ArabicToWideArabic(base_cand.value, &converted_numbers);
ArabicToSeparatedArabic(base_cand.value, &converted_numbers);
ArabicToKanji(base_cand.value, &converted_numbers);
ArabicToOtherForms(base_cand.value, &converted_numbers);
if (segments->conversion_segments_size() == 1) {
ArabicToOtherRadixes(base_cand.value, &converted_numbers);
}
for (vector<Segment::Candidate>::const_iterator iter =
converted_numbers.begin();
iter != converted_numbers.end(); ++iter) {
Segment::Candidate* c = seg->insert_candidate(++pos);
if (c != NULL) {
c->lid = base_cand.lid;
c->rid = base_cand.rid;
c->cost = base_cand.cost;
c->value = iter->value;
c->style = iter->style;
c->SetDescription(Segment::Candidate::PLATFORM_DEPENDENT_CHARACTER |
Segment::Candidate::CHARACTER_FORM |
Segment::Candidate::FULL_HALF_WIDTH,
iter->description);
}
}
}
return modified;
}
} // namespace mozc