diff --git a/src/data_manager/data_manager.cc b/src/data_manager/data_manager.cc index c3fe22d..b1ceeaf 100644 --- a/src/data_manager/data_manager.cc +++ b/src/data_manager/data_manager.cc @@ -209,6 +209,19 @@ LOG(ERROR) << "Symbol dictionary data is broken"; return false; } + if (!reader.Get("emoticon_token", &emoticon_token_array_data_)) { + LOG(ERROR) << "Cannot find an emoticon token array"; + return false; + } + if (!reader.Get("emoticon_string", &emoticon_string_array_data_)) { + LOG(ERROR) << "Cannot find an emoticon string array or data is broken"; + return false; + } + if (!SerializedDictionary::VerifyData(emoticon_token_array_data_, + emoticon_string_array_data_)) { + LOG(ERROR) << "Emoticon dictionary data is broken"; + return false; + } if (!reader.Get("usage_item_array", &usage_items_data_)) { VLOG(2) << "Usage dictionary is not provided"; @@ -337,6 +350,12 @@ *string_array_data = symbol_string_array_data_; } +void DataManager::GetEmoticonRewriterData( + StringPiece *token_array_data, StringPiece *string_array_data) const { + *token_array_data = emoticon_token_array_data_; + *string_array_data = emoticon_string_array_data_; +} + void DataManager::GetCounterSuffixSortedArray(const char **array, size_t *size) const { *array = counter_suffix_data_.data(); diff --git a/src/data_manager/data_manager.gypi b/src/data_manager/data_manager.gypi index 1a08f73..4d0ff8b 100644 --- a/src/data_manager/data_manager.gypi +++ b/src/data_manager/data_manager.gypi @@ -117,6 +117,7 @@ 'gen_separate_suffix_data_for_<(dataset_tag)#host', 'gen_separate_reading_correction_data_for_<(dataset_tag)#host', 'gen_separate_symbol_rewriter_data_for_<(dataset_tag)#host', + 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)#host', ], 'actions': [ { @@ -146,6 +147,8 @@ 'reading_correction_correction': '<(gen_out_dir)/reading_correction_correction.data', 'symbol_token': '<(gen_out_dir)/symbol_token.data', 'symbol_string': '<(gen_out_dir)/symbol_string.data', + 'emoticon_token': '<(gen_out_dir)/emoticon_token.data', + 'emoticon_string': '<(gen_out_dir)/emoticon_string.data', }, 'inputs': [ '<(pos_matcher)', @@ -171,6 +174,8 @@ '<(reading_correction_correction)', '<(symbol_token)', '<(symbol_string)', + '<(emoticon_token)', + '<(emoticon_string)', ], 'outputs': [ '<(gen_out_dir)/<(out_mozc_data)', @@ -202,6 +207,8 @@ 'reading_correction_correction:32:<(gen_out_dir)/reading_correction_correction.data', 'symbol_token:32:<(gen_out_dir)/symbol_token.data', 'symbol_string:32:<(gen_out_dir)/symbol_string.data', + 'emoticon_token:32:<(gen_out_dir)/emoticon_token.data', + 'emoticon_string:32:<(gen_out_dir)/emoticon_string.data', ], 'conditions': [ ['target_platform!="Android"', { @@ -698,6 +705,40 @@ ], }, { + 'target_name': 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)', + 'type': 'none', + 'toolsets': ['host'], + 'dependencies': [ + '../../rewriter/rewriter_base.gyp:gen_emoticon_rewriter_data_main', + ], + 'actions': [ + { + 'action_name': 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)', + 'variables': { + 'generator': '<(PRODUCT_DIR)/gen_emoticon_rewriter_data_main<(EXECUTABLE_SUFFIX)', + 'input_files': [ + '<(mozc_dir)/data/emoticon/emoticon.tsv', + ], + }, + 'inputs': [ + '<(generator)', + '<@(input_files)', + ], + 'outputs': [ + '<(gen_out_dir)/emoticon_token.data', + '<(gen_out_dir)/emoticon_string.data', + ], + 'action': [ + '<(generator)', + '--input=<(mozc_dir)/data/emoticon/emoticon.tsv', + '--output_token_array=<(gen_out_dir)/emoticon_token.data', + '--output_string_array=<(gen_out_dir)/emoticon_string.data', + ], + 'message': '[<(dataset_tag)] Generating emoticon data', + }, + ], + }, + { 'target_name': 'gen_separate_counter_suffix_data_for_<(dataset_tag)', 'type': 'none', 'toolsets': ['host'], diff --git a/src/data_manager/data_manager.h b/src/data_manager/data_manager.h index 91a3faf..9f7d47f 100644 --- a/src/data_manager/data_manager.h +++ b/src/data_manager/data_manager.h @@ -84,6 +84,8 @@ StringPiece *correction_array_data) const override; void GetSymbolRewriterData(StringPiece *token_array_data, StringPiece *string_array_data) const override; + void GetEmoticonRewriterData(StringPiece *token_array_data, + StringPiece *string_array_data) const override; #ifndef NO_USAGE_REWRITER void GetUsageRewriterData( @@ -120,6 +122,8 @@ StringPiece reading_correction_correction_array_data_; StringPiece symbol_token_array_data_; StringPiece symbol_string_array_data_; + StringPiece emoticon_token_array_data_; + StringPiece emoticon_string_array_data_; StringPiece usage_base_conjugation_suffix_data_; StringPiece usage_conjugation_suffix_data_; StringPiece usage_conjugation_index_data_; diff --git a/src/data_manager/data_manager_interface.h b/src/data_manager/data_manager_interface.h index 19214d3..d2560fc 100644 --- a/src/data_manager/data_manager_interface.h +++ b/src/data_manager/data_manager_interface.h @@ -90,6 +90,10 @@ virtual void GetSymbolRewriterData(StringPiece *token_array_data, StringPiece *string_array_data) const = 0; + // Gets an address of symbol rewriter data array and its size. + virtual void GetEmoticonRewriterData( + StringPiece *token_array_data, StringPiece *string_array_data) const = 0; + #ifndef NO_USAGE_REWRITER // Gets the usage rewriter data. virtual void GetUsageRewriterData( diff --git a/src/data_manager/packed/packed_data_manager.cc b/src/data_manager/packed/packed_data_manager.cc index 6ff63da..5b6d4a2 100644 --- a/src/data_manager/packed/packed_data_manager.cc +++ b/src/data_manager/packed/packed_data_manager.cc @@ -89,6 +89,8 @@ void GetSuggestionFilterData(const char **data, size_t *size) const; void GetSymbolRewriterData(StringPiece *token_array_data, StringPiece *string_array_data) const; + void GetEmoticonRewriterData(StringPiece *token_array_data, + StringPiece *string_array_data) const; #ifndef NO_USAGE_REWRITER void GetUsageRewriterData(StringPiece *base_conjugation_suffix_data, StringPiece *conjugation_suffix_data, @@ -237,6 +239,11 @@ void PackedDataManager::Impl::GetSymbolRewriterData( StringPiece *token_array_data, StringPiece *string_array_data) const { manager_.GetSymbolRewriterData(token_array_data, string_array_data); +} + +void PackedDataManager::Impl::GetEmoticonRewriterData( + StringPiece *token_array_data, StringPiece *string_array_data) const { + manager_.GetEmoticonRewriterData(token_array_data, string_array_data); } #ifndef NO_USAGE_REWRITER @@ -392,6 +399,11 @@ manager_impl_->GetSymbolRewriterData(token_array_data, string_array_data); } +void PackedDataManager::GetEmoticonRewriterData( + StringPiece *token_array_data, StringPiece *string_array_data) const { + manager_impl_->GetEmoticonRewriterData(token_array_data, string_array_data); +} + #ifndef NO_USAGE_REWRITER void PackedDataManager::GetUsageRewriterData( StringPiece *base_conjugation_suffix_data, diff --git a/src/data_manager/packed/packed_data_manager.h b/src/data_manager/packed/packed_data_manager.h index 38d271a..5eaa836 100644 --- a/src/data_manager/packed/packed_data_manager.h +++ b/src/data_manager/packed/packed_data_manager.h @@ -73,6 +73,8 @@ void GetSuggestionFilterData(const char **data, size_t *size) const override; void GetSymbolRewriterData(StringPiece *token_array_data, StringPiece *string_array_data) const override; + void GetEmoticonRewriterData(StringPiece *token_array_data, + StringPiece *string_array_data) const override; #ifndef NO_USAGE_REWRITER void GetUsageRewriterData( StringPiece *base_conjugation_suffix_data, diff --git a/src/mozc_version_template.txt b/src/mozc_version_template.txt index 207c8e4..e0bf994 100644 --- a/src/mozc_version_template.txt +++ b/src/mozc_version_template.txt @@ -1,6 +1,6 @@ MAJOR=2 MINOR=17 -BUILD=2532 +BUILD=2533 REVISION=102 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be # downloaded by NaCl Mozc. diff --git a/src/rewriter/emoticon_rewriter.cc b/src/rewriter/emoticon_rewriter.cc index 21b76a0..7008757 100644 --- a/src/rewriter/emoticon_rewriter.cc +++ b/src/rewriter/emoticon_rewriter.cc @@ -43,51 +43,33 @@ #include "protocol/commands.pb.h" #include "protocol/config.pb.h" #include "request/conversion_request.h" -#include "rewriter/embedded_dictionary.h" #include "rewriter/rewriter_interface.h" +#include "rewriter/serialized_dictionary.h" namespace mozc { namespace { -#include "rewriter/emoticon_rewriter_data.h" - -class EmoticonDictionary { - public: - EmoticonDictionary() - : dic_(new EmbeddedDictionary(kEmoticonData_token_data, - kEmoticonData_token_size)) {} - - ~EmoticonDictionary() {} - - EmbeddedDictionary *GetDictionary() const { - return dic_.get(); - } - - private: - std::unique_ptr dic_; -}; - class ValueCostCompare { public: - bool operator() (const EmbeddedDictionary::Value *a, - const EmbeddedDictionary::Value *b) const { - return a->cost < b->cost; + bool operator() (SerializedDictionary::const_iterator a, + SerializedDictionary::const_iterator b) const { + return a.cost() < b.cost(); } }; class IsEqualValue { public: - bool operator() (const EmbeddedDictionary::Value *a, - const EmbeddedDictionary::Value *b) const { - return strcmp(a->value, b->value) == 0; + bool operator() (const SerializedDictionary::const_iterator a, + const SerializedDictionary::const_iterator b) const { + return a.value() == b.value(); } }; // Insert Emoticon into the |segment| // Top |initial_insert_size| candidates are inserted from |initial_insert_pos|. // Remained candidates are added to the buttom. -void InsertCandidates(const EmbeddedDictionary::Value *value, - size_t value_size, +void InsertCandidates(SerializedDictionary::const_iterator begin, + SerializedDictionary::const_iterator end, size_t initial_insert_pos, size_t initial_insert_size, bool is_no_learning, @@ -101,9 +83,9 @@ size_t offset = min(initial_insert_pos, segment->candidates_size()); // Sort values by cost just in case - vector sorted_value; - for (size_t i = 0; i < value_size; ++i) { - sorted_value.push_back(&value[i]); + vector sorted_value; + for (auto iter = begin; iter != end; ++iter) { + sorted_value.push_back(iter); } std::sort(sorted_value.begin(), sorted_value.end(), ValueCostCompare()); @@ -116,7 +98,7 @@ sorted_value.end()); for (size_t i = 0; i < sorted_value.size(); ++i) { - Segment::Candidate *c = NULL; + Segment::Candidate *c = nullptr; if (i < initial_insert_size) { c = segment->insert_candidate(offset); @@ -125,18 +107,18 @@ c = segment->push_back_candidate(); } - if (c == NULL) { + if (c == nullptr) { LOG(ERROR) << "cannot insert candidate at " << offset; continue; } c->Init(); // TODO(taku): set an appropriate POS here. - c->lid = sorted_value[i]->lid; - c->rid = sorted_value[i]->rid; + c->lid = sorted_value[i].lid(); + c->rid = sorted_value[i].rid(); c->cost = base_candidate.cost; - c->value = sorted_value[i]->value; - c->content_value = sorted_value[i]->value; + sorted_value[i].value().CopyToString(&c->value); + c->content_value = c->value; c->key = base_candidate.key; c->content_key = base_candidate.content_key; // no full/half width normalizations @@ -151,18 +133,20 @@ const char kBaseEmoticonDescription[] = "\xE9\xA1\x94\xE6\x96\x87\xE5\xAD\x97"; - if (sorted_value[i]->description == NULL) { + if (sorted_value[i].description().empty()) { c->description = kBaseEmoticonDescription; } else { string description = kBaseEmoticonDescription; description.append(" "); - description.append(sorted_value[i]->description); + sorted_value[i].description().AppendToString(&description); c->description = description; } } } -bool RewriteCandidate(Segments *segments) { +} // namespace + +bool EmoticonRewriter::RewriteCandidate(Segments *segments) const { bool modified = false; for (size_t i = 0; i < segments->conversion_segments_size(); ++i) { const string &key = segments->conversion_segment(i).key(); @@ -171,8 +155,8 @@ continue; } bool is_no_learning = false; - const EmbeddedDictionary::Value *value = NULL; - size_t value_size = 0; + SerializedDictionary::const_iterator begin; + SerializedDictionary::const_iterator end = dic_.end(); size_t initial_insert_size = 0; size_t initial_insert_pos = 0; @@ -184,58 +168,50 @@ if (key == "\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98") { // When key is "かおもじ", default candidate size should be small enough. // It is safe to expand all candidates at this time. - const EmbeddedDictionary::Token *token - = Singleton::get()->GetDictionary()->AllToken(); - CHECK(token); + begin = dic_.begin(); + CHECK(begin != dic_.end()); + end = dic_.end(); // set large value(100) so that all candidates are pushed to the bottom - value = token->value; - value_size = token->value_size; initial_insert_pos = 100; - initial_insert_size = token->value_size; + initial_insert_size = dic_.size(); // "かお" } else if (key == "\xE3\x81\x8B\xE3\x81\x8A") { // When key is "かお", expand all candidates in conservative way. - const EmbeddedDictionary::Token *token - = Singleton::get()->GetDictionary()->AllToken(); - CHECK(token); + begin = dic_.begin(); + CHECK(begin != dic_.end()); // first 6 candidates are inserted at 4 th position. // Other candidates are pushed to the buttom. - value = token->value; - value_size = token->value_size; initial_insert_pos = 4; initial_insert_size = 6; } else if (key == "\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F" "\xE3\x82\x89\xE3\x81\x84") { // "ふくわらい" // Choose one emoticon randomly from the dictionary. // TODO(taku): want to make it "generate" more funny emoticon. - const EmbeddedDictionary::Token *token - = Singleton::get()->GetDictionary()->AllToken(); - CHECK(token); + begin = dic_.begin(); + CHECK(begin != dic_.end()); uint32 n = 0; // use secure random not to predict the next emoticon. Util::GetRandomSequence(reinterpret_cast(&n), sizeof(n)); - value = token->value + n % token->value_size; - value_size = 1; + begin += n % dic_.size(); + end = begin + 1; initial_insert_pos = 4; initial_insert_size = 1; is_no_learning = true; // do not learn this candidate. } else { - const EmbeddedDictionary::Token *token - = Singleton::get()->GetDictionary()->Lookup(key); - // by default, insert canidate at 7 th position. - if (token != NULL) { - value = token->value; - value_size = token->value_size; + const auto range = dic_.equal_range(key); + begin = range.first; + end = range.second; + if (begin != end) { initial_insert_pos = 6; - initial_insert_size = token == NULL ? 0 : token->value_size; + initial_insert_size = std::distance(begin, end); } } - if (value == NULL || value_size == 0) { + if (begin == end) { continue; } - InsertCandidates(value, value_size, + InsertCandidates(begin, end, initial_insert_pos, initial_insert_size, is_no_learning, @@ -245,11 +221,20 @@ return modified; } -} // namespace - -EmoticonRewriter::EmoticonRewriter() {} - -EmoticonRewriter::~EmoticonRewriter() {} + +std::unique_ptr EmoticonRewriter::CreateFromDataManager( + const DataManagerInterface &data_manager) { + StringPiece token_array_data, string_array_data; + data_manager.GetEmoticonRewriterData(&token_array_data, &string_array_data); + return std::unique_ptr( + new EmoticonRewriter(token_array_data, string_array_data)); +} + +EmoticonRewriter::EmoticonRewriter(StringPiece token_array_data, + StringPiece string_array_data) + : dic_(token_array_data, string_array_data) {} + +EmoticonRewriter::~EmoticonRewriter() = default; int EmoticonRewriter::capability(const ConversionRequest &request) const { if (request.request().mixed_conversion()) { diff --git a/src/rewriter/emoticon_rewriter.h b/src/rewriter/emoticon_rewriter.h index ee3e0d9..a1254fb 100644 --- a/src/rewriter/emoticon_rewriter.h +++ b/src/rewriter/emoticon_rewriter.h @@ -30,7 +30,11 @@ #ifndef MOZC_REWRITER_EMOTICON_REWRITER_H_ #define MOZC_REWRITER_EMOTICON_REWRITER_H_ +#include + +#include "data_manager/data_manager_interface.h" #include "rewriter/rewriter_interface.h" +#include "rewriter/serialized_dictionary.h" namespace mozc { @@ -39,13 +43,21 @@ class EmoticonRewriter : public RewriterInterface { public: - EmoticonRewriter(); - virtual ~EmoticonRewriter(); + static std::unique_ptr CreateFromDataManager( + const DataManagerInterface &data_manager); - virtual int capability(const ConversionRequest &request) const; + EmoticonRewriter(StringPiece token_array_data, StringPiece string_array_data); + ~EmoticonRewriter() override; - virtual bool Rewrite(const ConversionRequest &request, - Segments *segments) const; + int capability(const ConversionRequest &request) const override; + + bool Rewrite(const ConversionRequest &request, + Segments *segments) const override; + + private: + bool RewriteCandidate(Segments *segments) const; + + SerializedDictionary dic_; }; } // namespace mozc diff --git a/src/rewriter/emoticon_rewriter_test.cc b/src/rewriter/emoticon_rewriter_test.cc index ce575f8..eb2aefd 100644 --- a/src/rewriter/emoticon_rewriter_test.cc +++ b/src/rewriter/emoticon_rewriter_test.cc @@ -30,6 +30,7 @@ #include "rewriter/emoticon_rewriter.h" #include +#include #include #include "base/logging.h" @@ -37,16 +38,17 @@ #include "base/util.h" #include "config/config_handler.h" #include "converter/segments.h" +#include "data_manager/testing/mock_data_manager.h" #include "protocol/commands.pb.h" #include "protocol/config.pb.h" #include "request/conversion_request.h" +#include "testing/base/public/googletest.h" #include "testing/base/public/gunit.h" - -DECLARE_string(test_tmpdir); +#include "testing/base/public/mozctest.h" namespace mozc { +namespace { -namespace { void AddSegment(const string &key, const string &value, Segments *segments) { segments->Clear(); @@ -71,22 +73,19 @@ } return false; } -} // namespace -class EmoticonRewriterTest : public testing::Test { +class EmoticonRewriterTest : public ::testing::Test { protected: - EmoticonRewriterTest() {} - ~EmoticonRewriterTest() {} + testing::MockDataManager mock_data_manager_; - virtual void SetUp() { - SystemUtil::SetUserProfileDirectory(FLAGS_test_tmpdir); - } - - virtual void TearDown() {} + private: + testing::ScopedTmpUserProfileDirectory scoped_profile_dir_; }; TEST_F(EmoticonRewriterTest, BasicTest) { - EmoticonRewriter emoticon_rewriter; + std::unique_ptr emoticon_rewriter = + EmoticonRewriter::CreateFromDataManager(mock_data_manager_); + config::Config config; config::ConfigHandler::GetDefaultConfig(&config); ConversionRequest request; @@ -96,30 +95,30 @@ Segments segments; AddSegment("test", "test", &segments); - emoticon_rewriter.Rewrite(request, &segments); + emoticon_rewriter->Rewrite(request, &segments); EXPECT_FALSE(HasEmoticon(segments)); // "かお" AddSegment("\xE3\x81\x8B\xE3\x81\x8A", "test", &segments); - emoticon_rewriter.Rewrite(request, &segments); + emoticon_rewriter->Rewrite(request, &segments); EXPECT_TRUE(HasEmoticon(segments)); // "かおもじ" AddSegment("\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98", "test", &segments); - emoticon_rewriter.Rewrite(request, &segments); + emoticon_rewriter->Rewrite(request, &segments); EXPECT_TRUE(HasEmoticon(segments)); // "にこにこ" AddSegment("\xE3\x81\xAB\xE3\x81\x93\xE3\x81\xAB\xE3\x81\x93", "test", &segments); - emoticon_rewriter.Rewrite(request, &segments); + emoticon_rewriter->Rewrite(request, &segments); EXPECT_TRUE(HasEmoticon(segments)); // "ふくわらい" AddSegment("\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F\xE3\x82\x89\xE3\x81\x84", "test", &segments); - emoticon_rewriter.Rewrite(request, &segments); + emoticon_rewriter->Rewrite(request, &segments); EXPECT_TRUE(HasEmoticon(segments)); } @@ -128,49 +127,52 @@ Segments segments; AddSegment("test", "test", &segments); - emoticon_rewriter.Rewrite(request, &segments); + emoticon_rewriter->Rewrite(request, &segments); EXPECT_FALSE(HasEmoticon(segments)); // "かお" AddSegment("\xE3\x81\x8B\xE3\x81\x8A", "test", &segments); - emoticon_rewriter.Rewrite(request, &segments); + emoticon_rewriter->Rewrite(request, &segments); EXPECT_FALSE(HasEmoticon(segments)); // "かおもじ" AddSegment("\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98", "test", &segments); - emoticon_rewriter.Rewrite(request, &segments); + emoticon_rewriter->Rewrite(request, &segments); EXPECT_FALSE(HasEmoticon(segments)); // "にこにこ" AddSegment("\xE3\x81\xAB\xE3\x81\x93\xE3\x81\xAB\xE3\x81\x93", "test", &segments); - emoticon_rewriter.Rewrite(request, &segments); + emoticon_rewriter->Rewrite(request, &segments); EXPECT_FALSE(HasEmoticon(segments)); // "ふくわらい" AddSegment("\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F\xE3\x82\x89\xE3\x81\x84", "test", &segments); - emoticon_rewriter.Rewrite(request, &segments); + emoticon_rewriter->Rewrite(request, &segments); EXPECT_FALSE(HasEmoticon(segments)); } } TEST_F(EmoticonRewriterTest, MobileEnvironmentTest) { - EmoticonRewriter rewriter; + std::unique_ptr rewriter = + EmoticonRewriter::CreateFromDataManager(mock_data_manager_); + commands::Request request; ConversionRequest convreq; convreq.set_request(&request); { request.set_mixed_conversion(true); - EXPECT_EQ(RewriterInterface::ALL, rewriter.capability(convreq)); + EXPECT_EQ(RewriterInterface::ALL, rewriter->capability(convreq)); } { request.set_mixed_conversion(false); - EXPECT_EQ(RewriterInterface::CONVERSION, rewriter.capability(convreq)); + EXPECT_EQ(RewriterInterface::CONVERSION, rewriter->capability(convreq)); } } +} // namespace } // namespace mozc diff --git a/src/rewriter/gen_emoticon_rewriter_data.cc b/src/rewriter/gen_emoticon_rewriter_data.cc new file mode 100644 index 0000000..6b8ffba --- /dev/null +++ b/src/rewriter/gen_emoticon_rewriter_data.cc @@ -0,0 +1,135 @@ +// Copyright 2010-2016, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include +#include +#include +#include + +#include "base/file_stream.h" +#include "base/flags.h" +#include "base/init_mozc.h" +#include "base/logging.h" +#include "base/string_piece.h" +#include "base/util.h" +#include "rewriter/serialized_dictionary.h" + +DEFINE_string(input, "", "Emoticon dictionary file"); +DEFINE_string(output_token_array, "", "Output token array"); +DEFINE_string(output_string_array, "", "Output string array"); + +namespace mozc { +namespace { + +using KeyList = vector; +using CompilerToken = SerializedDictionary::CompilerToken; +using TokenList = SerializedDictionary::TokenList; + +int LookupCount(const std::unordered_map &key_count, + const string &key) { + const auto iter = key_count.find(key); + return (iter == key_count.end()) ? 0 : iter->second; +} + +string GetDescription(const KeyList &key_list, + const std::unordered_map &key_count) { + if (key_list.size() == 1) { + return key_list[0]; + } + KeyList sorted_key_list(key_list); + sort(sorted_key_list.begin(), sorted_key_list.end(), + [&key_count](const string &x, const string &y) { + const int x_count = LookupCount(key_count, x); + const int y_count = LookupCount(key_count, y); + if (x_count == y_count) { + return x < y; + } + return x_count < y_count; + }); + return Util::StringPrintf("%s %s", sorted_key_list.back().c_str(), + sorted_key_list.front().c_str()); +} + +map ReadEmoticonTsv(const string &path) { + InputFileStream ifs(path.c_str()); + + string line; + getline(ifs, line); // Skip header + + vector> data; + std::unordered_map key_count; + while (getline(ifs, line)) { + vector field_list; + Util::SplitStringUsing(line, "\t", &field_list); + CHECK_GE(field_list.size(), 2) << "Format error: " << line; + LOG_IF(WARNING, field_list.size() > 3) << "Ignore extra columns: " << line; + + string replaced; + Util::StringReplace(field_list[1], "\xE3\x80\x80", " ", true, &replaced); + KeyList key_list; + Util::SplitStringUsing(field_list[1], " ", &key_list); + + data.emplace_back(field_list[0].as_string(), std::move(key_list)); + for (const auto &key : key_list) { + ++key_count[key]; + } + } + + map input_data; + int16 cost = 10; + for (const auto &kv : data) { + const string &value = kv.first; + const KeyList &key_list = kv.second; + const string &description = GetDescription(key_list, key_count); + for (const string &key : key_list) { + std::unique_ptr token(new CompilerToken()); + token->value = value; + token->description = description; + token->lid = 0; + token->rid = 0; + token->cost = cost; + input_data[key].push_back(std::move(token)); + cost += 10; + } + } + + return input_data; +} + +} // namespace +} // namespace mozc + +int main(int argc, char **argv) { + mozc::InitMozc(argv[0], &argc, &argv, true); + const auto &input_data = mozc::ReadEmoticonTsv(FLAGS_input); + mozc::SerializedDictionary::CompileToFiles( + input_data, FLAGS_output_token_array, FLAGS_output_string_array); + return 0; +} diff --git a/src/rewriter/gen_emoticon_rewriter_data.py b/src/rewriter/gen_emoticon_rewriter_data.py deleted file mode 100644 index 2ecc77f..0000000 --- a/src/rewriter/gen_emoticon_rewriter_data.py +++ /dev/null @@ -1,127 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2010-2016, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -"""Converter from emoticon data to embedded_dictionary. - -Usage: - python gen_emoticon_rewriter_data.py --input=input.tsv --output=output_header -""" - -__author__ = "hidehiko" - -from collections import defaultdict -import logging -import optparse -import re -import sys -from rewriter import embedded_dictionary_compiler - - -def ParseOptions(): - parser = optparse.OptionParser() - parser.add_option('--input', dest='input', help='emoticon dictionary file') - parser.add_option('--output', dest='output', help='output header file') - return parser.parse_args()[0] - - -def GetDescription(key_list, key_count): - """Generates a description from readings. - - We simply add 1) the most general reading and 2) the most specific reading. - 1) and 2) are simply approximated by checking the frequency of the readings. - - Args: - key_list: a list of key strings. - key_count: a dictionary of key to the number of key's occurence in the data - file. - Returns: - the description string. - """ - if len(key_list) == 1: - return key_list[0] - - sorted_key_list = sorted(key_list, key=lambda key: (key_count[key], key)) - return '%s %s' % (sorted_key_list[-1], sorted_key_list[0]) - - -def ReadEmoticonTsv(stream): - """Read lines from stream to a Token dictionary for a embedded dictionary.""" - # Skip the first line (header). - stream.next() - - data = [] - key_count = defaultdict(int) - for line in stream: - # The file format is: - # value readings(space delimitered) - field_list = line.rstrip('\n').split('\t') - # Check the size of columns. - if len(field_list) < 2: - logging.critical('format error: %s', line) - sys.exit(1) - if len(field_list) > 3: - logging.warning('ignore extra columns: %s', line) - - # \xE3\x80\x80 is full width space - key_list = re.split(r'(?: |\xE3\x80\x80)+', field_list[1].strip()) - data.append((field_list[0], key_list)) - for key in key_list: - key_count[key] += 1 - - input_data = defaultdict(list) - cost = 10 - for value, key_list in data: - input_value = value - if input_value == "": - input_value = None - description = GetDescription(key_list, key_count) - if description == "": - description = None - - for key in key_list: - input_data[key].append(embedded_dictionary_compiler.Token( - key, input_value, description, None, 0, 0, cost)) - cost += 10 - - return input_data - - -def main(): - options = ParseOptions() - with open(options.input, 'r') as input_stream: - input_data = ReadEmoticonTsv(input_stream) - - with open(options.output, 'w') as output_stream: - embedded_dictionary_compiler.Compile( - 'EmoticonData', input_data, output_stream) - - -if __name__ == '__main__': - main() diff --git a/src/rewriter/rewriter.cc b/src/rewriter/rewriter.cc index 89a5c90..5e4f26a 100644 --- a/src/rewriter/rewriter.cc +++ b/src/rewriter/rewriter.cc @@ -108,7 +108,7 @@ kEmojiDataList, arraysize(kEmojiDataList), kEmojiTokenList, arraysize(kEmojiTokenList), kEmojiValueList)); - AddRewriter(new EmoticonRewriter); + AddRewriter(EmoticonRewriter::CreateFromDataManager(*data_manager).release()); AddRewriter(new CalculatorRewriter(parent_converter)); AddRewriter(new SymbolRewriter(parent_converter, data_manager)); AddRewriter(new UnicodeRewriter(parent_converter)); diff --git a/src/rewriter/rewriter_base.gyp b/src/rewriter/rewriter_base.gyp index 58bce8f..30d9fdd 100644 --- a/src/rewriter/rewriter_base.gyp +++ b/src/rewriter/rewriter_base.gyp @@ -68,26 +68,6 @@ ], }, { - 'action_name': 'gen_emoticon_rewriter_data', - 'variables': { - 'input_file': '../data/emoticon/emoticon.tsv', - 'output_file': '<(gen_out_dir)/emoticon_rewriter_data.h', - }, - 'inputs': [ - 'embedded_dictionary_compiler.py', - 'gen_emoticon_rewriter_data.py', - '<(input_file)', - ], - 'outputs': [ - '<(output_file)' - ], - 'action': [ - 'python', 'gen_emoticon_rewriter_data.py', - '--input=<(input_file)', - '--output=<(output_file)', - ], - }, - { 'action_name': 'gen_emoji_rewriter_data', 'variables': { 'input_file': '../data/emoji/emoji_data.tsv', @@ -214,5 +194,17 @@ '../base/base.gyp:serialized_string_array', ], }, + { + 'target_name': 'gen_emoticon_rewriter_data_main', + 'type': 'executable', + 'toolsets': ['host'], + 'sources': [ + 'gen_emoticon_rewriter_data.cc', + ], + 'dependencies': [ + '../base/base.gyp:base', + 'rewriter_serialized_dictionary.gyp:serialized_dictionary', + ], + }, ], } diff --git a/src/rewriter/serialized_dictionary.cc b/src/rewriter/serialized_dictionary.cc index 8beb5b4..689c4cf 100644 --- a/src/rewriter/serialized_dictionary.cc +++ b/src/rewriter/serialized_dictionary.cc @@ -49,16 +49,8 @@ namespace mozc { namespace { -struct CompilerToken { - string value; - string description; - string additional_description; - uint16 lid; - uint16 rid; - int16 cost; -}; - -using TokenList = vector>; +using CompilerToken = SerializedDictionary::CompilerToken; +using TokenList = SerializedDictionary::TokenList; struct CompareByCost { bool operator()(const std::unique_ptr &t1, @@ -113,10 +105,16 @@ std::istream *input, std::unique_ptr *output_token_array_buf, std::unique_ptr *output_string_array_buf) { - CHECK(SystemUtil::IsLittleEndian()); - map dic; LoadTokens(input, &dic); + return Compile(dic, output_token_array_buf, output_string_array_buf); +} + +pair SerializedDictionary::Compile( + const map &dic, + std::unique_ptr *output_token_array_buf, + std::unique_ptr *output_string_array_buf) { + CHECK(SystemUtil::IsLittleEndian()); // Build a mapping from string to its index in a serialized string array. // Note that duplicate keys share the same index, so data is slightly @@ -189,9 +187,16 @@ const string &output_string_array) { InputFileStream ifs(input.c_str()); CHECK(ifs.good()); - + map dic; + LoadTokens(&ifs, &dic); + CompileToFiles(dic, output_token_array, output_string_array); +} + +void SerializedDictionary::CompileToFiles(const map &dic, + const string &output_token_array, + const string &output_string_array) { std::unique_ptr buf1, buf2; - const pair data = Compile(&ifs, &buf1, &buf2); + const pair data = Compile(dic, &buf1, &buf2); CHECK(VerifyData(data.first, data.second)); OutputFileStream token_ofs(output_token_array.c_str(), diff --git a/src/rewriter/serialized_dictionary.h b/src/rewriter/serialized_dictionary.h index 50dd05f..d4da60a 100644 --- a/src/rewriter/serialized_dictionary.h +++ b/src/rewriter/serialized_dictionary.h @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -106,6 +107,17 @@ // array by index. class SerializedDictionary { public: + struct CompilerToken { + string value; + string description; + string additional_description; + uint16 lid; + uint16 rid; + int16 cost; + }; + + using TokenList = vector>; + static const size_t kTokenByteLength = 24; class iterator : public std::iterator *output_token_array_buf, std::unique_ptr *output_string_array_buf); + static pair Compile( + const map &dic, + std::unique_ptr *output_token_array_buf, + std::unique_ptr *output_string_array_buf); // Creates serialized data and writes them to files. static void CompileToFiles(const string &input, const string &output_token_array, const string &output_string_array); + static void CompileToFiles(const map &dic, + const string &output_token_array, + const string &output_string_array); // Validates the serialized data. static bool VerifyData(StringPiece token_array_data, @@ -295,6 +314,10 @@ // boundary. SerializedDictionary(StringPiece token_array, StringPiece string_array_data); ~SerializedDictionary(); + + std::size_t size() const { + return token_array_.size() / kTokenByteLength; + } iterator begin() { return iterator(token_array_.data(), &string_array_); } const_iterator begin() const {