Codebase list mozc / 8f7ff0e
Move emoticon rewriter data to data set file This CL moves C++-embedded data in EmoticonRewriter to the data set file. To this end, gen_emoticon_rewriter_data.py is rewritten in C++ to reuse SerializedDictionary::CompileToFiles. BUG= TEST= REF_BUG=26841123 REF_CL=117306701,118327510 REF_TIME=2016-03-16T12:42:47+09:00 REF_TIME_RAW=1458099767 +0900 Noriyuki Takahashi 8 years ago
16 changed file(s) with 374 addition(s) and 265 deletion(s). Raw diff Collapse all Expand all
208208 LOG(ERROR) << "Symbol dictionary data is broken";
209209 return false;
210210 }
211 if (!reader.Get("emoticon_token", &emoticon_token_array_data_)) {
212 LOG(ERROR) << "Cannot find an emoticon token array";
213 return false;
214 }
215 if (!reader.Get("emoticon_string", &emoticon_string_array_data_)) {
216 LOG(ERROR) << "Cannot find an emoticon string array or data is broken";
217 return false;
218 }
219 if (!SerializedDictionary::VerifyData(emoticon_token_array_data_,
220 emoticon_string_array_data_)) {
221 LOG(ERROR) << "Emoticon dictionary data is broken";
222 return false;
223 }
211224
212225 if (!reader.Get("usage_item_array", &usage_items_data_)) {
213226 VLOG(2) << "Usage dictionary is not provided";
336349 *string_array_data = symbol_string_array_data_;
337350 }
338351
352 void DataManager::GetEmoticonRewriterData(
353 StringPiece *token_array_data, StringPiece *string_array_data) const {
354 *token_array_data = emoticon_token_array_data_;
355 *string_array_data = emoticon_string_array_data_;
356 }
357
339358 void DataManager::GetCounterSuffixSortedArray(const char **array,
340359 size_t *size) const {
341360 *array = counter_suffix_data_.data();
116116 'gen_separate_suffix_data_for_<(dataset_tag)#host',
117117 'gen_separate_reading_correction_data_for_<(dataset_tag)#host',
118118 'gen_separate_symbol_rewriter_data_for_<(dataset_tag)#host',
119 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)#host',
119120 ],
120121 'actions': [
121122 {
145146 'reading_correction_correction': '<(gen_out_dir)/reading_correction_correction.data',
146147 'symbol_token': '<(gen_out_dir)/symbol_token.data',
147148 'symbol_string': '<(gen_out_dir)/symbol_string.data',
149 'emoticon_token': '<(gen_out_dir)/emoticon_token.data',
150 'emoticon_string': '<(gen_out_dir)/emoticon_string.data',
148151 },
149152 'inputs': [
150153 '<(pos_matcher)',
170173 '<(reading_correction_correction)',
171174 '<(symbol_token)',
172175 '<(symbol_string)',
176 '<(emoticon_token)',
177 '<(emoticon_string)',
173178 ],
174179 'outputs': [
175180 '<(gen_out_dir)/<(out_mozc_data)',
201206 'reading_correction_correction:32:<(gen_out_dir)/reading_correction_correction.data',
202207 'symbol_token:32:<(gen_out_dir)/symbol_token.data',
203208 'symbol_string:32:<(gen_out_dir)/symbol_string.data',
209 'emoticon_token:32:<(gen_out_dir)/emoticon_token.data',
210 'emoticon_string:32:<(gen_out_dir)/emoticon_string.data',
204211 ],
205212 'conditions': [
206213 ['target_platform!="Android"', {
697704 ],
698705 },
699706 {
707 'target_name': 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)',
708 'type': 'none',
709 'toolsets': ['host'],
710 'dependencies': [
711 '../../rewriter/rewriter_base.gyp:gen_emoticon_rewriter_data_main',
712 ],
713 'actions': [
714 {
715 'action_name': 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)',
716 'variables': {
717 'generator': '<(PRODUCT_DIR)/gen_emoticon_rewriter_data_main<(EXECUTABLE_SUFFIX)',
718 'input_files': [
719 '<(mozc_dir)/data/emoticon/emoticon.tsv',
720 ],
721 },
722 'inputs': [
723 '<(generator)',
724 '<@(input_files)',
725 ],
726 'outputs': [
727 '<(gen_out_dir)/emoticon_token.data',
728 '<(gen_out_dir)/emoticon_string.data',
729 ],
730 'action': [
731 '<(generator)',
732 '--input=<(mozc_dir)/data/emoticon/emoticon.tsv',
733 '--output_token_array=<(gen_out_dir)/emoticon_token.data',
734 '--output_string_array=<(gen_out_dir)/emoticon_string.data',
735 ],
736 'message': '[<(dataset_tag)] Generating emoticon data',
737 },
738 ],
739 },
740 {
700741 'target_name': 'gen_separate_counter_suffix_data_for_<(dataset_tag)',
701742 'type': 'none',
702743 'toolsets': ['host'],
8383 StringPiece *correction_array_data) const override;
8484 void GetSymbolRewriterData(StringPiece *token_array_data,
8585 StringPiece *string_array_data) const override;
86 void GetEmoticonRewriterData(StringPiece *token_array_data,
87 StringPiece *string_array_data) const override;
8688
8789 #ifndef NO_USAGE_REWRITER
8890 void GetUsageRewriterData(
119121 StringPiece reading_correction_correction_array_data_;
120122 StringPiece symbol_token_array_data_;
121123 StringPiece symbol_string_array_data_;
124 StringPiece emoticon_token_array_data_;
125 StringPiece emoticon_string_array_data_;
122126 StringPiece usage_base_conjugation_suffix_data_;
123127 StringPiece usage_conjugation_suffix_data_;
124128 StringPiece usage_conjugation_index_data_;
8989 virtual void GetSymbolRewriterData(StringPiece *token_array_data,
9090 StringPiece *string_array_data) const = 0;
9191
92 // Gets an address of symbol rewriter data array and its size.
93 virtual void GetEmoticonRewriterData(
94 StringPiece *token_array_data, StringPiece *string_array_data) const = 0;
95
9296 #ifndef NO_USAGE_REWRITER
9397 // Gets the usage rewriter data.
9498 virtual void GetUsageRewriterData(
8888 void GetSuggestionFilterData(const char **data, size_t *size) const;
8989 void GetSymbolRewriterData(StringPiece *token_array_data,
9090 StringPiece *string_array_data) const;
91 void GetEmoticonRewriterData(StringPiece *token_array_data,
92 StringPiece *string_array_data) const;
9193 #ifndef NO_USAGE_REWRITER
9294 void GetUsageRewriterData(StringPiece *base_conjugation_suffix_data,
9395 StringPiece *conjugation_suffix_data,
236238 void PackedDataManager::Impl::GetSymbolRewriterData(
237239 StringPiece *token_array_data, StringPiece *string_array_data) const {
238240 manager_.GetSymbolRewriterData(token_array_data, string_array_data);
241 }
242
243 void PackedDataManager::Impl::GetEmoticonRewriterData(
244 StringPiece *token_array_data, StringPiece *string_array_data) const {
245 manager_.GetEmoticonRewriterData(token_array_data, string_array_data);
239246 }
240247
241248 #ifndef NO_USAGE_REWRITER
391398 manager_impl_->GetSymbolRewriterData(token_array_data, string_array_data);
392399 }
393400
401 void PackedDataManager::GetEmoticonRewriterData(
402 StringPiece *token_array_data, StringPiece *string_array_data) const {
403 manager_impl_->GetEmoticonRewriterData(token_array_data, string_array_data);
404 }
405
394406 #ifndef NO_USAGE_REWRITER
395407 void PackedDataManager::GetUsageRewriterData(
396408 StringPiece *base_conjugation_suffix_data,
7272 void GetSuggestionFilterData(const char **data, size_t *size) const override;
7373 void GetSymbolRewriterData(StringPiece *token_array_data,
7474 StringPiece *string_array_data) const override;
75 void GetEmoticonRewriterData(StringPiece *token_array_data,
76 StringPiece *string_array_data) const override;
7577 #ifndef NO_USAGE_REWRITER
7678 void GetUsageRewriterData(
7779 StringPiece *base_conjugation_suffix_data,
00 MAJOR=2
11 MINOR=17
2 BUILD=2532
2 BUILD=2533
33 REVISION=102
44 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
55 # downloaded by NaCl Mozc.
4242 #include "protocol/commands.pb.h"
4343 #include "protocol/config.pb.h"
4444 #include "request/conversion_request.h"
45 #include "rewriter/embedded_dictionary.h"
4645 #include "rewriter/rewriter_interface.h"
46 #include "rewriter/serialized_dictionary.h"
4747
4848 namespace mozc {
4949 namespace {
5050
51 #include "rewriter/emoticon_rewriter_data.h"
52
53 class EmoticonDictionary {
54 public:
55 EmoticonDictionary()
56 : dic_(new EmbeddedDictionary(kEmoticonData_token_data,
57 kEmoticonData_token_size)) {}
58
59 ~EmoticonDictionary() {}
60
61 EmbeddedDictionary *GetDictionary() const {
62 return dic_.get();
63 }
64
65 private:
66 std::unique_ptr<EmbeddedDictionary> dic_;
67 };
68
6951 class ValueCostCompare {
7052 public:
71 bool operator() (const EmbeddedDictionary::Value *a,
72 const EmbeddedDictionary::Value *b) const {
73 return a->cost < b->cost;
53 bool operator() (SerializedDictionary::const_iterator a,
54 SerializedDictionary::const_iterator b) const {
55 return a.cost() < b.cost();
7456 }
7557 };
7658
7759 class IsEqualValue {
7860 public:
79 bool operator() (const EmbeddedDictionary::Value *a,
80 const EmbeddedDictionary::Value *b) const {
81 return strcmp(a->value, b->value) == 0;
61 bool operator() (const SerializedDictionary::const_iterator a,
62 const SerializedDictionary::const_iterator b) const {
63 return a.value() == b.value();
8264 }
8365 };
8466
8567 // Insert Emoticon into the |segment|
8668 // Top |initial_insert_size| candidates are inserted from |initial_insert_pos|.
8769 // Remained candidates are added to the buttom.
88 void InsertCandidates(const EmbeddedDictionary::Value *value,
89 size_t value_size,
70 void InsertCandidates(SerializedDictionary::const_iterator begin,
71 SerializedDictionary::const_iterator end,
9072 size_t initial_insert_pos,
9173 size_t initial_insert_size,
9274 bool is_no_learning,
10082 size_t offset = min(initial_insert_pos, segment->candidates_size());
10183
10284 // Sort values by cost just in case
103 vector<const EmbeddedDictionary::Value *> sorted_value;
104 for (size_t i = 0; i < value_size; ++i) {
105 sorted_value.push_back(&value[i]);
85 vector<SerializedDictionary::const_iterator> sorted_value;
86 for (auto iter = begin; iter != end; ++iter) {
87 sorted_value.push_back(iter);
10688 }
10789
10890 std::sort(sorted_value.begin(), sorted_value.end(), ValueCostCompare());
11597 sorted_value.end());
11698
11799 for (size_t i = 0; i < sorted_value.size(); ++i) {
118 Segment::Candidate *c = NULL;
100 Segment::Candidate *c = nullptr;
119101
120102 if (i < initial_insert_size) {
121103 c = segment->insert_candidate(offset);
124106 c = segment->push_back_candidate();
125107 }
126108
127 if (c == NULL) {
109 if (c == nullptr) {
128110 LOG(ERROR) << "cannot insert candidate at " << offset;
129111 continue;
130112 }
131113
132114 c->Init();
133115 // TODO(taku): set an appropriate POS here.
134 c->lid = sorted_value[i]->lid;
135 c->rid = sorted_value[i]->rid;
116 c->lid = sorted_value[i].lid();
117 c->rid = sorted_value[i].rid();
136118 c->cost = base_candidate.cost;
137 c->value = sorted_value[i]->value;
138 c->content_value = sorted_value[i]->value;
119 sorted_value[i].value().CopyToString(&c->value);
120 c->content_value = c->value;
139121 c->key = base_candidate.key;
140122 c->content_key = base_candidate.content_key;
141123 // no full/half width normalizations
150132 const char kBaseEmoticonDescription[]
151133 = "\xE9\xA1\x94\xE6\x96\x87\xE5\xAD\x97";
152134
153 if (sorted_value[i]->description == NULL) {
135 if (sorted_value[i].description().empty()) {
154136 c->description = kBaseEmoticonDescription;
155137 } else {
156138 string description = kBaseEmoticonDescription;
157139 description.append(" ");
158 description.append(sorted_value[i]->description);
140 sorted_value[i].description().AppendToString(&description);
159141 c->description = description;
160142 }
161143 }
162144 }
163145
164 bool RewriteCandidate(Segments *segments) {
146 } // namespace
147
148 bool EmoticonRewriter::RewriteCandidate(Segments *segments) const {
165149 bool modified = false;
166150 for (size_t i = 0; i < segments->conversion_segments_size(); ++i) {
167151 const string &key = segments->conversion_segment(i).key();
170154 continue;
171155 }
172156 bool is_no_learning = false;
173 const EmbeddedDictionary::Value *value = NULL;
174 size_t value_size = 0;
157 SerializedDictionary::const_iterator begin;
158 SerializedDictionary::const_iterator end = dic_.end();
175159 size_t initial_insert_size = 0;
176160 size_t initial_insert_pos = 0;
177161
183167 if (key == "\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98") {
184168 // When key is "かおもじ", default candidate size should be small enough.
185169 // It is safe to expand all candidates at this time.
186 const EmbeddedDictionary::Token *token
187 = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken();
188 CHECK(token);
170 begin = dic_.begin();
171 CHECK(begin != dic_.end());
172 end = dic_.end();
189173 // set large value(100) so that all candidates are pushed to the bottom
190 value = token->value;
191 value_size = token->value_size;
192174 initial_insert_pos = 100;
193 initial_insert_size = token->value_size;
175 initial_insert_size = dic_.size();
194176 // "かお"
195177 } else if (key == "\xE3\x81\x8B\xE3\x81\x8A") {
196178 // When key is "かお", expand all candidates in conservative way.
197 const EmbeddedDictionary::Token *token
198 = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken();
199 CHECK(token);
179 begin = dic_.begin();
180 CHECK(begin != dic_.end());
200181 // first 6 candidates are inserted at 4 th position.
201182 // Other candidates are pushed to the buttom.
202 value = token->value;
203 value_size = token->value_size;
204183 initial_insert_pos = 4;
205184 initial_insert_size = 6;
206185 } else if (key == "\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F"
207186 "\xE3\x82\x89\xE3\x81\x84") { // "ふくわらい"
208187 // Choose one emoticon randomly from the dictionary.
209188 // TODO(taku): want to make it "generate" more funny emoticon.
210 const EmbeddedDictionary::Token *token
211 = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken();
212 CHECK(token);
189 begin = dic_.begin();
190 CHECK(begin != dic_.end());
213191 uint32 n = 0;
214192 // use secure random not to predict the next emoticon.
215193 Util::GetRandomSequence(reinterpret_cast<char *>(&n), sizeof(n));
216 value = token->value + n % token->value_size;
217 value_size = 1;
194 begin += n % dic_.size();
195 end = begin + 1;
218196 initial_insert_pos = 4;
219197 initial_insert_size = 1;
220198 is_no_learning = true; // do not learn this candidate.
221199 } else {
222 const EmbeddedDictionary::Token *token
223 = Singleton<EmoticonDictionary>::get()->GetDictionary()->Lookup(key);
224 // by default, insert canidate at 7 th position.
225 if (token != NULL) {
226 value = token->value;
227 value_size = token->value_size;
200 const auto range = dic_.equal_range(key);
201 begin = range.first;
202 end = range.second;
203 if (begin != end) {
228204 initial_insert_pos = 6;
229 initial_insert_size = token == NULL ? 0 : token->value_size;
205 initial_insert_size = std::distance(begin, end);
230206 }
231207 }
232208
233 if (value == NULL || value_size == 0) {
209 if (begin == end) {
234210 continue;
235211 }
236212
237 InsertCandidates(value, value_size,
213 InsertCandidates(begin, end,
238214 initial_insert_pos,
239215 initial_insert_size,
240216 is_no_learning,
244220
245221 return modified;
246222 }
247 } // namespace
248
249 EmoticonRewriter::EmoticonRewriter() {}
250
251 EmoticonRewriter::~EmoticonRewriter() {}
223
224 std::unique_ptr<EmoticonRewriter> EmoticonRewriter::CreateFromDataManager(
225 const DataManagerInterface &data_manager) {
226 StringPiece token_array_data, string_array_data;
227 data_manager.GetEmoticonRewriterData(&token_array_data, &string_array_data);
228 return std::unique_ptr<EmoticonRewriter>(
229 new EmoticonRewriter(token_array_data, string_array_data));
230 }
231
232 EmoticonRewriter::EmoticonRewriter(StringPiece token_array_data,
233 StringPiece string_array_data)
234 : dic_(token_array_data, string_array_data) {}
235
236 EmoticonRewriter::~EmoticonRewriter() = default;
252237
253238 int EmoticonRewriter::capability(const ConversionRequest &request) const {
254239 if (request.request().mixed_conversion()) {
2929 #ifndef MOZC_REWRITER_EMOTICON_REWRITER_H_
3030 #define MOZC_REWRITER_EMOTICON_REWRITER_H_
3131
32 #include <memory>
33
34 #include "data_manager/data_manager_interface.h"
3235 #include "rewriter/rewriter_interface.h"
36 #include "rewriter/serialized_dictionary.h"
3337
3438 namespace mozc {
3539
3842
3943 class EmoticonRewriter : public RewriterInterface {
4044 public:
41 EmoticonRewriter();
42 virtual ~EmoticonRewriter();
45 static std::unique_ptr<EmoticonRewriter> CreateFromDataManager(
46 const DataManagerInterface &data_manager);
4347
44 virtual int capability(const ConversionRequest &request) const;
48 EmoticonRewriter(StringPiece token_array_data, StringPiece string_array_data);
49 ~EmoticonRewriter() override;
4550
46 virtual bool Rewrite(const ConversionRequest &request,
47 Segments *segments) const;
51 int capability(const ConversionRequest &request) const override;
52
53 bool Rewrite(const ConversionRequest &request,
54 Segments *segments) const override;
55
56 private:
57 bool RewriteCandidate(Segments *segments) const;
58
59 SerializedDictionary dic_;
4860 };
4961
5062 } // namespace mozc
2929 #include "rewriter/emoticon_rewriter.h"
3030
3131 #include <cstddef>
32 #include <memory>
3233 #include <string>
3334
3435 #include "base/logging.h"
3637 #include "base/util.h"
3738 #include "config/config_handler.h"
3839 #include "converter/segments.h"
40 #include "data_manager/testing/mock_data_manager.h"
3941 #include "protocol/commands.pb.h"
4042 #include "protocol/config.pb.h"
4143 #include "request/conversion_request.h"
44 #include "testing/base/public/googletest.h"
4245 #include "testing/base/public/gunit.h"
43
44 DECLARE_string(test_tmpdir);
46 #include "testing/base/public/mozctest.h"
4547
4648 namespace mozc {
49 namespace {
4750
48 namespace {
4951 void AddSegment(const string &key, const string &value,
5052 Segments *segments) {
5153 segments->Clear();
7072 }
7173 return false;
7274 }
73 } // namespace
7475
75 class EmoticonRewriterTest : public testing::Test {
76 class EmoticonRewriterTest : public ::testing::Test {
7677 protected:
77 EmoticonRewriterTest() {}
78 ~EmoticonRewriterTest() {}
78 testing::MockDataManager mock_data_manager_;
7979
80 virtual void SetUp() {
81 SystemUtil::SetUserProfileDirectory(FLAGS_test_tmpdir);
82 }
83
84 virtual void TearDown() {}
80 private:
81 testing::ScopedTmpUserProfileDirectory scoped_profile_dir_;
8582 };
8683
8784 TEST_F(EmoticonRewriterTest, BasicTest) {
88 EmoticonRewriter emoticon_rewriter;
85 std::unique_ptr<EmoticonRewriter> emoticon_rewriter =
86 EmoticonRewriter::CreateFromDataManager(mock_data_manager_);
87
8988 config::Config config;
9089 config::ConfigHandler::GetDefaultConfig(&config);
9190 ConversionRequest request;
9594
9695 Segments segments;
9796 AddSegment("test", "test", &segments);
98 emoticon_rewriter.Rewrite(request, &segments);
97 emoticon_rewriter->Rewrite(request, &segments);
9998 EXPECT_FALSE(HasEmoticon(segments));
10099
101100 // "かお"
102101 AddSegment("\xE3\x81\x8B\xE3\x81\x8A", "test", &segments);
103 emoticon_rewriter.Rewrite(request, &segments);
102 emoticon_rewriter->Rewrite(request, &segments);
104103 EXPECT_TRUE(HasEmoticon(segments));
105104
106105 // "かおもじ"
107106 AddSegment("\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98",
108107 "test", &segments);
109 emoticon_rewriter.Rewrite(request, &segments);
108 emoticon_rewriter->Rewrite(request, &segments);
110109 EXPECT_TRUE(HasEmoticon(segments));
111110
112111 // "にこにこ"
113112 AddSegment("\xE3\x81\xAB\xE3\x81\x93\xE3\x81\xAB\xE3\x81\x93",
114113 "test", &segments);
115 emoticon_rewriter.Rewrite(request, &segments);
114 emoticon_rewriter->Rewrite(request, &segments);
116115 EXPECT_TRUE(HasEmoticon(segments));
117116
118117 // "ふくわらい"
119118 AddSegment("\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F\xE3\x82\x89\xE3\x81\x84",
120119 "test", &segments);
121 emoticon_rewriter.Rewrite(request, &segments);
120 emoticon_rewriter->Rewrite(request, &segments);
122121 EXPECT_TRUE(HasEmoticon(segments));
123122 }
124123
127126
128127 Segments segments;
129128 AddSegment("test", "test", &segments);
130 emoticon_rewriter.Rewrite(request, &segments);
129 emoticon_rewriter->Rewrite(request, &segments);
131130 EXPECT_FALSE(HasEmoticon(segments));
132131
133132 // "かお"
134133 AddSegment("\xE3\x81\x8B\xE3\x81\x8A", "test", &segments);
135 emoticon_rewriter.Rewrite(request, &segments);
134 emoticon_rewriter->Rewrite(request, &segments);
136135 EXPECT_FALSE(HasEmoticon(segments));
137136
138137 // "かおもじ"
139138 AddSegment("\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98",
140139 "test", &segments);
141 emoticon_rewriter.Rewrite(request, &segments);
140 emoticon_rewriter->Rewrite(request, &segments);
142141 EXPECT_FALSE(HasEmoticon(segments));
143142
144143 // "にこにこ"
145144 AddSegment("\xE3\x81\xAB\xE3\x81\x93\xE3\x81\xAB\xE3\x81\x93",
146145 "test", &segments);
147 emoticon_rewriter.Rewrite(request, &segments);
146 emoticon_rewriter->Rewrite(request, &segments);
148147 EXPECT_FALSE(HasEmoticon(segments));
149148
150149 // "ふくわらい"
151150 AddSegment("\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F\xE3\x82\x89\xE3\x81\x84",
152151 "test", &segments);
153 emoticon_rewriter.Rewrite(request, &segments);
152 emoticon_rewriter->Rewrite(request, &segments);
154153 EXPECT_FALSE(HasEmoticon(segments));
155154 }
156155 }
157156
158157 TEST_F(EmoticonRewriterTest, MobileEnvironmentTest) {
159 EmoticonRewriter rewriter;
158 std::unique_ptr<EmoticonRewriter> rewriter =
159 EmoticonRewriter::CreateFromDataManager(mock_data_manager_);
160
160161 commands::Request request;
161162 ConversionRequest convreq;
162163 convreq.set_request(&request);
163164
164165 {
165166 request.set_mixed_conversion(true);
166 EXPECT_EQ(RewriterInterface::ALL, rewriter.capability(convreq));
167 EXPECT_EQ(RewriterInterface::ALL, rewriter->capability(convreq));
167168 }
168169
169170 {
170171 request.set_mixed_conversion(false);
171 EXPECT_EQ(RewriterInterface::CONVERSION, rewriter.capability(convreq));
172 EXPECT_EQ(RewriterInterface::CONVERSION, rewriter->capability(convreq));
172173 }
173174 }
174175
176 } // namespace
175177 } // namespace mozc
0 // Copyright 2010-2016, Google Inc.
1 // All rights reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are
5 // met:
6 //
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above
10 // copyright notice, this list of conditions and the following disclaimer
11 // in the documentation and/or other materials provided with the
12 // distribution.
13 // * Neither the name of Google Inc. nor the names of its
14 // contributors may be used to endorse or promote products derived from
15 // this software without specific prior written permission.
16 //
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29 #include <algorithm>
30 #include <memory>
31 #include <string>
32 #include <unordered_map>
33 #include <vector>
34
35 #include "base/file_stream.h"
36 #include "base/flags.h"
37 #include "base/init_mozc.h"
38 #include "base/logging.h"
39 #include "base/string_piece.h"
40 #include "base/util.h"
41 #include "rewriter/serialized_dictionary.h"
42
43 DEFINE_string(input, "", "Emoticon dictionary file");
44 DEFINE_string(output_token_array, "", "Output token array");
45 DEFINE_string(output_string_array, "", "Output string array");
46
47 namespace mozc {
48 namespace {
49
50 using KeyList = vector<string>;
51 using CompilerToken = SerializedDictionary::CompilerToken;
52 using TokenList = SerializedDictionary::TokenList;
53
54 int LookupCount(const std::unordered_map<string, int> &key_count,
55 const string &key) {
56 const auto iter = key_count.find(key);
57 return (iter == key_count.end()) ? 0 : iter->second;
58 }
59
60 string GetDescription(const KeyList &key_list,
61 const std::unordered_map<string, int> &key_count) {
62 if (key_list.size() == 1) {
63 return key_list[0];
64 }
65 KeyList sorted_key_list(key_list);
66 sort(sorted_key_list.begin(), sorted_key_list.end(),
67 [&key_count](const string &x, const string &y) {
68 const int x_count = LookupCount(key_count, x);
69 const int y_count = LookupCount(key_count, y);
70 if (x_count == y_count) {
71 return x < y;
72 }
73 return x_count < y_count;
74 });
75 return Util::StringPrintf("%s %s", sorted_key_list.back().c_str(),
76 sorted_key_list.front().c_str());
77 }
78
79 map<string, TokenList> ReadEmoticonTsv(const string &path) {
80 InputFileStream ifs(path.c_str());
81
82 string line;
83 getline(ifs, line); // Skip header
84
85 vector<pair<string, KeyList>> data;
86 std::unordered_map<string, int> key_count;
87 while (getline(ifs, line)) {
88 vector<StringPiece> field_list;
89 Util::SplitStringUsing(line, "\t", &field_list);
90 CHECK_GE(field_list.size(), 2) << "Format error: " << line;
91 LOG_IF(WARNING, field_list.size() > 3) << "Ignore extra columns: " << line;
92
93 string replaced;
94 Util::StringReplace(field_list[1], "\xE3\x80\x80", " ", true, &replaced);
95 KeyList key_list;
96 Util::SplitStringUsing(field_list[1], " ", &key_list);
97
98 data.emplace_back(field_list[0].as_string(), std::move(key_list));
99 for (const auto &key : key_list) {
100 ++key_count[key];
101 }
102 }
103
104 map<string, TokenList> input_data;
105 int16 cost = 10;
106 for (const auto &kv : data) {
107 const string &value = kv.first;
108 const KeyList &key_list = kv.second;
109 const string &description = GetDescription(key_list, key_count);
110 for (const string &key : key_list) {
111 std::unique_ptr<CompilerToken> token(new CompilerToken());
112 token->value = value;
113 token->description = description;
114 token->lid = 0;
115 token->rid = 0;
116 token->cost = cost;
117 input_data[key].push_back(std::move(token));
118 cost += 10;
119 }
120 }
121
122 return input_data;
123 }
124
125 } // namespace
126 } // namespace mozc
127
128 int main(int argc, char **argv) {
129 mozc::InitMozc(argv[0], &argc, &argv, true);
130 const auto &input_data = mozc::ReadEmoticonTsv(FLAGS_input);
131 mozc::SerializedDictionary::CompileToFiles(
132 input_data, FLAGS_output_token_array, FLAGS_output_string_array);
133 return 0;
134 }
+0
-127
src/rewriter/gen_emoticon_rewriter_data.py less more
0 # -*- coding: utf-8 -*-
1 # Copyright 2010-2016, Google Inc.
2 # All rights reserved.
3 #
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions are
6 # met:
7 #
8 # * Redistributions of source code must retain the above copyright
9 # notice, this list of conditions and the following disclaimer.
10 # * Redistributions in binary form must reproduce the above
11 # copyright notice, this list of conditions and the following disclaimer
12 # in the documentation and/or other materials provided with the
13 # distribution.
14 # * Neither the name of Google Inc. nor the names of its
15 # contributors may be used to endorse or promote products derived from
16 # this software without specific prior written permission.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 """Converter from emoticon data to embedded_dictionary.
31
32 Usage:
33 python gen_emoticon_rewriter_data.py --input=input.tsv --output=output_header
34 """
35
36 __author__ = "hidehiko"
37
38 from collections import defaultdict
39 import logging
40 import optparse
41 import re
42 import sys
43 from rewriter import embedded_dictionary_compiler
44
45
46 def ParseOptions():
47 parser = optparse.OptionParser()
48 parser.add_option('--input', dest='input', help='emoticon dictionary file')
49 parser.add_option('--output', dest='output', help='output header file')
50 return parser.parse_args()[0]
51
52
53 def GetDescription(key_list, key_count):
54 """Generates a description from readings.
55
56 We simply add 1) the most general reading and 2) the most specific reading.
57 1) and 2) are simply approximated by checking the frequency of the readings.
58
59 Args:
60 key_list: a list of key strings.
61 key_count: a dictionary of key to the number of key's occurence in the data
62 file.
63 Returns:
64 the description string.
65 """
66 if len(key_list) == 1:
67 return key_list[0]
68
69 sorted_key_list = sorted(key_list, key=lambda key: (key_count[key], key))
70 return '%s %s' % (sorted_key_list[-1], sorted_key_list[0])
71
72
73 def ReadEmoticonTsv(stream):
74 """Read lines from stream to a Token dictionary for a embedded dictionary."""
75 # Skip the first line (header).
76 stream.next()
77
78 data = []
79 key_count = defaultdict(int)
80 for line in stream:
81 # The file format is:
82 # value <tab> readings(space delimitered)
83 field_list = line.rstrip('\n').split('\t')
84 # Check the size of columns.
85 if len(field_list) < 2:
86 logging.critical('format error: %s', line)
87 sys.exit(1)
88 if len(field_list) > 3:
89 logging.warning('ignore extra columns: %s', line)
90
91 # \xE3\x80\x80 is full width space
92 key_list = re.split(r'(?: |\xE3\x80\x80)+', field_list[1].strip())
93 data.append((field_list[0], key_list))
94 for key in key_list:
95 key_count[key] += 1
96
97 input_data = defaultdict(list)
98 cost = 10
99 for value, key_list in data:
100 input_value = value
101 if input_value == "":
102 input_value = None
103 description = GetDescription(key_list, key_count)
104 if description == "":
105 description = None
106
107 for key in key_list:
108 input_data[key].append(embedded_dictionary_compiler.Token(
109 key, input_value, description, None, 0, 0, cost))
110 cost += 10
111
112 return input_data
113
114
115 def main():
116 options = ParseOptions()
117 with open(options.input, 'r') as input_stream:
118 input_data = ReadEmoticonTsv(input_stream)
119
120 with open(options.output, 'w') as output_stream:
121 embedded_dictionary_compiler.Compile(
122 'EmoticonData', input_data, output_stream)
123
124
125 if __name__ == '__main__':
126 main()
107107 kEmojiDataList, arraysize(kEmojiDataList),
108108 kEmojiTokenList, arraysize(kEmojiTokenList),
109109 kEmojiValueList));
110 AddRewriter(new EmoticonRewriter);
110 AddRewriter(EmoticonRewriter::CreateFromDataManager(*data_manager).release());
111111 AddRewriter(new CalculatorRewriter(parent_converter));
112112 AddRewriter(new SymbolRewriter(parent_converter, data_manager));
113113 AddRewriter(new UnicodeRewriter(parent_converter));
6767 ],
6868 },
6969 {
70 'action_name': 'gen_emoticon_rewriter_data',
71 'variables': {
72 'input_file': '../data/emoticon/emoticon.tsv',
73 'output_file': '<(gen_out_dir)/emoticon_rewriter_data.h',
74 },
75 'inputs': [
76 'embedded_dictionary_compiler.py',
77 'gen_emoticon_rewriter_data.py',
78 '<(input_file)',
79 ],
80 'outputs': [
81 '<(output_file)'
82 ],
83 'action': [
84 'python', 'gen_emoticon_rewriter_data.py',
85 '--input=<(input_file)',
86 '--output=<(output_file)',
87 ],
88 },
89 {
9070 'action_name': 'gen_emoji_rewriter_data',
9171 'variables': {
9272 'input_file': '../data/emoji/emoji_data.tsv',
213193 '../base/base.gyp:serialized_string_array',
214194 ],
215195 },
196 {
197 'target_name': 'gen_emoticon_rewriter_data_main',
198 'type': 'executable',
199 'toolsets': ['host'],
200 'sources': [
201 'gen_emoticon_rewriter_data.cc',
202 ],
203 'dependencies': [
204 '../base/base.gyp:base',
205 'rewriter_serialized_dictionary.gyp:serialized_dictionary',
206 ],
207 },
216208 ],
217209 }
4848 namespace mozc {
4949 namespace {
5050
51 struct CompilerToken {
52 string value;
53 string description;
54 string additional_description;
55 uint16 lid;
56 uint16 rid;
57 int16 cost;
58 };
59
60 using TokenList = vector<std::unique_ptr<CompilerToken>>;
51 using CompilerToken = SerializedDictionary::CompilerToken;
52 using TokenList = SerializedDictionary::TokenList;
6153
6254 struct CompareByCost {
6355 bool operator()(const std::unique_ptr<CompilerToken> &t1,
112104 std::istream *input,
113105 std::unique_ptr<uint32[]> *output_token_array_buf,
114106 std::unique_ptr<uint32[]> *output_string_array_buf) {
115 CHECK(SystemUtil::IsLittleEndian());
116
117107 map<string, TokenList> dic;
118108 LoadTokens(input, &dic);
109 return Compile(dic, output_token_array_buf, output_string_array_buf);
110 }
111
112 pair<StringPiece, StringPiece> SerializedDictionary::Compile(
113 const map<string, TokenList> &dic,
114 std::unique_ptr<uint32[]> *output_token_array_buf,
115 std::unique_ptr<uint32[]> *output_string_array_buf) {
116 CHECK(SystemUtil::IsLittleEndian());
119117
120118 // Build a mapping from string to its index in a serialized string array.
121119 // Note that duplicate keys share the same index, so data is slightly
188186 const string &output_string_array) {
189187 InputFileStream ifs(input.c_str());
190188 CHECK(ifs.good());
191
189 map<string, TokenList> dic;
190 LoadTokens(&ifs, &dic);
191 CompileToFiles(dic, output_token_array, output_string_array);
192 }
193
194 void SerializedDictionary::CompileToFiles(const map<string, TokenList> &dic,
195 const string &output_token_array,
196 const string &output_string_array) {
192197 std::unique_ptr<uint32[]> buf1, buf2;
193 const pair<StringPiece, StringPiece> data = Compile(&ifs, &buf1, &buf2);
198 const pair<StringPiece, StringPiece> data = Compile(dic, &buf1, &buf2);
194199 CHECK(VerifyData(data.first, data.second));
195200
196201 OutputFileStream token_ofs(output_token_array.c_str(),
3131
3232 #include <istream>
3333 #include <iterator>
34 #include <map>
3435 #include <string>
3536 #include <utility>
3637
105106 // array by index.
106107 class SerializedDictionary {
107108 public:
109 struct CompilerToken {
110 string value;
111 string description;
112 string additional_description;
113 uint16 lid;
114 uint16 rid;
115 int16 cost;
116 };
117
118 using TokenList = vector<std::unique_ptr<CompilerToken>>;
119
108120 static const size_t kTokenByteLength = 24;
109121
110122 class iterator : public std::iterator<std::random_access_iterator_tag,
280292 std::istream *input,
281293 std::unique_ptr<uint32[]> *output_token_array_buf,
282294 std::unique_ptr<uint32[]> *output_string_array_buf);
295 static pair<StringPiece, StringPiece> Compile(
296 const map<string, TokenList> &dic,
297 std::unique_ptr<uint32[]> *output_token_array_buf,
298 std::unique_ptr<uint32[]> *output_string_array_buf);
283299
284300 // Creates serialized data and writes them to files.
285301 static void CompileToFiles(const string &input,
286302 const string &output_token_array,
287303 const string &output_string_array);
304 static void CompileToFiles(const map<string, TokenList> &dic,
305 const string &output_token_array,
306 const string &output_string_array);
288307
289308 // Validates the serialized data.
290309 static bool VerifyData(StringPiece token_array_data,
294313 // boundary.
295314 SerializedDictionary(StringPiece token_array, StringPiece string_array_data);
296315 ~SerializedDictionary();
316
317 std::size_t size() const {
318 return token_array_.size() / kTokenByteLength;
319 }
297320
298321 iterator begin() { return iterator(token_array_.data(), &string_array_); }
299322 const_iterator begin() const {