Move emoticon rewriter data to data set file
This CL moves C++-embedded data in EmoticonRewriter to the data set
file. To this end, gen_emoticon_rewriter_data.py is rewritten in C++ to
reuse SerializedDictionary::CompileToFiles.
BUG=
TEST=
REF_BUG=26841123
REF_CL=117306701,118327510
REF_TIME=2016-03-16T12:42:47+09:00
REF_TIME_RAW=1458099767 +0900
Noriyuki Takahashi
8 years ago
208 | 208 | LOG(ERROR) << "Symbol dictionary data is broken"; |
209 | 209 | return false; |
210 | 210 | } |
211 | if (!reader.Get("emoticon_token", &emoticon_token_array_data_)) { | |
212 | LOG(ERROR) << "Cannot find an emoticon token array"; | |
213 | return false; | |
214 | } | |
215 | if (!reader.Get("emoticon_string", &emoticon_string_array_data_)) { | |
216 | LOG(ERROR) << "Cannot find an emoticon string array or data is broken"; | |
217 | return false; | |
218 | } | |
219 | if (!SerializedDictionary::VerifyData(emoticon_token_array_data_, | |
220 | emoticon_string_array_data_)) { | |
221 | LOG(ERROR) << "Emoticon dictionary data is broken"; | |
222 | return false; | |
223 | } | |
211 | 224 | |
212 | 225 | if (!reader.Get("usage_item_array", &usage_items_data_)) { |
213 | 226 | VLOG(2) << "Usage dictionary is not provided"; |
336 | 349 | *string_array_data = symbol_string_array_data_; |
337 | 350 | } |
338 | 351 | |
352 | void DataManager::GetEmoticonRewriterData( | |
353 | StringPiece *token_array_data, StringPiece *string_array_data) const { | |
354 | *token_array_data = emoticon_token_array_data_; | |
355 | *string_array_data = emoticon_string_array_data_; | |
356 | } | |
357 | ||
339 | 358 | void DataManager::GetCounterSuffixSortedArray(const char **array, |
340 | 359 | size_t *size) const { |
341 | 360 | *array = counter_suffix_data_.data(); |
116 | 116 | 'gen_separate_suffix_data_for_<(dataset_tag)#host', |
117 | 117 | 'gen_separate_reading_correction_data_for_<(dataset_tag)#host', |
118 | 118 | 'gen_separate_symbol_rewriter_data_for_<(dataset_tag)#host', |
119 | 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)#host', | |
119 | 120 | ], |
120 | 121 | 'actions': [ |
121 | 122 | { |
145 | 146 | 'reading_correction_correction': '<(gen_out_dir)/reading_correction_correction.data', |
146 | 147 | 'symbol_token': '<(gen_out_dir)/symbol_token.data', |
147 | 148 | 'symbol_string': '<(gen_out_dir)/symbol_string.data', |
149 | 'emoticon_token': '<(gen_out_dir)/emoticon_token.data', | |
150 | 'emoticon_string': '<(gen_out_dir)/emoticon_string.data', | |
148 | 151 | }, |
149 | 152 | 'inputs': [ |
150 | 153 | '<(pos_matcher)', |
170 | 173 | '<(reading_correction_correction)', |
171 | 174 | '<(symbol_token)', |
172 | 175 | '<(symbol_string)', |
176 | '<(emoticon_token)', | |
177 | '<(emoticon_string)', | |
173 | 178 | ], |
174 | 179 | 'outputs': [ |
175 | 180 | '<(gen_out_dir)/<(out_mozc_data)', |
201 | 206 | 'reading_correction_correction:32:<(gen_out_dir)/reading_correction_correction.data', |
202 | 207 | 'symbol_token:32:<(gen_out_dir)/symbol_token.data', |
203 | 208 | 'symbol_string:32:<(gen_out_dir)/symbol_string.data', |
209 | 'emoticon_token:32:<(gen_out_dir)/emoticon_token.data', | |
210 | 'emoticon_string:32:<(gen_out_dir)/emoticon_string.data', | |
204 | 211 | ], |
205 | 212 | 'conditions': [ |
206 | 213 | ['target_platform!="Android"', { |
697 | 704 | ], |
698 | 705 | }, |
699 | 706 | { |
707 | 'target_name': 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)', | |
708 | 'type': 'none', | |
709 | 'toolsets': ['host'], | |
710 | 'dependencies': [ | |
711 | '../../rewriter/rewriter_base.gyp:gen_emoticon_rewriter_data_main', | |
712 | ], | |
713 | 'actions': [ | |
714 | { | |
715 | 'action_name': 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)', | |
716 | 'variables': { | |
717 | 'generator': '<(PRODUCT_DIR)/gen_emoticon_rewriter_data_main<(EXECUTABLE_SUFFIX)', | |
718 | 'input_files': [ | |
719 | '<(mozc_dir)/data/emoticon/emoticon.tsv', | |
720 | ], | |
721 | }, | |
722 | 'inputs': [ | |
723 | '<(generator)', | |
724 | '<@(input_files)', | |
725 | ], | |
726 | 'outputs': [ | |
727 | '<(gen_out_dir)/emoticon_token.data', | |
728 | '<(gen_out_dir)/emoticon_string.data', | |
729 | ], | |
730 | 'action': [ | |
731 | '<(generator)', | |
732 | '--input=<(mozc_dir)/data/emoticon/emoticon.tsv', | |
733 | '--output_token_array=<(gen_out_dir)/emoticon_token.data', | |
734 | '--output_string_array=<(gen_out_dir)/emoticon_string.data', | |
735 | ], | |
736 | 'message': '[<(dataset_tag)] Generating emoticon data', | |
737 | }, | |
738 | ], | |
739 | }, | |
740 | { | |
700 | 741 | 'target_name': 'gen_separate_counter_suffix_data_for_<(dataset_tag)', |
701 | 742 | 'type': 'none', |
702 | 743 | 'toolsets': ['host'], |
83 | 83 | StringPiece *correction_array_data) const override; |
84 | 84 | void GetSymbolRewriterData(StringPiece *token_array_data, |
85 | 85 | StringPiece *string_array_data) const override; |
86 | void GetEmoticonRewriterData(StringPiece *token_array_data, | |
87 | StringPiece *string_array_data) const override; | |
86 | 88 | |
87 | 89 | #ifndef NO_USAGE_REWRITER |
88 | 90 | void GetUsageRewriterData( |
119 | 121 | StringPiece reading_correction_correction_array_data_; |
120 | 122 | StringPiece symbol_token_array_data_; |
121 | 123 | StringPiece symbol_string_array_data_; |
124 | StringPiece emoticon_token_array_data_; | |
125 | StringPiece emoticon_string_array_data_; | |
122 | 126 | StringPiece usage_base_conjugation_suffix_data_; |
123 | 127 | StringPiece usage_conjugation_suffix_data_; |
124 | 128 | StringPiece usage_conjugation_index_data_; |
89 | 89 | virtual void GetSymbolRewriterData(StringPiece *token_array_data, |
90 | 90 | StringPiece *string_array_data) const = 0; |
91 | 91 | |
92 | // Gets an address of symbol rewriter data array and its size. | |
93 | virtual void GetEmoticonRewriterData( | |
94 | StringPiece *token_array_data, StringPiece *string_array_data) const = 0; | |
95 | ||
92 | 96 | #ifndef NO_USAGE_REWRITER |
93 | 97 | // Gets the usage rewriter data. |
94 | 98 | virtual void GetUsageRewriterData( |
88 | 88 | void GetSuggestionFilterData(const char **data, size_t *size) const; |
89 | 89 | void GetSymbolRewriterData(StringPiece *token_array_data, |
90 | 90 | StringPiece *string_array_data) const; |
91 | void GetEmoticonRewriterData(StringPiece *token_array_data, | |
92 | StringPiece *string_array_data) const; | |
91 | 93 | #ifndef NO_USAGE_REWRITER |
92 | 94 | void GetUsageRewriterData(StringPiece *base_conjugation_suffix_data, |
93 | 95 | StringPiece *conjugation_suffix_data, |
236 | 238 | void PackedDataManager::Impl::GetSymbolRewriterData( |
237 | 239 | StringPiece *token_array_data, StringPiece *string_array_data) const { |
238 | 240 | manager_.GetSymbolRewriterData(token_array_data, string_array_data); |
241 | } | |
242 | ||
243 | void PackedDataManager::Impl::GetEmoticonRewriterData( | |
244 | StringPiece *token_array_data, StringPiece *string_array_data) const { | |
245 | manager_.GetEmoticonRewriterData(token_array_data, string_array_data); | |
239 | 246 | } |
240 | 247 | |
241 | 248 | #ifndef NO_USAGE_REWRITER |
391 | 398 | manager_impl_->GetSymbolRewriterData(token_array_data, string_array_data); |
392 | 399 | } |
393 | 400 | |
401 | void PackedDataManager::GetEmoticonRewriterData( | |
402 | StringPiece *token_array_data, StringPiece *string_array_data) const { | |
403 | manager_impl_->GetEmoticonRewriterData(token_array_data, string_array_data); | |
404 | } | |
405 | ||
394 | 406 | #ifndef NO_USAGE_REWRITER |
395 | 407 | void PackedDataManager::GetUsageRewriterData( |
396 | 408 | StringPiece *base_conjugation_suffix_data, |
72 | 72 | void GetSuggestionFilterData(const char **data, size_t *size) const override; |
73 | 73 | void GetSymbolRewriterData(StringPiece *token_array_data, |
74 | 74 | StringPiece *string_array_data) const override; |
75 | void GetEmoticonRewriterData(StringPiece *token_array_data, | |
76 | StringPiece *string_array_data) const override; | |
75 | 77 | #ifndef NO_USAGE_REWRITER |
76 | 78 | void GetUsageRewriterData( |
77 | 79 | StringPiece *base_conjugation_suffix_data, |
0 | 0 | MAJOR=2 |
1 | 1 | MINOR=17 |
2 | BUILD=2532 | |
2 | BUILD=2533 | |
3 | 3 | REVISION=102 |
4 | 4 | # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be |
5 | 5 | # downloaded by NaCl Mozc. |
42 | 42 | #include "protocol/commands.pb.h" |
43 | 43 | #include "protocol/config.pb.h" |
44 | 44 | #include "request/conversion_request.h" |
45 | #include "rewriter/embedded_dictionary.h" | |
46 | 45 | #include "rewriter/rewriter_interface.h" |
46 | #include "rewriter/serialized_dictionary.h" | |
47 | 47 | |
48 | 48 | namespace mozc { |
49 | 49 | namespace { |
50 | 50 | |
51 | #include "rewriter/emoticon_rewriter_data.h" | |
52 | ||
53 | class EmoticonDictionary { | |
54 | public: | |
55 | EmoticonDictionary() | |
56 | : dic_(new EmbeddedDictionary(kEmoticonData_token_data, | |
57 | kEmoticonData_token_size)) {} | |
58 | ||
59 | ~EmoticonDictionary() {} | |
60 | ||
61 | EmbeddedDictionary *GetDictionary() const { | |
62 | return dic_.get(); | |
63 | } | |
64 | ||
65 | private: | |
66 | std::unique_ptr<EmbeddedDictionary> dic_; | |
67 | }; | |
68 | ||
69 | 51 | class ValueCostCompare { |
70 | 52 | public: |
71 | bool operator() (const EmbeddedDictionary::Value *a, | |
72 | const EmbeddedDictionary::Value *b) const { | |
73 | return a->cost < b->cost; | |
53 | bool operator() (SerializedDictionary::const_iterator a, | |
54 | SerializedDictionary::const_iterator b) const { | |
55 | return a.cost() < b.cost(); | |
74 | 56 | } |
75 | 57 | }; |
76 | 58 | |
77 | 59 | class IsEqualValue { |
78 | 60 | public: |
79 | bool operator() (const EmbeddedDictionary::Value *a, | |
80 | const EmbeddedDictionary::Value *b) const { | |
81 | return strcmp(a->value, b->value) == 0; | |
61 | bool operator() (const SerializedDictionary::const_iterator a, | |
62 | const SerializedDictionary::const_iterator b) const { | |
63 | return a.value() == b.value(); | |
82 | 64 | } |
83 | 65 | }; |
84 | 66 | |
85 | 67 | // Insert Emoticon into the |segment| |
86 | 68 | // Top |initial_insert_size| candidates are inserted from |initial_insert_pos|. |
87 | 69 | // Remained candidates are added to the buttom. |
88 | void InsertCandidates(const EmbeddedDictionary::Value *value, | |
89 | size_t value_size, | |
70 | void InsertCandidates(SerializedDictionary::const_iterator begin, | |
71 | SerializedDictionary::const_iterator end, | |
90 | 72 | size_t initial_insert_pos, |
91 | 73 | size_t initial_insert_size, |
92 | 74 | bool is_no_learning, |
100 | 82 | size_t offset = min(initial_insert_pos, segment->candidates_size()); |
101 | 83 | |
102 | 84 | // Sort values by cost just in case |
103 | vector<const EmbeddedDictionary::Value *> sorted_value; | |
104 | for (size_t i = 0; i < value_size; ++i) { | |
105 | sorted_value.push_back(&value[i]); | |
85 | vector<SerializedDictionary::const_iterator> sorted_value; | |
86 | for (auto iter = begin; iter != end; ++iter) { | |
87 | sorted_value.push_back(iter); | |
106 | 88 | } |
107 | 89 | |
108 | 90 | std::sort(sorted_value.begin(), sorted_value.end(), ValueCostCompare()); |
115 | 97 | sorted_value.end()); |
116 | 98 | |
117 | 99 | for (size_t i = 0; i < sorted_value.size(); ++i) { |
118 | Segment::Candidate *c = NULL; | |
100 | Segment::Candidate *c = nullptr; | |
119 | 101 | |
120 | 102 | if (i < initial_insert_size) { |
121 | 103 | c = segment->insert_candidate(offset); |
124 | 106 | c = segment->push_back_candidate(); |
125 | 107 | } |
126 | 108 | |
127 | if (c == NULL) { | |
109 | if (c == nullptr) { | |
128 | 110 | LOG(ERROR) << "cannot insert candidate at " << offset; |
129 | 111 | continue; |
130 | 112 | } |
131 | 113 | |
132 | 114 | c->Init(); |
133 | 115 | // TODO(taku): set an appropriate POS here. |
134 | c->lid = sorted_value[i]->lid; | |
135 | c->rid = sorted_value[i]->rid; | |
116 | c->lid = sorted_value[i].lid(); | |
117 | c->rid = sorted_value[i].rid(); | |
136 | 118 | c->cost = base_candidate.cost; |
137 | c->value = sorted_value[i]->value; | |
138 | c->content_value = sorted_value[i]->value; | |
119 | sorted_value[i].value().CopyToString(&c->value); | |
120 | c->content_value = c->value; | |
139 | 121 | c->key = base_candidate.key; |
140 | 122 | c->content_key = base_candidate.content_key; |
141 | 123 | // no full/half width normalizations |
150 | 132 | const char kBaseEmoticonDescription[] |
151 | 133 | = "\xE9\xA1\x94\xE6\x96\x87\xE5\xAD\x97"; |
152 | 134 | |
153 | if (sorted_value[i]->description == NULL) { | |
135 | if (sorted_value[i].description().empty()) { | |
154 | 136 | c->description = kBaseEmoticonDescription; |
155 | 137 | } else { |
156 | 138 | string description = kBaseEmoticonDescription; |
157 | 139 | description.append(" "); |
158 | description.append(sorted_value[i]->description); | |
140 | sorted_value[i].description().AppendToString(&description); | |
159 | 141 | c->description = description; |
160 | 142 | } |
161 | 143 | } |
162 | 144 | } |
163 | 145 | |
164 | bool RewriteCandidate(Segments *segments) { | |
146 | } // namespace | |
147 | ||
148 | bool EmoticonRewriter::RewriteCandidate(Segments *segments) const { | |
165 | 149 | bool modified = false; |
166 | 150 | for (size_t i = 0; i < segments->conversion_segments_size(); ++i) { |
167 | 151 | const string &key = segments->conversion_segment(i).key(); |
170 | 154 | continue; |
171 | 155 | } |
172 | 156 | bool is_no_learning = false; |
173 | const EmbeddedDictionary::Value *value = NULL; | |
174 | size_t value_size = 0; | |
157 | SerializedDictionary::const_iterator begin; | |
158 | SerializedDictionary::const_iterator end = dic_.end(); | |
175 | 159 | size_t initial_insert_size = 0; |
176 | 160 | size_t initial_insert_pos = 0; |
177 | 161 | |
183 | 167 | if (key == "\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98") { |
184 | 168 | // When key is "かおもじ", default candidate size should be small enough. |
185 | 169 | // It is safe to expand all candidates at this time. |
186 | const EmbeddedDictionary::Token *token | |
187 | = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken(); | |
188 | CHECK(token); | |
170 | begin = dic_.begin(); | |
171 | CHECK(begin != dic_.end()); | |
172 | end = dic_.end(); | |
189 | 173 | // set large value(100) so that all candidates are pushed to the bottom |
190 | value = token->value; | |
191 | value_size = token->value_size; | |
192 | 174 | initial_insert_pos = 100; |
193 | initial_insert_size = token->value_size; | |
175 | initial_insert_size = dic_.size(); | |
194 | 176 | // "かお" |
195 | 177 | } else if (key == "\xE3\x81\x8B\xE3\x81\x8A") { |
196 | 178 | // When key is "かお", expand all candidates in conservative way. |
197 | const EmbeddedDictionary::Token *token | |
198 | = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken(); | |
199 | CHECK(token); | |
179 | begin = dic_.begin(); | |
180 | CHECK(begin != dic_.end()); | |
200 | 181 | // first 6 candidates are inserted at 4 th position. |
201 | 182 | // Other candidates are pushed to the buttom. |
202 | value = token->value; | |
203 | value_size = token->value_size; | |
204 | 183 | initial_insert_pos = 4; |
205 | 184 | initial_insert_size = 6; |
206 | 185 | } else if (key == "\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F" |
207 | 186 | "\xE3\x82\x89\xE3\x81\x84") { // "ふくわらい" |
208 | 187 | // Choose one emoticon randomly from the dictionary. |
209 | 188 | // TODO(taku): want to make it "generate" more funny emoticon. |
210 | const EmbeddedDictionary::Token *token | |
211 | = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken(); | |
212 | CHECK(token); | |
189 | begin = dic_.begin(); | |
190 | CHECK(begin != dic_.end()); | |
213 | 191 | uint32 n = 0; |
214 | 192 | // use secure random not to predict the next emoticon. |
215 | 193 | Util::GetRandomSequence(reinterpret_cast<char *>(&n), sizeof(n)); |
216 | value = token->value + n % token->value_size; | |
217 | value_size = 1; | |
194 | begin += n % dic_.size(); | |
195 | end = begin + 1; | |
218 | 196 | initial_insert_pos = 4; |
219 | 197 | initial_insert_size = 1; |
220 | 198 | is_no_learning = true; // do not learn this candidate. |
221 | 199 | } else { |
222 | const EmbeddedDictionary::Token *token | |
223 | = Singleton<EmoticonDictionary>::get()->GetDictionary()->Lookup(key); | |
224 | // by default, insert canidate at 7 th position. | |
225 | if (token != NULL) { | |
226 | value = token->value; | |
227 | value_size = token->value_size; | |
200 | const auto range = dic_.equal_range(key); | |
201 | begin = range.first; | |
202 | end = range.second; | |
203 | if (begin != end) { | |
228 | 204 | initial_insert_pos = 6; |
229 | initial_insert_size = token == NULL ? 0 : token->value_size; | |
205 | initial_insert_size = std::distance(begin, end); | |
230 | 206 | } |
231 | 207 | } |
232 | 208 | |
233 | if (value == NULL || value_size == 0) { | |
209 | if (begin == end) { | |
234 | 210 | continue; |
235 | 211 | } |
236 | 212 | |
237 | InsertCandidates(value, value_size, | |
213 | InsertCandidates(begin, end, | |
238 | 214 | initial_insert_pos, |
239 | 215 | initial_insert_size, |
240 | 216 | is_no_learning, |
244 | 220 | |
245 | 221 | return modified; |
246 | 222 | } |
247 | } // namespace | |
248 | ||
249 | EmoticonRewriter::EmoticonRewriter() {} | |
250 | ||
251 | EmoticonRewriter::~EmoticonRewriter() {} | |
223 | ||
224 | std::unique_ptr<EmoticonRewriter> EmoticonRewriter::CreateFromDataManager( | |
225 | const DataManagerInterface &data_manager) { | |
226 | StringPiece token_array_data, string_array_data; | |
227 | data_manager.GetEmoticonRewriterData(&token_array_data, &string_array_data); | |
228 | return std::unique_ptr<EmoticonRewriter>( | |
229 | new EmoticonRewriter(token_array_data, string_array_data)); | |
230 | } | |
231 | ||
232 | EmoticonRewriter::EmoticonRewriter(StringPiece token_array_data, | |
233 | StringPiece string_array_data) | |
234 | : dic_(token_array_data, string_array_data) {} | |
235 | ||
236 | EmoticonRewriter::~EmoticonRewriter() = default; | |
252 | 237 | |
253 | 238 | int EmoticonRewriter::capability(const ConversionRequest &request) const { |
254 | 239 | if (request.request().mixed_conversion()) { |
29 | 29 | #ifndef MOZC_REWRITER_EMOTICON_REWRITER_H_ |
30 | 30 | #define MOZC_REWRITER_EMOTICON_REWRITER_H_ |
31 | 31 | |
32 | #include <memory> | |
33 | ||
34 | #include "data_manager/data_manager_interface.h" | |
32 | 35 | #include "rewriter/rewriter_interface.h" |
36 | #include "rewriter/serialized_dictionary.h" | |
33 | 37 | |
34 | 38 | namespace mozc { |
35 | 39 | |
38 | 42 | |
39 | 43 | class EmoticonRewriter : public RewriterInterface { |
40 | 44 | public: |
41 | EmoticonRewriter(); | |
42 | virtual ~EmoticonRewriter(); | |
45 | static std::unique_ptr<EmoticonRewriter> CreateFromDataManager( | |
46 | const DataManagerInterface &data_manager); | |
43 | 47 | |
44 | virtual int capability(const ConversionRequest &request) const; | |
48 | EmoticonRewriter(StringPiece token_array_data, StringPiece string_array_data); | |
49 | ~EmoticonRewriter() override; | |
45 | 50 | |
46 | virtual bool Rewrite(const ConversionRequest &request, | |
47 | Segments *segments) const; | |
51 | int capability(const ConversionRequest &request) const override; | |
52 | ||
53 | bool Rewrite(const ConversionRequest &request, | |
54 | Segments *segments) const override; | |
55 | ||
56 | private: | |
57 | bool RewriteCandidate(Segments *segments) const; | |
58 | ||
59 | SerializedDictionary dic_; | |
48 | 60 | }; |
49 | 61 | |
50 | 62 | } // namespace mozc |
29 | 29 | #include "rewriter/emoticon_rewriter.h" |
30 | 30 | |
31 | 31 | #include <cstddef> |
32 | #include <memory> | |
32 | 33 | #include <string> |
33 | 34 | |
34 | 35 | #include "base/logging.h" |
36 | 37 | #include "base/util.h" |
37 | 38 | #include "config/config_handler.h" |
38 | 39 | #include "converter/segments.h" |
40 | #include "data_manager/testing/mock_data_manager.h" | |
39 | 41 | #include "protocol/commands.pb.h" |
40 | 42 | #include "protocol/config.pb.h" |
41 | 43 | #include "request/conversion_request.h" |
44 | #include "testing/base/public/googletest.h" | |
42 | 45 | #include "testing/base/public/gunit.h" |
43 | ||
44 | DECLARE_string(test_tmpdir); | |
46 | #include "testing/base/public/mozctest.h" | |
45 | 47 | |
46 | 48 | namespace mozc { |
49 | namespace { | |
47 | 50 | |
48 | namespace { | |
49 | 51 | void AddSegment(const string &key, const string &value, |
50 | 52 | Segments *segments) { |
51 | 53 | segments->Clear(); |
70 | 72 | } |
71 | 73 | return false; |
72 | 74 | } |
73 | } // namespace | |
74 | 75 | |
75 | class EmoticonRewriterTest : public testing::Test { | |
76 | class EmoticonRewriterTest : public ::testing::Test { | |
76 | 77 | protected: |
77 | EmoticonRewriterTest() {} | |
78 | ~EmoticonRewriterTest() {} | |
78 | testing::MockDataManager mock_data_manager_; | |
79 | 79 | |
80 | virtual void SetUp() { | |
81 | SystemUtil::SetUserProfileDirectory(FLAGS_test_tmpdir); | |
82 | } | |
83 | ||
84 | virtual void TearDown() {} | |
80 | private: | |
81 | testing::ScopedTmpUserProfileDirectory scoped_profile_dir_; | |
85 | 82 | }; |
86 | 83 | |
87 | 84 | TEST_F(EmoticonRewriterTest, BasicTest) { |
88 | EmoticonRewriter emoticon_rewriter; | |
85 | std::unique_ptr<EmoticonRewriter> emoticon_rewriter = | |
86 | EmoticonRewriter::CreateFromDataManager(mock_data_manager_); | |
87 | ||
89 | 88 | config::Config config; |
90 | 89 | config::ConfigHandler::GetDefaultConfig(&config); |
91 | 90 | ConversionRequest request; |
95 | 94 | |
96 | 95 | Segments segments; |
97 | 96 | AddSegment("test", "test", &segments); |
98 | emoticon_rewriter.Rewrite(request, &segments); | |
97 | emoticon_rewriter->Rewrite(request, &segments); | |
99 | 98 | EXPECT_FALSE(HasEmoticon(segments)); |
100 | 99 | |
101 | 100 | // "かお" |
102 | 101 | AddSegment("\xE3\x81\x8B\xE3\x81\x8A", "test", &segments); |
103 | emoticon_rewriter.Rewrite(request, &segments); | |
102 | emoticon_rewriter->Rewrite(request, &segments); | |
104 | 103 | EXPECT_TRUE(HasEmoticon(segments)); |
105 | 104 | |
106 | 105 | // "かおもじ" |
107 | 106 | AddSegment("\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98", |
108 | 107 | "test", &segments); |
109 | emoticon_rewriter.Rewrite(request, &segments); | |
108 | emoticon_rewriter->Rewrite(request, &segments); | |
110 | 109 | EXPECT_TRUE(HasEmoticon(segments)); |
111 | 110 | |
112 | 111 | // "にこにこ" |
113 | 112 | AddSegment("\xE3\x81\xAB\xE3\x81\x93\xE3\x81\xAB\xE3\x81\x93", |
114 | 113 | "test", &segments); |
115 | emoticon_rewriter.Rewrite(request, &segments); | |
114 | emoticon_rewriter->Rewrite(request, &segments); | |
116 | 115 | EXPECT_TRUE(HasEmoticon(segments)); |
117 | 116 | |
118 | 117 | // "ふくわらい" |
119 | 118 | AddSegment("\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F\xE3\x82\x89\xE3\x81\x84", |
120 | 119 | "test", &segments); |
121 | emoticon_rewriter.Rewrite(request, &segments); | |
120 | emoticon_rewriter->Rewrite(request, &segments); | |
122 | 121 | EXPECT_TRUE(HasEmoticon(segments)); |
123 | 122 | } |
124 | 123 | |
127 | 126 | |
128 | 127 | Segments segments; |
129 | 128 | AddSegment("test", "test", &segments); |
130 | emoticon_rewriter.Rewrite(request, &segments); | |
129 | emoticon_rewriter->Rewrite(request, &segments); | |
131 | 130 | EXPECT_FALSE(HasEmoticon(segments)); |
132 | 131 | |
133 | 132 | // "かお" |
134 | 133 | AddSegment("\xE3\x81\x8B\xE3\x81\x8A", "test", &segments); |
135 | emoticon_rewriter.Rewrite(request, &segments); | |
134 | emoticon_rewriter->Rewrite(request, &segments); | |
136 | 135 | EXPECT_FALSE(HasEmoticon(segments)); |
137 | 136 | |
138 | 137 | // "かおもじ" |
139 | 138 | AddSegment("\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98", |
140 | 139 | "test", &segments); |
141 | emoticon_rewriter.Rewrite(request, &segments); | |
140 | emoticon_rewriter->Rewrite(request, &segments); | |
142 | 141 | EXPECT_FALSE(HasEmoticon(segments)); |
143 | 142 | |
144 | 143 | // "にこにこ" |
145 | 144 | AddSegment("\xE3\x81\xAB\xE3\x81\x93\xE3\x81\xAB\xE3\x81\x93", |
146 | 145 | "test", &segments); |
147 | emoticon_rewriter.Rewrite(request, &segments); | |
146 | emoticon_rewriter->Rewrite(request, &segments); | |
148 | 147 | EXPECT_FALSE(HasEmoticon(segments)); |
149 | 148 | |
150 | 149 | // "ふくわらい" |
151 | 150 | AddSegment("\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F\xE3\x82\x89\xE3\x81\x84", |
152 | 151 | "test", &segments); |
153 | emoticon_rewriter.Rewrite(request, &segments); | |
152 | emoticon_rewriter->Rewrite(request, &segments); | |
154 | 153 | EXPECT_FALSE(HasEmoticon(segments)); |
155 | 154 | } |
156 | 155 | } |
157 | 156 | |
158 | 157 | TEST_F(EmoticonRewriterTest, MobileEnvironmentTest) { |
159 | EmoticonRewriter rewriter; | |
158 | std::unique_ptr<EmoticonRewriter> rewriter = | |
159 | EmoticonRewriter::CreateFromDataManager(mock_data_manager_); | |
160 | ||
160 | 161 | commands::Request request; |
161 | 162 | ConversionRequest convreq; |
162 | 163 | convreq.set_request(&request); |
163 | 164 | |
164 | 165 | { |
165 | 166 | request.set_mixed_conversion(true); |
166 | EXPECT_EQ(RewriterInterface::ALL, rewriter.capability(convreq)); | |
167 | EXPECT_EQ(RewriterInterface::ALL, rewriter->capability(convreq)); | |
167 | 168 | } |
168 | 169 | |
169 | 170 | { |
170 | 171 | request.set_mixed_conversion(false); |
171 | EXPECT_EQ(RewriterInterface::CONVERSION, rewriter.capability(convreq)); | |
172 | EXPECT_EQ(RewriterInterface::CONVERSION, rewriter->capability(convreq)); | |
172 | 173 | } |
173 | 174 | } |
174 | 175 | |
176 | } // namespace | |
175 | 177 | } // namespace mozc |
0 | // Copyright 2010-2016, Google Inc. | |
1 | // All rights reserved. | |
2 | // | |
3 | // Redistribution and use in source and binary forms, with or without | |
4 | // modification, are permitted provided that the following conditions are | |
5 | // met: | |
6 | // | |
7 | // * Redistributions of source code must retain the above copyright | |
8 | // notice, this list of conditions and the following disclaimer. | |
9 | // * Redistributions in binary form must reproduce the above | |
10 | // copyright notice, this list of conditions and the following disclaimer | |
11 | // in the documentation and/or other materials provided with the | |
12 | // distribution. | |
13 | // * Neither the name of Google Inc. nor the names of its | |
14 | // contributors may be used to endorse or promote products derived from | |
15 | // this software without specific prior written permission. | |
16 | // | |
17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ||
29 | #include <algorithm> | |
30 | #include <memory> | |
31 | #include <string> | |
32 | #include <unordered_map> | |
33 | #include <vector> | |
34 | ||
35 | #include "base/file_stream.h" | |
36 | #include "base/flags.h" | |
37 | #include "base/init_mozc.h" | |
38 | #include "base/logging.h" | |
39 | #include "base/string_piece.h" | |
40 | #include "base/util.h" | |
41 | #include "rewriter/serialized_dictionary.h" | |
42 | ||
43 | DEFINE_string(input, "", "Emoticon dictionary file"); | |
44 | DEFINE_string(output_token_array, "", "Output token array"); | |
45 | DEFINE_string(output_string_array, "", "Output string array"); | |
46 | ||
47 | namespace mozc { | |
48 | namespace { | |
49 | ||
50 | using KeyList = vector<string>; | |
51 | using CompilerToken = SerializedDictionary::CompilerToken; | |
52 | using TokenList = SerializedDictionary::TokenList; | |
53 | ||
54 | int LookupCount(const std::unordered_map<string, int> &key_count, | |
55 | const string &key) { | |
56 | const auto iter = key_count.find(key); | |
57 | return (iter == key_count.end()) ? 0 : iter->second; | |
58 | } | |
59 | ||
60 | string GetDescription(const KeyList &key_list, | |
61 | const std::unordered_map<string, int> &key_count) { | |
62 | if (key_list.size() == 1) { | |
63 | return key_list[0]; | |
64 | } | |
65 | KeyList sorted_key_list(key_list); | |
66 | sort(sorted_key_list.begin(), sorted_key_list.end(), | |
67 | [&key_count](const string &x, const string &y) { | |
68 | const int x_count = LookupCount(key_count, x); | |
69 | const int y_count = LookupCount(key_count, y); | |
70 | if (x_count == y_count) { | |
71 | return x < y; | |
72 | } | |
73 | return x_count < y_count; | |
74 | }); | |
75 | return Util::StringPrintf("%s %s", sorted_key_list.back().c_str(), | |
76 | sorted_key_list.front().c_str()); | |
77 | } | |
78 | ||
79 | map<string, TokenList> ReadEmoticonTsv(const string &path) { | |
80 | InputFileStream ifs(path.c_str()); | |
81 | ||
82 | string line; | |
83 | getline(ifs, line); // Skip header | |
84 | ||
85 | vector<pair<string, KeyList>> data; | |
86 | std::unordered_map<string, int> key_count; | |
87 | while (getline(ifs, line)) { | |
88 | vector<StringPiece> field_list; | |
89 | Util::SplitStringUsing(line, "\t", &field_list); | |
90 | CHECK_GE(field_list.size(), 2) << "Format error: " << line; | |
91 | LOG_IF(WARNING, field_list.size() > 3) << "Ignore extra columns: " << line; | |
92 | ||
93 | string replaced; | |
94 | Util::StringReplace(field_list[1], "\xE3\x80\x80", " ", true, &replaced); | |
95 | KeyList key_list; | |
96 | Util::SplitStringUsing(field_list[1], " ", &key_list); | |
97 | ||
98 | data.emplace_back(field_list[0].as_string(), std::move(key_list)); | |
99 | for (const auto &key : key_list) { | |
100 | ++key_count[key]; | |
101 | } | |
102 | } | |
103 | ||
104 | map<string, TokenList> input_data; | |
105 | int16 cost = 10; | |
106 | for (const auto &kv : data) { | |
107 | const string &value = kv.first; | |
108 | const KeyList &key_list = kv.second; | |
109 | const string &description = GetDescription(key_list, key_count); | |
110 | for (const string &key : key_list) { | |
111 | std::unique_ptr<CompilerToken> token(new CompilerToken()); | |
112 | token->value = value; | |
113 | token->description = description; | |
114 | token->lid = 0; | |
115 | token->rid = 0; | |
116 | token->cost = cost; | |
117 | input_data[key].push_back(std::move(token)); | |
118 | cost += 10; | |
119 | } | |
120 | } | |
121 | ||
122 | return input_data; | |
123 | } | |
124 | ||
125 | } // namespace | |
126 | } // namespace mozc | |
127 | ||
128 | int main(int argc, char **argv) { | |
129 | mozc::InitMozc(argv[0], &argc, &argv, true); | |
130 | const auto &input_data = mozc::ReadEmoticonTsv(FLAGS_input); | |
131 | mozc::SerializedDictionary::CompileToFiles( | |
132 | input_data, FLAGS_output_token_array, FLAGS_output_string_array); | |
133 | return 0; | |
134 | } |
0 | # -*- coding: utf-8 -*- | |
1 | # Copyright 2010-2016, Google Inc. | |
2 | # All rights reserved. | |
3 | # | |
4 | # Redistribution and use in source and binary forms, with or without | |
5 | # modification, are permitted provided that the following conditions are | |
6 | # met: | |
7 | # | |
8 | # * Redistributions of source code must retain the above copyright | |
9 | # notice, this list of conditions and the following disclaimer. | |
10 | # * Redistributions in binary form must reproduce the above | |
11 | # copyright notice, this list of conditions and the following disclaimer | |
12 | # in the documentation and/or other materials provided with the | |
13 | # distribution. | |
14 | # * Neither the name of Google Inc. nor the names of its | |
15 | # contributors may be used to endorse or promote products derived from | |
16 | # this software without specific prior written permission. | |
17 | # | |
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 | ||
30 | """Converter from emoticon data to embedded_dictionary. | |
31 | ||
32 | Usage: | |
33 | python gen_emoticon_rewriter_data.py --input=input.tsv --output=output_header | |
34 | """ | |
35 | ||
36 | __author__ = "hidehiko" | |
37 | ||
38 | from collections import defaultdict | |
39 | import logging | |
40 | import optparse | |
41 | import re | |
42 | import sys | |
43 | from rewriter import embedded_dictionary_compiler | |
44 | ||
45 | ||
46 | def ParseOptions(): | |
47 | parser = optparse.OptionParser() | |
48 | parser.add_option('--input', dest='input', help='emoticon dictionary file') | |
49 | parser.add_option('--output', dest='output', help='output header file') | |
50 | return parser.parse_args()[0] | |
51 | ||
52 | ||
53 | def GetDescription(key_list, key_count): | |
54 | """Generates a description from readings. | |
55 | ||
56 | We simply add 1) the most general reading and 2) the most specific reading. | |
57 | 1) and 2) are simply approximated by checking the frequency of the readings. | |
58 | ||
59 | Args: | |
60 | key_list: a list of key strings. | |
61 | key_count: a dictionary of key to the number of key's occurence in the data | |
62 | file. | |
63 | Returns: | |
64 | the description string. | |
65 | """ | |
66 | if len(key_list) == 1: | |
67 | return key_list[0] | |
68 | ||
69 | sorted_key_list = sorted(key_list, key=lambda key: (key_count[key], key)) | |
70 | return '%s %s' % (sorted_key_list[-1], sorted_key_list[0]) | |
71 | ||
72 | ||
73 | def ReadEmoticonTsv(stream): | |
74 | """Read lines from stream to a Token dictionary for a embedded dictionary.""" | |
75 | # Skip the first line (header). | |
76 | stream.next() | |
77 | ||
78 | data = [] | |
79 | key_count = defaultdict(int) | |
80 | for line in stream: | |
81 | # The file format is: | |
82 | # value <tab> readings(space delimitered) | |
83 | field_list = line.rstrip('\n').split('\t') | |
84 | # Check the size of columns. | |
85 | if len(field_list) < 2: | |
86 | logging.critical('format error: %s', line) | |
87 | sys.exit(1) | |
88 | if len(field_list) > 3: | |
89 | logging.warning('ignore extra columns: %s', line) | |
90 | ||
91 | # \xE3\x80\x80 is full width space | |
92 | key_list = re.split(r'(?: |\xE3\x80\x80)+', field_list[1].strip()) | |
93 | data.append((field_list[0], key_list)) | |
94 | for key in key_list: | |
95 | key_count[key] += 1 | |
96 | ||
97 | input_data = defaultdict(list) | |
98 | cost = 10 | |
99 | for value, key_list in data: | |
100 | input_value = value | |
101 | if input_value == "": | |
102 | input_value = None | |
103 | description = GetDescription(key_list, key_count) | |
104 | if description == "": | |
105 | description = None | |
106 | ||
107 | for key in key_list: | |
108 | input_data[key].append(embedded_dictionary_compiler.Token( | |
109 | key, input_value, description, None, 0, 0, cost)) | |
110 | cost += 10 | |
111 | ||
112 | return input_data | |
113 | ||
114 | ||
115 | def main(): | |
116 | options = ParseOptions() | |
117 | with open(options.input, 'r') as input_stream: | |
118 | input_data = ReadEmoticonTsv(input_stream) | |
119 | ||
120 | with open(options.output, 'w') as output_stream: | |
121 | embedded_dictionary_compiler.Compile( | |
122 | 'EmoticonData', input_data, output_stream) | |
123 | ||
124 | ||
125 | if __name__ == '__main__': | |
126 | main() |
107 | 107 | kEmojiDataList, arraysize(kEmojiDataList), |
108 | 108 | kEmojiTokenList, arraysize(kEmojiTokenList), |
109 | 109 | kEmojiValueList)); |
110 | AddRewriter(new EmoticonRewriter); | |
110 | AddRewriter(EmoticonRewriter::CreateFromDataManager(*data_manager).release()); | |
111 | 111 | AddRewriter(new CalculatorRewriter(parent_converter)); |
112 | 112 | AddRewriter(new SymbolRewriter(parent_converter, data_manager)); |
113 | 113 | AddRewriter(new UnicodeRewriter(parent_converter)); |
67 | 67 | ], |
68 | 68 | }, |
69 | 69 | { |
70 | 'action_name': 'gen_emoticon_rewriter_data', | |
71 | 'variables': { | |
72 | 'input_file': '../data/emoticon/emoticon.tsv', | |
73 | 'output_file': '<(gen_out_dir)/emoticon_rewriter_data.h', | |
74 | }, | |
75 | 'inputs': [ | |
76 | 'embedded_dictionary_compiler.py', | |
77 | 'gen_emoticon_rewriter_data.py', | |
78 | '<(input_file)', | |
79 | ], | |
80 | 'outputs': [ | |
81 | '<(output_file)' | |
82 | ], | |
83 | 'action': [ | |
84 | 'python', 'gen_emoticon_rewriter_data.py', | |
85 | '--input=<(input_file)', | |
86 | '--output=<(output_file)', | |
87 | ], | |
88 | }, | |
89 | { | |
90 | 70 | 'action_name': 'gen_emoji_rewriter_data', |
91 | 71 | 'variables': { |
92 | 72 | 'input_file': '../data/emoji/emoji_data.tsv', |
213 | 193 | '../base/base.gyp:serialized_string_array', |
214 | 194 | ], |
215 | 195 | }, |
196 | { | |
197 | 'target_name': 'gen_emoticon_rewriter_data_main', | |
198 | 'type': 'executable', | |
199 | 'toolsets': ['host'], | |
200 | 'sources': [ | |
201 | 'gen_emoticon_rewriter_data.cc', | |
202 | ], | |
203 | 'dependencies': [ | |
204 | '../base/base.gyp:base', | |
205 | 'rewriter_serialized_dictionary.gyp:serialized_dictionary', | |
206 | ], | |
207 | }, | |
216 | 208 | ], |
217 | 209 | } |
48 | 48 | namespace mozc { |
49 | 49 | namespace { |
50 | 50 | |
51 | struct CompilerToken { | |
52 | string value; | |
53 | string description; | |
54 | string additional_description; | |
55 | uint16 lid; | |
56 | uint16 rid; | |
57 | int16 cost; | |
58 | }; | |
59 | ||
60 | using TokenList = vector<std::unique_ptr<CompilerToken>>; | |
51 | using CompilerToken = SerializedDictionary::CompilerToken; | |
52 | using TokenList = SerializedDictionary::TokenList; | |
61 | 53 | |
62 | 54 | struct CompareByCost { |
63 | 55 | bool operator()(const std::unique_ptr<CompilerToken> &t1, |
112 | 104 | std::istream *input, |
113 | 105 | std::unique_ptr<uint32[]> *output_token_array_buf, |
114 | 106 | std::unique_ptr<uint32[]> *output_string_array_buf) { |
115 | CHECK(SystemUtil::IsLittleEndian()); | |
116 | ||
117 | 107 | map<string, TokenList> dic; |
118 | 108 | LoadTokens(input, &dic); |
109 | return Compile(dic, output_token_array_buf, output_string_array_buf); | |
110 | } | |
111 | ||
112 | pair<StringPiece, StringPiece> SerializedDictionary::Compile( | |
113 | const map<string, TokenList> &dic, | |
114 | std::unique_ptr<uint32[]> *output_token_array_buf, | |
115 | std::unique_ptr<uint32[]> *output_string_array_buf) { | |
116 | CHECK(SystemUtil::IsLittleEndian()); | |
119 | 117 | |
120 | 118 | // Build a mapping from string to its index in a serialized string array. |
121 | 119 | // Note that duplicate keys share the same index, so data is slightly |
188 | 186 | const string &output_string_array) { |
189 | 187 | InputFileStream ifs(input.c_str()); |
190 | 188 | CHECK(ifs.good()); |
191 | ||
189 | map<string, TokenList> dic; | |
190 | LoadTokens(&ifs, &dic); | |
191 | CompileToFiles(dic, output_token_array, output_string_array); | |
192 | } | |
193 | ||
194 | void SerializedDictionary::CompileToFiles(const map<string, TokenList> &dic, | |
195 | const string &output_token_array, | |
196 | const string &output_string_array) { | |
192 | 197 | std::unique_ptr<uint32[]> buf1, buf2; |
193 | const pair<StringPiece, StringPiece> data = Compile(&ifs, &buf1, &buf2); | |
198 | const pair<StringPiece, StringPiece> data = Compile(dic, &buf1, &buf2); | |
194 | 199 | CHECK(VerifyData(data.first, data.second)); |
195 | 200 | |
196 | 201 | OutputFileStream token_ofs(output_token_array.c_str(), |
31 | 31 | |
32 | 32 | #include <istream> |
33 | 33 | #include <iterator> |
34 | #include <map> | |
34 | 35 | #include <string> |
35 | 36 | #include <utility> |
36 | 37 | |
105 | 106 | // array by index. |
106 | 107 | class SerializedDictionary { |
107 | 108 | public: |
109 | struct CompilerToken { | |
110 | string value; | |
111 | string description; | |
112 | string additional_description; | |
113 | uint16 lid; | |
114 | uint16 rid; | |
115 | int16 cost; | |
116 | }; | |
117 | ||
118 | using TokenList = vector<std::unique_ptr<CompilerToken>>; | |
119 | ||
108 | 120 | static const size_t kTokenByteLength = 24; |
109 | 121 | |
110 | 122 | class iterator : public std::iterator<std::random_access_iterator_tag, |
280 | 292 | std::istream *input, |
281 | 293 | std::unique_ptr<uint32[]> *output_token_array_buf, |
282 | 294 | std::unique_ptr<uint32[]> *output_string_array_buf); |
295 | static pair<StringPiece, StringPiece> Compile( | |
296 | const map<string, TokenList> &dic, | |
297 | std::unique_ptr<uint32[]> *output_token_array_buf, | |
298 | std::unique_ptr<uint32[]> *output_string_array_buf); | |
283 | 299 | |
284 | 300 | // Creates serialized data and writes them to files. |
285 | 301 | static void CompileToFiles(const string &input, |
286 | 302 | const string &output_token_array, |
287 | 303 | const string &output_string_array); |
304 | static void CompileToFiles(const map<string, TokenList> &dic, | |
305 | const string &output_token_array, | |
306 | const string &output_string_array); | |
288 | 307 | |
289 | 308 | // Validates the serialized data. |
290 | 309 | static bool VerifyData(StringPiece token_array_data, |
294 | 313 | // boundary. |
295 | 314 | SerializedDictionary(StringPiece token_array, StringPiece string_array_data); |
296 | 315 | ~SerializedDictionary(); |
316 | ||
317 | std::size_t size() const { | |
318 | return token_array_.size() / kTokenByteLength; | |
319 | } | |
297 | 320 | |
298 | 321 | iterator begin() { return iterator(token_array_.data(), &string_array_); } |
299 | 322 | const_iterator begin() const { |