Stop embedding reading correction data as C++ code
This CL moves the embedded reading correction data to a new data set
file.
BUG=
TEST=
REF_BUG=26841123
REF_CL=115039689
REF_TIME=2016-02-19T16:55:30+09:00
REF_TIME_RAW=1455868530 +0900
Noriyuki Takahashi
8 years ago
115 | 115 | : array_(array), index_(index) {} |
116 | 116 | iterator(const iterator &x) = default; |
117 | 117 | |
118 | size_t index() const { return index_; } | |
118 | 119 | StringPiece operator*() { return (*array_)[index_]; } |
119 | 120 | StringPiece operator*() const { return (*array_)[index_]; } |
120 | 121 | StringPiece operator[](difference_type n) const { |
35 | 35 | #include "base/singleton.h" |
36 | 36 | #include "converter/boundary_struct.h" |
37 | 37 | #include "dictionary/pos_matcher.h" |
38 | #include "rewriter/correction_rewriter.h" | |
39 | 38 | #ifndef NO_USAGE_REWRITER |
40 | 39 | #include "rewriter/usage_rewriter_data_structs.h" |
41 | 40 | #endif // NO_USAGE_REWRITER |
102 | 101 | manager_.GetSuffixDictionaryData(key_array, value_array, token_array); |
103 | 102 | } |
104 | 103 | |
105 | namespace { | |
106 | // Include kReadingCorrections. | |
107 | #include "data_manager/chromeos/reading_correction_data.h" | |
108 | } // namespace | |
109 | ||
110 | 104 | void ChromeOsDataManager::GetReadingCorrectionData( |
111 | const ReadingCorrectionItem **array, | |
112 | size_t *size) const { | |
113 | *array = kReadingCorrections; | |
114 | *size = arraysize(kReadingCorrections); | |
105 | StringPiece *value_array_data, StringPiece *error_array_data, | |
106 | StringPiece *correction_array_data) const { | |
107 | manager_.GetReadingCorrectionData(value_array_data, error_array_data, | |
108 | correction_array_data); | |
115 | 109 | } |
116 | 110 | |
117 | 111 | void ChromeOsDataManager::GetCollocationData(const char **array, |
50 | 50 | void GetSystemDictionaryData(const char **data, int *size) const override; |
51 | 51 | void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array, |
52 | 52 | const uint32 **token_array) const override; |
53 | void GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
54 | size_t *size) const override; | |
53 | void GetReadingCorrectionData( | |
54 | StringPiece *value_array_data, StringPiece *error_array_data, | |
55 | StringPiece *correction_array_data) const override; | |
55 | 56 | void GetCollocationData(const char **array, size_t *size) const override; |
56 | 57 | void GetCollocationSuppressionData(const char **array, |
57 | 58 | size_t *size) const override; |
57 | 57 | void GetSystemDictionaryData(const char **data, int *size) const override {} |
58 | 58 | void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array, |
59 | 59 | const uint32 **token_array) const override {} |
60 | void GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
61 | size_t *size) const override {} | |
60 | void GetReadingCorrectionData( | |
61 | StringPiece *value_array_data, StringPiece *error_array_data, | |
62 | StringPiece *correction_array_data) const override {} | |
62 | 63 | void GetCollocationData(const char **array, size_t *size) const override {} |
63 | 64 | void GetCollocationSuppressionData(const char **array, |
64 | 65 | size_t *size) const override {} |
131 | 131 | return false; |
132 | 132 | } |
133 | 133 | } |
134 | if (!reader.Get("reading_correction_value", | |
135 | &reading_correction_value_array_data_)) { | |
136 | LOG(ERROR) << "Cannot find reading correction value array"; | |
137 | return false; | |
138 | } | |
139 | if (!reader.Get("reading_correction_error", | |
140 | &reading_correction_error_array_data_)) { | |
141 | LOG(ERROR) << "Cannot find reading correction error array"; | |
142 | return false; | |
143 | } | |
144 | if (!reader.Get("reading_correction_correction", | |
145 | &reading_correction_correction_array_data_)) { | |
146 | LOG(ERROR) << "Cannot find reading correction correction array"; | |
147 | return false; | |
148 | } | |
149 | { | |
150 | SerializedStringArray value_array, error_array, correction_array; | |
151 | if (!value_array.Init(reading_correction_value_array_data_) || | |
152 | !error_array.Init(reading_correction_error_array_data_) || | |
153 | !correction_array.Init(reading_correction_correction_array_data_) || | |
154 | value_array.size() != error_array.size() || | |
155 | value_array.size() != correction_array.size()) { | |
156 | LOG(ERROR) << "Reading correction data is broken"; | |
157 | return false; | |
158 | } | |
159 | } | |
134 | 160 | return true; |
135 | 161 | } |
136 | 162 | |
197 | 223 | reinterpret_cast<const uint32 *>(suffix_token_array_data_.data()); |
198 | 224 | } |
199 | 225 | |
200 | void DataManager::GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
201 | size_t *size) const { | |
202 | LOG(FATAL) << "Not implemented"; | |
226 | void DataManager::GetReadingCorrectionData( | |
227 | StringPiece *value_array_data, StringPiece *error_array_data, | |
228 | StringPiece *correction_array_data) const { | |
229 | *value_array_data = reading_correction_value_array_data_; | |
230 | *error_array_data = reading_correction_error_array_data_; | |
231 | *correction_array_data = reading_correction_correction_array_data_; | |
203 | 232 | } |
204 | 233 | |
205 | 234 | void DataManager::GetSymbolRewriterData(const EmbeddedDictionary::Token **data, |
128 | 128 | 'gen_separate_boundary_data_for_<(dataset_tag)#host', |
129 | 129 | 'gen_separate_counter_suffix_data_for_<(dataset_tag)#host', |
130 | 130 | 'gen_separate_suffix_data_for_<(dataset_tag)#host', |
131 | 'gen_separate_reading_correction_data_for_<(dataset_tag)#host', | |
131 | 132 | ], |
132 | 133 | 'actions': [ |
133 | 134 | { |
149 | 150 | 'suffix_key': '<(gen_out_dir)/suffix_key.data', |
150 | 151 | 'suffix_value': '<(gen_out_dir)/suffix_value.data', |
151 | 152 | 'suffix_token': '<(gen_out_dir)/suffix_token.data', |
153 | 'reading_correction_value': '<(gen_out_dir)/reading_correction_value.data', | |
154 | 'reading_correction_error': '<(gen_out_dir)/reading_correction_error.data', | |
155 | 'reading_correction_correction': '<(gen_out_dir)/reading_correction_correction.data', | |
152 | 156 | }, |
153 | 157 | 'inputs': [ |
154 | 158 | '<(dictionary)', |
166 | 170 | '<(suffix_key)', |
167 | 171 | '<(suffix_value)', |
168 | 172 | '<(suffix_token)', |
173 | '<(reading_correction_value)', | |
174 | '<(reading_correction_error)', | |
175 | '<(reading_correction_correction)', | |
169 | 176 | ], |
170 | 177 | 'outputs': [ |
171 | 178 | '<(gen_out_dir)/<(out_mozc_data)', |
189 | 196 | 'suffix_key:32:<(gen_out_dir)/suffix_key.data', |
190 | 197 | 'suffix_value:32:<(gen_out_dir)/suffix_value.data', |
191 | 198 | 'suffix_token:32:<(gen_out_dir)/suffix_token.data', |
199 | 'reading_correction_value:32:<(gen_out_dir)/reading_correction_value.data', | |
200 | 'reading_correction_error:32:<(gen_out_dir)/reading_correction_error.data', | |
201 | 'reading_correction_correction:32:<(gen_out_dir)/reading_correction_correction.data', | |
192 | 202 | ], |
193 | 203 | }, |
194 | 204 | ], |
203 | 213 | 'gen_embedded_collocation_suppression_data_for_<(dataset_tag)#host', |
204 | 214 | 'gen_embedded_connection_data_for_<(dataset_tag)#host', |
205 | 215 | 'gen_embedded_dictionary_data_for_<(dataset_tag)#host', |
206 | 'gen_embedded_reading_correction_data_for_<(dataset_tag)#host', | |
207 | 216 | 'gen_embedded_suggestion_filter_data_for_<(dataset_tag)#host', |
208 | 217 | 'gen_embedded_symbol_rewriter_data_for_<(dataset_tag)#host', |
209 | 218 | ], |
651 | 660 | ], |
652 | 661 | }, |
653 | 662 | { |
654 | 'target_name': 'gen_embedded_reading_correction_data_for_<(dataset_tag)', | |
663 | 'target_name': 'gen_separate_reading_correction_data_for_<(dataset_tag)', | |
655 | 664 | 'type': 'none', |
656 | 665 | 'toolsets': ['host'], |
657 | 666 | 'actions': [ |
666 | 675 | '<(platform_data_dir)/reading_correction.tsv', |
667 | 676 | ], |
668 | 677 | 'outputs': [ |
669 | '<(gen_out_dir)/reading_correction_data.h', | |
678 | '<(gen_out_dir)/reading_correction_value.data', | |
679 | '<(gen_out_dir)/reading_correction_error.data', | |
680 | '<(gen_out_dir)/reading_correction_correction.data', | |
670 | 681 | ], |
671 | 682 | 'action': [ |
672 | 683 | 'python', '<(mozc_dir)/rewriter/gen_reading_correction_data.py', |
673 | '--output=<(gen_out_dir)/reading_correction_data.h', | |
674 | 684 | '--input=<@(input_files)', |
675 | ], | |
676 | 'message': ('[<(dataset_tag)] Generating ' + | |
677 | '<(gen_out_dir)/reading_correction_data.h'), | |
685 | '--output_value_array=<(gen_out_dir)/reading_correction_value.data', | |
686 | '--output_error_array=<(gen_out_dir)/reading_correction_error.data', | |
687 | '--output_correction_array=<(gen_out_dir)/reading_correction_correction.data', | |
688 | ], | |
689 | 'message': ('[<(dataset_tag)] Generating ' + | |
690 | '<(gen_out_dir)/reading_correction*'), | |
678 | 691 | }, |
679 | 692 | ], |
680 | 693 | }, |
63 | 63 | void GetSuffixDictionaryData(StringPiece *key_array_data, |
64 | 64 | StringPiece *value_array_data, |
65 | 65 | const uint32 **token_array) const override; |
66 | void GetReadingCorrectionData( | |
67 | StringPiece *value_array_data, StringPiece *error_array_data, | |
68 | StringPiece *correction_array_data) const override; | |
66 | 69 | |
67 | 70 | // The following interfaces are not yet implemented. |
68 | 71 | // TODO(noriyukit): Implements all the interfaces by migrating embedded C++ |
70 | 73 | const dictionary::UserPOS::POSToken *GetUserPOSData() const override; |
71 | 74 | const dictionary::POSMatcher *GetPOSMatcher() const override; |
72 | 75 | |
73 | void GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
74 | size_t *size) const override; | |
75 | 76 | void GetSymbolRewriterData(const EmbeddedDictionary::Token **data, |
76 | 77 | size_t *size) const override; |
77 | 78 | #ifndef NO_USAGE_REWRITER |
99 | 100 | StringPiece suffix_key_array_data_; |
100 | 101 | StringPiece suffix_value_array_data_; |
101 | 102 | StringPiece suffix_token_array_data_; |
103 | StringPiece reading_correction_value_array_data_; | |
104 | StringPiece reading_correction_error_array_data_; | |
105 | StringPiece reading_correction_correction_array_data_; | |
102 | 106 | |
103 | 107 | DISALLOW_COPY_AND_ASSIGN(DataManager); |
104 | 108 | }; |
36 | 36 | |
37 | 37 | namespace mozc { |
38 | 38 | |
39 | struct ReadingCorrectionItem; | |
40 | 39 | #ifndef NO_USAGE_REWRITER |
41 | 40 | struct ConjugationSuffix; |
42 | 41 | struct UsageDictItem; |
81 | 80 | const uint32 **token_array) const = 0; |
82 | 81 | |
83 | 82 | // Gets a reference to reading correction data array and its size. |
84 | virtual void GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
85 | size_t *size) const = 0; | |
83 | virtual void GetReadingCorrectionData( | |
84 | StringPiece *value_array_data, StringPiece *error_array_data, | |
85 | StringPiece *correction_array_data) const = 0; | |
86 | 86 | |
87 | 87 | // Gets the address of collocation data array and its size. |
88 | 88 | virtual void GetCollocationData(const char **array, size_t *size) const = 0; |
32 | 32 | #include "base/logging.h" |
33 | 33 | #include "base/port.h" |
34 | 34 | #include "dictionary/pos_matcher.h" |
35 | #include "rewriter/correction_rewriter.h" | |
36 | 35 | #ifndef NO_USAGE_REWRITER |
37 | 36 | #include "rewriter/usage_rewriter_data_structs.h" |
38 | 37 | #endif // NO_USAGE_REWRITER |
114 | 113 | manager_.GetSuffixDictionaryData(key_array, value_array, token_array); |
115 | 114 | } |
116 | 115 | |
117 | namespace { | |
118 | // Include kReadingCorrections. | |
119 | #include "data_manager/oss/reading_correction_data.h" | |
120 | } // namespace | |
121 | ||
122 | 116 | void OssDataManager::GetReadingCorrectionData( |
123 | const ReadingCorrectionItem **array, | |
124 | size_t *size) const { | |
125 | *array = kReadingCorrections; | |
126 | *size = arraysize(kReadingCorrections); | |
117 | StringPiece *value_array_data, StringPiece *error_array_data, | |
118 | StringPiece *correction_array_data) const { | |
119 | manager_.GetReadingCorrectionData(value_array_data, error_array_data, | |
120 | correction_array_data); | |
127 | 121 | } |
128 | 122 | |
129 | 123 | void OssDataManager::GetCollocationData(const char **array, |
52 | 52 | void GetSystemDictionaryData(const char **data, int *size) const override; |
53 | 53 | void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array, |
54 | 54 | const uint32 **token_array) const override; |
55 | void GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
56 | size_t *size) const override; | |
55 | void GetReadingCorrectionData( | |
56 | StringPiece *value_array_data, StringPiece *error_array_data, | |
57 | StringPiece *correction_array_data) const override; | |
57 | 58 | void GetCollocationData(const char **array, size_t *size) const override; |
58 | 59 | void GetCollocationSuppressionData(const char **array, |
59 | 60 | size_t *size) const override; |
58 | 58 | void GetSystemDictionaryData(const char **data, int *size) const override {} |
59 | 59 | void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array, |
60 | 60 | const uint32 **token_array) const override {} |
61 | void GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
62 | size_t *size) const override {} | |
61 | void GetReadingCorrectionData( | |
62 | StringPiece *value_array_data, StringPiece *error_array_data, | |
63 | StringPiece *correction_array_data) const override {} | |
63 | 64 | void GetCollocationData(const char **array, size_t *size) const override {} |
64 | 65 | void GetCollocationSuppressionData(const char **array, |
65 | 66 | size_t *size) const override {} |
38 | 38 | #include "dictionary/pos_group.h" |
39 | 39 | #include "dictionary/pos_matcher.h" |
40 | 40 | #include "dictionary/user_pos.h" |
41 | #include "rewriter/correction_rewriter.h" | |
42 | 41 | #include "rewriter/embedded_dictionary.h" |
43 | 42 | #ifndef NO_USAGE_REWRITER |
44 | 43 | #include "rewriter/usage_rewriter_data_structs.h" |
55 | 54 | namespace { |
56 | 55 | |
57 | 56 | #include "data_manager/@DIR@/pos_matcher_data.h" |
58 | #include "data_manager/@DIR@/reading_correction_data.h" | |
59 | 57 | #include "data_manager/@DIR@/symbol_rewriter_data.h" |
60 | 58 | #include "data_manager/@DIR@/user_pos_data.h" |
61 | 59 | #ifndef NO_USAGE_REWRITER |
76 | 74 | // elements are not required at runtime. |
77 | 75 | packer.SetPosMatcherData(kRuleIdTable, arraysize(kRuleIdTable) - 1, |
78 | 76 | kRangeTables, arraysize(kRangeTables) - 1); |
79 | packer.SetReadingCorretions(kReadingCorrections, | |
80 | arraysize(kReadingCorrections)); | |
81 | 77 | packer.SetSymbolRewriterData(kSymbolData_token_data, kSymbolData_token_size); |
82 | 78 | #ifndef NO_USAGE_REWRITER |
83 | 79 | packer.SetUsageRewriterData(kConjugationNum, |
41 | 41 | #include "data_manager/packed/system_dictionary_data.pb.h" |
42 | 42 | #include "data_manager/packed/system_dictionary_format_version.h" |
43 | 43 | #include "dictionary/pos_matcher.h" |
44 | #include "rewriter/correction_rewriter.h" | |
45 | 44 | #include "rewriter/embedded_dictionary.h" |
46 | 45 | #ifndef NO_USAGE_REWRITER |
47 | 46 | #include "rewriter/usage_rewriter_data_structs.h" |
95 | 94 | void GetSystemDictionaryData(const char **data, int *size) const; |
96 | 95 | void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array, |
97 | 96 | const uint32 **token_array) const; |
98 | void GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
99 | size_t *size) const; | |
97 | void GetReadingCorrectionData( | |
98 | StringPiece *value_array_data, StringPiece *error_array_data, | |
99 | StringPiece *correction_array_data) const; | |
100 | 100 | void GetCollocationData(const char **array, size_t *size) const; |
101 | 101 | void GetCollocationSuppressionData(const char **array, |
102 | 102 | size_t *size) const; |
128 | 128 | unique_ptr<uint16[]> rule_id_table_; |
129 | 129 | unique_ptr<POSMatcher::Range *[]> range_tables_; |
130 | 130 | unique_ptr<Range[]> range_table_items_; |
131 | unique_ptr<ReadingCorrectionItem[]> reading_corrections_; | |
132 | 131 | unique_ptr<EmbeddedDictionary::Value[]> symbol_data_values_; |
133 | 132 | size_t symbol_data_token_size_; |
134 | 133 | unique_ptr<EmbeddedDictionary::Token[]> symbol_data_tokens_; |
270 | 269 | range_table_items_[range_index].lower = static_cast<uint16>(0xFFFF); |
271 | 270 | range_table_items_[range_index].upper = static_cast<uint16>(0xFFFF); |
272 | 271 | ++range_index; |
273 | } | |
274 | ||
275 | // Makes reading correction data. | |
276 | reading_corrections_.reset( | |
277 | new ReadingCorrectionItem[ | |
278 | system_dictionary_data_->reading_corrections_size()]); | |
279 | for (size_t i = 0; | |
280 | i < system_dictionary_data_->reading_corrections_size(); | |
281 | ++i) { | |
282 | const SystemDictionaryData::ReadingCorrectionItem &item = | |
283 | system_dictionary_data_->reading_corrections(i); | |
284 | if (item.has_value()) { | |
285 | reading_corrections_[i].value = item.value().data(); | |
286 | } else { | |
287 | reading_corrections_[i].value = NULL; | |
288 | } | |
289 | if (item.has_error()) { | |
290 | reading_corrections_[i].error = item.error().data(); | |
291 | } else { | |
292 | reading_corrections_[i].error = NULL; | |
293 | } | |
294 | if (item.has_correction()) { | |
295 | reading_corrections_[i].correction = item.correction().data(); | |
296 | } else { | |
297 | reading_corrections_[i].correction = NULL; | |
298 | } | |
299 | 272 | } |
300 | 273 | |
301 | 274 | // Makes symbol dictionary data. |
464 | 437 | } |
465 | 438 | |
466 | 439 | void PackedDataManager::Impl::GetReadingCorrectionData( |
467 | const ReadingCorrectionItem **array, | |
468 | size_t *size) const { | |
469 | *array = reading_corrections_.get(); | |
470 | *size = system_dictionary_data_->reading_corrections().size(); | |
440 | StringPiece *value_array_data, StringPiece *error_array_data, | |
441 | StringPiece *correction_array_data) const { | |
442 | manager_.GetReadingCorrectionData(value_array_data, error_array_data, | |
443 | correction_array_data); | |
471 | 444 | } |
472 | 445 | |
473 | 446 | void PackedDataManager::Impl::GetCollocationData( |
624 | 597 | } |
625 | 598 | |
626 | 599 | void PackedDataManager::GetReadingCorrectionData( |
627 | const ReadingCorrectionItem **array, | |
628 | size_t *size) const { | |
629 | manager_impl_->GetReadingCorrectionData(array, size); | |
600 | StringPiece *value_array_data, StringPiece *error_array_data, | |
601 | StringPiece *correction_array_data) const { | |
602 | manager_impl_->GetReadingCorrectionData(value_array_data, error_array_data, | |
603 | correction_array_data); | |
630 | 604 | } |
631 | 605 | |
632 | 606 | void PackedDataManager::GetCollocationData( |
62 | 62 | void GetSystemDictionaryData(const char **data, int *size) const override; |
63 | 63 | void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array, |
64 | 64 | const uint32 **token_array) const override; |
65 | void GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
66 | size_t *size) const override; | |
65 | void GetReadingCorrectionData( | |
66 | StringPiece *value_array_data, StringPiece *error_array_data, | |
67 | StringPiece *correction_array_data) const override; | |
67 | 68 | void GetCollocationData(const char **array, size_t *size) const override; |
68 | 69 | void GetCollocationSuppressionData(const char **array, |
69 | 70 | size_t *size) const override; |
66 | 66 | |
67 | 67 | reserved 7; // DEPRECATED: repeated SuffixToken suffix_tokens = 7; |
68 | 68 | |
69 | message ReadingCorrectionItem { | |
70 | optional string value = 1; | |
71 | optional string error = 2; | |
72 | optional string correction = 3; | |
73 | }; | |
74 | repeated ReadingCorrectionItem reading_corrections = 8; | |
69 | // DEPRECATED: repeated ReadingCorrectionItem reading_corrections = 8; | |
70 | reserved 8; | |
75 | 71 | |
76 | 72 | reserved 9; // DEPRECATED: optional SegmenterData segmenter_data = 9; |
77 | 73 |
42 | 42 | #include "dictionary/pos_group.h" |
43 | 43 | #include "dictionary/pos_matcher.h" |
44 | 44 | #include "dictionary/user_pos.h" |
45 | #include "rewriter/correction_rewriter.h" | |
46 | 45 | #include "rewriter/embedded_dictionary.h" |
47 | 46 | #ifndef NO_USAGE_REWRITER |
48 | 47 | #include "rewriter/usage_rewriter_data_structs.h" |
109 | 108 | = range_table->add_ranges(); |
110 | 109 | range->set_lower(range_tables[i][j].lower); |
111 | 110 | range->set_upper(range_tables[i][j].upper); |
112 | } | |
113 | } | |
114 | } | |
115 | ||
116 | void SystemDictionaryDataPacker::SetReadingCorretions( | |
117 | const ReadingCorrectionItem *reading_corrections, | |
118 | size_t reading_corrections_count) { | |
119 | for (size_t i = 0; i < reading_corrections_count; ++i) { | |
120 | SystemDictionaryData::ReadingCorrectionItem *item = | |
121 | system_dictionary_->add_reading_corrections(); | |
122 | if (reading_corrections[i].value) { | |
123 | item->set_value(reading_corrections[i].value); | |
124 | } | |
125 | if (reading_corrections[i].error) { | |
126 | item->set_error(reading_corrections[i].error); | |
127 | } | |
128 | if (reading_corrections[i].correction) { | |
129 | item->set_correction(reading_corrections[i].correction); | |
130 | 111 | } |
131 | 112 | } |
132 | 113 | } |
34 | 34 | #include "base/port.h" |
35 | 35 | #include "dictionary/pos_matcher.h" |
36 | 36 | #include "dictionary/user_pos.h" |
37 | #include "rewriter/correction_rewriter.h" | |
38 | 37 | #include "rewriter/embedded_dictionary.h" |
39 | 38 | |
40 | 39 | namespace mozc { |
60 | 59 | size_t rule_id_table_count, |
61 | 60 | const dictionary::POSMatcher::Range *const *range_tables, |
62 | 61 | size_t range_tables_count); |
63 | void SetReadingCorretions( | |
64 | const ReadingCorrectionItem *reading_corrections, | |
65 | size_t reading_corrections_count); | |
66 | 62 | void SetSuggestionFilterData( |
67 | 63 | const void *suggestion_filter_data, |
68 | 64 | size_t suggestion_filter_data_size); |
32 | 32 | namespace mozc { |
33 | 33 | namespace packed { |
34 | 34 | |
35 | const int kSystemDictionaryFormatVersion = 15; | |
35 | const int kSystemDictionaryFormatVersion = 16; | |
36 | 36 | |
37 | 37 | } // namespace packed |
38 | 38 | } // namespace mozc |
32 | 32 | #include "base/logging.h" |
33 | 33 | #include "base/port.h" |
34 | 34 | #include "dictionary/pos_matcher.h" |
35 | #include "rewriter/correction_rewriter.h" | |
36 | 35 | #ifndef NO_USAGE_REWRITER |
37 | 36 | #include "rewriter/usage_rewriter_data_structs.h" |
38 | 37 | #endif // NO_USAGE_REWRITER |
90 | 89 | manager_.GetSuffixDictionaryData(key_array, value_array, token_array); |
91 | 90 | } |
92 | 91 | |
93 | namespace { | |
94 | // Include kReadingCorrections. | |
95 | #include "data_manager/testing/reading_correction_data.h" | |
96 | } // namespace | |
97 | ||
98 | 92 | void MockDataManager::GetReadingCorrectionData( |
99 | const ReadingCorrectionItem **array, | |
100 | size_t *size) const { | |
101 | *array = kReadingCorrections; | |
102 | *size = arraysize(kReadingCorrections); | |
93 | StringPiece *value_array_data, StringPiece *error_array_data, | |
94 | StringPiece *correction_array_data) const { | |
95 | manager_.GetReadingCorrectionData(value_array_data, error_array_data, | |
96 | correction_array_data); | |
103 | 97 | } |
104 | 98 | |
105 | 99 | void MockDataManager::GetCollocationData(const char **array, |
50 | 50 | void GetSystemDictionaryData(const char **data, int *size) const override; |
51 | 51 | void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array, |
52 | 52 | const uint32 **token_array) const override; |
53 | void GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
54 | size_t *size) const override; | |
53 | void GetReadingCorrectionData( | |
54 | StringPiece *value_array_data, StringPiece *error_array_data, | |
55 | StringPiece *correction_array_data) const override; | |
55 | 56 | void GetCollocationData(const char **array, size_t *size) const override; |
56 | 57 | void GetCollocationSuppressionData(const char **array, |
57 | 58 | size_t *size) const override; |
57 | 57 | void GetSystemDictionaryData(const char **data, int *size) const override {} |
58 | 58 | void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array, |
59 | 59 | const uint32 **token_array) const override {} |
60 | void GetReadingCorrectionData(const ReadingCorrectionItem **array, | |
61 | size_t *size) const override {} | |
60 | void GetReadingCorrectionData( | |
61 | StringPiece *value_array_data, StringPiece *error_array_data, | |
62 | StringPiece *correction_array_data) const override {} | |
62 | 63 | void GetCollocationData(const char **array, size_t *size) const override {} |
63 | 64 | void GetCollocationSuppressionData(const char **array, |
64 | 65 | size_t *size) const override {} |
0 | 0 | MAJOR=2 |
1 | 1 | MINOR=17 |
2 | BUILD=2500 | |
2 | BUILD=2501 | |
3 | 3 | REVISION=102 |
4 | 4 | # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be |
5 | 5 | # downloaded by NaCl Mozc. |
6 | NACL_DICTIONARY_VERSION=15 | |
6 | NACL_DICTIONARY_VERSION=16 |
41 | 41 | #include "request/conversion_request.h" |
42 | 42 | |
43 | 43 | namespace mozc { |
44 | namespace { | |
45 | 44 | |
46 | void SetCandidate(const ReadingCorrectionItem *item, | |
47 | Segment::Candidate *candidate) { | |
48 | DCHECK(item); | |
45 | void CorrectionRewriter::SetCandidate(const ReadingCorrectionItem &item, | |
46 | Segment::Candidate *candidate) { | |
49 | 47 | candidate->prefix = "\xE2\x86\x92 "; // "→ " |
50 | 48 | candidate->attributes |= Segment::Candidate::SPELLING_CORRECTION; |
49 | ||
51 | 50 | candidate->description = |
52 | 51 | // "もしかして" |
53 | "<\xE3\x82\x82\xE3\x81\x97\xE3\x81\x8B\xE3\x81\x97\xE3\x81\xA6: " + | |
54 | string(item->correction) + ">"; | |
52 | "<\xE3\x82\x82\xE3\x81\x97\xE3\x81\x8B\xE3\x81\x97\xE3\x81\xA6: "; | |
53 | item.correction.AppendToString(&candidate->description); | |
54 | candidate->description.append(1, '>'); | |
55 | ||
55 | 56 | DCHECK(candidate->IsValid()); |
56 | 57 | } |
57 | ||
58 | struct ReadingCorrectionItemCompare { | |
59 | bool operator()(const ReadingCorrectionItem &s1, | |
60 | const ReadingCorrectionItem &s2) const { | |
61 | return (strcmp(s1.error, s2.error) < 0); | |
62 | } | |
63 | }; | |
64 | } // namespace | |
65 | 58 | |
66 | 59 | bool CorrectionRewriter::LookupCorrection( |
67 | 60 | const string &key, |
68 | 61 | const string &value, |
69 | vector<const ReadingCorrectionItem *> *results) const { | |
62 | vector<ReadingCorrectionItem> *results) const { | |
70 | 63 | CHECK(results); |
71 | 64 | results->clear(); |
72 | ReadingCorrectionItem key_item; | |
73 | key_item.error = key.c_str(); | |
74 | const ReadingCorrectionItem *result = | |
75 | std::lower_bound(reading_corrections_, reading_corrections_ + size_, | |
76 | key_item, ReadingCorrectionItemCompare()); | |
77 | if (result == (reading_corrections_ + size_) || | |
78 | key != result->error) { | |
79 | return false; | |
80 | } | |
81 | 65 | |
82 | for (; result != (reading_corrections_ + size_); ++result) { | |
83 | if (key != result->error) { | |
84 | break; | |
85 | } | |
86 | if (value.empty() || value == result->value) { | |
87 | results->push_back(result); | |
66 | using Iter = SerializedStringArray::const_iterator; | |
67 | pair<Iter, Iter> range = std::equal_range(error_array_.begin(), | |
68 | error_array_.end(), | |
69 | key); | |
70 | for (; range.first != range.second; ++range.first) { | |
71 | const StringPiece v = value_array_[range.first.index()]; | |
72 | if (value.empty() || value == v) { | |
73 | results->emplace_back(v, *range.first, | |
74 | correction_array_[range.first.index()]); | |
88 | 75 | } |
89 | 76 | } |
90 | ||
91 | 77 | return !results->empty(); |
92 | 78 | } |
93 | 79 | |
94 | CorrectionRewriter::CorrectionRewriter( | |
95 | const ReadingCorrectionItem *reading_corrections, const size_t array_size) : | |
96 | reading_corrections_(reading_corrections), size_(array_size) {} | |
80 | CorrectionRewriter::CorrectionRewriter(StringPiece value_array_data, | |
81 | StringPiece error_array_data, | |
82 | StringPiece correction_array_data) { | |
83 | DCHECK(SerializedStringArray::VerifyData(value_array_data)); | |
84 | DCHECK(SerializedStringArray::VerifyData(error_array_data)); | |
85 | DCHECK(SerializedStringArray::VerifyData(correction_array_data)); | |
86 | value_array_.Set(value_array_data); | |
87 | error_array_.Set(error_array_data); | |
88 | correction_array_.Set(correction_array_data); | |
89 | DCHECK_EQ(value_array_.size(), error_array_.size()); | |
90 | DCHECK_EQ(value_array_.size(), correction_array_.size()); | |
91 | } | |
97 | 92 | |
98 | 93 | // static |
99 | 94 | CorrectionRewriter *CorrectionRewriter::CreateCorrectionRewriter( |
100 | 95 | const DataManagerInterface *data_manager) { |
101 | const ReadingCorrectionItem *array = NULL; | |
102 | size_t array_size = 0; | |
103 | data_manager->GetReadingCorrectionData(&array, &array_size); | |
104 | return new CorrectionRewriter(array, array_size); | |
96 | StringPiece value_array_data, error_array_data, correction_array_data; | |
97 | data_manager->GetReadingCorrectionData(&value_array_data, | |
98 | &error_array_data, | |
99 | &correction_array_data); | |
100 | return new CorrectionRewriter(value_array_data, error_array_data, | |
101 | correction_array_data); | |
105 | 102 | } |
106 | 103 | |
107 | 104 | CorrectionRewriter::~CorrectionRewriter() {} |
113 | 110 | } |
114 | 111 | |
115 | 112 | bool modified = false; |
116 | vector<const ReadingCorrectionItem *> results; | |
113 | vector<ReadingCorrectionItem> results; | |
117 | 114 | |
118 | 115 | for (size_t i = 0; i < segments->conversion_segments_size(); ++i) { |
119 | 116 | Segment *segment = segments->mutable_conversion_segment(i); |
155 | 152 | segment->insert_candidate(kInsertPostion); |
156 | 153 | DCHECK(mutable_candidate); |
157 | 154 | mutable_candidate->CopyFrom(top_candidate); |
158 | Util::ConcatStrings(results[k]->error, | |
155 | Util::ConcatStrings(results[k].error, | |
159 | 156 | top_candidate.functional_key(), |
160 | 157 | &mutable_candidate->key); |
161 | Util::ConcatStrings(results[k]->value, | |
158 | Util::ConcatStrings(results[k].value, | |
162 | 159 | top_candidate.functional_value(), |
163 | 160 | &mutable_candidate->value); |
164 | 161 | mutable_candidate->inner_segment_boundary.clear(); |
33 | 33 | #include <string> |
34 | 34 | #include <vector> |
35 | 35 | |
36 | #include "base/serialized_string_array.h" | |
37 | #include "base/string_piece.h" | |
36 | 38 | #include "rewriter/rewriter_interface.h" |
37 | 39 | |
38 | 40 | namespace mozc { |
39 | ||
40 | struct ReadingCorrectionItem { | |
41 | // ex. (value, error, correction) = ("雰囲気", "ふいんき", "ふんいき") | |
42 | const char *value; | |
43 | const char *error; | |
44 | const char *correction; | |
45 | }; | |
46 | 41 | |
47 | 42 | class ConversionRequest; |
48 | 43 | class DataManagerInterface; |
56 | 51 | static CorrectionRewriter *CreateCorrectionRewriter( |
57 | 52 | const DataManagerInterface *data_manager); |
58 | 53 | |
59 | CorrectionRewriter(const ReadingCorrectionItem *reading_corrections, | |
60 | size_t array_size); | |
61 | virtual ~CorrectionRewriter(); | |
62 | virtual bool Rewrite(const ConversionRequest &request, | |
63 | Segments *segments) const; | |
54 | CorrectionRewriter(StringPiece value_array_data, StringPiece error_array_data, | |
55 | StringPiece correction_array_data); | |
56 | ~CorrectionRewriter() override; | |
64 | 57 | |
65 | virtual int capability(const ConversionRequest &request) const { | |
58 | bool Rewrite(const ConversionRequest &request, | |
59 | Segments *segments) const override; | |
60 | ||
61 | int capability(const ConversionRequest &request) const override { | |
66 | 62 | return RewriterInterface::ALL; |
67 | 63 | } |
68 | 64 | |
69 | 65 | private: |
70 | const ReadingCorrectionItem *reading_corrections_; | |
71 | size_t size_; | |
66 | struct ReadingCorrectionItem { | |
67 | ReadingCorrectionItem(StringPiece v, StringPiece e, StringPiece c) | |
68 | : value(v), error(e), correction(c) {} | |
69 | ||
70 | // ex. (value, error, correction) = ("雰囲気", "ふいんき", "ふんいき") | |
71 | StringPiece value; | |
72 | StringPiece error; | |
73 | StringPiece correction; | |
74 | }; | |
75 | ||
76 | // Sets |candidate| fields from |iterm|. | |
77 | static void SetCandidate(const ReadingCorrectionItem &item, | |
78 | Segment::Candidate *candidate); | |
72 | 79 | |
73 | 80 | // Looks up corrections with key and value. Return true if at least |
74 | 81 | // one correction is found in the internal dictionary. |
78 | 85 | bool LookupCorrection( |
79 | 86 | const string &key, |
80 | 87 | const string &value, |
81 | vector<const ReadingCorrectionItem *> *results) const; | |
88 | vector<ReadingCorrectionItem> *results) const; | |
89 | ||
90 | SerializedStringArray value_array_; | |
91 | SerializedStringArray error_array_; | |
92 | SerializedStringArray correction_array_; | |
82 | 93 | }; |
83 | 94 | |
84 | 95 | } // namespace mozc |
31 | 31 | #include <memory> |
32 | 32 | #include <string> |
33 | 33 | |
34 | #include "base/port.h" | |
35 | #include "base/serialized_string_array.h" | |
34 | 36 | #include "config/config_handler.h" |
35 | 37 | #include "converter/segments.h" |
36 | 38 | #include "protocol/commands.pb.h" |
40 | 42 | |
41 | 43 | namespace mozc { |
42 | 44 | namespace { |
43 | static const ReadingCorrectionItem kReadingCorrectionTestItems[] = { | |
44 | { "TSUKIGIME", "gekkyoku", "tsukigime" }, | |
45 | }; | |
46 | 45 | |
47 | 46 | Segment *AddSegment(const string &key, Segments *segments) { |
48 | 47 | Segment *segment = segments->push_back_segment(); |
71 | 70 | convreq_.set_config(&config_); |
72 | 71 | } |
73 | 72 | |
74 | virtual void SetUp() { | |
73 | void SetUp() override { | |
74 | // Create a rewriter with one entry: (TSUKIGIME, gekkyoku, tsukigime) | |
75 | const vector<StringPiece> values = {"TSUKIGIME"}; | |
76 | const vector<StringPiece> errors = {"gekkyoku"}; | |
77 | const vector<StringPiece> corrections = {"tsukigime"}; | |
75 | 78 | rewriter_.reset(new CorrectionRewriter( |
76 | kReadingCorrectionTestItems, | |
77 | arraysize(kReadingCorrectionTestItems))); | |
79 | SerializedStringArray::SerializeToBuffer(values, &values_buf_), | |
80 | SerializedStringArray::SerializeToBuffer(errors, &errors_buf_), | |
81 | SerializedStringArray::SerializeToBuffer(corrections, | |
82 | &corrections_buf_))); | |
78 | 83 | config::ConfigHandler::GetDefaultConfig(&config_); |
79 | 84 | config_.set_use_spelling_correction(true); |
80 | 85 | } |
83 | 88 | ConversionRequest convreq_; |
84 | 89 | commands::Request request_; |
85 | 90 | config::Config config_; |
91 | ||
92 | private: | |
93 | std::unique_ptr<uint32[]> values_buf_; | |
94 | std::unique_ptr<uint32[]> errors_buf_; | |
95 | std::unique_ptr<uint32[]> corrections_buf_; | |
86 | 96 | }; |
87 | 97 | |
88 | 98 | TEST_F(CorrectionRewriterTest, CapabilityTest) { |
27 | 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | 29 | |
30 | """Converter of reading correction data from TSV to C++ code. | |
30 | """Converter of reading correction data from TSV to binary format. | |
31 | 31 | |
32 | 32 | Usage: |
33 | python gen_reading_correction_data.py --input=input.tsv --output=output.h | |
33 | python gen_reading_correction_data.py | |
34 | --input=input.tsv | |
35 | --output_value_array=value_array.data | |
36 | --output_error_array=error_array.data | |
37 | --output_correction_array=correction_array.data | |
34 | 38 | """ |
35 | 39 | |
36 | 40 | __author__ = "komatsu" |
37 | 41 | |
38 | 42 | import logging |
39 | 43 | import optparse |
44 | ||
40 | 45 | from build_tools import code_generator_util |
46 | from build_tools import serialized_string_array_builder | |
41 | 47 | |
42 | 48 | |
43 | 49 | def ParseOptions(): |
44 | 50 | """Parse command line options.""" |
45 | 51 | parser = optparse.OptionParser() |
46 | parser.add_option('--input', dest='input', help='input TSV file path.') | |
47 | parser.add_option('--output', dest='output', help='output .h file path.') | |
52 | parser.add_option('--input', dest='input', help='Input TSV file path.') | |
53 | parser.add_option('--output_value_array', dest='output_value_array', | |
54 | help='Output serialized string array for values.') | |
55 | parser.add_option('--output_error_array', dest='output_error_array', | |
56 | help='Output serialized string array for errors.') | |
57 | parser.add_option('--output_correction_array', dest='output_correction_array', | |
58 | help='Output serialized string array for corrections.') | |
48 | 59 | return parser.parse_args()[0] |
49 | 60 | |
50 | 61 | |
51 | def WriteData(input_path, output_path): | |
62 | def WriteData(input_path, output_value_array_path, output_error_array_path, | |
63 | output_correction_array_path): | |
52 | 64 | outputs = [] |
53 | 65 | with open(input_path) as input_stream: |
54 | 66 | input_stream = code_generator_util.SkipLineComment(input_stream) |
59 | 71 | outputs.append([value, error, correction]) |
60 | 72 | |
61 | 73 | # In order to lookup the entries via |error| with binary search, |
62 | # sort outputs here. | |
74 | # sort outputs here. | |
63 | 75 | outputs.sort(lambda x, y: cmp(x[1], y[1]) or cmp(x[0], y[0])) |
64 | 76 | |
65 | with open(output_path, 'w') as output_stream: | |
66 | output_stream.write('static const ReadingCorrectionItem ' | |
67 | 'kReadingCorrections[] = {\n') | |
68 | for output in outputs: | |
69 | (value, error, correction) = output | |
70 | output_stream.write(' // %s, %s, %s\n' % (value, error, correction)) | |
71 | output_stream.write( | |
72 | code_generator_util.FormatWithCppEscape( | |
73 | ' { %s, %s, %s },\n', value, error, correction)) | |
74 | ||
75 | output_stream.write('};\n') | |
77 | serialized_string_array_builder.SerializeToFile( | |
78 | [value for (value, _, _) in outputs], output_value_array_path) | |
79 | serialized_string_array_builder.SerializeToFile( | |
80 | [error for (_, error, _) in outputs], output_error_array_path) | |
81 | serialized_string_array_builder.SerializeToFile( | |
82 | [correction for (_, _, correction) in outputs], | |
83 | output_correction_array_path) | |
76 | 84 | |
77 | 85 | |
78 | 86 | def main(): |
79 | 87 | options = ParseOptions() |
80 | WriteData(options.input, options.output) | |
88 | WriteData(options.input, options.output_value_array, | |
89 | options.output_error_array, options.output_correction_array) | |
81 | 90 | |
82 | 91 | |
83 | 92 | if __name__ == "__main__": |
47 | 47 | '<(gen_out_dir)/embedded_collocation_suppression_data.h', |
48 | 48 | '<(gen_out_dir)/emoji_rewriter_data.h', |
49 | 49 | '<(gen_out_dir)/emoticon_rewriter_data.h', |
50 | '<(gen_out_dir)/reading_correction_data.h', | |
51 | 50 | '<(gen_out_dir)/single_kanji_rewriter_data.h', |
52 | 51 | '<(gen_out_dir)/symbol_rewriter_data.h', |
53 | 52 | '<(gen_out_dir)/usage_rewriter_data.h', |