Codebase list mozc / 306dbb8
Stop embedding reading correction data as C++ code This CL moves the embedded reading correction data to a new data set file. BUG= TEST= REF_BUG=26841123 REF_CL=115039689 REF_TIME=2016-02-19T16:55:30+09:00 REF_TIME_RAW=1455868530 +0900 Noriyuki Takahashi 8 years ago
28 changed file(s) with 220 addition(s) and 214 deletion(s). Raw diff Collapse all Expand all
115115 : array_(array), index_(index) {}
116116 iterator(const iterator &x) = default;
117117
118 size_t index() const { return index_; }
118119 StringPiece operator*() { return (*array_)[index_]; }
119120 StringPiece operator*() const { return (*array_)[index_]; }
120121 StringPiece operator[](difference_type n) const {
3535 #include "base/singleton.h"
3636 #include "converter/boundary_struct.h"
3737 #include "dictionary/pos_matcher.h"
38 #include "rewriter/correction_rewriter.h"
3938 #ifndef NO_USAGE_REWRITER
4039 #include "rewriter/usage_rewriter_data_structs.h"
4140 #endif // NO_USAGE_REWRITER
102101 manager_.GetSuffixDictionaryData(key_array, value_array, token_array);
103102 }
104103
105 namespace {
106 // Include kReadingCorrections.
107 #include "data_manager/chromeos/reading_correction_data.h"
108 } // namespace
109
110104 void ChromeOsDataManager::GetReadingCorrectionData(
111 const ReadingCorrectionItem **array,
112 size_t *size) const {
113 *array = kReadingCorrections;
114 *size = arraysize(kReadingCorrections);
105 StringPiece *value_array_data, StringPiece *error_array_data,
106 StringPiece *correction_array_data) const {
107 manager_.GetReadingCorrectionData(value_array_data, error_array_data,
108 correction_array_data);
115109 }
116110
117111 void ChromeOsDataManager::GetCollocationData(const char **array,
5050 void GetSystemDictionaryData(const char **data, int *size) const override;
5151 void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array,
5252 const uint32 **token_array) const override;
53 void GetReadingCorrectionData(const ReadingCorrectionItem **array,
54 size_t *size) const override;
53 void GetReadingCorrectionData(
54 StringPiece *value_array_data, StringPiece *error_array_data,
55 StringPiece *correction_array_data) const override;
5556 void GetCollocationData(const char **array, size_t *size) const override;
5657 void GetCollocationSuppressionData(const char **array,
5758 size_t *size) const override;
5757 void GetSystemDictionaryData(const char **data, int *size) const override {}
5858 void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array,
5959 const uint32 **token_array) const override {}
60 void GetReadingCorrectionData(const ReadingCorrectionItem **array,
61 size_t *size) const override {}
60 void GetReadingCorrectionData(
61 StringPiece *value_array_data, StringPiece *error_array_data,
62 StringPiece *correction_array_data) const override {}
6263 void GetCollocationData(const char **array, size_t *size) const override {}
6364 void GetCollocationSuppressionData(const char **array,
6465 size_t *size) const override {}
131131 return false;
132132 }
133133 }
134 if (!reader.Get("reading_correction_value",
135 &reading_correction_value_array_data_)) {
136 LOG(ERROR) << "Cannot find reading correction value array";
137 return false;
138 }
139 if (!reader.Get("reading_correction_error",
140 &reading_correction_error_array_data_)) {
141 LOG(ERROR) << "Cannot find reading correction error array";
142 return false;
143 }
144 if (!reader.Get("reading_correction_correction",
145 &reading_correction_correction_array_data_)) {
146 LOG(ERROR) << "Cannot find reading correction correction array";
147 return false;
148 }
149 {
150 SerializedStringArray value_array, error_array, correction_array;
151 if (!value_array.Init(reading_correction_value_array_data_) ||
152 !error_array.Init(reading_correction_error_array_data_) ||
153 !correction_array.Init(reading_correction_correction_array_data_) ||
154 value_array.size() != error_array.size() ||
155 value_array.size() != correction_array.size()) {
156 LOG(ERROR) << "Reading correction data is broken";
157 return false;
158 }
159 }
134160 return true;
135161 }
136162
197223 reinterpret_cast<const uint32 *>(suffix_token_array_data_.data());
198224 }
199225
200 void DataManager::GetReadingCorrectionData(const ReadingCorrectionItem **array,
201 size_t *size) const {
202 LOG(FATAL) << "Not implemented";
226 void DataManager::GetReadingCorrectionData(
227 StringPiece *value_array_data, StringPiece *error_array_data,
228 StringPiece *correction_array_data) const {
229 *value_array_data = reading_correction_value_array_data_;
230 *error_array_data = reading_correction_error_array_data_;
231 *correction_array_data = reading_correction_correction_array_data_;
203232 }
204233
205234 void DataManager::GetSymbolRewriterData(const EmbeddedDictionary::Token **data,
128128 'gen_separate_boundary_data_for_<(dataset_tag)#host',
129129 'gen_separate_counter_suffix_data_for_<(dataset_tag)#host',
130130 'gen_separate_suffix_data_for_<(dataset_tag)#host',
131 'gen_separate_reading_correction_data_for_<(dataset_tag)#host',
131132 ],
132133 'actions': [
133134 {
149150 'suffix_key': '<(gen_out_dir)/suffix_key.data',
150151 'suffix_value': '<(gen_out_dir)/suffix_value.data',
151152 'suffix_token': '<(gen_out_dir)/suffix_token.data',
153 'reading_correction_value': '<(gen_out_dir)/reading_correction_value.data',
154 'reading_correction_error': '<(gen_out_dir)/reading_correction_error.data',
155 'reading_correction_correction': '<(gen_out_dir)/reading_correction_correction.data',
152156 },
153157 'inputs': [
154158 '<(dictionary)',
166170 '<(suffix_key)',
167171 '<(suffix_value)',
168172 '<(suffix_token)',
173 '<(reading_correction_value)',
174 '<(reading_correction_error)',
175 '<(reading_correction_correction)',
169176 ],
170177 'outputs': [
171178 '<(gen_out_dir)/<(out_mozc_data)',
189196 'suffix_key:32:<(gen_out_dir)/suffix_key.data',
190197 'suffix_value:32:<(gen_out_dir)/suffix_value.data',
191198 'suffix_token:32:<(gen_out_dir)/suffix_token.data',
199 'reading_correction_value:32:<(gen_out_dir)/reading_correction_value.data',
200 'reading_correction_error:32:<(gen_out_dir)/reading_correction_error.data',
201 'reading_correction_correction:32:<(gen_out_dir)/reading_correction_correction.data',
192202 ],
193203 },
194204 ],
203213 'gen_embedded_collocation_suppression_data_for_<(dataset_tag)#host',
204214 'gen_embedded_connection_data_for_<(dataset_tag)#host',
205215 'gen_embedded_dictionary_data_for_<(dataset_tag)#host',
206 'gen_embedded_reading_correction_data_for_<(dataset_tag)#host',
207216 'gen_embedded_suggestion_filter_data_for_<(dataset_tag)#host',
208217 'gen_embedded_symbol_rewriter_data_for_<(dataset_tag)#host',
209218 ],
651660 ],
652661 },
653662 {
654 'target_name': 'gen_embedded_reading_correction_data_for_<(dataset_tag)',
663 'target_name': 'gen_separate_reading_correction_data_for_<(dataset_tag)',
655664 'type': 'none',
656665 'toolsets': ['host'],
657666 'actions': [
666675 '<(platform_data_dir)/reading_correction.tsv',
667676 ],
668677 'outputs': [
669 '<(gen_out_dir)/reading_correction_data.h',
678 '<(gen_out_dir)/reading_correction_value.data',
679 '<(gen_out_dir)/reading_correction_error.data',
680 '<(gen_out_dir)/reading_correction_correction.data',
670681 ],
671682 'action': [
672683 'python', '<(mozc_dir)/rewriter/gen_reading_correction_data.py',
673 '--output=<(gen_out_dir)/reading_correction_data.h',
674684 '--input=<@(input_files)',
675 ],
676 'message': ('[<(dataset_tag)] Generating ' +
677 '<(gen_out_dir)/reading_correction_data.h'),
685 '--output_value_array=<(gen_out_dir)/reading_correction_value.data',
686 '--output_error_array=<(gen_out_dir)/reading_correction_error.data',
687 '--output_correction_array=<(gen_out_dir)/reading_correction_correction.data',
688 ],
689 'message': ('[<(dataset_tag)] Generating ' +
690 '<(gen_out_dir)/reading_correction*'),
678691 },
679692 ],
680693 },
6363 void GetSuffixDictionaryData(StringPiece *key_array_data,
6464 StringPiece *value_array_data,
6565 const uint32 **token_array) const override;
66 void GetReadingCorrectionData(
67 StringPiece *value_array_data, StringPiece *error_array_data,
68 StringPiece *correction_array_data) const override;
6669
6770 // The following interfaces are not yet implemented.
6871 // TODO(noriyukit): Implements all the interfaces by migrating embedded C++
7073 const dictionary::UserPOS::POSToken *GetUserPOSData() const override;
7174 const dictionary::POSMatcher *GetPOSMatcher() const override;
7275
73 void GetReadingCorrectionData(const ReadingCorrectionItem **array,
74 size_t *size) const override;
7576 void GetSymbolRewriterData(const EmbeddedDictionary::Token **data,
7677 size_t *size) const override;
7778 #ifndef NO_USAGE_REWRITER
99100 StringPiece suffix_key_array_data_;
100101 StringPiece suffix_value_array_data_;
101102 StringPiece suffix_token_array_data_;
103 StringPiece reading_correction_value_array_data_;
104 StringPiece reading_correction_error_array_data_;
105 StringPiece reading_correction_correction_array_data_;
102106
103107 DISALLOW_COPY_AND_ASSIGN(DataManager);
104108 };
3636
3737 namespace mozc {
3838
39 struct ReadingCorrectionItem;
4039 #ifndef NO_USAGE_REWRITER
4140 struct ConjugationSuffix;
4241 struct UsageDictItem;
8180 const uint32 **token_array) const = 0;
8281
8382 // Gets a reference to reading correction data array and its size.
84 virtual void GetReadingCorrectionData(const ReadingCorrectionItem **array,
85 size_t *size) const = 0;
83 virtual void GetReadingCorrectionData(
84 StringPiece *value_array_data, StringPiece *error_array_data,
85 StringPiece *correction_array_data) const = 0;
8686
8787 // Gets the address of collocation data array and its size.
8888 virtual void GetCollocationData(const char **array, size_t *size) const = 0;
3232 #include "base/logging.h"
3333 #include "base/port.h"
3434 #include "dictionary/pos_matcher.h"
35 #include "rewriter/correction_rewriter.h"
3635 #ifndef NO_USAGE_REWRITER
3736 #include "rewriter/usage_rewriter_data_structs.h"
3837 #endif // NO_USAGE_REWRITER
114113 manager_.GetSuffixDictionaryData(key_array, value_array, token_array);
115114 }
116115
117 namespace {
118 // Include kReadingCorrections.
119 #include "data_manager/oss/reading_correction_data.h"
120 } // namespace
121
122116 void OssDataManager::GetReadingCorrectionData(
123 const ReadingCorrectionItem **array,
124 size_t *size) const {
125 *array = kReadingCorrections;
126 *size = arraysize(kReadingCorrections);
117 StringPiece *value_array_data, StringPiece *error_array_data,
118 StringPiece *correction_array_data) const {
119 manager_.GetReadingCorrectionData(value_array_data, error_array_data,
120 correction_array_data);
127121 }
128122
129123 void OssDataManager::GetCollocationData(const char **array,
5252 void GetSystemDictionaryData(const char **data, int *size) const override;
5353 void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array,
5454 const uint32 **token_array) const override;
55 void GetReadingCorrectionData(const ReadingCorrectionItem **array,
56 size_t *size) const override;
55 void GetReadingCorrectionData(
56 StringPiece *value_array_data, StringPiece *error_array_data,
57 StringPiece *correction_array_data) const override;
5758 void GetCollocationData(const char **array, size_t *size) const override;
5859 void GetCollocationSuppressionData(const char **array,
5960 size_t *size) const override;
5858 void GetSystemDictionaryData(const char **data, int *size) const override {}
5959 void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array,
6060 const uint32 **token_array) const override {}
61 void GetReadingCorrectionData(const ReadingCorrectionItem **array,
62 size_t *size) const override {}
61 void GetReadingCorrectionData(
62 StringPiece *value_array_data, StringPiece *error_array_data,
63 StringPiece *correction_array_data) const override {}
6364 void GetCollocationData(const char **array, size_t *size) const override {}
6465 void GetCollocationSuppressionData(const char **array,
6566 size_t *size) const override {}
3838 #include "dictionary/pos_group.h"
3939 #include "dictionary/pos_matcher.h"
4040 #include "dictionary/user_pos.h"
41 #include "rewriter/correction_rewriter.h"
4241 #include "rewriter/embedded_dictionary.h"
4342 #ifndef NO_USAGE_REWRITER
4443 #include "rewriter/usage_rewriter_data_structs.h"
5554 namespace {
5655
5756 #include "data_manager/@DIR@/pos_matcher_data.h"
58 #include "data_manager/@DIR@/reading_correction_data.h"
5957 #include "data_manager/@DIR@/symbol_rewriter_data.h"
6058 #include "data_manager/@DIR@/user_pos_data.h"
6159 #ifndef NO_USAGE_REWRITER
7674 // elements are not required at runtime.
7775 packer.SetPosMatcherData(kRuleIdTable, arraysize(kRuleIdTable) - 1,
7876 kRangeTables, arraysize(kRangeTables) - 1);
79 packer.SetReadingCorretions(kReadingCorrections,
80 arraysize(kReadingCorrections));
8177 packer.SetSymbolRewriterData(kSymbolData_token_data, kSymbolData_token_size);
8278 #ifndef NO_USAGE_REWRITER
8379 packer.SetUsageRewriterData(kConjugationNum,
4141 #include "data_manager/packed/system_dictionary_data.pb.h"
4242 #include "data_manager/packed/system_dictionary_format_version.h"
4343 #include "dictionary/pos_matcher.h"
44 #include "rewriter/correction_rewriter.h"
4544 #include "rewriter/embedded_dictionary.h"
4645 #ifndef NO_USAGE_REWRITER
4746 #include "rewriter/usage_rewriter_data_structs.h"
9594 void GetSystemDictionaryData(const char **data, int *size) const;
9695 void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array,
9796 const uint32 **token_array) const;
98 void GetReadingCorrectionData(const ReadingCorrectionItem **array,
99 size_t *size) const;
97 void GetReadingCorrectionData(
98 StringPiece *value_array_data, StringPiece *error_array_data,
99 StringPiece *correction_array_data) const;
100100 void GetCollocationData(const char **array, size_t *size) const;
101101 void GetCollocationSuppressionData(const char **array,
102102 size_t *size) const;
128128 unique_ptr<uint16[]> rule_id_table_;
129129 unique_ptr<POSMatcher::Range *[]> range_tables_;
130130 unique_ptr<Range[]> range_table_items_;
131 unique_ptr<ReadingCorrectionItem[]> reading_corrections_;
132131 unique_ptr<EmbeddedDictionary::Value[]> symbol_data_values_;
133132 size_t symbol_data_token_size_;
134133 unique_ptr<EmbeddedDictionary::Token[]> symbol_data_tokens_;
270269 range_table_items_[range_index].lower = static_cast<uint16>(0xFFFF);
271270 range_table_items_[range_index].upper = static_cast<uint16>(0xFFFF);
272271 ++range_index;
273 }
274
275 // Makes reading correction data.
276 reading_corrections_.reset(
277 new ReadingCorrectionItem[
278 system_dictionary_data_->reading_corrections_size()]);
279 for (size_t i = 0;
280 i < system_dictionary_data_->reading_corrections_size();
281 ++i) {
282 const SystemDictionaryData::ReadingCorrectionItem &item =
283 system_dictionary_data_->reading_corrections(i);
284 if (item.has_value()) {
285 reading_corrections_[i].value = item.value().data();
286 } else {
287 reading_corrections_[i].value = NULL;
288 }
289 if (item.has_error()) {
290 reading_corrections_[i].error = item.error().data();
291 } else {
292 reading_corrections_[i].error = NULL;
293 }
294 if (item.has_correction()) {
295 reading_corrections_[i].correction = item.correction().data();
296 } else {
297 reading_corrections_[i].correction = NULL;
298 }
299272 }
300273
301274 // Makes symbol dictionary data.
464437 }
465438
466439 void PackedDataManager::Impl::GetReadingCorrectionData(
467 const ReadingCorrectionItem **array,
468 size_t *size) const {
469 *array = reading_corrections_.get();
470 *size = system_dictionary_data_->reading_corrections().size();
440 StringPiece *value_array_data, StringPiece *error_array_data,
441 StringPiece *correction_array_data) const {
442 manager_.GetReadingCorrectionData(value_array_data, error_array_data,
443 correction_array_data);
471444 }
472445
473446 void PackedDataManager::Impl::GetCollocationData(
624597 }
625598
626599 void PackedDataManager::GetReadingCorrectionData(
627 const ReadingCorrectionItem **array,
628 size_t *size) const {
629 manager_impl_->GetReadingCorrectionData(array, size);
600 StringPiece *value_array_data, StringPiece *error_array_data,
601 StringPiece *correction_array_data) const {
602 manager_impl_->GetReadingCorrectionData(value_array_data, error_array_data,
603 correction_array_data);
630604 }
631605
632606 void PackedDataManager::GetCollocationData(
6262 void GetSystemDictionaryData(const char **data, int *size) const override;
6363 void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array,
6464 const uint32 **token_array) const override;
65 void GetReadingCorrectionData(const ReadingCorrectionItem **array,
66 size_t *size) const override;
65 void GetReadingCorrectionData(
66 StringPiece *value_array_data, StringPiece *error_array_data,
67 StringPiece *correction_array_data) const override;
6768 void GetCollocationData(const char **array, size_t *size) const override;
6869 void GetCollocationSuppressionData(const char **array,
6970 size_t *size) const override;
6666
6767 reserved 7; // DEPRECATED: repeated SuffixToken suffix_tokens = 7;
6868
69 message ReadingCorrectionItem {
70 optional string value = 1;
71 optional string error = 2;
72 optional string correction = 3;
73 };
74 repeated ReadingCorrectionItem reading_corrections = 8;
69 // DEPRECATED: repeated ReadingCorrectionItem reading_corrections = 8;
70 reserved 8;
7571
7672 reserved 9; // DEPRECATED: optional SegmenterData segmenter_data = 9;
7773
4242 #include "dictionary/pos_group.h"
4343 #include "dictionary/pos_matcher.h"
4444 #include "dictionary/user_pos.h"
45 #include "rewriter/correction_rewriter.h"
4645 #include "rewriter/embedded_dictionary.h"
4746 #ifndef NO_USAGE_REWRITER
4847 #include "rewriter/usage_rewriter_data_structs.h"
109108 = range_table->add_ranges();
110109 range->set_lower(range_tables[i][j].lower);
111110 range->set_upper(range_tables[i][j].upper);
112 }
113 }
114 }
115
116 void SystemDictionaryDataPacker::SetReadingCorretions(
117 const ReadingCorrectionItem *reading_corrections,
118 size_t reading_corrections_count) {
119 for (size_t i = 0; i < reading_corrections_count; ++i) {
120 SystemDictionaryData::ReadingCorrectionItem *item =
121 system_dictionary_->add_reading_corrections();
122 if (reading_corrections[i].value) {
123 item->set_value(reading_corrections[i].value);
124 }
125 if (reading_corrections[i].error) {
126 item->set_error(reading_corrections[i].error);
127 }
128 if (reading_corrections[i].correction) {
129 item->set_correction(reading_corrections[i].correction);
130111 }
131112 }
132113 }
3434 #include "base/port.h"
3535 #include "dictionary/pos_matcher.h"
3636 #include "dictionary/user_pos.h"
37 #include "rewriter/correction_rewriter.h"
3837 #include "rewriter/embedded_dictionary.h"
3938
4039 namespace mozc {
6059 size_t rule_id_table_count,
6160 const dictionary::POSMatcher::Range *const *range_tables,
6261 size_t range_tables_count);
63 void SetReadingCorretions(
64 const ReadingCorrectionItem *reading_corrections,
65 size_t reading_corrections_count);
6662 void SetSuggestionFilterData(
6763 const void *suggestion_filter_data,
6864 size_t suggestion_filter_data_size);
3232 namespace mozc {
3333 namespace packed {
3434
35 const int kSystemDictionaryFormatVersion = 15;
35 const int kSystemDictionaryFormatVersion = 16;
3636
3737 } // namespace packed
3838 } // namespace mozc
3232 #include "base/logging.h"
3333 #include "base/port.h"
3434 #include "dictionary/pos_matcher.h"
35 #include "rewriter/correction_rewriter.h"
3635 #ifndef NO_USAGE_REWRITER
3736 #include "rewriter/usage_rewriter_data_structs.h"
3837 #endif // NO_USAGE_REWRITER
9089 manager_.GetSuffixDictionaryData(key_array, value_array, token_array);
9190 }
9291
93 namespace {
94 // Include kReadingCorrections.
95 #include "data_manager/testing/reading_correction_data.h"
96 } // namespace
97
9892 void MockDataManager::GetReadingCorrectionData(
99 const ReadingCorrectionItem **array,
100 size_t *size) const {
101 *array = kReadingCorrections;
102 *size = arraysize(kReadingCorrections);
93 StringPiece *value_array_data, StringPiece *error_array_data,
94 StringPiece *correction_array_data) const {
95 manager_.GetReadingCorrectionData(value_array_data, error_array_data,
96 correction_array_data);
10397 }
10498
10599 void MockDataManager::GetCollocationData(const char **array,
5050 void GetSystemDictionaryData(const char **data, int *size) const override;
5151 void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array,
5252 const uint32 **token_array) const override;
53 void GetReadingCorrectionData(const ReadingCorrectionItem **array,
54 size_t *size) const override;
53 void GetReadingCorrectionData(
54 StringPiece *value_array_data, StringPiece *error_array_data,
55 StringPiece *correction_array_data) const override;
5556 void GetCollocationData(const char **array, size_t *size) const override;
5657 void GetCollocationSuppressionData(const char **array,
5758 size_t *size) const override;
5757 void GetSystemDictionaryData(const char **data, int *size) const override {}
5858 void GetSuffixDictionaryData(StringPiece *key_array, StringPiece *value_array,
5959 const uint32 **token_array) const override {}
60 void GetReadingCorrectionData(const ReadingCorrectionItem **array,
61 size_t *size) const override {}
60 void GetReadingCorrectionData(
61 StringPiece *value_array_data, StringPiece *error_array_data,
62 StringPiece *correction_array_data) const override {}
6263 void GetCollocationData(const char **array, size_t *size) const override {}
6364 void GetCollocationSuppressionData(const char **array,
6465 size_t *size) const override {}
00 MAJOR=2
11 MINOR=17
2 BUILD=2500
2 BUILD=2501
33 REVISION=102
44 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
55 # downloaded by NaCl Mozc.
6 NACL_DICTIONARY_VERSION=15
6 NACL_DICTIONARY_VERSION=16
4141 #include "request/conversion_request.h"
4242
4343 namespace mozc {
44 namespace {
4544
46 void SetCandidate(const ReadingCorrectionItem *item,
47 Segment::Candidate *candidate) {
48 DCHECK(item);
45 void CorrectionRewriter::SetCandidate(const ReadingCorrectionItem &item,
46 Segment::Candidate *candidate) {
4947 candidate->prefix = "\xE2\x86\x92 "; // "→ "
5048 candidate->attributes |= Segment::Candidate::SPELLING_CORRECTION;
49
5150 candidate->description =
5251 // "もしかして"
53 "<\xE3\x82\x82\xE3\x81\x97\xE3\x81\x8B\xE3\x81\x97\xE3\x81\xA6: " +
54 string(item->correction) + ">";
52 "<\xE3\x82\x82\xE3\x81\x97\xE3\x81\x8B\xE3\x81\x97\xE3\x81\xA6: ";
53 item.correction.AppendToString(&candidate->description);
54 candidate->description.append(1, '>');
55
5556 DCHECK(candidate->IsValid());
5657 }
57
58 struct ReadingCorrectionItemCompare {
59 bool operator()(const ReadingCorrectionItem &s1,
60 const ReadingCorrectionItem &s2) const {
61 return (strcmp(s1.error, s2.error) < 0);
62 }
63 };
64 } // namespace
6558
6659 bool CorrectionRewriter::LookupCorrection(
6760 const string &key,
6861 const string &value,
69 vector<const ReadingCorrectionItem *> *results) const {
62 vector<ReadingCorrectionItem> *results) const {
7063 CHECK(results);
7164 results->clear();
72 ReadingCorrectionItem key_item;
73 key_item.error = key.c_str();
74 const ReadingCorrectionItem *result =
75 std::lower_bound(reading_corrections_, reading_corrections_ + size_,
76 key_item, ReadingCorrectionItemCompare());
77 if (result == (reading_corrections_ + size_) ||
78 key != result->error) {
79 return false;
80 }
8165
82 for (; result != (reading_corrections_ + size_); ++result) {
83 if (key != result->error) {
84 break;
85 }
86 if (value.empty() || value == result->value) {
87 results->push_back(result);
66 using Iter = SerializedStringArray::const_iterator;
67 pair<Iter, Iter> range = std::equal_range(error_array_.begin(),
68 error_array_.end(),
69 key);
70 for (; range.first != range.second; ++range.first) {
71 const StringPiece v = value_array_[range.first.index()];
72 if (value.empty() || value == v) {
73 results->emplace_back(v, *range.first,
74 correction_array_[range.first.index()]);
8875 }
8976 }
90
9177 return !results->empty();
9278 }
9379
94 CorrectionRewriter::CorrectionRewriter(
95 const ReadingCorrectionItem *reading_corrections, const size_t array_size) :
96 reading_corrections_(reading_corrections), size_(array_size) {}
80 CorrectionRewriter::CorrectionRewriter(StringPiece value_array_data,
81 StringPiece error_array_data,
82 StringPiece correction_array_data) {
83 DCHECK(SerializedStringArray::VerifyData(value_array_data));
84 DCHECK(SerializedStringArray::VerifyData(error_array_data));
85 DCHECK(SerializedStringArray::VerifyData(correction_array_data));
86 value_array_.Set(value_array_data);
87 error_array_.Set(error_array_data);
88 correction_array_.Set(correction_array_data);
89 DCHECK_EQ(value_array_.size(), error_array_.size());
90 DCHECK_EQ(value_array_.size(), correction_array_.size());
91 }
9792
9893 // static
9994 CorrectionRewriter *CorrectionRewriter::CreateCorrectionRewriter(
10095 const DataManagerInterface *data_manager) {
101 const ReadingCorrectionItem *array = NULL;
102 size_t array_size = 0;
103 data_manager->GetReadingCorrectionData(&array, &array_size);
104 return new CorrectionRewriter(array, array_size);
96 StringPiece value_array_data, error_array_data, correction_array_data;
97 data_manager->GetReadingCorrectionData(&value_array_data,
98 &error_array_data,
99 &correction_array_data);
100 return new CorrectionRewriter(value_array_data, error_array_data,
101 correction_array_data);
105102 }
106103
107104 CorrectionRewriter::~CorrectionRewriter() {}
113110 }
114111
115112 bool modified = false;
116 vector<const ReadingCorrectionItem *> results;
113 vector<ReadingCorrectionItem> results;
117114
118115 for (size_t i = 0; i < segments->conversion_segments_size(); ++i) {
119116 Segment *segment = segments->mutable_conversion_segment(i);
155152 segment->insert_candidate(kInsertPostion);
156153 DCHECK(mutable_candidate);
157154 mutable_candidate->CopyFrom(top_candidate);
158 Util::ConcatStrings(results[k]->error,
155 Util::ConcatStrings(results[k].error,
159156 top_candidate.functional_key(),
160157 &mutable_candidate->key);
161 Util::ConcatStrings(results[k]->value,
158 Util::ConcatStrings(results[k].value,
162159 top_candidate.functional_value(),
163160 &mutable_candidate->value);
164161 mutable_candidate->inner_segment_boundary.clear();
3333 #include <string>
3434 #include <vector>
3535
36 #include "base/serialized_string_array.h"
37 #include "base/string_piece.h"
3638 #include "rewriter/rewriter_interface.h"
3739
3840 namespace mozc {
39
40 struct ReadingCorrectionItem {
41 // ex. (value, error, correction) = ("雰囲気", "ふいんき", "ふんいき")
42 const char *value;
43 const char *error;
44 const char *correction;
45 };
4641
4742 class ConversionRequest;
4843 class DataManagerInterface;
5651 static CorrectionRewriter *CreateCorrectionRewriter(
5752 const DataManagerInterface *data_manager);
5853
59 CorrectionRewriter(const ReadingCorrectionItem *reading_corrections,
60 size_t array_size);
61 virtual ~CorrectionRewriter();
62 virtual bool Rewrite(const ConversionRequest &request,
63 Segments *segments) const;
54 CorrectionRewriter(StringPiece value_array_data, StringPiece error_array_data,
55 StringPiece correction_array_data);
56 ~CorrectionRewriter() override;
6457
65 virtual int capability(const ConversionRequest &request) const {
58 bool Rewrite(const ConversionRequest &request,
59 Segments *segments) const override;
60
61 int capability(const ConversionRequest &request) const override {
6662 return RewriterInterface::ALL;
6763 }
6864
6965 private:
70 const ReadingCorrectionItem *reading_corrections_;
71 size_t size_;
66 struct ReadingCorrectionItem {
67 ReadingCorrectionItem(StringPiece v, StringPiece e, StringPiece c)
68 : value(v), error(e), correction(c) {}
69
70 // ex. (value, error, correction) = ("雰囲気", "ふいんき", "ふんいき")
71 StringPiece value;
72 StringPiece error;
73 StringPiece correction;
74 };
75
76 // Sets |candidate| fields from |iterm|.
77 static void SetCandidate(const ReadingCorrectionItem &item,
78 Segment::Candidate *candidate);
7279
7380 // Looks up corrections with key and value. Return true if at least
7481 // one correction is found in the internal dictionary.
7885 bool LookupCorrection(
7986 const string &key,
8087 const string &value,
81 vector<const ReadingCorrectionItem *> *results) const;
88 vector<ReadingCorrectionItem> *results) const;
89
90 SerializedStringArray value_array_;
91 SerializedStringArray error_array_;
92 SerializedStringArray correction_array_;
8293 };
8394
8495 } // namespace mozc
3131 #include <memory>
3232 #include <string>
3333
34 #include "base/port.h"
35 #include "base/serialized_string_array.h"
3436 #include "config/config_handler.h"
3537 #include "converter/segments.h"
3638 #include "protocol/commands.pb.h"
4042
4143 namespace mozc {
4244 namespace {
43 static const ReadingCorrectionItem kReadingCorrectionTestItems[] = {
44 { "TSUKIGIME", "gekkyoku", "tsukigime" },
45 };
4645
4746 Segment *AddSegment(const string &key, Segments *segments) {
4847 Segment *segment = segments->push_back_segment();
7170 convreq_.set_config(&config_);
7271 }
7372
74 virtual void SetUp() {
73 void SetUp() override {
74 // Create a rewriter with one entry: (TSUKIGIME, gekkyoku, tsukigime)
75 const vector<StringPiece> values = {"TSUKIGIME"};
76 const vector<StringPiece> errors = {"gekkyoku"};
77 const vector<StringPiece> corrections = {"tsukigime"};
7578 rewriter_.reset(new CorrectionRewriter(
76 kReadingCorrectionTestItems,
77 arraysize(kReadingCorrectionTestItems)));
79 SerializedStringArray::SerializeToBuffer(values, &values_buf_),
80 SerializedStringArray::SerializeToBuffer(errors, &errors_buf_),
81 SerializedStringArray::SerializeToBuffer(corrections,
82 &corrections_buf_)));
7883 config::ConfigHandler::GetDefaultConfig(&config_);
7984 config_.set_use_spelling_correction(true);
8085 }
8388 ConversionRequest convreq_;
8489 commands::Request request_;
8590 config::Config config_;
91
92 private:
93 std::unique_ptr<uint32[]> values_buf_;
94 std::unique_ptr<uint32[]> errors_buf_;
95 std::unique_ptr<uint32[]> corrections_buf_;
8696 };
8797
8898 TEST_F(CorrectionRewriterTest, CapabilityTest) {
2727 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2828 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2929
30 """Converter of reading correction data from TSV to C++ code.
30 """Converter of reading correction data from TSV to binary format.
3131
3232 Usage:
33 python gen_reading_correction_data.py --input=input.tsv --output=output.h
33 python gen_reading_correction_data.py
34 --input=input.tsv
35 --output_value_array=value_array.data
36 --output_error_array=error_array.data
37 --output_correction_array=correction_array.data
3438 """
3539
3640 __author__ = "komatsu"
3741
3842 import logging
3943 import optparse
44
4045 from build_tools import code_generator_util
46 from build_tools import serialized_string_array_builder
4147
4248
4349 def ParseOptions():
4450 """Parse command line options."""
4551 parser = optparse.OptionParser()
46 parser.add_option('--input', dest='input', help='input TSV file path.')
47 parser.add_option('--output', dest='output', help='output .h file path.')
52 parser.add_option('--input', dest='input', help='Input TSV file path.')
53 parser.add_option('--output_value_array', dest='output_value_array',
54 help='Output serialized string array for values.')
55 parser.add_option('--output_error_array', dest='output_error_array',
56 help='Output serialized string array for errors.')
57 parser.add_option('--output_correction_array', dest='output_correction_array',
58 help='Output serialized string array for corrections.')
4859 return parser.parse_args()[0]
4960
5061
51 def WriteData(input_path, output_path):
62 def WriteData(input_path, output_value_array_path, output_error_array_path,
63 output_correction_array_path):
5264 outputs = []
5365 with open(input_path) as input_stream:
5466 input_stream = code_generator_util.SkipLineComment(input_stream)
5971 outputs.append([value, error, correction])
6072
6173 # In order to lookup the entries via |error| with binary search,
62 # sort outputs here.
74 # sort outputs here.
6375 outputs.sort(lambda x, y: cmp(x[1], y[1]) or cmp(x[0], y[0]))
6476
65 with open(output_path, 'w') as output_stream:
66 output_stream.write('static const ReadingCorrectionItem '
67 'kReadingCorrections[] = {\n')
68 for output in outputs:
69 (value, error, correction) = output
70 output_stream.write(' // %s, %s, %s\n' % (value, error, correction))
71 output_stream.write(
72 code_generator_util.FormatWithCppEscape(
73 ' { %s, %s, %s },\n', value, error, correction))
74
75 output_stream.write('};\n')
77 serialized_string_array_builder.SerializeToFile(
78 [value for (value, _, _) in outputs], output_value_array_path)
79 serialized_string_array_builder.SerializeToFile(
80 [error for (_, error, _) in outputs], output_error_array_path)
81 serialized_string_array_builder.SerializeToFile(
82 [correction for (_, _, correction) in outputs],
83 output_correction_array_path)
7684
7785
7886 def main():
7987 options = ParseOptions()
80 WriteData(options.input, options.output)
88 WriteData(options.input, options.output_value_array,
89 options.output_error_array, options.output_correction_array)
8190
8291
8392 if __name__ == "__main__":
4747 '<(gen_out_dir)/embedded_collocation_suppression_data.h',
4848 '<(gen_out_dir)/emoji_rewriter_data.h',
4949 '<(gen_out_dir)/emoticon_rewriter_data.h',
50 '<(gen_out_dir)/reading_correction_data.h',
5150 '<(gen_out_dir)/single_kanji_rewriter_data.h',
5251 '<(gen_out_dir)/symbol_rewriter_data.h',
5352 '<(gen_out_dir)/usage_rewriter_data.h',
6666 ],
6767 'dependencies': [
6868 '../base/base.gyp:base',
69 '../base/base.gyp:serialized_string_array',
6970 '../base/base_test.gyp:clock_mock',
7071 '../converter/converter.gyp:converter',
7172 '../converter/converter_base.gyp:converter_mock',