Stop embedding segmenter data as C++ code
This CL moves the segmenter data into a data set file for all
platforms.
BUG=
TEST=
REF_BUG=26841123
REF_CL=114514812
REF_TIME=2016-02-12T17:13:08+09:00
REF_TIME_RAW=1455264788 +0900
Noriyuki Takahashi
8 years ago
153 | 153 | ], |
154 | 154 | 'dependencies' : [ |
155 | 155 | '../base/base.gyp:base', |
156 | '../protocol/protocol.gyp:segmenter_data_proto', | |
156 | 157 | ] |
157 | 158 | }, |
158 | 159 | { |
38 | 38 | #include <vector> |
39 | 39 | |
40 | 40 | #include "base/bitarray.h" |
41 | #include "base/codegen_bytearray_stream.h" | |
42 | 41 | #include "base/file_stream.h" |
43 | 42 | #include "base/logging.h" |
44 | 43 | #include "base/port.h" |
44 | #include "base/system_util.h" | |
45 | #include "protocol/segmenter_data.pb.h" | |
45 | 46 | |
46 | 47 | namespace mozc { |
47 | 48 | |
89 | 90 | return compressed_table_[id]; |
90 | 91 | } |
91 | 92 | |
92 | size_t compressed_size() const { | |
93 | return compressed_size_; | |
94 | } | |
95 | ||
96 | void Output(const string &name, ostream *os) { | |
97 | // Disable following compression for simplifying the implementation. | |
98 | // As CompressedTable have <3000 entries, this affects at most | |
99 | // (16-8)(bit) * 3000 (entries) * 2 (L and R) = 6KB | |
100 | // | |
101 | // TODO(toshiyuki): Enable this compression again if possible or needed | |
102 | // | |
103 | // if (compressed_size_ < 256) { | |
104 | // // trivial compression -- use uint8 if possible | |
105 | // *os << "const uint8 " << name << "[] = {" << std::endl; | |
106 | // } else { | |
107 | // *os << "const uint16 " << name << "[] = {" << std::endl; | |
108 | // } | |
109 | ||
110 | *os << "const uint16 " << name << "[] = {" << std::endl; | |
111 | for (size_t i = 0; i < compressed_table_.size(); ++i) { | |
112 | *os << compressed_table_[i]; | |
113 | if (i < compressed_table_.size() - 1) { | |
114 | *os << ","; | |
115 | } | |
116 | *os << std::endl; | |
117 | } | |
118 | *os << "};" << std::endl; | |
93 | size_t compressed_size() const { return compressed_size_; } | |
94 | ||
95 | void Output(ostream *os) { | |
96 | const char* data = reinterpret_cast<const char*>(compressed_table_.data()); | |
97 | const size_t bytelen = compressed_table_.size() * sizeof(uint16); | |
98 | os->write(data, bytelen); | |
119 | 99 | } |
120 | 100 | |
121 | 101 | private: |
127 | 107 | }; |
128 | 108 | } // namespace |
129 | 109 | |
130 | void SegmenterBitarrayGenerator::GenerateBitarray(int lsize, int rsize, | |
131 | IsBoundaryFunc func, | |
132 | const string &output_file) { | |
110 | void SegmenterBitarrayGenerator::GenerateBitarray( | |
111 | int lsize, int rsize, IsBoundaryFunc func, const string &output_size_info, | |
112 | const string &output_ltable, const string &output_rtable, | |
113 | const string &output_bitarray) { | |
133 | 114 | // Load the original matrix into an array |
134 | vector<uint8> array((lsize + 1) * (rsize + 1)); | |
115 | vector<uint8> array((lsize + 1) * (rsize + 1)); | |
135 | 116 | |
136 | 117 | for (size_t rid = 0; rid <= lsize; ++rid) { |
137 | 118 | for (size_t lid = 0; lid <= rsize; ++lid) { |
198 | 179 | // verify the table |
199 | 180 | for (size_t rid = 0; rid <= lsize; ++rid) { |
200 | 181 | for (size_t lid = 0; lid <= rsize; ++lid) { |
201 | const int index= rid + lsize * lid; | |
182 | const int index = rid + lsize * lid; | |
202 | 183 | const uint32 cindex = ltable.id(rid) + kCompressedLSize * rtable.id(lid); |
203 | 184 | CHECK_EQ(barray.get(cindex), (array[index] != 0)); |
204 | 185 | } |
207 | 188 | CHECK(barray.array()); |
208 | 189 | CHECK_GT(barray.size(), 0); |
209 | 190 | |
210 | mozc::OutputFileStream ofs(output_file.c_str()); | |
211 | CHECK(ofs); | |
212 | ||
213 | ofs << "const size_t kCompressedLSize = " << kCompressedLSize << ";" | |
214 | << std::endl; | |
215 | ofs << "const size_t kCompressedRSize = " << kCompressedRSize << ";" | |
216 | << std::endl; | |
217 | ltable.Output("kCompressedLIDTable", &ofs); | |
218 | rtable.Output("kCompressedRIDTable", &ofs); | |
219 | ||
220 | mozc::CodeGenByteArrayOutputStream codegen_stream( | |
221 | &ofs, mozc::codegenstream::NOT_OWN_STREAM); | |
222 | codegen_stream.OpenVarDef("SegmenterBitArrayData"); | |
223 | codegen_stream.write(barray.array(), barray.array_size()); | |
224 | codegen_stream.CloseVarDef(); | |
191 | CHECK(SystemUtil::IsLittleEndian()) | |
192 | << "Architecture must be little endian"; | |
193 | { | |
194 | mozc::converter::SegmenterDataSizeInfo pb; | |
195 | pb.set_compressed_lsize(kCompressedLSize); | |
196 | pb.set_compressed_rsize(kCompressedRSize); | |
197 | mozc::OutputFileStream ofs(output_size_info.c_str(), | |
198 | ios_base::out | ios_base::binary); | |
199 | CHECK(ofs); | |
200 | CHECK(pb.SerializeToOstream(&ofs)); | |
201 | ofs.close(); | |
202 | } | |
203 | { | |
204 | mozc::OutputFileStream ofs(output_ltable.c_str(), | |
205 | ios_base::out | ios_base::binary); | |
206 | CHECK(ofs); | |
207 | ltable.Output(&ofs); | |
208 | ofs.close(); | |
209 | } | |
210 | { | |
211 | mozc::OutputFileStream ofs(output_rtable.c_str(), | |
212 | ios_base::out | ios_base::binary); | |
213 | CHECK(ofs); | |
214 | rtable.Output(&ofs); | |
215 | ofs.close(); | |
216 | } | |
217 | { | |
218 | mozc::OutputFileStream ofs(output_bitarray.c_str(), | |
219 | ios_base::out | ios_base::binary); | |
220 | CHECK(ofs); | |
221 | ofs.write(barray.array(), barray.array_size()); | |
222 | ofs.close(); | |
223 | } | |
225 | 224 | } |
226 | 225 | |
227 | 226 | } // namespace mozc |
39 | 39 | public: |
40 | 40 | typedef bool (*IsBoundaryFunc)(uint16 rid, uint16 lid); |
41 | 41 | static void GenerateBitarray(int lsize, int rsize, IsBoundaryFunc func, |
42 | const string &output_file); | |
42 | const string &output_size_info, | |
43 | const string &output_ltable, | |
44 | const string &output_rtable, | |
45 | const string &output_bitarray); | |
43 | 46 | |
44 | 47 | private: |
45 | 48 | DISALLOW_COPY_AND_ASSIGN(SegmenterBitarrayGenerator); |
90 | 90 | manager_.GetSystemDictionaryData(data, size); |
91 | 91 | } |
92 | 92 | |
93 | namespace { | |
94 | // Automatically generated headers containing data set for segmenter. | |
95 | #include "data_manager/chromeos/segmenter_data.h" | |
96 | } // namespace | |
97 | ||
98 | 93 | void ChromeOsDataManager::GetSegmenterData( |
99 | 94 | size_t *l_num_elements, size_t *r_num_elements, |
100 | 95 | const uint16 **l_table, const uint16 **r_table, |
101 | 96 | size_t *bitarray_num_bytes, const char **bitarray_data, |
102 | 97 | const uint16 **boundary_data) const { |
103 | *l_num_elements = kCompressedLSize; | |
104 | *r_num_elements = kCompressedRSize; | |
105 | *l_table = kCompressedLIDTable; | |
106 | *r_table = kCompressedRIDTable; | |
107 | *bitarray_num_bytes = kSegmenterBitArrayData_size; | |
108 | *bitarray_data = kSegmenterBitArrayData_data; | |
109 | *boundary_data = manager_.GetBoundaryData(); | |
98 | manager_.GetSegmenterData(l_num_elements, r_num_elements, | |
99 | l_table, r_table, bitarray_num_bytes, | |
100 | bitarray_data, boundary_data); | |
110 | 101 | } |
111 | 102 | |
112 | 103 | namespace { |
36 | 36 | namespace chromeos { |
37 | 37 | |
38 | 38 | namespace { |
39 | #include "data_manager/chromeos/chromeos_segmenter_inl.h" | |
39 | #include "data_manager/chromeos/segmenter_inl.h" | |
40 | 40 | } // namespace |
41 | 41 | |
42 | 42 | class ChromeOsDataManagerTest : public DataManagerTestBase { |
31 | 31 | #include "converter/gen_segmenter_bitarray.h" |
32 | 32 | |
33 | 33 | namespace { |
34 | #include "data_manager/chromeos/chromeos_segmenter_inl.h" | |
34 | #include "data_manager/chromeos/segmenter_inl.h" | |
35 | 35 | } |
36 | 36 | |
37 | DEFINE_string(output, "", "header filename for chromeos segmenter"); | |
37 | DEFINE_string(output_size_info, "", "Serialized SegmenterDataSizeInfo"); | |
38 | DEFINE_string(output_ltable, "", "LTable array"); | |
39 | DEFINE_string(output_rtable, "", "RTable array"); | |
40 | DEFINE_string(output_bitarray, "", "Segmenter bitarray"); | |
38 | 41 | |
39 | 42 | int main(int argc, char **argv) { |
40 | 43 | mozc::InitMozc(argv[0], &argc, &argv, true); |
41 | 44 | mozc::SegmenterBitarrayGenerator::GenerateBitarray( |
42 | kLSize, kRSize, &IsBoundaryInternal, FLAGS_output); | |
45 | kLSize, kRSize, &IsBoundaryInternal, FLAGS_output_size_info, | |
46 | FLAGS_output_ltable, FLAGS_output_rtable, FLAGS_output_bitarray); | |
43 | 47 | return 0; |
44 | 48 | } |
30 | 30 | |
31 | 31 | #include "base/logging.h" |
32 | 32 | #include "data_manager/dataset_reader.h" |
33 | #include "protocol/segmenter_data.pb.h" | |
33 | 34 | |
34 | 35 | namespace mozc { |
35 | 36 | |
68 | 69 | } |
69 | 70 | if (!reader.Get("bdry", &boundary_data_)) { |
70 | 71 | LOG(ERROR) << "Cannot find a boundary data"; |
72 | return false; | |
73 | } | |
74 | { | |
75 | StringPiece memblock; | |
76 | if (!reader.Get("segmenter_sizeinfo", &memblock)) { | |
77 | LOG(ERROR) << "Cannot find a segmenter size info"; | |
78 | return false; | |
79 | } | |
80 | converter::SegmenterDataSizeInfo sizeinfo; | |
81 | if (!sizeinfo.ParseFromArray(memblock.data(), memblock.size())) { | |
82 | LOG(ERROR) << "Failed to parse SegmenterDataSizeInfo"; | |
83 | return false; | |
84 | } | |
85 | segmenter_compressed_lsize_ = sizeinfo.compressed_lsize(); | |
86 | segmenter_compressed_rsize_ = sizeinfo.compressed_rsize(); | |
87 | } | |
88 | if (!reader.Get("segmenter_ltable", &segmenter_ltable_)) { | |
89 | LOG(ERROR) << "Cannot find a segmenter ltable"; | |
90 | return false; | |
91 | } | |
92 | if (!reader.Get("segmenter_rtable", &segmenter_rtable_)) { | |
93 | LOG(ERROR) << "Cannot find a segmenter rtable"; | |
94 | return false; | |
95 | } | |
96 | if (!reader.Get("segmenter_bitarray", &segmenter_bitarray_)) { | |
97 | LOG(ERROR) << "Cannot find a segmenter bit-array"; | |
71 | 98 | return false; |
72 | 99 | } |
73 | 100 | return true; |
118 | 145 | size_t *l_num_elements, size_t *r_num_elements, const uint16 **l_table, |
119 | 146 | const uint16 **r_table, size_t *bitarray_num_bytes, |
120 | 147 | const char **bitarray_data, const uint16 **boundary_data) const { |
121 | LOG(FATAL) << "Not implemented"; | |
148 | *l_num_elements = segmenter_compressed_lsize_; | |
149 | *r_num_elements = segmenter_compressed_rsize_; | |
150 | *l_table = reinterpret_cast<const uint16 *>(segmenter_ltable_.data()); | |
151 | *r_table = reinterpret_cast<const uint16 *>(segmenter_rtable_.data()); | |
152 | *bitarray_num_bytes = segmenter_bitarray_.size(); | |
153 | *bitarray_data = segmenter_bitarray_.data(); | |
154 | *boundary_data = reinterpret_cast<const uint16 *>(boundary_data_.data()); | |
122 | 155 | } |
123 | 156 | |
124 | 157 | void DataManager::GetSuffixDictionaryData(const dictionary::SuffixToken **data, |
59 | 59 | 'target_name': '<(dataset_tag)_data_manager', |
60 | 60 | 'type': 'static_library', |
61 | 61 | 'sources': [ |
62 | '<(gen_out_dir)/embedded_connection_data.h', | |
63 | '<(gen_out_dir)/embedded_dictionary_data.h', | |
64 | '<(gen_out_dir)/segmenter_data.h', | |
65 | 62 | '<(gen_out_dir)/suffix_data.h', |
66 | 63 | '<(gen_out_dir)/symbol_rewriter_data.h', |
67 | 64 | '<(mozc_dir)/dictionary/pos_group.h', |
143 | 140 | 'suggestion_filter': '<(gen_out_dir)/suggestion_filter_data.data', |
144 | 141 | 'pos_group': '<(gen_out_dir)/pos_group.data', |
145 | 142 | 'boundary': '<(gen_out_dir)/boundary.data', |
143 | 'segmenter_sizeinfo': '<(gen_out_dir)/segmenter_sizeinfo.data', | |
144 | 'segmenter_ltable': '<(gen_out_dir)/segmenter_ltable.data', | |
145 | 'segmenter_rtable': '<(gen_out_dir)/segmenter_rtable.data', | |
146 | 'segmenter_bitarray': '<(gen_out_dir)/segmenter_bitarray.data', | |
146 | 147 | }, |
147 | 148 | 'inputs': [ |
148 | 149 | '<(dictionary)', |
152 | 153 | '<(suggestion_filter)', |
153 | 154 | '<(pos_group)', |
154 | 155 | '<(boundary)', |
156 | '<(segmenter_sizeinfo)', | |
157 | '<(segmenter_ltable)', | |
158 | '<(segmenter_rtable)', | |
159 | '<(segmenter_bitarray)', | |
155 | 160 | ], |
156 | 161 | 'outputs': [ |
157 | 162 | '<(gen_out_dir)/<(out_mozc_data)', |
165 | 170 | 'conn:32:<(gen_out_dir)/connection.data', |
166 | 171 | 'dict:32:<(gen_out_dir)/system.dictionary', |
167 | 172 | 'sugg:32:<(gen_out_dir)/suggestion_filter_data.data', |
168 | 'posg:8:<(gen_out_dir)/pos_group.data', | |
169 | 'bdry:16:<(gen_out_dir)/boundary.data', | |
173 | 'posg:32:<(gen_out_dir)/pos_group.data', | |
174 | 'bdry:32:<(gen_out_dir)/boundary.data', | |
175 | 'segmenter_sizeinfo:32:<(gen_out_dir)/segmenter_sizeinfo.data', | |
176 | 'segmenter_ltable:32:<(gen_out_dir)/segmenter_ltable.data', | |
177 | 'segmenter_rtable:32:<(gen_out_dir)/segmenter_rtable.data', | |
178 | 'segmenter_bitarray:32:<(gen_out_dir)/segmenter_bitarray.data' | |
170 | 179 | ], |
171 | 180 | }, |
172 | 181 | ], |
183 | 192 | 'gen_embedded_counter_suffix_data_for_<(dataset_tag)#host', |
184 | 193 | 'gen_embedded_dictionary_data_for_<(dataset_tag)#host', |
185 | 194 | 'gen_embedded_reading_correction_data_for_<(dataset_tag)#host', |
186 | 'gen_embedded_segmenter_data_for_<(dataset_tag)#host', | |
187 | 195 | 'gen_embedded_suffix_data_for_<(dataset_tag)#host', |
188 | 196 | 'gen_embedded_suggestion_filter_data_for_<(dataset_tag)#host', |
189 | 197 | 'gen_embedded_symbol_rewriter_data_for_<(dataset_tag)#host', |
501 | 509 | '<@(input_files)', |
502 | 510 | ], |
503 | 511 | 'outputs': [ |
504 | '<(gen_out_dir)/<(dataset_tag)_segmenter_inl.h', | |
512 | '<(gen_out_dir)/segmenter_inl.h', | |
505 | 513 | ], |
506 | 514 | 'action': [ |
507 | 515 | 'python', '<(mozc_dir)/build_tools/redirect.py', |
508 | '<(gen_out_dir)/<(dataset_tag)_segmenter_inl.h', | |
516 | '<(gen_out_dir)/segmenter_inl.h', | |
509 | 517 | '<(mozc_dir)/converter/gen_segmenter_code.py', |
510 | 518 | '<@(input_files)', |
511 | 519 | ], |
512 | 520 | 'message': ('[<(dataset_tag)] Generating ' + |
513 | '<(gen_out_dir)/<(dataset_tag)_segmenter_inl.h.'), | |
521 | '<(gen_out_dir)/segmenter_inl.h.'), | |
514 | 522 | }, |
515 | 523 | ], |
516 | 524 | }, |
530 | 538 | ], |
531 | 539 | }, |
532 | 540 | { |
533 | 'target_name': 'gen_embedded_segmenter_data_for_<(dataset_tag)', | |
541 | 'target_name': 'gen_separate_segmenter_data_for_<(dataset_tag)', | |
534 | 542 | 'type': 'none', |
535 | 543 | 'toolsets': ['host'], |
536 | 544 | 'dependencies': [ |
538 | 546 | ], |
539 | 547 | 'actions': [ |
540 | 548 | { |
541 | 'action_name': 'gen_embedded_segmenter_data_for_<(dataset_tag)', | |
549 | 'action_name': 'gen_separate_segmenter_data_for_<(dataset_tag)', | |
542 | 550 | 'variables': { |
543 | 551 | 'generator': '<(PRODUCT_DIR)/gen_<(dataset_tag)_sbm<(EXECUTABLE_SUFFIX)' |
544 | 552 | }, |
546 | 554 | '<(generator)', |
547 | 555 | ], |
548 | 556 | 'outputs': [ |
549 | '<(gen_out_dir)/segmenter_data.h', | |
550 | ], | |
551 | 'action': [ | |
552 | '<(generator)', | |
553 | '--output=<(gen_out_dir)/segmenter_data.h', | |
554 | ], | |
555 | 'message': ('[<(dataset_tag)] Generating ' + | |
556 | '<(gen_out_dir)/segmenter_data.h.'), | |
557 | '<(gen_out_dir)/segmenter_sizeinfo.data', | |
558 | '<(gen_out_dir)/segmenter_ltable.data', | |
559 | '<(gen_out_dir)/segmenter_rtable.data', | |
560 | '<(gen_out_dir)/segmenter_bitarray.data', | |
561 | ], | |
562 | 'action': [ | |
563 | '<(generator)', | |
564 | '--output_size_info=<(gen_out_dir)/segmenter_sizeinfo.data', | |
565 | '--output_ltable=<(gen_out_dir)/segmenter_ltable.data', | |
566 | '--output_rtable=<(gen_out_dir)/segmenter_rtable.data', | |
567 | '--output_bitarray=<(gen_out_dir)/segmenter_bitarray.data', | |
568 | ], | |
569 | 'message': ('[<(dataset_tag)] Generating segmenter data files'), | |
557 | 570 | }, |
558 | 571 | ], |
559 | 572 | }, |
81 | 81 | void GetCounterSuffixSortedArray(const CounterSuffixEntry **array, |
82 | 82 | size_t *size) const override; |
83 | 83 | |
84 | // TODO(noriyukit): This function gives boundary data, which is a partial | |
85 | // result of GetSegmenterData() above. Implement the full GetSegmenterData() | |
86 | // and remove this function. | |
87 | const uint16 *GetBoundaryData() const { | |
88 | return reinterpret_cast<const uint16*>(boundary_data_.data()); | |
89 | } | |
90 | ||
91 | 84 | private: |
92 | 85 | StringPiece connection_data_; |
93 | 86 | StringPiece dictionary_data_; |
96 | 89 | StringPiece collocation_suppression_data_; |
97 | 90 | StringPiece pos_group_data_; |
98 | 91 | StringPiece boundary_data_; |
92 | size_t segmenter_compressed_lsize_; | |
93 | size_t segmenter_compressed_rsize_; | |
94 | StringPiece segmenter_ltable_; | |
95 | StringPiece segmenter_rtable_; | |
96 | StringPiece segmenter_bitarray_; | |
99 | 97 | |
100 | 98 | DISALLOW_COPY_AND_ASSIGN(DataManager); |
101 | 99 | }; |
41 | 41 | ], |
42 | 42 | 'dependencies': [ |
43 | 43 | '../base/base.gyp:base', |
44 | '../protocol/protocol.gyp:segmenter_data_proto', | |
44 | 45 | 'dataset_reader', |
45 | 46 | ], |
46 | 47 | }, |
31 | 31 | #include "converter/gen_segmenter_bitarray.h" |
32 | 32 | |
33 | 33 | namespace { |
34 | #include "data_manager/oss/oss_segmenter_inl.h" | |
34 | #include "data_manager/oss/segmenter_inl.h" | |
35 | 35 | } |
36 | 36 | |
37 | DEFINE_string(output, "", "header filename for google segmenter"); | |
37 | DEFINE_string(output_size_info, "", "Serialized SegmenterDataSizeInfo"); | |
38 | DEFINE_string(output_ltable, "", "LTable array"); | |
39 | DEFINE_string(output_rtable, "", "RTable array"); | |
40 | DEFINE_string(output_bitarray, "", "Segmenter bitarray"); | |
38 | 41 | |
39 | 42 | int main(int argc, char **argv) { |
40 | 43 | mozc::InitMozc(argv[0], &argc, &argv, true); |
41 | 44 | mozc::SegmenterBitarrayGenerator::GenerateBitarray( |
42 | kLSize, kRSize, &IsBoundaryInternal, FLAGS_output); | |
45 | kLSize, kRSize, &IsBoundaryInternal, FLAGS_output_size_info, | |
46 | FLAGS_output_ltable, FLAGS_output_rtable, FLAGS_output_bitarray); | |
43 | 47 | return 0; |
44 | 48 | } |
102 | 102 | manager_.GetSystemDictionaryData(data, size); |
103 | 103 | } |
104 | 104 | |
105 | namespace { | |
106 | // Automatically generated headers containing data set for segmenter. | |
107 | #include "data_manager/oss/segmenter_data.h" | |
108 | } // namespace | |
109 | ||
110 | 105 | void OssDataManager::GetSegmenterData( |
111 | 106 | size_t *l_num_elements, size_t *r_num_elements, |
112 | 107 | const uint16 **l_table, const uint16 **r_table, |
113 | 108 | size_t *bitarray_num_bytes, const char **bitarray_data, |
114 | 109 | const uint16 **boundary_data) const { |
115 | *l_num_elements = kCompressedLSize; | |
116 | *r_num_elements = kCompressedRSize; | |
117 | *l_table = kCompressedLIDTable; | |
118 | *r_table = kCompressedRIDTable; | |
119 | *bitarray_num_bytes = kSegmenterBitArrayData_size; | |
120 | *bitarray_data = kSegmenterBitArrayData_data; | |
121 | *boundary_data = manager_.GetBoundaryData(); | |
110 | manager_.GetSegmenterData(l_num_elements, r_num_elements, | |
111 | l_table, r_table, bitarray_num_bytes, | |
112 | bitarray_data, boundary_data); | |
122 | 113 | } |
123 | 114 | |
124 | 115 | namespace { |
36 | 36 | namespace oss { |
37 | 37 | |
38 | 38 | namespace { |
39 | #include "data_manager/oss/oss_segmenter_inl.h" | |
39 | #include "data_manager/oss/segmenter_inl.h" | |
40 | 40 | } // namespace |
41 | 41 | |
42 | 42 | class OssDataManagerTest : public DataManagerTestBase { |
58 | 58 | |
59 | 59 | #include "data_manager/@DIR@/pos_matcher_data.h" |
60 | 60 | #include "data_manager/@DIR@/reading_correction_data.h" |
61 | #include "data_manager/@DIR@/segmenter_data.h" | |
62 | 61 | #include "data_manager/@DIR@/suffix_data.h" |
63 | 62 | #include "data_manager/@DIR@/symbol_rewriter_data.h" |
64 | 63 | #include "data_manager/@DIR@/user_pos_data.h" |
84 | 83 | packer.SetSuffixTokens(kSuffixTokens, arraysize(kSuffixTokens)); |
85 | 84 | packer.SetReadingCorretions(kReadingCorrections, |
86 | 85 | arraysize(kReadingCorrections)); |
87 | packer.SetSegmenterData(kCompressedLSize, | |
88 | kCompressedRSize, | |
89 | kCompressedLIDTable, | |
90 | arraysize(kCompressedLIDTable), | |
91 | kCompressedRIDTable, | |
92 | arraysize(kCompressedRIDTable), | |
93 | kSegmenterBitArrayData_data, | |
94 | kSegmenterBitArrayData_size); | |
95 | 86 | packer.SetSymbolRewriterData(kSymbolData_token_data, kSymbolData_token_size); |
96 | 87 | #ifndef NO_USAGE_REWRITER |
97 | 88 | packer.SetUsageRewriterData(kConjugationNum, |
134 | 134 | unique_ptr<Range[]> range_table_items_; |
135 | 135 | unique_ptr<SuffixToken[]> suffix_tokens_; |
136 | 136 | unique_ptr<ReadingCorrectionItem[]> reading_corrections_; |
137 | size_t compressed_l_size_; | |
138 | size_t compressed_r_size_; | |
139 | unique_ptr<uint16[]> compressed_lid_table_; | |
140 | unique_ptr<uint16[]> compressed_rid_table_; | |
141 | 137 | unique_ptr<EmbeddedDictionary::Value[]> symbol_data_values_; |
142 | 138 | size_t symbol_data_token_size_; |
143 | 139 | unique_ptr<EmbeddedDictionary::Token[]> symbol_data_tokens_; |
154 | 150 | }; |
155 | 151 | |
156 | 152 | PackedDataManager::Impl::Impl() |
157 | : compressed_l_size_(0), | |
158 | compressed_r_size_(0), | |
159 | symbol_data_token_size_(0) { | |
153 | : symbol_data_token_size_(0) { | |
160 | 154 | } |
161 | 155 | |
162 | 156 | PackedDataManager::Impl::~Impl() { |
331 | 325 | } else { |
332 | 326 | reading_corrections_[i].correction = NULL; |
333 | 327 | } |
334 | } | |
335 | ||
336 | // Makes segment data. | |
337 | const SystemDictionaryData::SegmenterData &segmenter_data = | |
338 | system_dictionary_data_->segmenter_data(); | |
339 | compressed_l_size_ = segmenter_data.compressed_l_size(); | |
340 | compressed_r_size_ = segmenter_data.compressed_r_size(); | |
341 | compressed_lid_table_.reset( | |
342 | new uint16[segmenter_data.compressed_lid_table_size()]); | |
343 | for (size_t i = 0; i < segmenter_data.compressed_lid_table_size(); ++i) { | |
344 | compressed_lid_table_[i] = segmenter_data.compressed_lid_table(i); | |
345 | } | |
346 | compressed_rid_table_.reset( | |
347 | new uint16[segmenter_data.compressed_rid_table_size()]); | |
348 | for (size_t i = 0; i < segmenter_data.compressed_rid_table_size(); ++i) { | |
349 | compressed_rid_table_[i] = segmenter_data.compressed_rid_table(i); | |
350 | 328 | } |
351 | 329 | |
352 | 330 | // Makes symbol dictionary data. |
513 | 491 | const uint16 **l_table, const uint16 **r_table, |
514 | 492 | size_t *bitarray_num_bytes, const char **bitarray_data, |
515 | 493 | const uint16 **boundary_data) const { |
516 | *l_num_elements = compressed_l_size_; | |
517 | *r_num_elements = compressed_r_size_; | |
518 | *l_table = compressed_lid_table_.get(); | |
519 | *r_table = compressed_rid_table_.get(); | |
520 | *bitarray_num_bytes = | |
521 | system_dictionary_data_->segmenter_data().bit_array_data().size(); | |
522 | *bitarray_data = | |
523 | system_dictionary_data_->segmenter_data().bit_array_data().data(); | |
524 | *boundary_data = manager_.GetBoundaryData(); | |
494 | manager_.GetSegmenterData(l_num_elements, r_num_elements, l_table, r_table, | |
495 | bitarray_num_bytes, bitarray_data, boundary_data); | |
525 | 496 | } |
526 | 497 | |
527 | 498 | void PackedDataManager::Impl::GetSystemDictionaryData( |
80 | 80 | }; |
81 | 81 | repeated ReadingCorrectionItem reading_corrections = 8; |
82 | 82 | |
83 | message SegmenterData { | |
84 | optional uint32 compressed_l_size = 1; | |
85 | optional uint32 compressed_r_size = 2; | |
86 | repeated uint32 compressed_lid_table = 3; | |
87 | repeated uint32 compressed_rid_table = 4; | |
88 | optional bytes bit_array_data = 5; | |
89 | } | |
90 | optional SegmenterData segmenter_data = 9; | |
83 | reserved 9; // DEPRECATED: optional SegmenterData segmenter_data = 9; | |
91 | 84 | |
92 | 85 | message EmbeddedDictionary { |
93 | 86 | message Value { |
152 | 152 | } |
153 | 153 | } |
154 | 154 | |
155 | void SystemDictionaryDataPacker::SetSegmenterData( | |
156 | size_t compressed_l_size, | |
157 | size_t compressed_r_size, | |
158 | const uint16 *compressed_lid_table, | |
159 | size_t compressed_lid_table_size, | |
160 | const uint16 *compressed_rid_table, | |
161 | size_t compressed_rid_table_size, | |
162 | const char *segmenter_bit_array_data, | |
163 | size_t segmenter_bit_array_data_size) { | |
164 | SystemDictionaryData::SegmenterData *segmenter = | |
165 | system_dictionary_->mutable_segmenter_data(); | |
166 | segmenter->set_compressed_l_size(compressed_l_size); | |
167 | segmenter->set_compressed_r_size(compressed_r_size); | |
168 | for (size_t i = 0; i < compressed_lid_table_size; ++i) { | |
169 | segmenter->add_compressed_lid_table(compressed_lid_table[i]); | |
170 | } | |
171 | for (size_t i = 0; i < compressed_rid_table_size; ++i) { | |
172 | segmenter->add_compressed_rid_table(compressed_rid_table[i]); | |
173 | } | |
174 | segmenter->set_bit_array_data(segmenter_bit_array_data, | |
175 | segmenter_bit_array_data_size); | |
176 | } | |
177 | ||
178 | 155 | void SystemDictionaryDataPacker::SetSymbolRewriterData( |
179 | 156 | const mozc::EmbeddedDictionary::Token *token_data, |
180 | 157 | size_t token_size) { |
68 | 68 | void SetReadingCorretions( |
69 | 69 | const ReadingCorrectionItem *reading_corrections, |
70 | 70 | size_t reading_corrections_count); |
71 | void SetSegmenterData( | |
72 | size_t compressed_l_size, | |
73 | size_t compressed_r_size, | |
74 | const uint16 *compressed_lid_table, | |
75 | size_t compressed_lid_table_size, | |
76 | const uint16 *compressed_rid_table, | |
77 | size_t compressed_rid_table_size, | |
78 | const char *segmenter_bit_array_data, | |
79 | size_t segmenter_bit_array_data_size); | |
80 | 71 | void SetSuggestionFilterData( |
81 | 72 | const void *suggestion_filter_data, |
82 | 73 | size_t suggestion_filter_data_size); |
32 | 32 | namespace mozc { |
33 | 33 | namespace packed { |
34 | 34 | |
35 | const int kSystemDictionaryFormatVersion = 12; | |
35 | const int kSystemDictionaryFormatVersion = 13; | |
36 | 36 | |
37 | 37 | } // namespace packed |
38 | 38 | } // namespace mozc |
31 | 31 | #include "converter/gen_segmenter_bitarray.h" |
32 | 32 | |
33 | 33 | namespace { |
34 | #include "data_manager/testing/mock_segmenter_inl.h" | |
34 | #include "data_manager/testing/segmenter_inl.h" | |
35 | 35 | } |
36 | 36 | |
37 | DEFINE_string(output, "", "header filename for mock segmenter"); | |
37 | DEFINE_string(output_size_info, "", "Serialized SegmenterDataSizeInfo"); | |
38 | DEFINE_string(output_ltable, "", "LTable array"); | |
39 | DEFINE_string(output_rtable, "", "RTable array"); | |
40 | DEFINE_string(output_bitarray, "", "Segmenter bitarray"); | |
38 | 41 | |
39 | 42 | int main(int argc, char **argv) { |
40 | 43 | mozc::InitMozc(argv[0], &argc, &argv, true); |
41 | 44 | mozc::SegmenterBitarrayGenerator::GenerateBitarray( |
42 | kLSize, kRSize, &IsBoundaryInternal, FLAGS_output); | |
45 | kLSize, kRSize, &IsBoundaryInternal, FLAGS_output_size_info, | |
46 | FLAGS_output_ltable, FLAGS_output_rtable, FLAGS_output_bitarray); | |
43 | 47 | return 0; |
44 | 48 | } |
78 | 78 | manager_.GetSystemDictionaryData(data, size); |
79 | 79 | } |
80 | 80 | |
81 | namespace { | |
82 | // Automatically generated headers containing data set for segmenter. | |
83 | #include "data_manager/testing/segmenter_data.h" | |
84 | } // namespace | |
85 | ||
86 | 81 | void MockDataManager::GetSegmenterData( |
87 | 82 | size_t *l_num_elements, size_t *r_num_elements, |
88 | 83 | const uint16 **l_table, const uint16 **r_table, |
89 | 84 | size_t *bitarray_num_bytes, const char **bitarray_data, |
90 | 85 | const uint16 **boundary_data) const { |
91 | *l_num_elements = kCompressedLSize; | |
92 | *r_num_elements = kCompressedRSize; | |
93 | *l_table = kCompressedLIDTable; | |
94 | *r_table = kCompressedRIDTable; | |
95 | *bitarray_num_bytes = kSegmenterBitArrayData_size; | |
96 | *bitarray_data = kSegmenterBitArrayData_data; | |
97 | *boundary_data = manager_.GetBoundaryData(); | |
86 | manager_.GetSegmenterData(l_num_elements, r_num_elements, | |
87 | l_table, r_table, bitarray_num_bytes, | |
88 | bitarray_data, boundary_data); | |
98 | 89 | } |
99 | 90 | |
100 | 91 | namespace { |
36 | 36 | namespace testing { |
37 | 37 | |
38 | 38 | namespace { |
39 | #include "data_manager/testing/mock_segmenter_inl.h" | |
39 | #include "data_manager/testing/segmenter_inl.h" | |
40 | 40 | } // namespace |
41 | 41 | |
42 | 42 | class MockDataManagerTest : public DataManagerTestBase { |
0 | 0 | MAJOR=2 |
1 | 1 | MINOR=17 |
2 | BUILD=2472 | |
2 | BUILD=2473 | |
3 | 3 | REVISION=102 |
4 | 4 | # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be |
5 | 5 | # downloaded by NaCl Mozc. |
6 | NACL_DICTIONARY_VERSION=12 | |
6 | NACL_DICTIONARY_VERSION=13 |
208 | 208 | 'genproto_user_dictionary_storage_proto#host', |
209 | 209 | ], |
210 | 210 | }, |
211 | { | |
212 | 'target_name': 'genproto_segmenter_data_proto', | |
213 | 'type': 'none', | |
214 | 'toolsets': ['host'], | |
215 | 'sources': [ | |
216 | 'segmenter_data.proto', | |
217 | ], | |
218 | 'includes': [ | |
219 | '../protobuf/genproto.gypi', | |
220 | ], | |
221 | }, | |
222 | { | |
223 | 'target_name': 'segmenter_data_proto', | |
224 | 'type': 'static_library', | |
225 | 'toolsets': ['target', 'host'], | |
226 | 'hard_dependency': 1, | |
227 | 'sources': [ | |
228 | '<(proto_out_dir)/<(relative_dir)/segmenter_data.pb.cc', | |
229 | ], | |
230 | 'dependencies': [ | |
231 | '../protobuf/protobuf.gyp:protobuf', | |
232 | 'genproto_segmenter_data_proto#host', | |
233 | ], | |
234 | 'export_dependent_settings': [ | |
235 | 'genproto_segmenter_data_proto#host', | |
236 | ], | |
237 | }, | |
211 | 238 | ], |
212 | 239 | } |
0 | // Copyright 2010-2016, Google Inc. | |
1 | // All rights reserved. | |
2 | // | |
3 | // Redistribution and use in source and binary forms, with or without | |
4 | // modification, are permitted provided that the following conditions are | |
5 | // met: | |
6 | // | |
7 | // * Redistributions of source code must retain the above copyright | |
8 | // notice, this list of conditions and the following disclaimer. | |
9 | // * Redistributions in binary form must reproduce the above | |
10 | // copyright notice, this list of conditions and the following disclaimer | |
11 | // in the documentation and/or other materials provided with the | |
12 | // distribution. | |
13 | // * Neither the name of Google Inc. nor the names of its | |
14 | // contributors may be used to endorse or promote products derived from | |
15 | // this software without specific prior written permission. | |
16 | // | |
17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ||
29 | syntax = "proto2"; | |
30 | ||
31 | package mozc.converter; | |
32 | ||
33 | message SegmenterDataSizeInfo { | |
34 | optional uint64 compressed_lsize = 1; | |
35 | optional uint64 compressed_rsize = 2; | |
36 | } |