Codebase list mozc / ad2e131
Stop embedding user POS data as C++ structures This CL defines binary format for user POS data and converts C++-embedded data to a new data set file. BUG= TEST= REF_BUG=26841123 REF_CL=116321468,116327229 REF_TIME=2016-03-04T14:08:55+09:00 REF_TIME_RAW=1457068135 +0900 Noriyuki Takahashi 8 years ago
36 changed file(s) with 528 addition(s) and 279 deletion(s). Raw diff Collapse all Expand all
198198 PepperFileUtil::Initialize(instance_, kFileIoFileSystemExpectedSize);
199199 LoadDictionary();
200200 #endif // GOOGLE_JAPANESE_INPUT_BUILD
201 user_pos_.reset(new dictionary::UserPOS(
202 packed::PackedDataManager::GetUserPosManager()->GetUserPOSData()));
201 user_pos_.reset(dictionary::UserPOS::CreateFromDataManager(
202 *packed::PackedDataManager::GetUserPosManager()));
203203
204204 engine_.reset(mozc::EngineFactory::Create());
205205 handler_.reset(new SessionHandler(engine_.get()));
5959 #include "dictionary/system/value_dictionary.h"
6060 #include "dictionary/user_dictionary.h"
6161 #include "dictionary/user_dictionary_stub.h"
62 #include "dictionary/user_pos.h"
6263 #include "engine/engine.h"
6364 #include "engine/engine_interface.h"
6465 #include "engine/mock_data_engine_factory.h"
345346 SuppressionDictionary *suppression_dictionary = new SuppressionDictionary;
346347 dictionary::UserDictionary *user_dictionary =
347348 new dictionary::UserDictionary(
348 new dictionary::UserPOS(user_pos_manager.GetUserPOSData()),
349 dictionary::UserPOS::CreateFromDataManager(user_pos_manager),
349350 user_pos_manager.GetPOSMatcher(),
350351 suppression_dictionary);
351352 InitConverterAndData(
5454 '../dictionary/dictionary.gyp:dictionary_mock',
5555 '../dictionary/dictionary.gyp:suffix_dictionary',
5656 '../dictionary/dictionary_base.gyp:user_dictionary',
57 '../dictionary/dictionary_base.gyp:user_pos',
5758 '../dictionary/system/system_dictionary.gyp:system_dictionary',
5859 '../dictionary/system/system_dictionary.gyp:value_dictionary',
5960 '../engine/engine.gyp:engine_factory',
2828
2929 #include "data_manager/chromeos/chromeos_user_pos_manager.h"
3030
31 #include "base/embedded_file.h"
3132 #include "base/logging.h"
3233 #include "base/singleton.h"
33 #include "dictionary/pos_group.h"
3434 #include "dictionary/pos_matcher.h"
35 #include "dictionary/user_pos.h"
3635
3736 namespace mozc {
3837 namespace chromeos {
4342
4443 namespace {
4544
46 // The following header file is automatically generated and contains the
47 // definition of variable, kPOSToken, of type UserPOSImpl::POSToken.
48 #include "data_manager/chromeos/user_pos_data.h"
45 // Embedded file kUserPosManagerData is defined in this header file.
46 #include "data_manager/chromeos/user_pos_manager_data.h"
4947
5048 } // namespace
5149
52 const dictionary::UserPOS::POSToken *
53 ChromeOsUserPosManager::GetUserPOSData() const {
54 DCHECK(kPOSToken != NULL);
55 return kPOSToken;
50 ChromeOsUserPosManager::ChromeOsUserPosManager() {
51 const StringPiece data = LoadEmbeddedFile(kUserPosManagerData);
52 const char *kMagicNumber = ""; // Magic number is not present.
53 CHECK(manager_.InitUserPosManagerDataFromArray(data, kMagicNumber))
54 << "Embedded user_pos_manager_data.h is broken";
55 }
56
57 ChromeOsUserPosManager::~ChromeOsUserPosManager() = default;
58
59 void ChromeOsUserPosManager::GetUserPOSData(
60 StringPiece *token_array_data, StringPiece *string_array_data) const {
61 manager_.GetUserPOSData(token_array_data, string_array_data);
5662 }
5763
5864 namespace {
3030 #define MOZC_DATA_MANAGER_CHROMEOS_CHROMEOS_USER_POS_MANAGER_H_
3131
3232 #include "base/port.h"
33 #include "data_manager/data_manager.h"
3334 #include "data_manager/data_manager_interface.h"
3435
3536 namespace mozc {
3738
3839 class ChromeOsUserPosManager : public DataManagerInterface {
3940 public:
40 ChromeOsUserPosManager() {}
41 ~ChromeOsUserPosManager() override {}
41 ChromeOsUserPosManager();
42 ~ChromeOsUserPosManager() override;
4243
4344 static ChromeOsUserPosManager *GetUserPosManager();
4445
4546 // Partially implement the interface because some binary only reqiures the
4647 // folloiwng embedded data.
47 const dictionary::UserPOS::POSToken *GetUserPOSData() const override;
48 void GetUserPOSData(StringPiece *token_array_data,
49 StringPiece *string_array_data) const override;
4850 const dictionary::POSMatcher *GetPOSMatcher() const override;
4951
5052 // The following are implemented in ChromeOsDataManager.
8082 size_t *size) const override {}
8183
8284 private:
85 DataManager manager_;
8386 DISALLOW_COPY_AND_ASSIGN(ChromeOsUserPosManager);
8487 };
8588
3535 #include "rewriter/serialized_dictionary.h"
3636
3737 namespace mozc {
38 namespace {
39
40 bool InitUserPosManagerDataFromReader(const DataSetReader &reader,
41 StringPiece *user_pos_token_array_data,
42 StringPiece *user_pos_string_array_data) {
43 if (!reader.Get("user_pos_token", user_pos_token_array_data)) {
44 LOG(ERROR) << "Cannot find a user POS token array";
45 return false;
46 }
47 if (!reader.Get("user_pos_string", user_pos_string_array_data)) {
48 LOG(ERROR) << "Cannot find a user POS string array";
49 return false;
50 }
51 if (user_pos_token_array_data->size() % 8 != 0 ||
52 !SerializedStringArray::VerifyData(*user_pos_string_array_data)) {
53 LOG(ERROR) << "User POS data is broken: token array data size = "
54 << user_pos_token_array_data->size() << ", string array size = "
55 << user_pos_string_array_data->size();
56 return false;
57 }
58 return true;
59 }
60
61 } // namespace
3862
3963 DataManager::DataManager() = default;
4064 DataManager::~DataManager() = default;
4367 DataSetReader reader;
4468 if (!reader.Init(array, magic)) {
4569 LOG(ERROR) << "Binary data of size " << array.size() << " is broken";
70 return false;
71 }
72 if (!InitUserPosManagerDataFromReader(reader,
73 &user_pos_token_array_data_,
74 &user_pos_string_array_data_)) {
75 LOG(ERROR) << "User POS manager data is broken";
4676 return false;
4777 }
4878 if (!reader.Get("conn", &connection_data_)) {
195225 return true;
196226 }
197227
228 bool DataManager::InitUserPosManagerDataFromArray(StringPiece array,
229 StringPiece magic) {
230 DataSetReader reader;
231 if (!reader.Init(array, magic)) {
232 LOG(ERROR) << "Binary data of size " << array.size() << " is broken";
233 return false;
234 }
235 if (!InitUserPosManagerDataFromReader(reader,
236 &user_pos_token_array_data_,
237 &user_pos_string_array_data_)) {
238 LOG(ERROR) << "User POS manager data is broken";
239 return false;
240 }
241 return true;
242 }
243
198244 void DataManager::GetConnectorData(const char **data, size_t *size) const {
199245 *data = connection_data_.data();
200246 *size = connection_data_.size();
222268 *size = suggestion_filter_data_.size();
223269 }
224270
225 const dictionary::UserPOS::POSToken *DataManager::GetUserPOSData() const {
226 LOG(FATAL) << "Not implemented";
227 return nullptr;
271 void DataManager::GetUserPOSData(StringPiece *token_array_data,
272 StringPiece *string_array_data) const {
273 *token_array_data = user_pos_token_array_data_;
274 *string_array_data = user_pos_string_array_data_;
228275 }
229276
230277 const dictionary::POSMatcher *DataManager::GetPOSMatcher() const {
119119 'dependencies': [
120120 '../data_manager_base.gyp:dataset_writer_main',
121121 '../../rewriter/rewriter_base.gyp:gen_rewriter_files#host',
122 '<(dataset_tag)_data_manager_base.gyp:gen_separate_user_pos_data_for_<(dataset_tag)#host',
122123 'gen_separate_connection_data_for_<(dataset_tag)#host',
123124 'gen_separate_dictionary_data_for_<(dataset_tag)#host',
124125 'gen_separate_collocation_data_for_<(dataset_tag)#host',
136137 'action_name': 'gen_mozc_dataset_for_<(dataset_tag)',
137138 'variables': {
138139 'generator': '<(PRODUCT_DIR)/dataset_writer_main<(EXECUTABLE_SUFFIX)',
140 'user_pos_token': '<(gen_out_dir)/user_pos_token_array.data',
141 'user_pos_string': '<(gen_out_dir)/user_pos_string_array.data',
139142 'dictionary': '<(gen_out_dir)/system.dictionary',
140143 'connection': '<(gen_out_dir)/connection.data',
141144 'collocation': '<(gen_out_dir)/collocation_data.data',
158161 'symbol_string': '<(gen_out_dir)/symbol_string.data',
159162 },
160163 'inputs': [
164 '<(user_pos_token)',
165 '<(user_pos_string)',
161166 '<(dictionary)',
162167 '<(connection)',
163168 '<(collocation)',
186191 '<(generator)',
187192 '--magic=<(magic_number)',
188193 '--output=<(gen_out_dir)/<(out_mozc_data)',
194 'user_pos_token:32:<(user_pos_token)',
195 'user_pos_string:32:<(user_pos_string)',
189196 'coll:32:<(gen_out_dir)/collocation_data.data',
190197 'cols:32:<(gen_out_dir)/collocation_suppression_data.data',
191198 'conn:32:<(gen_out_dir)/connection.data',
4444 DataManager();
4545 ~DataManager() override;
4646
47 // Parses |array| and extracts byte blocks of data set.
4748 bool InitFromArray(StringPiece array, StringPiece magic);
4849
50 // The same as above InitFromArray() but only parses data set for user pos
51 // manager. For mozc runtime modules, use InitFromArray() because this method
52 // is only for build tools, e.g., rewriter/dictionary_generator.cc (some build
53 // tools depend on user pos data to create outputs, so we need to handle
54 // partial data set).
55 bool InitUserPosManagerDataFromArray(StringPiece array, StringPiece magic);
56
4957 // The following interfaces are implemented.
58 void GetUserPOSData(StringPiece *token_array_data,
59 StringPiece *string_array_data) const override;
5060 void GetConnectorData(const char **data, size_t *size) const override;
5161 void GetSystemDictionaryData(const char **data, int *size) const override;
5262 void GetCollocationData(const char **array, size_t *size) const override;
8191 // The following interfaces are not yet implemented.
8292 // TODO(noriyukit): Implements all the interfaces by migrating embedded C++
8393 // structures to a data set file.
84 const dictionary::UserPOS::POSToken *GetUserPOSData() const override;
8594 const dictionary::POSMatcher *GetPOSMatcher() const override;
8695
8796 private:
97 StringPiece user_pos_token_array_data_;
98 StringPiece user_pos_string_array_data_;
8899 StringPiece connection_data_;
89100 StringPiece dictionary_data_;
90101 StringPiece suggestion_filter_data_;
3838 'dependencies': [
3939 '<(mozc_dir)/base/base.gyp:base',
4040 '<(mozc_dir)/dictionary/dictionary_base.gyp:pos_matcher',
41 '<(mozc_dir)/dictionary/dictionary_base.gyp:user_pos',
4241 'gen_embedded_pos_matcher_data_for_<(dataset_tag)#host',
43 'gen_embedded_user_pos_data_for_<(dataset_tag)#host',
42 'gen_user_pos_manager_data_header_for_<(dataset_tag)#host',
43 '../data_manager_base.gyp:data_manager',
4444 ],
4545 },
4646 {
4949 'toolsets': ['host'],
5050 'dependencies': [
5151 'gen_embedded_pos_matcher_data_for_<(dataset_tag)#host',
52 'gen_embedded_user_pos_data_for_<(dataset_tag)#host',
5352 ],
5453 },
5554 {
5756 'type': 'none',
5857 'toolsets': ['host'],
5958 'dependencies': [
60 'gen_embedded_user_pos_data_for_<(dataset_tag)#host',
59 'gen_separate_user_pos_data_for_<(dataset_tag)#host',
6160 ],
6261 'actions': [
6362 {
8180 ],
8281 },
8382 {
84 'target_name': 'gen_embedded_user_pos_data_for_<(dataset_tag)',
83 'target_name': 'gen_user_pos_manager_data_header_for_<(dataset_tag)',
84 'type': 'none',
85 'toolsets': ['host'],
86 'dependencies': [
87 'gen_user_pos_manager_data_for_<(dataset_tag)#host',
88 ],
89 'actions': [
90 {
91 'action_name': 'gen_user_pos_manager_data_header_for_<(dataset_tag)',
92 'variables': {
93 'user_pos_manager_data': '<(gen_out_dir)/user_pos_manager.data',
94 },
95 'inputs': [
96 '<(user_pos_manager_data)',
97 ],
98 'outputs': [
99 '<(gen_out_dir)/user_pos_manager_data.h',
100 ],
101 'action': [
102 'python', '<(mozc_dir)/build_tools/embed_file.py',
103 '--input=<(user_pos_manager_data)',
104 '--name=kUserPosManagerData',
105 '--output=<(gen_out_dir)/user_pos_manager_data.h',
106 ],
107 },
108 ],
109 },
110 {
111 'target_name': 'gen_user_pos_manager_data_for_<(dataset_tag)',
112 'type': 'none',
113 'toolsets': ['host'],
114 'dependencies': [
115 '../data_manager_base.gyp:dataset_writer_main',
116 'gen_separate_user_pos_data_for_<(dataset_tag)#host',
117 ],
118 'actions': [
119 {
120 'action_name': 'gen_user_pos_manager_data_for_<(dataset_tag)',
121 'variables': {
122 'generator': '<(PRODUCT_DIR)/dataset_writer_main<(EXECUTABLE_SUFFIX)',
123 'user_pos_token': '<(gen_out_dir)/user_pos_token_array.data',
124 'user_pos_string': '<(gen_out_dir)/user_pos_string_array.data',
125 },
126 'inputs': [
127 '<(user_pos_token)',
128 '<(user_pos_string)',
129 ],
130 'outputs': [
131 '<(gen_out_dir)/user_pos_manager.data',
132 ],
133 'action': [
134 '<(generator)',
135 '--output=<(gen_out_dir)/user_pos_manager.data',
136 'user_pos_token:32:<(user_pos_token)',
137 'user_pos_string:32:<(user_pos_string)',
138 ],
139 },
140 ],
141 },
142 {
143 'target_name': 'gen_separate_user_pos_data_for_<(dataset_tag)',
85144 'type': 'none',
86145 'toolsets': ['host'],
87146 'dependencies': [
89148 ],
90149 'actions': [
91150 {
92 'action_name': 'gen_embedded_user_pos_data_for_<(dataset_tag)',
151 'action_name': 'gen_separate_user_pos_data_for_<(dataset_tag)',
93152 'variables': {
94153 'id_def': '<(platform_data_dir)/id.def',
95154 'special_pos': '<(common_data_dir)/rules/special_pos.def',
96155 'user_pos': '<(common_data_dir)/rules/user_pos.def',
97156 'cforms': '<(common_data_dir)/rules/cforms.def',
98 'user_pos_data': '<(gen_out_dir)/user_pos_data.h',
157 'token_array_data': '<(gen_out_dir)/user_pos_token_array.data',
158 'string_array_data': '<(gen_out_dir)/user_pos_string_array.data',
99159 'pos_list': '<(gen_out_dir)/pos_list.data',
100160 },
101161 'inputs': [
106166 '<(cforms)',
107167 ],
108168 'outputs': [
109 '<(user_pos_data)',
169 '<(token_array_data)',
170 '<(string_array_data)',
110171 '<(pos_list)',
111172 ],
112173 'action': [
115176 '--special_pos_file=<(special_pos)',
116177 '--user_pos_file=<(user_pos)',
117178 '--cforms_file=<(cforms)',
118 '--output=<(user_pos_data)',
179 '--output_token_array=<(token_array_data)',
180 '--output_string_array=<(string_array_data)',
119181 '--output_pos_list=<(pos_list)',
120182 ],
121 'message': '[<(dataset_tag)] Generating <(user_pos_data).',
183 'message': '[<(dataset_tag)] Generating user pos data.',
122184 },
123185 ],
124186 },
3131
3232 #include "base/port.h"
3333 #include "base/string_piece.h"
34 #include "dictionary/user_pos.h"
3534
3635 namespace mozc {
3736
5049 public:
5150 virtual ~DataManagerInterface() {}
5251
53 // Returns the address of an array of UserPOS::POSToken.
54 virtual const dictionary::UserPOS::POSToken *GetUserPOSData() const = 0;
52 // Returns data set for UserPOS.
53 virtual void GetUserPOSData(StringPiece *token_array_data,
54 StringPiece *string_array_data) const = 0;
5555
5656 // Returns a reference to POSMatcher class handling POS rules. Don't
5757 // delete the returned pointer, which is owned by the manager.
2828
2929 #include "data_manager/oss/oss_user_pos_manager.h"
3030
31 #include "base/embedded_file.h"
3132 #include "base/logging.h"
3233 #include "base/singleton.h"
3334 #include "dictionary/pos_group.h"
3435 #include "dictionary/pos_matcher.h"
35 #include "dictionary/user_pos.h"
3636
3737 namespace mozc {
3838 namespace oss {
4343
4444 namespace {
4545
46 // The following header file is automatically generated and contains the
47 // definition of variable, kPOSToken, of type UserPOSImpl::POSToken.
48 #include "data_manager/oss/user_pos_data.h"
46 // Embedded file kUserPosManagerData is defined in this header file.
47 #include "data_manager/oss/user_pos_manager_data.h"
4948
5049 } // namespace
5150
52 const dictionary::UserPOS::POSToken *OssUserPosManager::GetUserPOSData() const {
53 DCHECK(kPOSToken != NULL);
54 return kPOSToken;
51 OssUserPosManager::OssUserPosManager() {
52 const StringPiece data = LoadEmbeddedFile(kUserPosManagerData);
53 const char *kMagicNumber = ""; // Magic number is not present.
54 CHECK(manager_.InitUserPosManagerDataFromArray(data, kMagicNumber))
55 << "Embedded user_pos_manager_data.h is broken";
56 }
57
58 OssUserPosManager::~OssUserPosManager() = default;
59
60 void OssUserPosManager::GetUserPOSData(
61 StringPiece *token_array_data, StringPiece *string_array_data) const {
62 manager_.GetUserPOSData(token_array_data, string_array_data);
5563 }
5664
5765 namespace {
3030 #define MOZC_DATA_MANAGER_OSS_OSS_USER_POS_MANAGER_H_
3131
3232 #include "base/port.h"
33 #include "data_manager/data_manager.h"
3334 #include "data_manager/data_manager_interface.h"
3435
3536 namespace mozc {
3738
3839 class OssUserPosManager : public DataManagerInterface {
3940 public:
40 OssUserPosManager() {}
41 ~OssUserPosManager() override {}
41 OssUserPosManager();
42 ~OssUserPosManager() override;
4243
4344 static OssUserPosManager *GetUserPosManager();
4445
4546 // Partially implement the interface because some binary only reqiures the
4647 // folloiwng embedded data.
4748 // Returns the address to an array of UserPOS::POSToken.
48 const dictionary::UserPOS::POSToken *GetUserPOSData() const override;
49 void GetUserPOSData(StringPiece *token_array_data,
50 StringPiece *string_array_data) const override;
4951 const dictionary::POSMatcher *GetPOSMatcher() const override;
5052
5153 // The following are implemented in OssDataManager.
8082 size_t *size) const override {}
8183
8284 private:
85 DataManager manager_;
8386 DISALLOW_COPY_AND_ASSIGN(OssUserPosManager);
8487 };
8588
2828
2929 #include <string>
3030
31 #include "base/file_stream.h"
3132 #include "base/flags.h"
3233 #include "base/init_mozc.h"
3334 #include "base/logging.h"
3738 #include "dictionary/pos_matcher.h"
3839 #include "dictionary/user_pos.h"
3940
41 DEFINE_string(user_pos_manager_data, "", "Input user pos manager data");
4042 DEFINE_string(output, "", "Output data file name");
4143
4244 namespace mozc {
4345 namespace {
4446
4547 #include "data_manager/@DIR@/pos_matcher_data.h"
46 #include "data_manager/@DIR@/user_pos_data.h"
4748
4849 } // namespace
4950
5051 bool OutputData(const string &file_path) {
52 const char* kMagicNumber = ""; // No magic number.
5153 packed::SystemDictionaryDataPacker packer(Version::GetMozcVersion());
52 packer.SetPosTokens(kPOSToken, arraysize(kPOSToken));
54 packer.SetMozcData(InputFileStream(FLAGS_user_pos_manager_data.c_str(),
55 ios_base::in | ios_base::binary).Read(),
56 kMagicNumber);
5357 // The following two arrays contain sentinel elements but the packer doesn't
5458 // expect them. So, pass the shinked ranges of the arrays. Note that
5559 // sentinel elements are not necessary at runtime.
6367 int main(int argc, char **argv) {
6468 mozc::InitMozc(argv[0], &argc, &argv, false);
6569
66 if (FLAGS_output.empty()) {
67 LOG(FATAL) << "output flag is needed";
70 if (FLAGS_user_pos_manager_data.empty() || FLAGS_output.empty()) {
71 LOG(FATAL) << "input and output flags are needed";
6872 return 1;
6973 }
7074 if (!mozc::OutputData(FLAGS_output)) {
5151 namespace {
5252
5353 #include "data_manager/@DIR@/pos_matcher_data.h"
54 #include "data_manager/@DIR@/user_pos_data.h"
5554
5655 } // namespace
5756
6160 dictionary_version = FLAGS_dictionary_version;
6261 }
6362 packed::SystemDictionaryDataPacker packer(dictionary_version);
64 packer.SetPosTokens(kPOSToken, arraysize(kPOSToken));
6563 // The following two arrays contain sentinel elements but the packer doesn't
6664 // expect them. So pass the shinked ranges of the arrays. Note that sentinel
6765 // elements are not required at runtime.
4949 using std::unique_ptr;
5050
5151 using mozc::dictionary::POSMatcher;
52 using mozc::dictionary::UserPOS;
5352
5453 namespace mozc {
5554 namespace packed {
7877 bool InitWithZippedData(const string &zipped_system_dictionary_data);
7978 string GetDictionaryVersion();
8079
81 const UserPOS::POSToken *GetUserPOSData() const;
80 void GetUserPOSData(StringPiece *token_array_data,
81 StringPiece *string_array_data) const;
8282 const POSMatcher *GetPOSMatcher() const;
8383 const uint8 *GetPosGroupData() const;
8484 void GetConnectorData(const char **data, size_t *size) const;
119119 };
120120 bool InitializeWithSystemDictionaryData();
121121
122 unique_ptr<UserPOS::POSToken[]> pos_token_;
123 unique_ptr<UserPOS::ConjugationType[]> conjugation_array_;
124122 unique_ptr<uint16[]> rule_id_table_;
125123 unique_ptr<POSMatcher::Range *[]> range_tables_;
126124 unique_ptr<Range[]> range_table_items_;
171169 << " expected:" << kSystemDictionaryFormatVersion
172170 << " actual:" << system_dictionary_data_->format_version();
173171 return false;
174 }
175 // Makes UserPOS data.
176 pos_token_.reset(
177 new UserPOS::POSToken[system_dictionary_data_->pos_tokens_size()]);
178 size_t conjugation_count = 0;
179 for (size_t i = 0; i < system_dictionary_data_->pos_tokens_size(); ++i) {
180 conjugation_count +=
181 system_dictionary_data_->pos_tokens(i).conjugation_forms_size();
182 }
183 conjugation_array_.reset(new UserPOS::ConjugationType[conjugation_count]);
184 size_t conjugation_index = 0;
185 for (size_t i = 0; i < system_dictionary_data_->pos_tokens_size(); ++i) {
186 const SystemDictionaryData::PosToken &pos_token =
187 system_dictionary_data_->pos_tokens(i);
188 if (pos_token.has_pos()) {
189 pos_token_[i].pos = pos_token.pos().data();
190 } else {
191 pos_token_[i].pos = NULL;
192 }
193 pos_token_[i].conjugation_size =
194 pos_token.conjugation_forms_size();
195 pos_token_[i].conjugation_form = &conjugation_array_[conjugation_index];
196 if (pos_token.conjugation_forms_size() == 0) {
197 pos_token_[i].conjugation_form = NULL;
198 }
199 for (size_t j = 0; j < pos_token.conjugation_forms_size(); ++j) {
200 const SystemDictionaryData::PosToken::ConjugationType &conjugation_form =
201 pos_token.conjugation_forms(j);
202 if (conjugation_form.has_key_suffix()) {
203 conjugation_array_[conjugation_index].key_suffix =
204 conjugation_form.key_suffix().data();
205 } else {
206 conjugation_array_[conjugation_index].key_suffix = NULL;
207 }
208 if (conjugation_form.has_value_suffix()) {
209 conjugation_array_[conjugation_index].value_suffix =
210 conjugation_form.value_suffix().data();
211 } else {
212 conjugation_array_[conjugation_index].value_suffix = NULL;
213 }
214 conjugation_array_[conjugation_index].id = conjugation_form.id();
215 ++conjugation_index;
216 }
217172 }
218173
219174 // Makes POSMatcher data.
262217 if (system_dictionary_data_->has_mozc_data() &&
263218 !manager_.InitFromArray(system_dictionary_data_->mozc_data(),
264219 system_dictionary_data_->mozc_data_magic())) {
265 LOG(ERROR) << "Failed to initialize mozc data";
266 return false;
267 }
268
220 VLOG(1) << "Data set is incomplete. Assume this is user pos manager data.";
221 // The data set containing only user pos manager data is used in build
222 // tools.
223 // TODO(noriyukit): Fix this hard-to-understand behavior by removing
224 // PackedDataManager.
225 if (!manager_.InitUserPosManagerDataFromArray(
226 system_dictionary_data_->mozc_data(),
227 system_dictionary_data_->mozc_data_magic())) {
228 LOG(ERROR) << "Failed to initialize mozc data";
229 return false;
230 }
231 }
269232 return true;
270233 }
271234
272 const UserPOS::POSToken *PackedDataManager::Impl::GetUserPOSData() const {
273 return pos_token_.get();
235 void PackedDataManager::Impl::GetUserPOSData(
236 StringPiece *token_array_data, StringPiece *string_array_data) const {
237 manager_.GetUserPOSData(token_array_data, string_array_data);
274238 }
275239
276240 const POSMatcher *PackedDataManager::Impl::GetPOSMatcher() const {
401365 return manager_impl_->GetDictionaryVersion();
402366 }
403367
404 const UserPOS::POSToken *PackedDataManager::GetUserPOSData() const {
405 return manager_impl_->GetUserPOSData();
368 void PackedDataManager::GetUserPOSData(
369 StringPiece *token_array_data, StringPiece *string_array_data) const {
370 manager_impl_->GetUserPOSData(token_array_data, string_array_data);
406371 }
407372
408373 PackedDataManager *PackedDataManager::GetUserPosManager() {
5151
5252 static PackedDataManager *GetUserPosManager();
5353
54 const dictionary::UserPOS::POSToken *GetUserPOSData() const override;
54 void GetUserPOSData(StringPiece *token_array_data,
55 StringPiece *string_array_data) const override;
5556 const dictionary::POSMatcher *GetPOSMatcher() const override;
5657 const uint8 *GetPosGroupData() const override;
5758 void GetConnectorData(const char **data, size_t *size) const override;
8181 ],
8282 'action': [
8383 '<(PRODUCT_DIR)/gen_packed_data_light_main_<(dataset_tag)<(EXECUTABLE_SUFFIX)',
84 '--user_pos_manager_data=<(gen_out_dir)/../<(dataset_dir)/user_pos_manager.data',
8485 '--output=<(gen_out_dir)/packed_data_light_<(dataset_tag)',
8586 ],
8687 },
8788 ],
8889 'dependencies': [
8990 'gen_packed_data_light_main_<(dataset_tag)',
91 '../<(dataset_dir)/<(dataset_tag)_data_manager_base.gyp:gen_user_pos_manager_data_for_<(dataset_tag)',
9092 ],
9193 },
9294 ],
3636 optional string product_version = 1 [ default = "0.0.0.0" ];
3737 optional uint32 format_version = 2;
3838
39 message PosToken {
40 optional string pos = 1;
41 message ConjugationType {
42 optional string key_suffix = 1;
43 optional string value_suffix = 2;
44 optional uint32 id = 3;
45 };
46 repeated ConjugationType conjugation_forms = 2;
47 };
48 repeated PosToken pos_tokens = 3;
39 reserved 3; // DEPRECATED: repeated PosToken pos_tokens = 3;
4940
5041 message PosMatcherData {
5142 repeated uint32 rule_id_table = 1;
5656 }
5757
5858 SystemDictionaryDataPacker::~SystemDictionaryDataPacker() {
59 }
60
61 void SystemDictionaryDataPacker::SetPosTokens(
62 const UserPOS::POSToken *pos_token_data,
63 size_t token_count) {
64 for (size_t i = 0; i < token_count; ++i) {
65 SystemDictionaryData::PosToken *pos_token =
66 system_dictionary_->add_pos_tokens();
67 if (pos_token_data[i].pos) {
68 pos_token->set_pos(pos_token_data[i].pos);
69 }
70 for (size_t j = 0; j < pos_token_data[i].conjugation_size; ++j) {
71 SystemDictionaryData::PosToken::ConjugationType *conjugation_form
72 = pos_token->add_conjugation_forms();
73 if (pos_token_data[i].conjugation_form[j].key_suffix) {
74 conjugation_form->set_key_suffix(
75 pos_token_data[i].conjugation_form[j].key_suffix);
76 }
77 if (pos_token_data[i].conjugation_form[j].value_suffix) {
78 conjugation_form->set_value_suffix(
79 pos_token_data[i].conjugation_form[j].value_suffix);
80 }
81 conjugation_form->set_id(
82 pos_token_data[i].conjugation_form[j].id);
83 }
84 }
8559 }
8660
8761 void SystemDictionaryDataPacker::SetPosMatcherData(
3333
3434 #include "base/port.h"
3535 #include "dictionary/pos_matcher.h"
36 #include "dictionary/user_pos.h"
3736
3837 namespace mozc {
3938 namespace packed {
4443 public:
4544 explicit SystemDictionaryDataPacker(const string &product_version);
4645 ~SystemDictionaryDataPacker();
47 void SetPosTokens(
48 const dictionary::UserPOS::POSToken *pos_token_data,
49 size_t token_count);
5046 void SetPosMatcherData(
5147 const uint16 *rule_id_table,
5248 size_t rule_id_table_count,
3232 namespace mozc {
3333 namespace packed {
3434
35 const int kSystemDictionaryFormatVersion = 20;
35 const int kSystemDictionaryFormatVersion = 21;
3636
3737 } // namespace packed
3838 } // namespace mozc
2828
2929 #include "data_manager/testing/mock_user_pos_manager.h"
3030
31 #include "base/embedded_file.h"
3132 #include "base/logging.h"
3233 #include "base/singleton.h"
33 #include "dictionary/pos_group.h"
3434 #include "dictionary/pos_matcher.h"
35 #include "dictionary/user_pos.h"
3635
3736 namespace mozc {
3837 namespace testing {
4342
4443 namespace {
4544
46 // The following header file is automatically generated and contains the
47 // definition of variable, kPOSToken, of type UserPOSImpl::POSToken.
48 #include "data_manager/testing/user_pos_data.h"
45 // Embedded file kUserPosManagerData is defined in this header file.
46 #include "data_manager/testing/user_pos_manager_data.h"
4947
5048 } // namespace
5149
52 const dictionary::UserPOS::POSToken *
53 MockUserPosManager::GetUserPOSData() const {
54 DCHECK(kPOSToken != NULL);
55 return kPOSToken;
50 MockUserPosManager::MockUserPosManager() {
51 const StringPiece data = LoadEmbeddedFile(kUserPosManagerData);
52 const char *kMagicNumber = ""; // Magic number is not present.
53 CHECK(manager_.InitUserPosManagerDataFromArray(data, kMagicNumber))
54 << "Embedded user_pos_manager_data.h is broken";
55 }
56
57 MockUserPosManager::~MockUserPosManager() = default;
58
59 void MockUserPosManager::GetUserPOSData(
60 StringPiece *token_array_data, StringPiece *string_array_data) const {
61 manager_.GetUserPOSData(token_array_data, string_array_data);
5662 }
5763
5864 namespace {
3030 #define MOZC_DATA_MANAGER_TESTING_MOCK_USER_POS_MANAGER_H_
3131
3232 #include "base/port.h"
33 #include "data_manager/data_manager.h"
3334 #include "data_manager/data_manager_interface.h"
3435
3536 namespace mozc {
3738
3839 class MockUserPosManager : public DataManagerInterface {
3940 public:
40 MockUserPosManager() {}
41 ~MockUserPosManager() override {}
41 MockUserPosManager();
42 ~MockUserPosManager() override;
4243
4344 static MockUserPosManager *GetUserPosManager();
4445
4546 // Partially implement the interface because some binary only reqiures the
4647 // folloiwng embedded data.
47 const dictionary::UserPOS::POSToken *GetUserPOSData() const override;
48 void GetUserPOSData(StringPiece *token_array_data,
49 StringPiece *string_array_data) const override;
4850 const dictionary::POSMatcher *GetPOSMatcher() const override;
4951
5052 // The following are implemented in MockDataManager.
7981 size_t *size) const override {}
8082
8183 private:
84 DataManager manager_;
8285 DISALLOW_COPY_AND_ASSIGN(MockUserPosManager);
8386 };
8487
6363 'dictionary_base.gyp:pos_matcher',
6464 'dictionary_base.gyp:suppression_dictionary',
6565 'dictionary_base.gyp:user_dictionary',
66 'dictionary_base.gyp:user_pos',
6667 ],
6768 'variables': {
6869 'test_size': 'small',
2727 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2828 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2929
30 """Utility to generate user_pos_data.h."""
30 """Utility to generate User POS binary data."""
3131
3232 __author__ = "hidehiko"
3333
34 from collections import defaultdict
35 import logging
3634 import optparse
35 import struct
3736
38 from build_tools import code_generator_util
3937 from build_tools import serialized_string_array_builder
4038 from dictionary import pos_util
4139
4240
43 def OutputUserPosDataHeader(user_pos_data, output):
44 """Prints user_pos_data.h to output."""
45 # Output kConjugation
46 for index, (_, conjugation_list) in enumerate(user_pos_data):
47 output.write(
48 'static const ::mozc::dictionary::UserPOS::ConjugationType '
49 'kConjugation%d[] = {\n' % (index))
50 for value_suffix, key_suffix, pos_id in conjugation_list:
51 output.write(' { %s, %s, %d },\n' % (
52 code_generator_util.ToCppStringLiteral(value_suffix),
53 code_generator_util.ToCppStringLiteral(key_suffix),
54 pos_id))
55 output.write('};\n')
41 def ToString(s):
42 return s or ''
5643
57 # Output PosToken
58 output.write('const ::mozc::dictionary::UserPOS::POSToken kPOSToken[] = {\n')
59 for index, (user_pos, conjunction_list) in enumerate(user_pos_data):
60 output.write(' { %s, %d, kConjugation%d },\n' % (
61 code_generator_util.ToCppStringLiteral(user_pos),
62 len(conjunction_list),
63 index))
64 # Also output the sentinal.
65 output.write(' { NULL, 0, NULL },\n'
66 '};\n')
44
45 def OutputUserPosData(user_pos_data, output_token_array, output_string_array):
46 string_index = {}
47 for user_pos, conjugation_list in user_pos_data:
48 string_index[ToString(user_pos)] = 0
49 for value_suffix, key_suffix, _ in conjugation_list:
50 string_index[ToString(value_suffix)] = 0
51 string_index[ToString(key_suffix)] = 0
52 for index, s in enumerate(sorted(string_index)):
53 string_index[s] = index
54
55 with open(output_token_array, 'wb') as f:
56 for user_pos, conjugation_list in sorted(user_pos_data):
57 user_pos_index = string_index[ToString(user_pos)]
58 for value_suffix, key_suffix, conjugation_id in conjugation_list:
59 # One entry is serialized to 8 byte (four uint16 components).
60 f.write(struct.pack('<H', user_pos_index))
61 f.write(struct.pack('<H', string_index[ToString(value_suffix)]))
62 f.write(struct.pack('<H', string_index[ToString(key_suffix)]))
63 f.write(struct.pack('<H', conjugation_id))
64
65 serialized_string_array_builder.SerializeToFile(
66 sorted(string_index.iterkeys()), output_string_array)
6767
6868
6969 def ParseOptions():
7070 parser = optparse.OptionParser()
7171 # Input: id.def, special_pos.def, user_pos.def, cforms.def
72 # Output: user_pos_data.h
7372 parser.add_option('--id_file', dest='id_file', help='Path to id.def.')
7473 parser.add_option('--special_pos_file', dest='special_pos_file',
7574 help='Path to special_pos.def')
7776 help='Path to cforms.def')
7877 parser.add_option('--user_pos_file', dest='user_pos_file',
7978 help='Path to user_pos,def')
80 parser.add_option('--output', dest='output',
81 help='Path to output user_pos_data.h')
79 parser.add_option('--output_token_array', dest='output_token_array',
80 help='Path to output token array binary data')
81 parser.add_option('--output_string_array', dest='output_string_array',
82 help='Path to output string array data')
8283 parser.add_option('--output_pos_list', dest='output_pos_list',
8384 help='Path to output POS list binary file')
8485 return parser.parse_args()[0]
9394 user_pos = pos_util.UserPos(pos_database, inflection_map)
9495 user_pos.Parse(options.user_pos_file)
9596
96 with open(options.output, 'w') as stream:
97 OutputUserPosDataHeader(user_pos.data, stream)
97 OutputUserPosData(user_pos.data,
98 options.output_token_array, options.output_string_array)
9899
99100 if options.output_pos_list:
100101 serialized_string_array_builder.SerializeToFile(
115115 } // namespace
116116
117117 TextDictionaryLoader::TextDictionaryLoader(const POSMatcher &pos_matcher)
118 : pos_matcher_(&pos_matcher) {
119 }
118 : zipcode_id_(pos_matcher.GetZipcodeId()),
119 isolated_word_id_(pos_matcher.GetIsolatedWordId()) {}
120
121 TextDictionaryLoader::TextDictionaryLoader(uint16 zipcode_id,
122 uint16 isolated_word_id)
123 : zipcode_id_(zipcode_id), isolated_word_id_(isolated_word_id) {}
120124
121125 TextDictionaryLoader::~TextDictionaryLoader() {
122126 Clear();
133137 return true;
134138 }
135139 if (Util::StartsWith(label, "ZIP_CODE")) {
136 token->lid = pos_matcher_->GetZipcodeId();
137 token->rid = pos_matcher_->GetZipcodeId();
140 token->lid = zipcode_id_;
141 token->rid = zipcode_id_;
138142 return true;
139143 }
140144 if (Util::StartsWith(label, "ENGLISH")) {
141145 // TODO(noriyukit): Might be better to use special POS for english words.
142 token->lid = pos_matcher_->GetIsolatedWordId();
143 token->rid = pos_matcher_->GetIsolatedWordId();
146 token->lid = isolated_word_id_;
147 token->rid = isolated_word_id_;
144148 return true;
145149 }
146150 LOG(ERROR) << "Unknown special label: " << label;
4747 public:
4848 // TODO(noriyukit): Better to pass the pointer of pos_matcher.
4949 explicit TextDictionaryLoader(const POSMatcher& pos_matcher);
50 TextDictionaryLoader(uint16 zipcode_id, uint16 isolated_word_id);
5051 virtual ~TextDictionaryLoader();
5152
5253 // Loads tokens from system dictionary files and reading correction
8586 // Allows derived classes to implement custom filtering rules.
8687 virtual Token *ParseTSV(const vector<StringPiece> &columns) const;
8788
88 const POSMatcher *pos_matcher_;
89
9089 private:
9190 static void LoadReadingCorrectionTokens(
9291 const string &reading_correction_filename,
104103
105104 Token *ParseTSVLine(StringPiece line) const;
106105
106 const uint16 zipcode_id_;
107 const uint16 isolated_word_id_;
107108 vector<Token *> tokens_;
108109
109110 FRIEND_TEST(TextDictionaryLoaderTest, RewriteSpecialTokenTest);
226226 // Creates a user dictionary with actual pos data.
227227 UserDictionary *CreateDictionary() {
228228 const testing::MockUserPosManager user_pos_manager;
229 return new UserDictionary(new UserPOS(user_pos_manager.GetUserPOSData()),
229 return new UserDictionary(UserPOS::CreateFromDataManager(user_pos_manager),
230230 user_pos_manager.GetPOSMatcher(),
231231 Singleton<SuppressionDictionary>::get());
232232 }
2929 #include "dictionary/user_pos.h"
3030
3131 #include <algorithm>
32 #include <map>
32 #include <set>
3333
3434 #include "base/logging.h"
3535 #include "base/util.h"
3737 namespace mozc {
3838 namespace dictionary {
3939
40 UserPOS::UserPOS(const POSToken *pos_token_array)
41 : pos_token_array_(pos_token_array) {
42 DCHECK(pos_token_array_);
43 for (size_t i = 0; pos_token_array_[i].pos != nullptr; ++i) {
44 pos_map_.insert(
45 std::make_pair(string(pos_token_array_[i].pos), &pos_token_array_[i]));
46 }
47 CHECK_GT(pos_map_.size(), 1);
40 UserPOS::UserPOS(StringPiece token_array_data, StringPiece string_array_data)
41 : token_array_data_(token_array_data) {
42 DCHECK_EQ(token_array_data.size() % 8, 0);
43 DCHECK(SerializedStringArray::VerifyData(string_array_data));
44 string_array_.Set(string_array_data);
4845 }
46
47 UserPOS::~UserPOS() = default;
4948
5049 void UserPOS::GetPOSList(vector<string> *pos_list) const {
5150 pos_list->clear();
52 for (size_t i = 0; pos_token_array_[i].pos != nullptr; ++i) {
53 pos_list->push_back(pos_token_array_[i].pos);
51 set<uint16> seen;
52 for (auto iter = begin(); iter != end(); ++iter) {
53 if (!seen.insert(iter.pos_index()).second) {
54 continue;
55 }
56 const StringPiece pos = string_array_[iter.pos_index()];
57 pos_list->emplace_back(pos.data(), pos.size());
5458 }
5559 }
5660
5761 bool UserPOS::IsValidPOS(const string &pos) const {
58 map<string, const POSToken*>::const_iterator it = pos_map_.find(pos);
59 return it != pos_map_.end();
62 const auto iter =
63 std::lower_bound(string_array_.begin(), string_array_.end(), pos);
64 if (iter == string_array_.end()) {
65 return false;
66 }
67 return std::binary_search(begin(), end(), iter.index());
6068 }
6169
6270 bool UserPOS::GetPOSIDs(const string &pos, uint16 *id) const {
63 map<string, const POSToken*>::const_iterator it = pos_map_.find(pos);
64 if (it == pos_map_.end()) {
71 const auto str_iter =
72 std::lower_bound(string_array_.begin(), string_array_.end(), pos);
73 if (str_iter == string_array_.end() || *str_iter != pos) {
6574 return false;
6675 }
67
68 const ConjugationType *conjugation_form = it->second->conjugation_form;
69 CHECK(conjugation_form);
70
71 *id = conjugation_form[0].id;
72
76 const auto token_iter = std::lower_bound(begin(), end(), str_iter.index());
77 if (token_iter == end() || token_iter.pos_index() != str_iter.index()) {
78 return false;
79 }
80 *id = token_iter.conjugation_id();
7381 return true;
7482 }
7583
76 bool UserPOS::GetTokens(const string &key,
77 const string &value,
78 const string &pos,
79 vector<Token> *tokens) const {
80 if (key.empty() ||
81 value.empty() ||
82 pos.empty() ||
83 tokens == nullptr) {
84 bool UserPOS::GetTokens(const string &key, const string &value,
85 const string &pos, vector<Token> *tokens) const {
86 if (key.empty() || value.empty() || pos.empty() || tokens == nullptr) {
8487 return false;
8588 }
8689
8790 tokens->clear();
88 map<string, const POSToken*>::const_iterator it = pos_map_.find(pos);
89 if (it == pos_map_.end()) {
91 const auto str_iter =
92 std::lower_bound(string_array_.begin(), string_array_.end(), pos);
93 if (str_iter == string_array_.end() || *str_iter != pos) {
9094 return false;
9195 }
92
93 const ConjugationType *conjugation_form = it->second->conjugation_form;
94 CHECK(conjugation_form);
95
96 const size_t size = static_cast<size_t>(it->second->conjugation_size);
96 pair<iterator, iterator> range =
97 std::equal_range(begin(), end(), str_iter.index());
98 if (range.first == range.second) {
99 return false;
100 }
101 const size_t size = range.second - range.first;
97102 CHECK_GE(size, 1);
98103 tokens->resize(size);
99104
103108 // Set smaller cost for "短縮よみ" in order to make
104109 // the rank of the word higher than others.
105110 const int16 kIsolatedWordCost = 200;
106 const char kIsolatedWordPOS[]
107 = "\xE7\x9F\xAD\xE7\xB8\xAE\xE3\x82\x88\xE3\x81\xBF";
111 const char kIsolatedWordPOS[] =
112 "\xE7\x9F\xAD\xE7\xB8\xAE\xE3\x82\x88\xE3\x81\xBF";
108113
109114 if (size == 1) { // no conjugation
115 const auto &token_iter = range.first;
110116 (*tokens)[0].key = key;
111117 (*tokens)[0].value = value;
112 (*tokens)[0].id = conjugation_form[0].id;
118 (*tokens)[0].id = token_iter.conjugation_id();
113119 if (pos == kIsolatedWordPOS) {
114 (*tokens)[0].cost= kIsolatedWordCost;
120 (*tokens)[0].cost = kIsolatedWordCost;
115121 } else {
116 (*tokens)[0].cost= kDefaultCost;
122 (*tokens)[0].cost = kDefaultCost;
117123 }
118124 } else {
125 const auto &base_form_token_iter = range.first;
119126 // expand all other forms
120127 string key_stem = key;
121128 string value_stem = value;
122129 // assume that conjugation_form[0] contains the suffix of "base form".
123 const string base_key_suffix = conjugation_form[0].key_suffix;
124 const string base_value_suffix = conjugation_form[0].value_suffix;
130 const StringPiece base_key_suffix =
131 string_array_[base_form_token_iter.key_suffix_index()];
132 const StringPiece base_value_suffix =
133 string_array_[base_form_token_iter.value_suffix_index()];
134
125135 if (base_key_suffix.size() < key.size() &&
126136 base_value_suffix.size() < value.size() &&
127137 Util::EndsWith(key, base_key_suffix) &&
129139 key_stem.assign(key, 0, key.size() - base_key_suffix.size());
130140 value_stem.assign(value, 0, value.size() - base_value_suffix.size());
131141 }
132 for (size_t i = 0; i < size; ++i) {
133 (*tokens)[i].key = key_stem + conjugation_form[i].key_suffix;
134 (*tokens)[i].value = value_stem + conjugation_form[i].value_suffix;
135 (*tokens)[i].id = conjugation_form[i].id;
136 (*tokens)[i].cost = kDefaultCost;
142 for (size_t i = 0; i < size; ++i, ++range.first) {
143 const auto &token_iter = range.first;
144 const StringPiece key_suffix =
145 string_array_[token_iter.key_suffix_index()];
146 const StringPiece value_suffix =
147 string_array_[token_iter.value_suffix_index()];
148 Util::ConcatStrings(key_stem, key_suffix, &(*tokens)[i].key);
149 Util::ConcatStrings(value_stem, value_suffix, &(*tokens)[i].value);
150 (*tokens)[i].id = token_iter.conjugation_id();
151 (*tokens)[i].cost = kDefaultCost;
137152 }
153 DCHECK(range.first == range.second);
138154 }
139155
140156 return true;
141157 }
142158
159 UserPOS *UserPOS::CreateFromDataManager(const DataManagerInterface &manager) {
160 StringPiece token_array_data, string_array_data;
161 manager.GetUserPOSData(&token_array_data, &string_array_data);
162 return new UserPOS(token_array_data, string_array_data);
163 }
164
143165 } // namespace dictionary
144166 } // namespace mozc
2929 #ifndef MOZC_DICTIONARY_USER_POS_H_
3030 #define MOZC_DICTIONARY_USER_POS_H_
3131
32 #include <map>
32 #include <iterator>
3333 #include <string>
34 #include <utility>
3435 #include <vector>
3536
3637 #include "base/port.h"
38 #include "base/serialized_string_array.h"
39 #include "base/string_piece.h"
40 #include "data_manager/data_manager_interface.h"
3741 #include "dictionary/user_pos_interface.h"
3842
3943 namespace mozc {
4044 namespace dictionary {
4145
46 // This implementation of UserPOSInterface uses a sorted array of tokens to
47 // efficiently lookup required data. There are two required data, string array
48 // and token array, which are generated by ./gen_user_pos_data.py.
49 //
50 // * Prerequisite
51 // Little endian is assumed.
52 //
53 // * Binary format
54 //
55 // ** String array
56 // All the strings, such as key and value suffixes, are serialized into one
57 // array using SerializedStringArray in such a way that array is sorted in
58 // ascending order. In the token array (see below), every string is stored as
59 // an index to this array.
60 //
61 // ** Token array
62 //
63 // The token array is an array of 8 byte blocks each of which has the following
64 // layout:
65 //
66 // Token layout (8 bytes)
67 // +---------------------------------------+
68 // | POS index (2 bytes) |
69 // + - - - - - - - - - - - - - - - - - - - +
70 // | Value suffix index (2 bytes) |
71 // + - - - - - - - - - - - - - - - - - - - +
72 // | Key suffix index (2 bytes) |
73 // + - - - - - - - - - - - - - - - - - - - +
74 // | Conjugation ID (2 bytes) |
75 // +---------------------------------------+
76 //
77 // The array is sorted in ascending order of POS index so that we can use binary
78 // search to lookup necessary information efficiently. Note that there are
79 // tokens having the same POS index.
4280 class UserPOS : public UserPOSInterface {
4381 public:
44 struct ConjugationType {
45 const char *key_suffix;
46 const char *value_suffix;
47 uint16 id;
82 static const size_t kTokenByteLength = 8;
83
84 class iterator
85 : public std::iterator<std::random_access_iterator_tag, uint16> {
86 public:
87 iterator() = default;
88 explicit iterator(const char *ptr) : ptr_(ptr) {}
89 iterator(const iterator &x) = default;
90
91 uint16 pos_index() const {
92 return *reinterpret_cast<const uint16 *>(ptr_);
93 }
94 uint16 value_suffix_index() const {
95 return *reinterpret_cast<const uint16 *>(ptr_ + 2);
96 }
97 uint16 key_suffix_index() const {
98 return *reinterpret_cast<const uint16 *>(ptr_ + 4);
99 }
100 uint16 conjugation_id() const {
101 return *reinterpret_cast<const uint16 *>(ptr_ + 6);
102 }
103
104 uint16 operator*() const { return pos_index(); }
105
106 void swap(iterator &x) {
107 using std::swap;
108 swap(ptr_, x.ptr_);
109 }
110
111 friend void swap(iterator &x, iterator &y) { x.swap(y); }
112
113 iterator &operator++() {
114 ptr_ += kTokenByteLength;
115 return *this;
116 }
117
118 iterator operator++(int) {
119 const char *tmp = ptr_;
120 ptr_ += kTokenByteLength;
121 return iterator(tmp);
122 }
123
124 iterator &operator--() {
125 ptr_ -= kTokenByteLength;
126 return *this;
127 }
128
129 iterator operator--(int) {
130 const char *tmp = ptr_;
131 ptr_ -= kTokenByteLength;
132 return iterator(tmp);
133 }
134
135 iterator &operator+=(difference_type n) {
136 ptr_ += n * kTokenByteLength;
137 return *this;
138 }
139
140 iterator &operator-=(difference_type n) {
141 ptr_ -= n * kTokenByteLength;
142 return *this;
143 }
144
145 friend iterator operator+(iterator x, difference_type n) {
146 return iterator(x.ptr_ + n * kTokenByteLength);
147 }
148
149 friend iterator operator+(difference_type n, iterator x) {
150 return iterator(x.ptr_ + n * kTokenByteLength);
151 }
152
153 friend iterator operator-(iterator x, difference_type n) {
154 return iterator(x.ptr_ - n * kTokenByteLength);
155 }
156
157 friend difference_type operator-(iterator x, iterator y) {
158 return (x.ptr_ - y.ptr_) / kTokenByteLength;
159 }
160
161 friend bool operator==(iterator x, iterator y) { return x.ptr_ == y.ptr_; }
162 friend bool operator!=(iterator x, iterator y) { return x.ptr_ != y.ptr_; }
163 friend bool operator<(iterator x, iterator y) { return x.ptr_ < y.ptr_; }
164 friend bool operator<=(iterator x, iterator y) { return x.ptr_ <= y.ptr_; }
165 friend bool operator>(iterator x, iterator y) { return x.ptr_ > y.ptr_; }
166 friend bool operator>=(iterator x, iterator y) { return x.ptr_ >= y.ptr_; }
167
168 private:
169 const char *ptr_ = nullptr;
48170 };
49171
50 struct POSToken {
51 const char *pos;
52 uint16 conjugation_size;
53 const ConjugationType *conjugation_form;
54 };
55
56 // Initializes the user pos from the given POSToken array. The class doesn't
57 // take the ownership of the array. The caller is responsible for deleting it.
58 explicit UserPOS(const POSToken *pos_token_array);
59 virtual ~UserPOS() {}
172 using const_iterator = iterator;
173
174 static UserPOS *CreateFromDataManager(const DataManagerInterface &manager);
175
176 // Initializes the user pos from the given binary data. The provided byte
177 // data must outlive this instance.
178 UserPOS(StringPiece token_array_data, StringPiece string_array_data);
179 ~UserPOS() override;
60180
61181 // Implementation of UserPOSInterface.
62 virtual void GetPOSList(vector<string> *pos_list) const;
63 virtual bool IsValidPOS(const string &pos) const;
64 virtual bool GetPOSIDs(const string &pos, uint16 *id) const;
65 virtual bool GetTokens(const string &key, const string &value,
66 const string &pos, vector<Token> *tokens) const;
182 void GetPOSList(vector<string> *pos_list) const override;
183 bool IsValidPOS(const string &pos) const override;
184 bool GetPOSIDs(const string &pos, uint16 *id) const override;
185 bool GetTokens(const string &key, const string &value, const string &pos,
186 vector<Token> *tokens) const override;
187
188 iterator begin() const { return iterator(token_array_data_.data()); }
189 iterator end() const {
190 return iterator(token_array_data_.data() + token_array_data_.size());
191 }
67192
68193 private:
69 const POSToken *pos_token_array_;
70 map<string, const POSToken *> pos_map_;
194 StringPiece token_array_data_;
195 SerializedStringArray string_array_;
71196
72197 DISALLOW_COPY_AND_ASSIGN(UserPOS);
73198 };
4444
4545 class UserPOSTest : public ::testing::Test {
4646 protected:
47 virtual void SetUp() {
47 void SetUp() override {
48 StringPiece token_array_data, string_array_data;
4849 const testing::MockUserPosManager user_pos_manager;
49 user_pos_.reset(new UserPOS(user_pos_manager.GetUserPOSData()));
50 user_pos_manager.GetUserPOSData(&token_array_data, &string_array_data);
51 user_pos_.reset(new UserPOS(token_array_data, string_array_data));
5052 CHECK(user_pos_.get());
5153 }
5254
4646 #include "dictionary/system/system_dictionary.h"
4747 #include "dictionary/system/value_dictionary.h"
4848 #include "dictionary/user_dictionary.h"
49 #include "dictionary/user_pos.h"
4950 #include "engine/engine_interface.h"
5051 #include "engine/user_data_manager_interface.h"
5152 #include "prediction/dictionary_predictor.h"
148149 CHECK(suppression_dictionary_.get());
149150
150151 user_dictionary_.reset(
151 new UserDictionary(new UserPOS(data_manager->GetUserPOSData()),
152 new UserDictionary(UserPOS::CreateFromDataManager(*data_manager),
152153 data_manager->GetPOSMatcher(),
153154 suppression_dictionary_.get()));
154155 CHECK(user_dictionary_.get());
00 MAJOR=2
11 MINOR=17
2 BUILD=2517
2 BUILD=2518
33 REVISION=102
44 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
55 # downloaded by NaCl Mozc.
6 NACL_DICTIONARY_VERSION=20
6 NACL_DICTIONARY_VERSION=21
8181 DictionaryGenerator::DictionaryGenerator()
8282 : token_pool_(new ObjectPool<Token>(kTokenSize)),
8383 token_map_(new map<uint64, Token *>),
84 user_pos_(new dictionary::UserPOS(
85 UserPosManager::GetUserPosManager()->GetUserPOSData())),
8684 open_bracket_id_(UserPosManager::GetUserPosManager()->GetPOSMatcher()
8785 ->GetOpenBracketId()),
8886 close_bracket_id_(UserPosManager::GetUserPosManager()->GetPOSMatcher()
89 ->GetCloseBracketId()) {}
87 ->GetCloseBracketId()) {
88 user_pos_.reset(dictionary::UserPOS::CreateFromDataManager(
89 *UserPosManager::GetUserPosManager()));
90 }
9091
9192 DictionaryGenerator::~DictionaryGenerator() {}
9293
7272 '../converter/converter_base.gyp:converter_mock',
7373 '../data_manager/data_manager.gyp:user_pos_manager',
7474 '../data_manager/testing/mock_data_manager.gyp:mock_data_manager',
75 '../dictionary/dictionary_base.gyp:user_pos',
7576 '../engine/engine.gyp:mock_data_engine_factory',
7677 '../protocol/protocol.gyp:commands_proto',
7778 '../session/session_base.gyp:request_test_util',
4242 #include "dictionary/suppression_dictionary.h"
4343 #include "dictionary/user_dictionary.h"
4444 #include "dictionary/user_dictionary_storage.h"
45 #include "dictionary/user_pos.h"
4546 #include "protocol/commands.pb.h"
4647 #include "protocol/config.pb.h"
4748 #include "request/conversion_request.h"
7677 convreq_.set_config(&config_);
7778 }
7879
79 virtual void SetUp() {
80 void SetUp() override {
8081 SystemUtil::SetUserProfileDirectory(FLAGS_test_tmpdir);
8182 config::ConfigHandler::GetDefaultConfig(&config_);
8283
8485
8586 suppression_dictionary_.reset(new SuppressionDictionary);
8687 user_dictionary_.reset(
87 new UserDictionary(new UserPOS(data_manager_->GetUserPOSData()),
88 new UserDictionary(UserPOS::CreateFromDataManager(*data_manager_),
8889 data_manager_->GetPOSMatcher(),
8990 suppression_dictionary_.get()));
9091 }
9192
92 virtual void TearDown() {
93 void TearDown() override {
9394 // just in case, reset the config
9495 config::ConfigHandler::GetDefaultConfig(&config_);
9596 }