Move zero query data to data set file
BUG=
TEST=
REF_BUG=26841123
REF_CL=123191246
REF_TIME=2016-05-25T17:46:35+09:00
REF_TIME_RAW=1464165995 +0900
Noriyuki Takahashi
7 years ago
186 | 186 | |
187 | 187 | // This struct holds resources used by converter. |
188 | 188 | struct ConverterAndData { |
189 | std::unique_ptr<testing::MockDataManager> data_manager; | |
189 | 190 | std::unique_ptr<DictionaryInterface> user_dictionary; |
190 | 191 | std::unique_ptr<SuppressionDictionary> suppression_dictionary; |
191 | 192 | std::unique_ptr<DictionaryInterface> suffix_dictionary; |
238 | 239 | // Create a predictor with three sub-predictors, dictionary predictor, user |
239 | 240 | // history predictor, and extra predictor. |
240 | 241 | PredictorInterface *dictionary_predictor = |
241 | new DictionaryPredictor(converter_and_data.converter.get(), | |
242 | new DictionaryPredictor(*converter_and_data.data_manager, | |
243 | converter_and_data.converter.get(), | |
242 | 244 | converter_and_data.immutable_converter.get(), |
243 | 245 | converter_and_data.dictionary.get(), |
244 | 246 | converter_and_data.suffix_dictionary.get(), |
271 | 273 | RewriterInterface *rewriter, |
272 | 274 | PredictorType predictor_type, |
273 | 275 | ConverterAndData *converter_and_data) { |
274 | testing::MockDataManager data_manager; | |
276 | converter_and_data->data_manager.reset(new testing::MockDataManager()); | |
277 | const auto &data_manager = *converter_and_data->data_manager; | |
275 | 278 | |
276 | 279 | const char *dictionary_data = nullptr; |
277 | 280 | int dictionary_size = 0; |
1405 | 1408 | suppression_dictionary.get(), |
1406 | 1409 | DefaultPredictor::CreateDefaultPredictor( |
1407 | 1410 | new DictionaryPredictor( |
1411 | data_manager, | |
1408 | 1412 | converter.get(), |
1409 | 1413 | immutable_converter.get(), |
1410 | 1414 | dictionary.get(), |
0 | 0 | MAJOR=2 |
1 | 1 | MINOR=18 |
2 | BUILD=2568 | |
2 | BUILD=2569 | |
3 | 3 | REVISION=102 |
4 | 4 | # CAUTION: NACL_DICTIONARY_VERSION is going to be migrated to ENGINE_VERSION. |
5 | 5 | # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be |
301 | 301 | LOG(ERROR) << "Single Kanji data is broken"; |
302 | 302 | return Status::DATA_BROKEN; |
303 | 303 | } |
304 | if (!reader.Get("zero_query_token_array", | |
305 | &zero_query_token_array_data_) || | |
306 | !reader.Get("zero_query_string_array", | |
307 | &zero_query_string_array_data_) || | |
308 | !reader.Get("zero_query_number_token_array", | |
309 | &zero_query_number_token_array_data_) || | |
310 | !reader.Get("zero_query_number_string_array", | |
311 | &zero_query_number_string_array_data_)) { | |
312 | LOG(ERROR) << "Cannot find zero query data"; | |
313 | return Status::DATA_MISSING; | |
314 | } | |
315 | if (!SerializedStringArray::VerifyData(zero_query_string_array_data_) || | |
316 | !SerializedStringArray::VerifyData( | |
317 | zero_query_number_string_array_data_)) { | |
318 | LOG(ERROR) << "Zero query data is broken"; | |
319 | return Status::DATA_BROKEN; | |
320 | } | |
304 | 321 | |
305 | 322 | if (!reader.Get("usage_item_array", &usage_items_data_)) { |
306 | 323 | VLOG(2) << "Usage dictionary is not provided"; |
495 | 512 | *size = counter_suffix_data_.size(); |
496 | 513 | } |
497 | 514 | |
515 | void DataManager::GetZeroQueryData( | |
516 | StringPiece *zero_query_token_array_data, | |
517 | StringPiece *zero_query_string_array_data, | |
518 | StringPiece *zero_query_number_token_array_data, | |
519 | StringPiece *zero_query_number_string_array_data) const { | |
520 | *zero_query_token_array_data = zero_query_token_array_data_; | |
521 | *zero_query_string_array_data = zero_query_string_array_data_; | |
522 | *zero_query_number_token_array_data = zero_query_number_token_array_data_; | |
523 | *zero_query_number_string_array_data = zero_query_number_string_array_data_; | |
524 | } | |
525 | ||
498 | 526 | #ifndef NO_USAGE_REWRITER |
499 | 527 | void DataManager::GetUsageRewriterData( |
500 | 528 | StringPiece *base_conjugation_suffix_data, |
119 | 119 | 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)#host', |
120 | 120 | 'gen_separate_emoji_rewriter_data_for_<(dataset_tag)#host', |
121 | 121 | 'gen_separate_single_kanji_rewriter_data_for_<(dataset_tag)#host', |
122 | 'gen_separate_zero_query_data_for_<(dataset_tag)#host', | |
122 | 123 | 'gen_separate_version_data_for_<(dataset_tag)#host', |
123 | 124 | ], |
124 | 125 | 'actions': [ |
160 | 161 | 'single_kanji_variant_string': '<(gen_out_dir)/single_kanji_variant_string.data', |
161 | 162 | 'single_kanji_noun_prefix_token': '<(gen_out_dir)/single_kanji_noun_prefix_token.data', |
162 | 163 | 'single_kanji_noun_prefix_string': '<(gen_out_dir)/single_kanji_noun_prefix_string.data', |
164 | 'zero_query_token_array': '<(gen_out_dir)/zero_query_token.data', | |
165 | 'zero_query_string_array': '<(gen_out_dir)/zero_query_string.data', | |
166 | 'zero_query_number_token_array': '<(gen_out_dir)/zero_query_number_token.data', | |
167 | 'zero_query_number_string_array': '<(gen_out_dir)/zero_query_number_string.data', | |
163 | 168 | 'version': '<(gen_out_dir)/version.data', |
164 | 169 | }, |
165 | 170 | 'inputs': [ |
197 | 202 | '<(single_kanji_variant_string)', |
198 | 203 | '<(single_kanji_noun_prefix_token)', |
199 | 204 | '<(single_kanji_noun_prefix_string)', |
205 | '<(zero_query_token_array)', | |
206 | '<(zero_query_string_array)', | |
207 | '<(zero_query_number_token_array)', | |
208 | '<(zero_query_number_string_array)', | |
200 | 209 | '<(version)', |
201 | 210 | ], |
202 | 211 | 'outputs': [ |
240 | 249 | 'single_kanji_variant_string:32:<(gen_out_dir)/single_kanji_variant_string.data', |
241 | 250 | 'single_kanji_noun_prefix_token:32:<(gen_out_dir)/single_kanji_noun_prefix_token.data', |
242 | 251 | 'single_kanji_noun_prefix_string:32:<(gen_out_dir)/single_kanji_noun_prefix_string.data', |
252 | 'zero_query_token_array:32:<(gen_out_dir)/zero_query_token.data', | |
253 | 'zero_query_string_array:32:<(gen_out_dir)/zero_query_string.data', | |
254 | 'zero_query_number_token_array:32:<(gen_out_dir)/zero_query_number_token.data', | |
255 | 'zero_query_number_string_array:32:<(gen_out_dir)/zero_query_number_string.data', | |
243 | 256 | 'version:32:<(gen_out_dir)/version.data', |
244 | 257 | ], |
245 | 258 | 'conditions': [ |
891 | 904 | ], |
892 | 905 | }, |
893 | 906 | { |
907 | 'target_name': 'gen_separate_zero_query_data_for_<(dataset_tag)', | |
908 | 'type': 'none', | |
909 | 'toolsets': ['host'], | |
910 | 'actions': [ | |
911 | { | |
912 | 'action_name': 'gen_separate_zero_query_data_for_<(dataset_tag)', | |
913 | 'variables': { | |
914 | 'generator': '<(mozc_dir)/prediction/gen_zero_query_data.py', | |
915 | 'input_files': [ | |
916 | '<(mozc_dir)/data/emoji/emoji_data.tsv', | |
917 | '<(mozc_dir)/data/emoticon/categorized.tsv', | |
918 | '<(mozc_dir)/data/symbol/symbol.tsv', | |
919 | '<(mozc_dir)/data/zero_query/zero_query.def', | |
920 | ], | |
921 | }, | |
922 | 'inputs': [ | |
923 | '<(generator)', | |
924 | '<@(input_files)', | |
925 | ], | |
926 | 'outputs': [ | |
927 | '<(gen_out_dir)/zero_query_token.data', | |
928 | '<(gen_out_dir)/zero_query_string.data', | |
929 | ], | |
930 | 'action': [ | |
931 | 'python', '<(generator)', | |
932 | '--input_rule=<(mozc_dir)/data/zero_query/zero_query.def', | |
933 | '--input_symbol=<(mozc_dir)/data/symbol/symbol.tsv', | |
934 | '--input_emoji=<(mozc_dir)/data/emoji/emoji_data.tsv', | |
935 | '--input_emoticon=<(mozc_dir)/data/emoticon/categorized.tsv', | |
936 | '--output_token_array=<(gen_out_dir)/zero_query_token.data', | |
937 | '--output_string_array=<(gen_out_dir)/zero_query_string.data', | |
938 | ], | |
939 | }, | |
940 | { | |
941 | 'action_name': 'gen_separate_zero_query_number_data_for_<(dataset_tag)', | |
942 | 'variables': { | |
943 | 'generator': '<(mozc_dir)/prediction/gen_zero_query_number_data.py', | |
944 | 'input_files': [ | |
945 | '<(mozc_dir)/data/zero_query/zero_query_number.def', | |
946 | ], | |
947 | }, | |
948 | 'inputs': [ | |
949 | '<(generator)', | |
950 | '<@(input_files)', | |
951 | ], | |
952 | 'outputs': [ | |
953 | '<(gen_out_dir)/zero_query_number_token.data', | |
954 | '<(gen_out_dir)/zero_query_number_string.data', | |
955 | ], | |
956 | 'action': [ | |
957 | 'python', '<(generator)', | |
958 | '--input=<(mozc_dir)/data/zero_query/zero_query_number.def', | |
959 | '--output_token_array=<(gen_out_dir)/zero_query_number_token.data', | |
960 | '--output_string_array=<(gen_out_dir)/zero_query_number_string.data', | |
961 | ], | |
962 | }, | |
963 | ], | |
964 | }, | |
965 | { | |
894 | 966 | 'target_name': 'gen_separate_version_data_for_<(dataset_tag)', |
895 | 967 | 'type': 'none', |
896 | 968 | 'toolsets': ['host'], |
118 | 118 | StringPiece *variant_string_array_data, |
119 | 119 | StringPiece *noun_prefix_token_array_data, |
120 | 120 | StringPiece *noun_prefix_string_array_data) const override; |
121 | void GetZeroQueryData( | |
122 | StringPiece *zero_query_token_array_data, | |
123 | StringPiece *zero_query_string_array_data, | |
124 | StringPiece *zero_query_number_token_array_data, | |
125 | StringPiece *zero_query_number_string_array_data) const override; | |
121 | 126 | |
122 | 127 | #ifndef NO_USAGE_REWRITER |
123 | 128 | void GetUsageRewriterData( |
169 | 174 | StringPiece single_kanji_variant_string_array_data_; |
170 | 175 | StringPiece single_kanji_noun_prefix_token_array_data_; |
171 | 176 | StringPiece single_kanji_noun_prefix_string_array_data_; |
177 | StringPiece zero_query_token_array_data_; | |
178 | StringPiece zero_query_string_array_data_; | |
179 | StringPiece zero_query_number_token_array_data_; | |
180 | StringPiece zero_query_number_string_array_data_; | |
172 | 181 | StringPiece usage_base_conjugation_suffix_data_; |
173 | 182 | StringPiece usage_conjugation_suffix_data_; |
174 | 183 | StringPiece usage_conjugation_index_data_; |
38 | 38 | // files in data/dictionary, such as dictionary.txt, id.def, etc. |
39 | 39 | class DataManagerInterface { |
40 | 40 | public: |
41 | virtual ~DataManagerInterface() {} | |
41 | virtual ~DataManagerInterface() = default; | |
42 | 42 | |
43 | 43 | // Returns data set for UserPOS. |
44 | 44 | virtual void GetUserPOSData(StringPiece *token_array_data, |
121 | 121 | virtual void GetCounterSuffixSortedArray(const char **array, |
122 | 122 | size_t *size) const = 0; |
123 | 123 | |
124 | // Gets the zero query prediction data. | |
125 | virtual void GetZeroQueryData( | |
126 | StringPiece *zero_query_token_array_data, | |
127 | StringPiece *zero_query_string_array_data, | |
128 | StringPiece *zero_query_number_token_array_data, | |
129 | StringPiece *zero_query_number_string_array_data) const = 0; | |
130 | ||
124 | 131 | // Gets the data version string. |
125 | 132 | virtual StringPiece GetDataVersion() const = 0; |
126 | 133 | |
127 | 134 | protected: |
128 | DataManagerInterface() {} | |
135 | DataManagerInterface() = default; | |
129 | 136 | |
130 | 137 | private: |
131 | 138 | DISALLOW_COPY_AND_ASSIGN(DataManagerInterface); |
243 | 243 | // Create a predictor with three sub-predictors, dictionary predictor, user |
244 | 244 | // history predictor, and extra predictor. |
245 | 245 | PredictorInterface *dictionary_predictor = |
246 | new DictionaryPredictor(converter_.get(), | |
246 | new DictionaryPredictor(*data_manager, | |
247 | converter_.get(), | |
247 | 248 | immutable_converter_.get(), |
248 | 249 | dictionary_.get(), |
249 | 250 | suffix_dictionary_.get(), |
0 | # -*- coding: utf-8 -*- | |
1 | # Copyright 2010-2016, Google Inc. | |
2 | # All rights reserved. | |
3 | # | |
4 | # Redistribution and use in source and binary forms, with or without | |
5 | # modification, are permitted provided that the following conditions are | |
6 | # met: | |
7 | # | |
8 | # * Redistributions of source code must retain the above copyright | |
9 | # notice, this list of conditions and the following disclaimer. | |
10 | # * Redistributions in binary form must reproduce the above | |
11 | # copyright notice, this list of conditions and the following disclaimer | |
12 | # in the documentation and/or other materials provided with the | |
13 | # distribution. | |
14 | # * Neither the name of Google Inc. nor the names of its | |
15 | # contributors may be used to endorse or promote products derived from | |
16 | # this software without specific prior written permission. | |
17 | # | |
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 | ||
30 | """Generate header file for zero query suggestion. | |
31 | ||
32 | Output format: | |
33 | const char *kKey0 = "key0"; | |
34 | const ZeroQueryEntry kValues0[] = { | |
35 | {zero_query_type00, "Cand00", emoji_type00, codepoint00}, | |
36 | {zero_query_type01, "Cand01", emoji_type01, codepoint01}, | |
37 | .. | |
38 | }; | |
39 | const char *kKey1 = "key1"; | |
40 | const ZeroQueryEntry kValues1[] = { | |
41 | {zero_query_type10, "Cand10", emoji_type10, codepoint10}, | |
42 | {zero_query_type11, "Cand11", emoji_type11, codepoint11}, | |
43 | .. | |
44 | }; | |
45 | .. | |
46 | ||
47 | const ZeroQueryList kData_data[] = { | |
48 | {kKey0, kValues0, values0_size}, | |
49 | {kKey1, kValues1, values1_size}, | |
50 | .. | |
51 | }; | |
52 | const size_t kData_size = data_size; | |
53 | ||
54 | ||
55 | Here, (kKey0, kKey1, ...) is sorted so that we can use binary search. | |
56 | """ | |
57 | ||
58 | __author__ = "toshiyuki" | |
59 | ||
60 | import os | |
61 | from build_tools import code_generator_util as cgu | |
62 | ||
63 | ||
64 | _MOZC_DIR_FOR_INCLUDE_GUARD = ( | |
65 | 'MOZC') | |
66 | ||
67 | ZERO_QUERY_TYPE_NONE = 0 | |
68 | ZERO_QUERY_TYPE_NUMBER_SUFFIX = 1 | |
69 | ZERO_QUERY_TYPE_EMOTICON = 2 | |
70 | ZERO_QUERY_TYPE_EMOJI = 3 | |
71 | ||
72 | # bit fields | |
73 | # These are standing for command::Request::EmojiCarrierType | |
74 | EMOJI_TYPE_NONE = 0 | |
75 | EMOJI_TYPE_UNICODE = 1 | |
76 | EMOJI_TYPE_DOCOMO = 2 | |
77 | EMOJI_TYPE_SOFTBANK = 4 | |
78 | EMOJI_TYPE_KDDI = 8 | |
79 | ||
80 | ||
81 | class ZeroQueryEntry(object): | |
82 | ||
83 | def __init__(self, entry_type, value, emoji_type, emoji_android_pua): | |
84 | self.entry_type = entry_type | |
85 | self.value = value | |
86 | self.emoji_type = emoji_type | |
87 | self.emoji_android_pua = emoji_android_pua | |
88 | ||
89 | ||
90 | def ZeroQueryTypeToString(zero_query_type): | |
91 | """Returns a string for C++ code indicating zero query type.""" | |
92 | if zero_query_type == ZERO_QUERY_TYPE_NONE: | |
93 | return 'ZERO_QUERY_NONE' | |
94 | elif zero_query_type == ZERO_QUERY_TYPE_NUMBER_SUFFIX: | |
95 | return 'ZERO_QUERY_NUMBER_SUFFIX' | |
96 | elif zero_query_type == ZERO_QUERY_TYPE_EMOTICON: | |
97 | return 'ZERO_QUERY_EMOTICON' | |
98 | elif zero_query_type == ZERO_QUERY_TYPE_EMOJI: | |
99 | return 'ZERO_QUERY_EMOJI' | |
100 | return 0 | |
101 | ||
102 | ||
103 | def EmojiTypeToString(emoji_type): | |
104 | """Returns a string for C++ code indicating emoji type.""" | |
105 | if emoji_type == EMOJI_TYPE_NONE: | |
106 | return 'EMOJI_NONE' | |
107 | ||
108 | types = [] | |
109 | if emoji_type & EMOJI_TYPE_UNICODE: | |
110 | types.append('EMOJI_UNICODE') | |
111 | if emoji_type & EMOJI_TYPE_DOCOMO: | |
112 | types.append('EMOJI_DOCOMO') | |
113 | if emoji_type & EMOJI_TYPE_SOFTBANK: | |
114 | types.append('EMOJI_SOFTBANK') | |
115 | if emoji_type & EMOJI_TYPE_KDDI: | |
116 | types.append('EMOJI_KDDI') | |
117 | return ' | '.join(types) | |
118 | ||
119 | ||
120 | def GetIncludeGuardSymbol(file_name): | |
121 | """Returns include guard symbol for .h file. | |
122 | ||
123 | For example, returns 'SOME_EXAMPLE_H' for '/path/to/some_example.h' | |
124 | ||
125 | Args: | |
126 | file_name: a string indicating output file path. | |
127 | Returns: | |
128 | A string for include guard. | |
129 | """ | |
130 | return os.path.basename(file_name).upper().replace('.', '_') | |
131 | ||
132 | ||
133 | def WriteIncludeGuardHeader(output_file_name, output_stream): | |
134 | """Returns include guard header for .h file.""" | |
135 | output_stream.write('#ifndef %s_PREDICTION_%s_\n' %( | |
136 | _MOZC_DIR_FOR_INCLUDE_GUARD, GetIncludeGuardSymbol(output_file_name))) | |
137 | output_stream.write('#define %s_PREDICTION_%s_\n' %( | |
138 | _MOZC_DIR_FOR_INCLUDE_GUARD, GetIncludeGuardSymbol(output_file_name))) | |
139 | ||
140 | ||
141 | def WriteIncludeGuardFooter(output_file_name, output_stream): | |
142 | """Returns include guard footer for .h file.""" | |
143 | output_stream.write('#endif // %s_PREDICTION_%s_\n' %( | |
144 | _MOZC_DIR_FOR_INCLUDE_GUARD, GetIncludeGuardSymbol(output_file_name))) | |
145 | ||
146 | ||
147 | def WriteHeaderFileForZeroQuery( | |
148 | zero_query_dict, output_file_name, var_name, output_stream): | |
149 | """Returns contents for header file that contains a string array.""" | |
150 | ||
151 | WriteIncludeGuardHeader(output_file_name, output_stream) | |
152 | output_stream.write( | |
153 | '#include "./prediction/zero_query_list.h"\n') | |
154 | output_stream.write('namespace mozc {\n') | |
155 | output_stream.write('namespace {\n') | |
156 | ||
157 | sorted_keys = sorted(zero_query_dict.keys()) | |
158 | for i, key in enumerate(sorted_keys): | |
159 | if i: | |
160 | output_stream.write('\n') | |
161 | output_stream.write('const char *%s_key%d = %s; // "%s"\n' % ( | |
162 | var_name, i, cgu.ToCppStringLiteral(key), key)) | |
163 | output_stream.write( | |
164 | 'const ZeroQueryEntry %s_values%d[] = {\n' % (var_name, i)) | |
165 | output_stream.write( | |
166 | '\n'.join([ | |
167 | ' {%s, %s, %s, 0x%x}, // "%s"' % ( | |
168 | ZeroQueryTypeToString(e.entry_type), | |
169 | cgu.ToCppStringLiteral(e.value), | |
170 | EmojiTypeToString(e.emoji_type), | |
171 | e.emoji_android_pua, | |
172 | e.value) | |
173 | for e in zero_query_dict[key]]) + | |
174 | '\n') | |
175 | output_stream.write('};\n') | |
176 | ||
177 | output_stream.write('} // namespace\n') | |
178 | ||
179 | output_stream.write('const ZeroQueryList %s_data[] = {\n' % var_name) | |
180 | output_stream.write(',\n'.join( | |
181 | [' {%s_key%d, %s_values%d, %d}' % ( | |
182 | var_name, c, var_name, c, len(zero_query_dict[key])) | |
183 | for c, key in enumerate(sorted_keys)]) + '\n') | |
184 | output_stream.write('};\n') | |
185 | output_stream.write( | |
186 | 'const size_t %s_size = %d;' % (var_name, len(sorted_keys)) + '\n') | |
187 | ||
188 | output_stream.write('} // namespace mozc\n') | |
189 | WriteIncludeGuardFooter(output_file_name, output_stream) |
54 | 54 | #include "dictionary/pos_matcher.h" |
55 | 55 | #include "prediction/predictor_interface.h" |
56 | 56 | #include "prediction/suggestion_filter.h" |
57 | #include "prediction/zero_query_data.h" | |
58 | #include "prediction/zero_query_list.h" | |
59 | #include "prediction/zero_query_number_data.h" | |
57 | #include "prediction/zero_query_dict.h" | |
60 | 58 | #include "protocol/commands.pb.h" |
61 | 59 | #include "protocol/config.pb.h" |
62 | 60 | #include "request/conversion_request.h" |
144 | 142 | FLAGS_enable_typing_correction; |
145 | 143 | } |
146 | 144 | |
147 | struct ZeroQueryListCompare { | |
148 | bool operator()(const ZeroQueryList &lhs, const ZeroQueryList &rhs) const { | |
149 | return (strcmp(lhs.key, rhs.key) < 0); | |
150 | } | |
151 | }; | |
152 | 145 | } // namespace |
153 | 146 | |
154 | 147 | class DictionaryPredictor::PredictiveLookupCallback |
279 | 272 | }; |
280 | 273 | |
281 | 274 | DictionaryPredictor::DictionaryPredictor( |
275 | const DataManagerInterface &data_manager, | |
282 | 276 | const ConverterInterface *converter, |
283 | 277 | const ImmutableConverterInterface *immutable_converter, |
284 | 278 | const DictionaryInterface *dictionary, |
295 | 289 | segmenter_(segmenter), |
296 | 290 | suggestion_filter_(suggestion_filter), |
297 | 291 | counter_suffix_word_id_(pos_matcher->GetCounterSuffixWordId()), |
298 | predictor_name_("DictionaryPredictor") {} | |
292 | predictor_name_("DictionaryPredictor") { | |
293 | StringPiece zero_query_token_array_data; | |
294 | StringPiece zero_query_string_array_data; | |
295 | StringPiece zero_query_number_token_array_data; | |
296 | StringPiece zero_query_number_string_array_data; | |
297 | data_manager.GetZeroQueryData(&zero_query_token_array_data, | |
298 | &zero_query_string_array_data, | |
299 | &zero_query_number_token_array_data, | |
300 | &zero_query_number_string_array_data); | |
301 | zero_query_dict_.Init(zero_query_token_array_data, | |
302 | zero_query_string_array_data); | |
303 | zero_query_number_dict_.Init(zero_query_number_token_array_data, | |
304 | zero_query_number_string_array_data); | |
305 | } | |
299 | 306 | |
300 | 307 | DictionaryPredictor::~DictionaryPredictor() {} |
301 | 308 | |
1731 | 1738 | |
1732 | 1739 | // static |
1733 | 1740 | bool DictionaryPredictor::GetZeroQueryCandidatesForKey( |
1734 | const ConversionRequest &request, | |
1735 | const string &key, const ZeroQueryList *begin, const ZeroQueryList *end, | |
1736 | vector<ZeroQueryResult> *results) { | |
1741 | const ConversionRequest &request, const string &key, | |
1742 | const ZeroQueryDict &dict, vector<ZeroQueryResult> *results) { | |
1737 | 1743 | const int32 available_emoji_carrier = |
1738 | 1744 | request.request().available_emoji_carrier(); |
1739 | 1745 | |
1740 | 1746 | DCHECK(results); |
1741 | 1747 | results->clear(); |
1742 | const ZeroQueryList key_item = {key.c_str(), NULL, 0}; | |
1743 | const ZeroQueryList *result_rule = | |
1744 | std::lower_bound(begin, end, key_item, ZeroQueryListCompare()); | |
1745 | if (result_rule == end || key != result_rule->key) { | |
1748 | ||
1749 | auto range = dict.equal_range(key); | |
1750 | if (range.first == range.second) { | |
1746 | 1751 | return false; |
1747 | 1752 | } |
1748 | ||
1749 | for (size_t i = 0; i < result_rule->entries_size; ++i) { | |
1750 | const ZeroQueryEntry &entry = result_rule->entries[i]; | |
1751 | if (entry.type != ZERO_QUERY_EMOJI) { | |
1752 | results->push_back(std::make_pair(entry.value, entry.type)); | |
1753 | for (; range.first != range.second; ++range.first) { | |
1754 | const auto &entry = range.first; | |
1755 | if (entry.type() != ZERO_QUERY_EMOJI) { | |
1756 | results->push_back(std::make_pair(entry.value().as_string(), | |
1757 | entry.type())); | |
1753 | 1758 | continue; |
1754 | 1759 | } |
1755 | 1760 | if (available_emoji_carrier & Request::UNICODE_EMOJI && |
1756 | entry.emoji_type & EMOJI_UNICODE) { | |
1757 | results->push_back(std::make_pair(entry.value, entry.type)); | |
1761 | entry.emoji_type() & EMOJI_UNICODE) { | |
1762 | results->push_back(std::make_pair(entry.value().as_string(), | |
1763 | entry.type())); | |
1758 | 1764 | continue; |
1759 | 1765 | } |
1760 | 1766 | |
1761 | 1767 | if ((available_emoji_carrier & Request::DOCOMO_EMOJI && |
1762 | entry.emoji_type & EMOJI_DOCOMO) || | |
1768 | entry.emoji_type() & EMOJI_DOCOMO) || | |
1763 | 1769 | (available_emoji_carrier & Request::SOFTBANK_EMOJI && |
1764 | entry.emoji_type & EMOJI_SOFTBANK) || | |
1770 | entry.emoji_type() & EMOJI_SOFTBANK) || | |
1765 | 1771 | (available_emoji_carrier & Request::KDDI_EMOJI && |
1766 | entry.emoji_type & EMOJI_KDDI)) { | |
1772 | entry.emoji_type() & EMOJI_KDDI)) { | |
1767 | 1773 | string android_pua; |
1768 | Util::UCS4ToUTF8(entry.emoji_android_pua, &android_pua); | |
1769 | results->push_back(std::make_pair(android_pua, entry.type)); | |
1774 | Util::UCS4ToUTF8(entry.emoji_android_pua(), &android_pua); | |
1775 | results->push_back(std::make_pair(android_pua, entry.type())); | |
1770 | 1776 | } |
1771 | 1777 | } |
1772 | 1778 | return !results->empty(); |
1808 | 1814 | vector<ZeroQueryResult> candidates_for_number_key; |
1809 | 1815 | GetZeroQueryCandidatesForKey(request, |
1810 | 1816 | number_key, |
1811 | kZeroQueryNum_data, | |
1812 | kZeroQueryNum_data + kZeroQueryNum_size, | |
1817 | zero_query_number_dict_, | |
1813 | 1818 | &candidates_for_number_key); |
1814 | 1819 | |
1815 | 1820 | vector<ZeroQueryResult> default_candidates_for_number; |
1816 | 1821 | GetZeroQueryCandidatesForKey(request, |
1817 | 1822 | "default", |
1818 | kZeroQueryNum_data, | |
1819 | kZeroQueryNum_data + kZeroQueryNum_size, | |
1823 | zero_query_number_dict_, | |
1820 | 1824 | &default_candidates_for_number); |
1821 | 1825 | DCHECK(!default_candidates_for_number.empty()); |
1822 | 1826 | |
1847 | 1851 | vector<ZeroQueryResult> candidates; |
1848 | 1852 | if (!GetZeroQueryCandidatesForKey(request, |
1849 | 1853 | history_value, |
1850 | kZeroQueryData_data, | |
1851 | kZeroQueryData_data + kZeroQueryData_size, | |
1854 | zero_query_dict_, | |
1852 | 1855 | &candidates)) { |
1853 | 1856 | return false; |
1854 | 1857 | } |
39 | 39 | #include "converter/immutable_converter_interface.h" |
40 | 40 | #include "converter/segmenter.h" |
41 | 41 | #include "converter/segments.h" |
42 | #include "data_manager/data_manager_interface.h" | |
42 | 43 | #include "dictionary/dictionary_interface.h" |
43 | 44 | #include "dictionary/dictionary_token.h" |
44 | 45 | #include "dictionary/pos_matcher.h" |
45 | 46 | #include "prediction/predictor_interface.h" |
46 | 47 | #include "prediction/suggestion_filter.h" |
47 | #include "prediction/zero_query_list.h" | |
48 | #include "prediction/zero_query_dict.h" | |
48 | 49 | #include "request/conversion_request.h" |
49 | 50 | // for FRIEND_TEST() |
50 | 51 | #include "testing/base/public/gunit_prod.h" |
56 | 57 | public: |
57 | 58 | // Initializes a predictor with given references to submodules. Note that |
58 | 59 | // pointers are not owned by the class and to be deleted by the caller. |
59 | DictionaryPredictor(const ConverterInterface *converter, | |
60 | DictionaryPredictor(const DataManagerInterface& data_manager, | |
61 | const ConverterInterface *converter, | |
60 | 62 | const ImmutableConverterInterface *immutable_converter, |
61 | 63 | const dictionary::DictionaryInterface *dictionary, |
62 | 64 | const dictionary::DictionaryInterface *suffix_dictionary, |
64 | 66 | const Segmenter *segmenter, |
65 | 67 | const dictionary::POSMatcher *pos_matcher, |
66 | 68 | const SuggestionFilter *suggestion_filter); |
67 | virtual ~DictionaryPredictor(); | |
68 | ||
69 | virtual bool PredictForRequest(const ConversionRequest &request, | |
70 | Segments *segments) const; | |
71 | ||
72 | virtual void Finish(const ConversionRequest &request, Segments *segments); | |
73 | ||
74 | virtual const string &GetPredictorName() const { return predictor_name_; } | |
69 | ~DictionaryPredictor() override; | |
70 | ||
71 | bool PredictForRequest(const ConversionRequest &request, | |
72 | Segments *segments) const override; | |
73 | ||
74 | void Finish(const ConversionRequest &request, Segments *segments) override; | |
75 | ||
76 | const string &GetPredictorName() const override { return predictor_name_; } | |
75 | 77 | |
76 | 78 | protected: |
77 | 79 | // Protected members for unittesting |
233 | 235 | static bool GetZeroQueryCandidatesForKey( |
234 | 236 | const ConversionRequest &request, |
235 | 237 | const string &key, |
236 | const ZeroQueryList *begin, | |
237 | const ZeroQueryList *end, | |
238 | const ZeroQueryDict &dict, | |
238 | 239 | vector<ZeroQueryResult> *results); |
239 | 240 | |
240 | 241 | static void AppendZeroQueryToResults( |
463 | 464 | const SuggestionFilter *suggestion_filter_; |
464 | 465 | const uint16 counter_suffix_word_id_; |
465 | 466 | const string predictor_name_; |
467 | ZeroQueryDict zero_query_dict_; | |
468 | ZeroQueryDict zero_query_number_dict_; | |
466 | 469 | |
467 | 470 | DISALLOW_COPY_AND_ASSIGN(DictionaryPredictor); |
468 | 471 | }; |
38 | 38 | #include "base/flags.h" |
39 | 39 | #include "base/logging.h" |
40 | 40 | #include "base/port.h" |
41 | #include "base/serialized_string_array.h" | |
41 | 42 | #include "base/singleton.h" |
42 | 43 | #include "base/system_util.h" |
43 | 44 | #include "base/util.h" |
63 | 64 | #include "dictionary/suppression_dictionary.h" |
64 | 65 | #include "dictionary/system/system_dictionary.h" |
65 | 66 | #include "prediction/suggestion_filter.h" |
66 | #include "prediction/zero_query_list.h" | |
67 | #include "prediction/zero_query_dict.h" | |
67 | 68 | #include "protocol/commands.pb.h" |
68 | 69 | #include "protocol/config.pb.h" |
69 | 70 | #include "request/conversion_request.h" |
164 | 165 | // Test-only subclass: Just changing access levels |
165 | 166 | public: |
166 | 167 | TestableDictionaryPredictor( |
168 | const DataManagerInterface &data_manager, | |
167 | 169 | const ConverterInterface *converter, |
168 | 170 | const ImmutableConverterInterface *immutable_converter, |
169 | 171 | const DictionaryInterface *dictionary, |
172 | 174 | const Segmenter *segmenter, |
173 | 175 | const POSMatcher *pos_matcher, |
174 | 176 | const SuggestionFilter *suggestion_filter) |
175 | : DictionaryPredictor(converter, | |
177 | : DictionaryPredictor(data_manager, | |
178 | converter, | |
176 | 179 | immutable_converter, |
177 | 180 | dictionary, |
178 | 181 | suffix_dictionary, |
213 | 216 | // suffix dictionary is singleton. |
214 | 217 | void Init(const DictionaryInterface *dictionary = NULL, |
215 | 218 | const DictionaryInterface *suffix_dictionary = NULL) { |
216 | testing::MockDataManager data_manager; | |
217 | ||
218 | pos_matcher_.Set(data_manager.GetPOSMatcherData()); | |
219 | pos_matcher_.Set(data_manager_.GetPOSMatcherData()); | |
219 | 220 | suppression_dictionary_.reset(new SuppressionDictionary); |
220 | 221 | if (!dictionary) { |
221 | 222 | dictionary_mock_ = new DictionaryMock; |
226 | 227 | } |
227 | 228 | if (!suffix_dictionary) { |
228 | 229 | suffix_dictionary_.reset( |
229 | CreateSuffixDictionaryFromDataManager(data_manager)); | |
230 | CreateSuffixDictionaryFromDataManager(data_manager_)); | |
230 | 231 | } else { |
231 | 232 | suffix_dictionary_.reset(suffix_dictionary); |
232 | 233 | } |
233 | 234 | CHECK(suffix_dictionary_.get()); |
234 | 235 | |
235 | connector_.reset(Connector::CreateFromDataManager(data_manager)); | |
236 | connector_.reset(Connector::CreateFromDataManager(data_manager_)); | |
236 | 237 | CHECK(connector_.get()); |
237 | 238 | |
238 | segmenter_.reset(Segmenter::CreateFromDataManager(data_manager)); | |
239 | segmenter_.reset(Segmenter::CreateFromDataManager(data_manager_)); | |
239 | 240 | CHECK(segmenter_.get()); |
240 | 241 | |
241 | pos_group_.reset(new PosGroup(data_manager.GetPosGroupData())); | |
242 | suggestion_filter_.reset(CreateSuggestionFilter(data_manager)); | |
242 | pos_group_.reset(new PosGroup(data_manager_.GetPosGroupData())); | |
243 | suggestion_filter_.reset(CreateSuggestionFilter(data_manager_)); | |
243 | 244 | immutable_converter_.reset( |
244 | 245 | new ImmutableConverterImpl(dictionary_.get(), |
245 | 246 | suffix_dictionary_.get(), |
251 | 252 | suggestion_filter_.get())); |
252 | 253 | converter_.reset(new ConverterMock()); |
253 | 254 | dictionary_predictor_.reset( |
254 | new TestableDictionaryPredictor(converter_.get(), | |
255 | new TestableDictionaryPredictor(data_manager_, | |
256 | converter_.get(), | |
255 | 257 | immutable_converter_.get(), |
256 | 258 | dictionary_.get(), |
257 | 259 | suffix_dictionary_.get(), |
282 | 284 | } |
283 | 285 | |
284 | 286 | private: |
287 | const testing::MockDataManager data_manager_; | |
285 | 288 | POSMatcher pos_matcher_; |
286 | 289 | unique_ptr<SuppressionDictionary> suppression_dictionary_; |
287 | 290 | unique_ptr<const Connector> connector_; |
1572 | 1575 | CreateSuggestionFilter(data_manager)); |
1573 | 1576 | const dictionary::POSMatcher pos_matcher(data_manager.GetPOSMatcherData()); |
1574 | 1577 | unique_ptr<TestableDictionaryPredictor> predictor( |
1575 | new TestableDictionaryPredictor(converter.get(), | |
1578 | new TestableDictionaryPredictor(data_manager, | |
1579 | converter.get(), | |
1576 | 1580 | immutable_converter.get(), |
1577 | 1581 | dictionary.get(), |
1578 | 1582 | suffix_dictionary.get(), |
3166 | 3170 | CreateSuggestionFilter(data_manager)); |
3167 | 3171 | const dictionary::POSMatcher pos_matcher(data_manager.GetPOSMatcherData()); |
3168 | 3172 | unique_ptr<TestableDictionaryPredictor> predictor( |
3169 | new TestableDictionaryPredictor(converter.get(), | |
3173 | new TestableDictionaryPredictor(data_manager, | |
3174 | converter.get(), | |
3170 | 3175 | immutable_converter.get(), |
3171 | 3176 | dictionary.get(), |
3172 | 3177 | suffix_dictionary.get(), |
3358 | 3363 | } |
3359 | 3364 | |
3360 | 3365 | namespace { |
3361 | const char *kTestKey0 = "\xe3\x81\x82"; // "あ" | |
3362 | const ZeroQueryEntry kTestValues0[] = { | |
3363 | // emoji exclamation | |
3364 | {ZERO_QUERY_EMOJI, "", EMOJI_DOCOMO | EMOJI_SOFTBANK, 0xfeb04}, | |
3365 | {ZERO_QUERY_EMOJI, "\xE2\x9D\x95", EMOJI_UNICODE, 0xfeb0b}, // "❕" | |
3366 | {ZERO_QUERY_NONE, "\xE2\x9D\xA3", EMOJI_NONE, 0x0}, // "❣" | |
3366 | ||
3367 | const char kTestTokenArray[] = | |
3368 | // {"あ", "", ZERO_QUERY_EMOJI, EMOJI_DOCOMO | EMOJI_SOFTBANK, 0xfeb04} | |
3369 | "\x04\x00\x00\x00" | |
3370 | "\x00\x00\x00\x00" | |
3371 | "\x03\x00" | |
3372 | "\x06\x00" | |
3373 | "\x04\xeb\x0f\x00" | |
3374 | // {"あ", "❕", ZERO_QUERY_EMOJI, EMOJI_UNICODE, 0xfeb0b}, | |
3375 | "\x04\x00\x00\x00" | |
3376 | "\x02\x00\x00\x00" | |
3377 | "\x03\x00" | |
3378 | "\x01\x00" | |
3379 | "\x0b\xeb\x0f\x00" | |
3380 | // {"あ", "❣", ZERO_QUERY_NONE, EMOJI_NONE, 0x00}, | |
3381 | "\x04\x00\x00\x00" | |
3382 | "\x03\x00\x00\x00" | |
3383 | "\x00\x00" | |
3384 | "\x00\x00" | |
3385 | "\x00\x00\x00\x00" | |
3386 | // {"ああ", "( •̀ㅁ•́;)", ZERO_QUERY_EMOTICON, EMOJI_NONE, 0x00} | |
3387 | "\x05\x00\x00\x00" | |
3388 | "\x01\x00\x00\x00" | |
3389 | "\x02\x00" | |
3390 | "\x00\x00" | |
3391 | "\x00\x00\x00\x00"; | |
3392 | ||
3393 | const char *kTestStrings[] = { | |
3394 | "", | |
3395 | // "( •̀ㅁ•́;)" | |
3396 | "\x28\x20\xE2\x80\xA2\xCC\x80\xE3\x85\x81\xE2\x80\xA2\xCC\x81\x3B\x29", | |
3397 | "\xE2\x9D\x95", // "❕" | |
3398 | "\xE2\x9D\xA3", // "❣" | |
3399 | "\xE3\x81\x82", // "あ" | |
3400 | "\xE3\x81\x82\xE3\x81\x82", // "ああ" | |
3367 | 3401 | }; |
3368 | const char *kTestKey1 = "\xe3\x81\x82\xe3\x81\x82"; // "ああ" | |
3369 | const ZeroQueryEntry kTestValues1[] = { | |
3370 | // "( •̀ㅁ•́;)" | |
3371 | { | |
3372 | ZERO_QUERY_EMOTICON, | |
3373 | "\x28\x20\xE2\x80\xA2\xCC\x80\xE3\x85\x81\xE2\x80\xA2\xCC\x81\x3B\x29", | |
3374 | EMOJI_NONE, 0x0 | |
3375 | }, | |
3376 | }; | |
3377 | const ZeroQueryList kTestData_data[] = { | |
3378 | {kTestKey0, kTestValues0, 3}, | |
3379 | {kTestKey1, kTestValues1, 1}, | |
3380 | }; | |
3381 | const size_t kTestData_size = 2; | |
3382 | 3402 | |
3383 | 3403 | struct TestEntry { |
3384 | 3404 | int32 available_emoji_carrier; |
3411 | 3431 | types.c_str()); |
3412 | 3432 | } |
3413 | 3433 | }; |
3434 | ||
3414 | 3435 | } // namespace |
3415 | 3436 | |
3416 | 3437 | TEST_F(DictionaryPredictorTest, GetZeroQueryCandidates) { |
3438 | // Create test zero query data. | |
3439 | std::unique_ptr<uint32[]> string_data_buffer; | |
3440 | ZeroQueryDict zero_query_dict; | |
3441 | { | |
3442 | // kTestTokenArray contains a trailing '\0', so create a StringPiece that | |
3443 | // excludes it by subtracting 1. | |
3444 | const StringPiece token_array_data(kTestTokenArray, | |
3445 | arraysize(kTestTokenArray) - 1); | |
3446 | vector<StringPiece> strs; | |
3447 | for (const char *str : kTestStrings) { | |
3448 | strs.push_back(str); | |
3449 | } | |
3450 | const StringPiece string_array_data = | |
3451 | SerializedStringArray::SerializeToBuffer(strs, &string_data_buffer); | |
3452 | zero_query_dict.Init(token_array_data, string_array_data); | |
3453 | } | |
3454 | ||
3417 | 3455 | vector<TestEntry> test_entries; |
3418 | 3456 | { |
3419 | 3457 | TestEntry entry; |
3525 | 3563 | vector<DictionaryPredictor::ZeroQueryResult> actual_candidates; |
3526 | 3564 | const bool actual_result = |
3527 | 3565 | DictionaryPredictor::GetZeroQueryCandidatesForKey( |
3528 | request, test_entry.key, | |
3529 | kTestData_data, kTestData_data + kTestData_size, | |
3530 | &actual_candidates); | |
3566 | request, test_entry.key, zero_query_dict, &actual_candidates); | |
3531 | 3567 | EXPECT_EQ(test_entry.expected_result, actual_result) |
3532 | 3568 | << test_entry.DebugString(); |
3533 | 3569 | for (size_t j = 0; j < test_entry.expected_candidates.size(); ++j) { |
0 | # -*- coding: utf-8 -*- | |
1 | # Copyright 2010-2016, Google Inc. | |
2 | # All rights reserved. | |
3 | # | |
4 | # Redistribution and use in source and binary forms, with or without | |
5 | # modification, are permitted provided that the following conditions are | |
6 | # met: | |
7 | # | |
8 | # * Redistributions of source code must retain the above copyright | |
9 | # notice, this list of conditions and the following disclaimer. | |
10 | # * Redistributions in binary form must reproduce the above | |
11 | # copyright notice, this list of conditions and the following disclaimer | |
12 | # in the documentation and/or other materials provided with the | |
13 | # distribution. | |
14 | # * Neither the name of Google Inc. nor the names of its | |
15 | # contributors may be used to endorse or promote products derived from | |
16 | # this software without specific prior written permission. | |
17 | # | |
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 | ||
30 | """Generate header file of a string array for zero query suggestion. | |
31 | ||
32 | Usage: | |
33 | gen_embedded_string_array_for_zero_query.py --input input.def \ | |
34 | --output /path/to/output/zero_query_hoge.h --var_name ZeroQueryHoge | |
35 | ||
36 | Input format: | |
37 | <key> <TAB> <candidate_1>,<candidate_2>,..,<candidate_n> | |
38 | ... | |
39 | For more details, please refer to definition files under mozc/data/zero_query/ | |
40 | ||
41 | Output format: | |
42 | const char *Var0[] = {"Key0", "Cand00", "Cand01", .., 0}; | |
43 | const char *Var1[] = {"Key1", "Cand10", "Cand11", .., 0}; | |
44 | ||
45 | const char **Var[] = {Var0, Var1, .., VarN}; | |
46 | ||
47 | Here, (Cand00, Cand10, ...) is sorted so that we can use binary search. | |
48 | """ | |
49 | ||
50 | __author__ = "toshiyuki" | |
51 | ||
52 | import optparse | |
53 | import os | |
54 | ||
55 | ||
56 | _MOZC_DIR_FOR_DEFINE_GUARD = 'MOZC' | |
57 | ||
58 | ||
59 | def EscapeString(string): | |
60 | """Escapes string.""" | |
61 | return '"' + string.encode('string_escape') + '"' | |
62 | ||
63 | ||
64 | def GetDefineGuardSymbol(file_name): | |
65 | """Returns define guard symbol for .h file. | |
66 | ||
67 | For example, returns 'SOME_EXAMPLE_H' for '/path/to/some_example.h' | |
68 | ||
69 | Args: | |
70 | file_name: a string indicating output file path. | |
71 | Returns: | |
72 | A string for define guard. | |
73 | """ | |
74 | return os.path.basename(file_name).upper().replace('.', '_') | |
75 | ||
76 | ||
77 | def GetDefineGuardHeaderLines(output_file_name): | |
78 | """Returns define guard header for .h file.""" | |
79 | result = [] | |
80 | result.append( | |
81 | '#ifndef %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD, | |
82 | GetDefineGuardSymbol(output_file_name))) | |
83 | result.append( | |
84 | '#define %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD, | |
85 | GetDefineGuardSymbol(output_file_name))) | |
86 | return result | |
87 | ||
88 | ||
89 | def GetDefineGuardFooterLines(output_file_name): | |
90 | """Returns define guard footer for .h file.""" | |
91 | return [ | |
92 | '#endif // %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD, | |
93 | GetDefineGuardSymbol(output_file_name))] | |
94 | ||
95 | ||
96 | def GetZeroQueryRules(input_file_name): | |
97 | """Returns zero query trigerring rules. The list is sorted by key.""" | |
98 | rules = [] | |
99 | with open(input_file_name, 'r') as input_file: | |
100 | for line in input_file: | |
101 | if line.startswith('#'): | |
102 | continue | |
103 | line = line.rstrip('\r\n') | |
104 | if not line: | |
105 | continue | |
106 | ||
107 | tokens = line.split('\t') | |
108 | key = tokens[0] | |
109 | values = tokens[1].split(',') | |
110 | ||
111 | rules.append((key, values)) | |
112 | rules.sort(lambda x, y: cmp(x[0], y[0])) # For binary search | |
113 | return rules | |
114 | ||
115 | ||
116 | def GetHeaderContents(input_file_name, var_name, output_file_name): | |
117 | """Returns contents for header file that contains a string array.""" | |
118 | zero_query_rules = GetZeroQueryRules(input_file_name) | |
119 | ||
120 | result = [] | |
121 | result.extend(GetDefineGuardHeaderLines(output_file_name)) | |
122 | result.append('namespace mozc {') | |
123 | result.append('namespace {') | |
124 | ||
125 | for i, rule in enumerate(zero_query_rules): | |
126 | result.append('const char *%s%d[] = {' % (var_name, i)) | |
127 | result.append(' ' + ', '.join( | |
128 | [EscapeString(s) for s in [rule[0]] + rule[1]] + ['0'])) | |
129 | result.append('};') | |
130 | ||
131 | result.append('} // namespace') | |
132 | ||
133 | result.append('const char **%s_data[] = {' % var_name) | |
134 | result.append(' ' + ', '.join( | |
135 | ['%s%d' % (var_name, c) for c in range(len(zero_query_rules))])) | |
136 | result.append('};') | |
137 | result.append( | |
138 | 'const size_t %s_size = %d;' % (var_name, len(zero_query_rules))) | |
139 | ||
140 | result.append('} // namespace mozc') | |
141 | result.extend(GetDefineGuardFooterLines(output_file_name)) | |
142 | return result | |
143 | ||
144 | ||
145 | def ParseOption(): | |
146 | """Parses command line options.""" | |
147 | parser = optparse.OptionParser() | |
148 | parser.add_option('--input', dest='input', help='Input file path') | |
149 | parser.add_option('--output', dest='output', help='Output file path') | |
150 | parser.add_option( | |
151 | '--var_name', dest='var_name', help='Var name for the array') | |
152 | return parser.parse_args()[0] | |
153 | ||
154 | ||
155 | def main(): | |
156 | options = ParseOption() | |
157 | lines = GetHeaderContents(options.input, options.var_name, options.output) | |
158 | with open(options.output, 'w') as out_file: | |
159 | out_file.write('\n'.join(lines)) | |
160 | ||
161 | ||
162 | if __name__ == '__main__': | |
163 | main() |
36 | 36 | import sys |
37 | 37 | import unicodedata |
38 | 38 | from build_tools import code_generator_util |
39 | from prediction import codegen_util_for_zero_query as util | |
40 | ||
41 | ||
42 | _VAR_NAME_FOR_HEADER = 'kZeroQueryData' | |
39 | from prediction import gen_zero_query_util as util | |
43 | 40 | |
44 | 41 | |
45 | 42 | def ParseCodePoint(s): |
294 | 291 | parser.add_option('--input_emoji', dest='input_emoji', help='emoji data file') |
295 | 292 | parser.add_option( |
296 | 293 | '--input_emoticon', dest='input_emoticon', help='emoticon data file') |
297 | parser.add_option('--output', dest='output', help='output header file') | |
294 | parser.add_option('--output_token_array', dest='output_token_array', | |
295 | help='output token array file') | |
296 | parser.add_option('--output_string_array', dest='output_string_array', | |
297 | help='output string array file') | |
298 | 298 | return parser.parse_args()[0] |
299 | 299 | |
300 | 300 | |
313 | 313 | zero_query_rule_dict, zero_query_symbol_dict, |
314 | 314 | zero_query_emoji_dict, zero_query_emoticon_dict) |
315 | 315 | |
316 | with open(options.output, 'w') as output_stream: | |
317 | util.WriteHeaderFileForZeroQuery( | |
318 | merged_zero_query_dict, options.output, | |
319 | _VAR_NAME_FOR_HEADER, output_stream) | |
316 | util.WriteZeroQueryData(merged_zero_query_dict, | |
317 | options.output_token_array, | |
318 | options.output_string_array) | |
320 | 319 | |
321 | 320 | |
322 | 321 | if __name__ == '__main__': |
31 | 31 | |
32 | 32 | from collections import defaultdict |
33 | 33 | import optparse |
34 | from prediction import codegen_util_for_zero_query as util | |
35 | 34 | |
36 | ||
37 | _VAR_NAME_FOR_HEADER = 'kZeroQueryNum' | |
35 | from prediction import gen_zero_query_util as util | |
38 | 36 | |
39 | 37 | |
40 | 38 | def ReadZeroQueryNumberData(input_stream): |
63 | 61 | """Parses command line options.""" |
64 | 62 | parser = optparse.OptionParser() |
65 | 63 | parser.add_option('--input', dest='input', help='Input file path') |
66 | parser.add_option('--output', dest='output', help='Output file path') | |
64 | parser.add_option('--output_token_array', dest='output_token_array', | |
65 | help='Output token array file path') | |
66 | parser.add_option('--output_string_array', dest='output_string_array', | |
67 | help='Output string array file path') | |
67 | 68 | return parser.parse_args()[0] |
68 | 69 | |
69 | 70 | |
71 | 72 | options = ParseOption() |
72 | 73 | with open(options.input, 'r') as input_stream: |
73 | 74 | zero_query_dict = ReadZeroQueryNumberData(input_stream) |
74 | ||
75 | with open(options.output, 'w') as output_stream: | |
76 | util.WriteHeaderFileForZeroQuery( | |
77 | zero_query_dict, options.output, _VAR_NAME_FOR_HEADER, output_stream) | |
75 | util.WriteZeroQueryData(zero_query_dict, | |
76 | options.output_token_array, | |
77 | options.output_string_array) | |
78 | 78 | |
79 | 79 | |
80 | 80 | if __name__ == '__main__': |
0 | # -*- coding: utf-8 -*- | |
1 | # Copyright 2010-2016, Google Inc. | |
2 | # All rights reserved. | |
3 | # | |
4 | # Redistribution and use in source and binary forms, with or without | |
5 | # modification, are permitted provided that the following conditions are | |
6 | # met: | |
7 | # | |
8 | # * Redistributions of source code must retain the above copyright | |
9 | # notice, this list of conditions and the following disclaimer. | |
10 | # * Redistributions in binary form must reproduce the above | |
11 | # copyright notice, this list of conditions and the following disclaimer | |
12 | # in the documentation and/or other materials provided with the | |
13 | # distribution. | |
14 | # * Neither the name of Google Inc. nor the names of its | |
15 | # contributors may be used to endorse or promote products derived from | |
16 | # this software without specific prior written permission. | |
17 | # | |
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 | ||
30 | """Generate binary data for zero query suggestion. | |
31 | ||
32 | For output format, see zero_query_dict.h. | |
33 | """ | |
34 | ||
35 | __author__ = "toshiyuki" | |
36 | ||
37 | import os | |
38 | import struct | |
39 | ||
40 | from build_tools import code_generator_util as cgu | |
41 | from build_tools import serialized_string_array_builder | |
42 | ||
43 | ||
44 | ZERO_QUERY_TYPE_NONE = 0 | |
45 | ZERO_QUERY_TYPE_NUMBER_SUFFIX = 1 | |
46 | ZERO_QUERY_TYPE_EMOTICON = 2 | |
47 | ZERO_QUERY_TYPE_EMOJI = 3 | |
48 | ||
49 | # bit fields | |
50 | # These are standing for command::Request::EmojiCarrierType | |
51 | EMOJI_TYPE_NONE = 0 | |
52 | EMOJI_TYPE_UNICODE = 1 | |
53 | EMOJI_TYPE_DOCOMO = 2 | |
54 | EMOJI_TYPE_SOFTBANK = 4 | |
55 | EMOJI_TYPE_KDDI = 8 | |
56 | ||
57 | ||
58 | class ZeroQueryEntry(object): | |
59 | ||
60 | def __init__(self, entry_type, value, emoji_type, emoji_android_pua): | |
61 | self.entry_type = entry_type | |
62 | self.value = value | |
63 | self.emoji_type = emoji_type | |
64 | self.emoji_android_pua = emoji_android_pua | |
65 | ||
66 | ||
67 | def WriteZeroQueryData(zero_query_dict, output_token_array, | |
68 | output_string_array): | |
69 | # Collect all the strings and assing index in ascending order | |
70 | string_index = {} | |
71 | for key, entry_list in zero_query_dict.iteritems(): | |
72 | string_index[key] = 0 | |
73 | for entry in entry_list: | |
74 | string_index[entry.value] = 0 | |
75 | sorted_strings = sorted(string_index) | |
76 | for i, s in enumerate(sorted_strings): | |
77 | string_index[s] = i | |
78 | ||
79 | with open(output_token_array, 'wb') as f: | |
80 | for key in sorted(zero_query_dict): | |
81 | for entry in zero_query_dict[key]: | |
82 | f.write(struct.pack('<I', string_index[key])) | |
83 | f.write(struct.pack('<I', string_index[entry.value])) | |
84 | f.write(struct.pack('<H', entry.entry_type)) | |
85 | f.write(struct.pack('<H', entry.emoji_type)) | |
86 | f.write(struct.pack('<I', entry.emoji_android_pua)) | |
87 | ||
88 | serialized_string_array_builder.SerializeToFile(sorted_strings, | |
89 | output_string_array) |
58 | 58 | '../session/session_base.gyp:request_test_util', |
59 | 59 | '../storage/storage.gyp:storage', |
60 | 60 | '../usage_stats/usage_stats_base.gyp:usage_stats', |
61 | 'gen_zero_query_data#host', | |
62 | 'gen_zero_query_number_data#host', | |
63 | 61 | 'prediction_base.gyp:suggestion_filter', |
64 | 62 | 'prediction_protocol', |
65 | ], | |
66 | }, | |
67 | { | |
68 | 'target_name': 'gen_zero_query_number_data', | |
69 | 'type': 'none', | |
70 | 'toolsets': ['host'], | |
71 | 'actions': [ | |
72 | { | |
73 | 'action_name': 'gen_zero_query_number_data', | |
74 | 'variables': { | |
75 | 'input_files': [ | |
76 | '../data/zero_query/zero_query_number.def', | |
77 | ], | |
78 | }, | |
79 | 'inputs': [ | |
80 | 'gen_zero_query_number_data.py', | |
81 | 'codegen_util_for_zero_query.py', | |
82 | '<@(input_files)', | |
83 | ], | |
84 | 'outputs': [ | |
85 | '<(gen_out_dir)/zero_query_number_data.h', | |
86 | ], | |
87 | 'action': [ | |
88 | 'python', 'gen_zero_query_number_data.py', | |
89 | '--input=<@(input_files)', | |
90 | '--output=<(gen_out_dir)/zero_query_number_data.h', | |
91 | ], | |
92 | 'message': 'Generating <(gen_out_dir)/zero_query_number_data.h', | |
93 | }, | |
94 | ], | |
95 | }, | |
96 | { | |
97 | 'target_name': 'gen_zero_query_data', | |
98 | 'type': 'none', | |
99 | 'toolsets': ['host'], | |
100 | 'actions': [ | |
101 | { | |
102 | 'action_name': 'gen_zero_query_data', | |
103 | 'variables': { | |
104 | 'input_rule': '../data/zero_query/zero_query.def', | |
105 | 'input_symbol': '../data/symbol/symbol.tsv', | |
106 | 'input_emoji': '../data/emoji/emoji_data.tsv', | |
107 | 'input_emoticon': '../data/emoticon/categorized.tsv', | |
108 | }, | |
109 | 'inputs': [ | |
110 | 'gen_zero_query_data.py', | |
111 | 'codegen_util_for_zero_query.py', | |
112 | '<(input_rule)', | |
113 | '<(input_symbol)', | |
114 | '<(input_emoji)', | |
115 | '<(input_emoticon)', | |
116 | ], | |
117 | 'outputs': [ | |
118 | '<(gen_out_dir)/zero_query_data.h', | |
119 | ], | |
120 | 'action': [ | |
121 | 'python', 'gen_zero_query_data.py', | |
122 | '--input_rule=<(input_rule)', | |
123 | '--input_symbol=<(input_symbol)', | |
124 | '--input_emoji=<(input_emoji)', | |
125 | '--input_emoticon=<(input_emoticon)', | |
126 | '--output=<(gen_out_dir)/zero_query_data.h', | |
127 | ], | |
128 | 'message': 'Generating <(gen_out_dir)/zero_query_data.h', | |
129 | }, | |
130 | 63 | ], |
131 | 64 | }, |
132 | 65 | { |
0 | // Copyright 2010-2016, Google Inc. | |
1 | // All rights reserved. | |
2 | // | |
3 | // Redistribution and use in source and binary forms, with or without | |
4 | // modification, are permitted provided that the following conditions are | |
5 | // met: | |
6 | // | |
7 | // * Redistributions of source code must retain the above copyright | |
8 | // notice, this list of conditions and the following disclaimer. | |
9 | // * Redistributions in binary form must reproduce the above | |
10 | // copyright notice, this list of conditions and the following disclaimer | |
11 | // in the documentation and/or other materials provided with the | |
12 | // distribution. | |
13 | // * Neither the name of Google Inc. nor the names of its | |
14 | // contributors may be used to endorse or promote products derived from | |
15 | // this software without specific prior written permission. | |
16 | // | |
17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ||
29 | #ifndef MOZC_PREDICTION_ZERO_QUERY_DICT_H_ | |
30 | #define MOZC_PREDICTION_ZERO_QUERY_DICT_H_ | |
31 | ||
32 | #include <algorithm> | |
33 | #include <iterator> | |
34 | #include <utility> | |
35 | ||
36 | #include "base/port.h" | |
37 | #include "base/serialized_string_array.h" | |
38 | ||
39 | namespace mozc { | |
40 | ||
41 | enum ZeroQueryType { | |
42 | ZERO_QUERY_NONE = 0, // "☁" (symbol, non-unicode 6.0 emoji), and rule based. | |
43 | ZERO_QUERY_NUMBER_SUFFIX, // "階" from "2" | |
44 | ZERO_QUERY_EMOTICON, // "(>ω<)" from "うれしい" | |
45 | ZERO_QUERY_EMOJI, // <umbrella emoji> from "かさ" | |
46 | // Following types are defined for usage stats. | |
47 | // The candidates of these types will not be stored at |ZeroQueryList|. | |
48 | // - "ヒルズ" from "六本木" | |
49 | // These candidates will be generated from dictionary entries | |
50 | // such as "六本木ヒルズ". | |
51 | ZERO_QUERY_BIGRAM, | |
52 | // - "に" from "六本木". | |
53 | // These candidates will be generated from suffix dictionary. | |
54 | ZERO_QUERY_SUFFIX, | |
55 | }; | |
56 | ||
57 | // bit fields | |
58 | enum ZeroQueryEmojiType { | |
59 | EMOJI_NONE = 0, | |
60 | EMOJI_UNICODE = 1, | |
61 | EMOJI_DOCOMO = 2, | |
62 | EMOJI_SOFTBANK = 4, | |
63 | EMOJI_KDDI = 8, | |
64 | }; | |
65 | ||
66 | // Zero query dictionary is a multimap from string to a list of zero query | |
67 | // entries, where each entry can be looked up by equal_range() method. The data | |
68 | // is serialized to two binary data: token array and string array. Token array | |
69 | // encodes an array of zero query entries, where each entry is encoded in 16 | |
70 | // bytes as follows: | |
71 | // | |
72 | // ZeroQueryEntry { | |
73 | // uint32 key_index: 4 bytes | |
74 | // uint32 value_index: 4 bytes | |
75 | // ZeroQueryType type: 2 bytes | |
76 | // uint16 emoji_type: 2 bytes | |
77 | // uint32 emoji_android_pua: 4 bytes | |
78 | // } | |
79 | // | |
80 | // The token array is sorted in ascending order of key_index for binary search. | |
81 | // String values of key and value are encoded separately in the string array, | |
82 | // which can be extracted by using |key_index| and |value_index|. The string | |
83 | // array is also sorted in ascending order of strings. For the serialization | |
84 | // format of string array, see base/serialized_string_array.h". | |
85 | class ZeroQueryDict { | |
86 | public: | |
87 | static const size_t kTokenByteSize = 16; | |
88 | ||
89 | class iterator : public std::iterator<std::random_access_iterator_tag, | |
90 | uint32> { | |
91 | public: | |
92 | iterator(const char *ptr, const SerializedStringArray *array) | |
93 | : ptr_(ptr), string_array_(array) {} | |
94 | iterator(const iterator& x) = default; | |
95 | iterator& operator=(const iterator& x) = default; | |
96 | ||
97 | uint32 operator*() const { return key_index(); } | |
98 | ||
99 | uint32 key_index() const { | |
100 | return *reinterpret_cast<const uint32 *>(ptr_); | |
101 | } | |
102 | ||
103 | uint32 value_index() const { | |
104 | return *reinterpret_cast<const uint32 *>(ptr_ + 4); | |
105 | } | |
106 | ||
107 | ZeroQueryType type() const { | |
108 | const uint16 val = *reinterpret_cast<const uint16 *>(ptr_ + 8); | |
109 | return static_cast<ZeroQueryType>(val); | |
110 | } | |
111 | ||
112 | uint16 emoji_type() const { | |
113 | return *reinterpret_cast<const uint16 *>(ptr_ + 10); | |
114 | } | |
115 | ||
116 | uint32 emoji_android_pua() const { | |
117 | return *reinterpret_cast<const uint32 *>(ptr_ + 12); | |
118 | } | |
119 | ||
120 | StringPiece key() const { return (*string_array_)[key_index()]; } | |
121 | StringPiece value() const { return (*string_array_)[value_index()]; } | |
122 | ||
123 | iterator &operator++() { | |
124 | ptr_ += kTokenByteSize; | |
125 | return *this; | |
126 | } | |
127 | ||
128 | iterator operator++(int) { | |
129 | const iterator tmp(ptr_, string_array_); | |
130 | ptr_ += kTokenByteSize; | |
131 | return tmp; | |
132 | } | |
133 | ||
134 | iterator &operator+=(ptrdiff_t n) { | |
135 | ptr_ += n * kTokenByteSize; | |
136 | return *this; | |
137 | } | |
138 | ||
139 | friend iterator operator+(iterator iter, ptrdiff_t n) { | |
140 | iter += n; | |
141 | return iter; | |
142 | } | |
143 | ||
144 | friend iterator operator+(ptrdiff_t n, iterator iter) { | |
145 | iter += n; | |
146 | return iter; | |
147 | } | |
148 | ||
149 | iterator &operator-=(ptrdiff_t n) { | |
150 | ptr_ -= n * kTokenByteSize; | |
151 | return *this; | |
152 | } | |
153 | ||
154 | friend iterator operator-(iterator iter, ptrdiff_t n) { | |
155 | iter -= n; | |
156 | return iter; | |
157 | } | |
158 | ||
159 | friend ptrdiff_t operator-(iterator x, iterator y) { | |
160 | return (x.ptr_ - y.ptr_) / kTokenByteSize; | |
161 | } | |
162 | ||
163 | friend bool operator==(iterator x, iterator y) { | |
164 | return x.ptr_ == y.ptr_; | |
165 | } | |
166 | ||
167 | friend bool operator!=(iterator x, iterator y) { | |
168 | return x.ptr_ != y.ptr_; | |
169 | } | |
170 | ||
171 | friend bool operator<(iterator x, iterator y) { | |
172 | return x.ptr_ < y.ptr_; | |
173 | } | |
174 | ||
175 | friend bool operator<=(iterator x, iterator y) { | |
176 | return x.ptr_ <= y.ptr_; | |
177 | } | |
178 | ||
179 | friend bool operator>(iterator x, iterator y) { | |
180 | return x.ptr_ > y.ptr_; | |
181 | } | |
182 | ||
183 | friend bool operator>=(iterator x, iterator y) { | |
184 | return x.ptr_ >= y.ptr_; | |
185 | } | |
186 | ||
187 | private: | |
188 | const char *ptr_; | |
189 | const SerializedStringArray * string_array_; | |
190 | }; | |
191 | ||
192 | void Init(StringPiece token_array_data, StringPiece string_array_data) { | |
193 | token_array_ = token_array_data; | |
194 | string_array_.Set(string_array_data); | |
195 | } | |
196 | ||
197 | iterator begin() const { | |
198 | return iterator(token_array_.data(), &string_array_); | |
199 | } | |
200 | ||
201 | iterator end() const { | |
202 | return iterator(token_array_.data() + token_array_.size(), | |
203 | &string_array_); | |
204 | } | |
205 | ||
206 | std::pair<iterator, iterator> equal_range(StringPiece key) const { | |
207 | const auto iter = std::lower_bound(string_array_.begin(), | |
208 | string_array_.end(), key); | |
209 | if (iter == string_array_.end() || *iter != key) { | |
210 | return std::pair<iterator, iterator>(end(), end()); | |
211 | } | |
212 | return std::equal_range(begin(), end(), iter.index()); | |
213 | } | |
214 | ||
215 | private: | |
216 | StringPiece token_array_; | |
217 | SerializedStringArray string_array_; | |
218 | }; | |
219 | ||
220 | } // namespace mozc | |
221 | ||
222 | #endif // MOZC_PREDICTION_ZERO_QUERY_DICT_H_ |
0 | // Copyright 2010-2016, Google Inc. | |
1 | // All rights reserved. | |
2 | // | |
3 | // Redistribution and use in source and binary forms, with or without | |
4 | // modification, are permitted provided that the following conditions are | |
5 | // met: | |
6 | // | |
7 | // * Redistributions of source code must retain the above copyright | |
8 | // notice, this list of conditions and the following disclaimer. | |
9 | // * Redistributions in binary form must reproduce the above | |
10 | // copyright notice, this list of conditions and the following disclaimer | |
11 | // in the documentation and/or other materials provided with the | |
12 | // distribution. | |
13 | // * Neither the name of Google Inc. nor the names of its | |
14 | // contributors may be used to endorse or promote products derived from | |
15 | // this software without specific prior written permission. | |
16 | // | |
17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ||
29 | #ifndef MOZC_PREDICTION_ZERO_QUERY_LIST_H_ | |
30 | #define MOZC_PREDICTION_ZERO_QUERY_LIST_H_ | |
31 | ||
32 | #include "base/port.h" | |
33 | ||
34 | namespace mozc { | |
35 | enum ZeroQueryType { | |
36 | ZERO_QUERY_NONE = 0, // "☁" (symbol, non-unicode 6.0 emoji), and rule based. | |
37 | ZERO_QUERY_NUMBER_SUFFIX, // "階" from "2" | |
38 | ZERO_QUERY_EMOTICON, // "(>ω<)" from "うれしい" | |
39 | ZERO_QUERY_EMOJI, // <umbrella emoji> from "かさ" | |
40 | // Following types are defined for usage stats. | |
41 | // The candidates of these types will not be stored at |ZeroQueryList|. | |
42 | // - "ヒルズ" from "六本木" | |
43 | // These candidates will be generated from dictionary entries | |
44 | // such as "六本木ヒルズ". | |
45 | ZERO_QUERY_BIGRAM, | |
46 | // - "に" from "六本木". | |
47 | // These candidates will be generated from suffix dictionary. | |
48 | ZERO_QUERY_SUFFIX, | |
49 | }; | |
50 | ||
51 | // bit fields | |
52 | enum ZeroQueryEmojiType { | |
53 | EMOJI_NONE = 0, | |
54 | EMOJI_UNICODE = 1, | |
55 | EMOJI_DOCOMO = 2, | |
56 | EMOJI_SOFTBANK = 4, | |
57 | EMOJI_KDDI = 8, | |
58 | }; | |
59 | ||
60 | struct ZeroQueryEntry { | |
61 | ZeroQueryType type; | |
62 | const char *value; | |
63 | uint8 emoji_type; // ZeroQueryEmojiType | |
64 | // The carrier dependent emoji code point on Android. | |
65 | uint32 emoji_android_pua; | |
66 | }; | |
67 | ||
68 | struct ZeroQueryList { | |
69 | const char *key; | |
70 | const ZeroQueryEntry *entries; | |
71 | const size_t entries_size; | |
72 | }; | |
73 | } // namespace mozc | |
74 | ||
75 | #endif // MOZC_PREDICTION_ZERO_QUERY_LIST_H_ |