Codebase list mozc / c4a2d46
Move zero query data to data set file BUG= TEST= REF_BUG=26841123 REF_CL=123191246 REF_TIME=2016-05-25T17:46:35+09:00 REF_TIME_RAW=1464165995 +0900 Noriyuki Takahashi 7 years ago
18 changed file(s) with 580 addition(s) and 602 deletion(s). Raw diff Collapse all Expand all
186186
187187 // This struct holds resources used by converter.
188188 struct ConverterAndData {
189 std::unique_ptr<testing::MockDataManager> data_manager;
189190 std::unique_ptr<DictionaryInterface> user_dictionary;
190191 std::unique_ptr<SuppressionDictionary> suppression_dictionary;
191192 std::unique_ptr<DictionaryInterface> suffix_dictionary;
238239 // Create a predictor with three sub-predictors, dictionary predictor, user
239240 // history predictor, and extra predictor.
240241 PredictorInterface *dictionary_predictor =
241 new DictionaryPredictor(converter_and_data.converter.get(),
242 new DictionaryPredictor(*converter_and_data.data_manager,
243 converter_and_data.converter.get(),
242244 converter_and_data.immutable_converter.get(),
243245 converter_and_data.dictionary.get(),
244246 converter_and_data.suffix_dictionary.get(),
271273 RewriterInterface *rewriter,
272274 PredictorType predictor_type,
273275 ConverterAndData *converter_and_data) {
274 testing::MockDataManager data_manager;
276 converter_and_data->data_manager.reset(new testing::MockDataManager());
277 const auto &data_manager = *converter_and_data->data_manager;
275278
276279 const char *dictionary_data = nullptr;
277280 int dictionary_size = 0;
14051408 suppression_dictionary.get(),
14061409 DefaultPredictor::CreateDefaultPredictor(
14071410 new DictionaryPredictor(
1411 data_manager,
14081412 converter.get(),
14091413 immutable_converter.get(),
14101414 dictionary.get(),
00 MAJOR=2
11 MINOR=18
2 BUILD=2568
2 BUILD=2569
33 REVISION=102
44 # CAUTION: NACL_DICTIONARY_VERSION is going to be migrated to ENGINE_VERSION.
55 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
301301 LOG(ERROR) << "Single Kanji data is broken";
302302 return Status::DATA_BROKEN;
303303 }
304 if (!reader.Get("zero_query_token_array",
305 &zero_query_token_array_data_) ||
306 !reader.Get("zero_query_string_array",
307 &zero_query_string_array_data_) ||
308 !reader.Get("zero_query_number_token_array",
309 &zero_query_number_token_array_data_) ||
310 !reader.Get("zero_query_number_string_array",
311 &zero_query_number_string_array_data_)) {
312 LOG(ERROR) << "Cannot find zero query data";
313 return Status::DATA_MISSING;
314 }
315 if (!SerializedStringArray::VerifyData(zero_query_string_array_data_) ||
316 !SerializedStringArray::VerifyData(
317 zero_query_number_string_array_data_)) {
318 LOG(ERROR) << "Zero query data is broken";
319 return Status::DATA_BROKEN;
320 }
304321
305322 if (!reader.Get("usage_item_array", &usage_items_data_)) {
306323 VLOG(2) << "Usage dictionary is not provided";
495512 *size = counter_suffix_data_.size();
496513 }
497514
515 void DataManager::GetZeroQueryData(
516 StringPiece *zero_query_token_array_data,
517 StringPiece *zero_query_string_array_data,
518 StringPiece *zero_query_number_token_array_data,
519 StringPiece *zero_query_number_string_array_data) const {
520 *zero_query_token_array_data = zero_query_token_array_data_;
521 *zero_query_string_array_data = zero_query_string_array_data_;
522 *zero_query_number_token_array_data = zero_query_number_token_array_data_;
523 *zero_query_number_string_array_data = zero_query_number_string_array_data_;
524 }
525
498526 #ifndef NO_USAGE_REWRITER
499527 void DataManager::GetUsageRewriterData(
500528 StringPiece *base_conjugation_suffix_data,
119119 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)#host',
120120 'gen_separate_emoji_rewriter_data_for_<(dataset_tag)#host',
121121 'gen_separate_single_kanji_rewriter_data_for_<(dataset_tag)#host',
122 'gen_separate_zero_query_data_for_<(dataset_tag)#host',
122123 'gen_separate_version_data_for_<(dataset_tag)#host',
123124 ],
124125 'actions': [
160161 'single_kanji_variant_string': '<(gen_out_dir)/single_kanji_variant_string.data',
161162 'single_kanji_noun_prefix_token': '<(gen_out_dir)/single_kanji_noun_prefix_token.data',
162163 'single_kanji_noun_prefix_string': '<(gen_out_dir)/single_kanji_noun_prefix_string.data',
164 'zero_query_token_array': '<(gen_out_dir)/zero_query_token.data',
165 'zero_query_string_array': '<(gen_out_dir)/zero_query_string.data',
166 'zero_query_number_token_array': '<(gen_out_dir)/zero_query_number_token.data',
167 'zero_query_number_string_array': '<(gen_out_dir)/zero_query_number_string.data',
163168 'version': '<(gen_out_dir)/version.data',
164169 },
165170 'inputs': [
197202 '<(single_kanji_variant_string)',
198203 '<(single_kanji_noun_prefix_token)',
199204 '<(single_kanji_noun_prefix_string)',
205 '<(zero_query_token_array)',
206 '<(zero_query_string_array)',
207 '<(zero_query_number_token_array)',
208 '<(zero_query_number_string_array)',
200209 '<(version)',
201210 ],
202211 'outputs': [
240249 'single_kanji_variant_string:32:<(gen_out_dir)/single_kanji_variant_string.data',
241250 'single_kanji_noun_prefix_token:32:<(gen_out_dir)/single_kanji_noun_prefix_token.data',
242251 'single_kanji_noun_prefix_string:32:<(gen_out_dir)/single_kanji_noun_prefix_string.data',
252 'zero_query_token_array:32:<(gen_out_dir)/zero_query_token.data',
253 'zero_query_string_array:32:<(gen_out_dir)/zero_query_string.data',
254 'zero_query_number_token_array:32:<(gen_out_dir)/zero_query_number_token.data',
255 'zero_query_number_string_array:32:<(gen_out_dir)/zero_query_number_string.data',
243256 'version:32:<(gen_out_dir)/version.data',
244257 ],
245258 'conditions': [
891904 ],
892905 },
893906 {
907 'target_name': 'gen_separate_zero_query_data_for_<(dataset_tag)',
908 'type': 'none',
909 'toolsets': ['host'],
910 'actions': [
911 {
912 'action_name': 'gen_separate_zero_query_data_for_<(dataset_tag)',
913 'variables': {
914 'generator': '<(mozc_dir)/prediction/gen_zero_query_data.py',
915 'input_files': [
916 '<(mozc_dir)/data/emoji/emoji_data.tsv',
917 '<(mozc_dir)/data/emoticon/categorized.tsv',
918 '<(mozc_dir)/data/symbol/symbol.tsv',
919 '<(mozc_dir)/data/zero_query/zero_query.def',
920 ],
921 },
922 'inputs': [
923 '<(generator)',
924 '<@(input_files)',
925 ],
926 'outputs': [
927 '<(gen_out_dir)/zero_query_token.data',
928 '<(gen_out_dir)/zero_query_string.data',
929 ],
930 'action': [
931 'python', '<(generator)',
932 '--input_rule=<(mozc_dir)/data/zero_query/zero_query.def',
933 '--input_symbol=<(mozc_dir)/data/symbol/symbol.tsv',
934 '--input_emoji=<(mozc_dir)/data/emoji/emoji_data.tsv',
935 '--input_emoticon=<(mozc_dir)/data/emoticon/categorized.tsv',
936 '--output_token_array=<(gen_out_dir)/zero_query_token.data',
937 '--output_string_array=<(gen_out_dir)/zero_query_string.data',
938 ],
939 },
940 {
941 'action_name': 'gen_separate_zero_query_number_data_for_<(dataset_tag)',
942 'variables': {
943 'generator': '<(mozc_dir)/prediction/gen_zero_query_number_data.py',
944 'input_files': [
945 '<(mozc_dir)/data/zero_query/zero_query_number.def',
946 ],
947 },
948 'inputs': [
949 '<(generator)',
950 '<@(input_files)',
951 ],
952 'outputs': [
953 '<(gen_out_dir)/zero_query_number_token.data',
954 '<(gen_out_dir)/zero_query_number_string.data',
955 ],
956 'action': [
957 'python', '<(generator)',
958 '--input=<(mozc_dir)/data/zero_query/zero_query_number.def',
959 '--output_token_array=<(gen_out_dir)/zero_query_number_token.data',
960 '--output_string_array=<(gen_out_dir)/zero_query_number_string.data',
961 ],
962 },
963 ],
964 },
965 {
894966 'target_name': 'gen_separate_version_data_for_<(dataset_tag)',
895967 'type': 'none',
896968 'toolsets': ['host'],
118118 StringPiece *variant_string_array_data,
119119 StringPiece *noun_prefix_token_array_data,
120120 StringPiece *noun_prefix_string_array_data) const override;
121 void GetZeroQueryData(
122 StringPiece *zero_query_token_array_data,
123 StringPiece *zero_query_string_array_data,
124 StringPiece *zero_query_number_token_array_data,
125 StringPiece *zero_query_number_string_array_data) const override;
121126
122127 #ifndef NO_USAGE_REWRITER
123128 void GetUsageRewriterData(
169174 StringPiece single_kanji_variant_string_array_data_;
170175 StringPiece single_kanji_noun_prefix_token_array_data_;
171176 StringPiece single_kanji_noun_prefix_string_array_data_;
177 StringPiece zero_query_token_array_data_;
178 StringPiece zero_query_string_array_data_;
179 StringPiece zero_query_number_token_array_data_;
180 StringPiece zero_query_number_string_array_data_;
172181 StringPiece usage_base_conjugation_suffix_data_;
173182 StringPiece usage_conjugation_suffix_data_;
174183 StringPiece usage_conjugation_index_data_;
3838 // files in data/dictionary, such as dictionary.txt, id.def, etc.
3939 class DataManagerInterface {
4040 public:
41 virtual ~DataManagerInterface() {}
41 virtual ~DataManagerInterface() = default;
4242
4343 // Returns data set for UserPOS.
4444 virtual void GetUserPOSData(StringPiece *token_array_data,
121121 virtual void GetCounterSuffixSortedArray(const char **array,
122122 size_t *size) const = 0;
123123
124 // Gets the zero query prediction data.
125 virtual void GetZeroQueryData(
126 StringPiece *zero_query_token_array_data,
127 StringPiece *zero_query_string_array_data,
128 StringPiece *zero_query_number_token_array_data,
129 StringPiece *zero_query_number_string_array_data) const = 0;
130
124131 // Gets the data version string.
125132 virtual StringPiece GetDataVersion() const = 0;
126133
127134 protected:
128 DataManagerInterface() {}
135 DataManagerInterface() = default;
129136
130137 private:
131138 DISALLOW_COPY_AND_ASSIGN(DataManagerInterface);
243243 // Create a predictor with three sub-predictors, dictionary predictor, user
244244 // history predictor, and extra predictor.
245245 PredictorInterface *dictionary_predictor =
246 new DictionaryPredictor(converter_.get(),
246 new DictionaryPredictor(*data_manager,
247 converter_.get(),
247248 immutable_converter_.get(),
248249 dictionary_.get(),
249250 suffix_dictionary_.get(),
+0
-190
src/prediction/codegen_util_for_zero_query.py less more
0 # -*- coding: utf-8 -*-
1 # Copyright 2010-2016, Google Inc.
2 # All rights reserved.
3 #
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions are
6 # met:
7 #
8 # * Redistributions of source code must retain the above copyright
9 # notice, this list of conditions and the following disclaimer.
10 # * Redistributions in binary form must reproduce the above
11 # copyright notice, this list of conditions and the following disclaimer
12 # in the documentation and/or other materials provided with the
13 # distribution.
14 # * Neither the name of Google Inc. nor the names of its
15 # contributors may be used to endorse or promote products derived from
16 # this software without specific prior written permission.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 """Generate header file for zero query suggestion.
31
32 Output format:
33 const char *kKey0 = "key0";
34 const ZeroQueryEntry kValues0[] = {
35 {zero_query_type00, "Cand00", emoji_type00, codepoint00},
36 {zero_query_type01, "Cand01", emoji_type01, codepoint01},
37 ..
38 };
39 const char *kKey1 = "key1";
40 const ZeroQueryEntry kValues1[] = {
41 {zero_query_type10, "Cand10", emoji_type10, codepoint10},
42 {zero_query_type11, "Cand11", emoji_type11, codepoint11},
43 ..
44 };
45 ..
46
47 const ZeroQueryList kData_data[] = {
48 {kKey0, kValues0, values0_size},
49 {kKey1, kValues1, values1_size},
50 ..
51 };
52 const size_t kData_size = data_size;
53
54
55 Here, (kKey0, kKey1, ...) is sorted so that we can use binary search.
56 """
57
58 __author__ = "toshiyuki"
59
60 import os
61 from build_tools import code_generator_util as cgu
62
63
64 _MOZC_DIR_FOR_INCLUDE_GUARD = (
65 'MOZC')
66
67 ZERO_QUERY_TYPE_NONE = 0
68 ZERO_QUERY_TYPE_NUMBER_SUFFIX = 1
69 ZERO_QUERY_TYPE_EMOTICON = 2
70 ZERO_QUERY_TYPE_EMOJI = 3
71
72 # bit fields
73 # These are standing for command::Request::EmojiCarrierType
74 EMOJI_TYPE_NONE = 0
75 EMOJI_TYPE_UNICODE = 1
76 EMOJI_TYPE_DOCOMO = 2
77 EMOJI_TYPE_SOFTBANK = 4
78 EMOJI_TYPE_KDDI = 8
79
80
81 class ZeroQueryEntry(object):
82
83 def __init__(self, entry_type, value, emoji_type, emoji_android_pua):
84 self.entry_type = entry_type
85 self.value = value
86 self.emoji_type = emoji_type
87 self.emoji_android_pua = emoji_android_pua
88
89
90 def ZeroQueryTypeToString(zero_query_type):
91 """Returns a string for C++ code indicating zero query type."""
92 if zero_query_type == ZERO_QUERY_TYPE_NONE:
93 return 'ZERO_QUERY_NONE'
94 elif zero_query_type == ZERO_QUERY_TYPE_NUMBER_SUFFIX:
95 return 'ZERO_QUERY_NUMBER_SUFFIX'
96 elif zero_query_type == ZERO_QUERY_TYPE_EMOTICON:
97 return 'ZERO_QUERY_EMOTICON'
98 elif zero_query_type == ZERO_QUERY_TYPE_EMOJI:
99 return 'ZERO_QUERY_EMOJI'
100 return 0
101
102
103 def EmojiTypeToString(emoji_type):
104 """Returns a string for C++ code indicating emoji type."""
105 if emoji_type == EMOJI_TYPE_NONE:
106 return 'EMOJI_NONE'
107
108 types = []
109 if emoji_type & EMOJI_TYPE_UNICODE:
110 types.append('EMOJI_UNICODE')
111 if emoji_type & EMOJI_TYPE_DOCOMO:
112 types.append('EMOJI_DOCOMO')
113 if emoji_type & EMOJI_TYPE_SOFTBANK:
114 types.append('EMOJI_SOFTBANK')
115 if emoji_type & EMOJI_TYPE_KDDI:
116 types.append('EMOJI_KDDI')
117 return ' | '.join(types)
118
119
120 def GetIncludeGuardSymbol(file_name):
121 """Returns include guard symbol for .h file.
122
123 For example, returns 'SOME_EXAMPLE_H' for '/path/to/some_example.h'
124
125 Args:
126 file_name: a string indicating output file path.
127 Returns:
128 A string for include guard.
129 """
130 return os.path.basename(file_name).upper().replace('.', '_')
131
132
133 def WriteIncludeGuardHeader(output_file_name, output_stream):
134 """Returns include guard header for .h file."""
135 output_stream.write('#ifndef %s_PREDICTION_%s_\n' %(
136 _MOZC_DIR_FOR_INCLUDE_GUARD, GetIncludeGuardSymbol(output_file_name)))
137 output_stream.write('#define %s_PREDICTION_%s_\n' %(
138 _MOZC_DIR_FOR_INCLUDE_GUARD, GetIncludeGuardSymbol(output_file_name)))
139
140
141 def WriteIncludeGuardFooter(output_file_name, output_stream):
142 """Returns include guard footer for .h file."""
143 output_stream.write('#endif // %s_PREDICTION_%s_\n' %(
144 _MOZC_DIR_FOR_INCLUDE_GUARD, GetIncludeGuardSymbol(output_file_name)))
145
146
147 def WriteHeaderFileForZeroQuery(
148 zero_query_dict, output_file_name, var_name, output_stream):
149 """Returns contents for header file that contains a string array."""
150
151 WriteIncludeGuardHeader(output_file_name, output_stream)
152 output_stream.write(
153 '#include "./prediction/zero_query_list.h"\n')
154 output_stream.write('namespace mozc {\n')
155 output_stream.write('namespace {\n')
156
157 sorted_keys = sorted(zero_query_dict.keys())
158 for i, key in enumerate(sorted_keys):
159 if i:
160 output_stream.write('\n')
161 output_stream.write('const char *%s_key%d = %s; // "%s"\n' % (
162 var_name, i, cgu.ToCppStringLiteral(key), key))
163 output_stream.write(
164 'const ZeroQueryEntry %s_values%d[] = {\n' % (var_name, i))
165 output_stream.write(
166 '\n'.join([
167 ' {%s, %s, %s, 0x%x}, // "%s"' % (
168 ZeroQueryTypeToString(e.entry_type),
169 cgu.ToCppStringLiteral(e.value),
170 EmojiTypeToString(e.emoji_type),
171 e.emoji_android_pua,
172 e.value)
173 for e in zero_query_dict[key]]) +
174 '\n')
175 output_stream.write('};\n')
176
177 output_stream.write('} // namespace\n')
178
179 output_stream.write('const ZeroQueryList %s_data[] = {\n' % var_name)
180 output_stream.write(',\n'.join(
181 [' {%s_key%d, %s_values%d, %d}' % (
182 var_name, c, var_name, c, len(zero_query_dict[key]))
183 for c, key in enumerate(sorted_keys)]) + '\n')
184 output_stream.write('};\n')
185 output_stream.write(
186 'const size_t %s_size = %d;' % (var_name, len(sorted_keys)) + '\n')
187
188 output_stream.write('} // namespace mozc\n')
189 WriteIncludeGuardFooter(output_file_name, output_stream)
5454 #include "dictionary/pos_matcher.h"
5555 #include "prediction/predictor_interface.h"
5656 #include "prediction/suggestion_filter.h"
57 #include "prediction/zero_query_data.h"
58 #include "prediction/zero_query_list.h"
59 #include "prediction/zero_query_number_data.h"
57 #include "prediction/zero_query_dict.h"
6058 #include "protocol/commands.pb.h"
6159 #include "protocol/config.pb.h"
6260 #include "request/conversion_request.h"
144142 FLAGS_enable_typing_correction;
145143 }
146144
147 struct ZeroQueryListCompare {
148 bool operator()(const ZeroQueryList &lhs, const ZeroQueryList &rhs) const {
149 return (strcmp(lhs.key, rhs.key) < 0);
150 }
151 };
152145 } // namespace
153146
154147 class DictionaryPredictor::PredictiveLookupCallback
279272 };
280273
281274 DictionaryPredictor::DictionaryPredictor(
275 const DataManagerInterface &data_manager,
282276 const ConverterInterface *converter,
283277 const ImmutableConverterInterface *immutable_converter,
284278 const DictionaryInterface *dictionary,
295289 segmenter_(segmenter),
296290 suggestion_filter_(suggestion_filter),
297291 counter_suffix_word_id_(pos_matcher->GetCounterSuffixWordId()),
298 predictor_name_("DictionaryPredictor") {}
292 predictor_name_("DictionaryPredictor") {
293 StringPiece zero_query_token_array_data;
294 StringPiece zero_query_string_array_data;
295 StringPiece zero_query_number_token_array_data;
296 StringPiece zero_query_number_string_array_data;
297 data_manager.GetZeroQueryData(&zero_query_token_array_data,
298 &zero_query_string_array_data,
299 &zero_query_number_token_array_data,
300 &zero_query_number_string_array_data);
301 zero_query_dict_.Init(zero_query_token_array_data,
302 zero_query_string_array_data);
303 zero_query_number_dict_.Init(zero_query_number_token_array_data,
304 zero_query_number_string_array_data);
305 }
299306
300307 DictionaryPredictor::~DictionaryPredictor() {}
301308
17311738
17321739 // static
17331740 bool DictionaryPredictor::GetZeroQueryCandidatesForKey(
1734 const ConversionRequest &request,
1735 const string &key, const ZeroQueryList *begin, const ZeroQueryList *end,
1736 vector<ZeroQueryResult> *results) {
1741 const ConversionRequest &request, const string &key,
1742 const ZeroQueryDict &dict, vector<ZeroQueryResult> *results) {
17371743 const int32 available_emoji_carrier =
17381744 request.request().available_emoji_carrier();
17391745
17401746 DCHECK(results);
17411747 results->clear();
1742 const ZeroQueryList key_item = {key.c_str(), NULL, 0};
1743 const ZeroQueryList *result_rule =
1744 std::lower_bound(begin, end, key_item, ZeroQueryListCompare());
1745 if (result_rule == end || key != result_rule->key) {
1748
1749 auto range = dict.equal_range(key);
1750 if (range.first == range.second) {
17461751 return false;
17471752 }
1748
1749 for (size_t i = 0; i < result_rule->entries_size; ++i) {
1750 const ZeroQueryEntry &entry = result_rule->entries[i];
1751 if (entry.type != ZERO_QUERY_EMOJI) {
1752 results->push_back(std::make_pair(entry.value, entry.type));
1753 for (; range.first != range.second; ++range.first) {
1754 const auto &entry = range.first;
1755 if (entry.type() != ZERO_QUERY_EMOJI) {
1756 results->push_back(std::make_pair(entry.value().as_string(),
1757 entry.type()));
17531758 continue;
17541759 }
17551760 if (available_emoji_carrier & Request::UNICODE_EMOJI &&
1756 entry.emoji_type & EMOJI_UNICODE) {
1757 results->push_back(std::make_pair(entry.value, entry.type));
1761 entry.emoji_type() & EMOJI_UNICODE) {
1762 results->push_back(std::make_pair(entry.value().as_string(),
1763 entry.type()));
17581764 continue;
17591765 }
17601766
17611767 if ((available_emoji_carrier & Request::DOCOMO_EMOJI &&
1762 entry.emoji_type & EMOJI_DOCOMO) ||
1768 entry.emoji_type() & EMOJI_DOCOMO) ||
17631769 (available_emoji_carrier & Request::SOFTBANK_EMOJI &&
1764 entry.emoji_type & EMOJI_SOFTBANK) ||
1770 entry.emoji_type() & EMOJI_SOFTBANK) ||
17651771 (available_emoji_carrier & Request::KDDI_EMOJI &&
1766 entry.emoji_type & EMOJI_KDDI)) {
1772 entry.emoji_type() & EMOJI_KDDI)) {
17671773 string android_pua;
1768 Util::UCS4ToUTF8(entry.emoji_android_pua, &android_pua);
1769 results->push_back(std::make_pair(android_pua, entry.type));
1774 Util::UCS4ToUTF8(entry.emoji_android_pua(), &android_pua);
1775 results->push_back(std::make_pair(android_pua, entry.type()));
17701776 }
17711777 }
17721778 return !results->empty();
18081814 vector<ZeroQueryResult> candidates_for_number_key;
18091815 GetZeroQueryCandidatesForKey(request,
18101816 number_key,
1811 kZeroQueryNum_data,
1812 kZeroQueryNum_data + kZeroQueryNum_size,
1817 zero_query_number_dict_,
18131818 &candidates_for_number_key);
18141819
18151820 vector<ZeroQueryResult> default_candidates_for_number;
18161821 GetZeroQueryCandidatesForKey(request,
18171822 "default",
1818 kZeroQueryNum_data,
1819 kZeroQueryNum_data + kZeroQueryNum_size,
1823 zero_query_number_dict_,
18201824 &default_candidates_for_number);
18211825 DCHECK(!default_candidates_for_number.empty());
18221826
18471851 vector<ZeroQueryResult> candidates;
18481852 if (!GetZeroQueryCandidatesForKey(request,
18491853 history_value,
1850 kZeroQueryData_data,
1851 kZeroQueryData_data + kZeroQueryData_size,
1854 zero_query_dict_,
18521855 &candidates)) {
18531856 return false;
18541857 }
3939 #include "converter/immutable_converter_interface.h"
4040 #include "converter/segmenter.h"
4141 #include "converter/segments.h"
42 #include "data_manager/data_manager_interface.h"
4243 #include "dictionary/dictionary_interface.h"
4344 #include "dictionary/dictionary_token.h"
4445 #include "dictionary/pos_matcher.h"
4546 #include "prediction/predictor_interface.h"
4647 #include "prediction/suggestion_filter.h"
47 #include "prediction/zero_query_list.h"
48 #include "prediction/zero_query_dict.h"
4849 #include "request/conversion_request.h"
4950 // for FRIEND_TEST()
5051 #include "testing/base/public/gunit_prod.h"
5657 public:
5758 // Initializes a predictor with given references to submodules. Note that
5859 // pointers are not owned by the class and to be deleted by the caller.
59 DictionaryPredictor(const ConverterInterface *converter,
60 DictionaryPredictor(const DataManagerInterface& data_manager,
61 const ConverterInterface *converter,
6062 const ImmutableConverterInterface *immutable_converter,
6163 const dictionary::DictionaryInterface *dictionary,
6264 const dictionary::DictionaryInterface *suffix_dictionary,
6466 const Segmenter *segmenter,
6567 const dictionary::POSMatcher *pos_matcher,
6668 const SuggestionFilter *suggestion_filter);
67 virtual ~DictionaryPredictor();
68
69 virtual bool PredictForRequest(const ConversionRequest &request,
70 Segments *segments) const;
71
72 virtual void Finish(const ConversionRequest &request, Segments *segments);
73
74 virtual const string &GetPredictorName() const { return predictor_name_; }
69 ~DictionaryPredictor() override;
70
71 bool PredictForRequest(const ConversionRequest &request,
72 Segments *segments) const override;
73
74 void Finish(const ConversionRequest &request, Segments *segments) override;
75
76 const string &GetPredictorName() const override { return predictor_name_; }
7577
7678 protected:
7779 // Protected members for unittesting
233235 static bool GetZeroQueryCandidatesForKey(
234236 const ConversionRequest &request,
235237 const string &key,
236 const ZeroQueryList *begin,
237 const ZeroQueryList *end,
238 const ZeroQueryDict &dict,
238239 vector<ZeroQueryResult> *results);
239240
240241 static void AppendZeroQueryToResults(
463464 const SuggestionFilter *suggestion_filter_;
464465 const uint16 counter_suffix_word_id_;
465466 const string predictor_name_;
467 ZeroQueryDict zero_query_dict_;
468 ZeroQueryDict zero_query_number_dict_;
466469
467470 DISALLOW_COPY_AND_ASSIGN(DictionaryPredictor);
468471 };
3838 #include "base/flags.h"
3939 #include "base/logging.h"
4040 #include "base/port.h"
41 #include "base/serialized_string_array.h"
4142 #include "base/singleton.h"
4243 #include "base/system_util.h"
4344 #include "base/util.h"
6364 #include "dictionary/suppression_dictionary.h"
6465 #include "dictionary/system/system_dictionary.h"
6566 #include "prediction/suggestion_filter.h"
66 #include "prediction/zero_query_list.h"
67 #include "prediction/zero_query_dict.h"
6768 #include "protocol/commands.pb.h"
6869 #include "protocol/config.pb.h"
6970 #include "request/conversion_request.h"
164165 // Test-only subclass: Just changing access levels
165166 public:
166167 TestableDictionaryPredictor(
168 const DataManagerInterface &data_manager,
167169 const ConverterInterface *converter,
168170 const ImmutableConverterInterface *immutable_converter,
169171 const DictionaryInterface *dictionary,
172174 const Segmenter *segmenter,
173175 const POSMatcher *pos_matcher,
174176 const SuggestionFilter *suggestion_filter)
175 : DictionaryPredictor(converter,
177 : DictionaryPredictor(data_manager,
178 converter,
176179 immutable_converter,
177180 dictionary,
178181 suffix_dictionary,
213216 // suffix dictionary is singleton.
214217 void Init(const DictionaryInterface *dictionary = NULL,
215218 const DictionaryInterface *suffix_dictionary = NULL) {
216 testing::MockDataManager data_manager;
217
218 pos_matcher_.Set(data_manager.GetPOSMatcherData());
219 pos_matcher_.Set(data_manager_.GetPOSMatcherData());
219220 suppression_dictionary_.reset(new SuppressionDictionary);
220221 if (!dictionary) {
221222 dictionary_mock_ = new DictionaryMock;
226227 }
227228 if (!suffix_dictionary) {
228229 suffix_dictionary_.reset(
229 CreateSuffixDictionaryFromDataManager(data_manager));
230 CreateSuffixDictionaryFromDataManager(data_manager_));
230231 } else {
231232 suffix_dictionary_.reset(suffix_dictionary);
232233 }
233234 CHECK(suffix_dictionary_.get());
234235
235 connector_.reset(Connector::CreateFromDataManager(data_manager));
236 connector_.reset(Connector::CreateFromDataManager(data_manager_));
236237 CHECK(connector_.get());
237238
238 segmenter_.reset(Segmenter::CreateFromDataManager(data_manager));
239 segmenter_.reset(Segmenter::CreateFromDataManager(data_manager_));
239240 CHECK(segmenter_.get());
240241
241 pos_group_.reset(new PosGroup(data_manager.GetPosGroupData()));
242 suggestion_filter_.reset(CreateSuggestionFilter(data_manager));
242 pos_group_.reset(new PosGroup(data_manager_.GetPosGroupData()));
243 suggestion_filter_.reset(CreateSuggestionFilter(data_manager_));
243244 immutable_converter_.reset(
244245 new ImmutableConverterImpl(dictionary_.get(),
245246 suffix_dictionary_.get(),
251252 suggestion_filter_.get()));
252253 converter_.reset(new ConverterMock());
253254 dictionary_predictor_.reset(
254 new TestableDictionaryPredictor(converter_.get(),
255 new TestableDictionaryPredictor(data_manager_,
256 converter_.get(),
255257 immutable_converter_.get(),
256258 dictionary_.get(),
257259 suffix_dictionary_.get(),
282284 }
283285
284286 private:
287 const testing::MockDataManager data_manager_;
285288 POSMatcher pos_matcher_;
286289 unique_ptr<SuppressionDictionary> suppression_dictionary_;
287290 unique_ptr<const Connector> connector_;
15721575 CreateSuggestionFilter(data_manager));
15731576 const dictionary::POSMatcher pos_matcher(data_manager.GetPOSMatcherData());
15741577 unique_ptr<TestableDictionaryPredictor> predictor(
1575 new TestableDictionaryPredictor(converter.get(),
1578 new TestableDictionaryPredictor(data_manager,
1579 converter.get(),
15761580 immutable_converter.get(),
15771581 dictionary.get(),
15781582 suffix_dictionary.get(),
31663170 CreateSuggestionFilter(data_manager));
31673171 const dictionary::POSMatcher pos_matcher(data_manager.GetPOSMatcherData());
31683172 unique_ptr<TestableDictionaryPredictor> predictor(
3169 new TestableDictionaryPredictor(converter.get(),
3173 new TestableDictionaryPredictor(data_manager,
3174 converter.get(),
31703175 immutable_converter.get(),
31713176 dictionary.get(),
31723177 suffix_dictionary.get(),
33583363 }
33593364
33603365 namespace {
3361 const char *kTestKey0 = "\xe3\x81\x82"; // "あ"
3362 const ZeroQueryEntry kTestValues0[] = {
3363 // emoji exclamation
3364 {ZERO_QUERY_EMOJI, "", EMOJI_DOCOMO | EMOJI_SOFTBANK, 0xfeb04},
3365 {ZERO_QUERY_EMOJI, "\xE2\x9D\x95", EMOJI_UNICODE, 0xfeb0b}, // "❕"
3366 {ZERO_QUERY_NONE, "\xE2\x9D\xA3", EMOJI_NONE, 0x0}, // "❣"
3366
3367 const char kTestTokenArray[] =
3368 // {"あ", "", ZERO_QUERY_EMOJI, EMOJI_DOCOMO | EMOJI_SOFTBANK, 0xfeb04}
3369 "\x04\x00\x00\x00"
3370 "\x00\x00\x00\x00"
3371 "\x03\x00"
3372 "\x06\x00"
3373 "\x04\xeb\x0f\x00"
3374 // {"あ", "❕", ZERO_QUERY_EMOJI, EMOJI_UNICODE, 0xfeb0b},
3375 "\x04\x00\x00\x00"
3376 "\x02\x00\x00\x00"
3377 "\x03\x00"
3378 "\x01\x00"
3379 "\x0b\xeb\x0f\x00"
3380 // {"あ", "❣", ZERO_QUERY_NONE, EMOJI_NONE, 0x00},
3381 "\x04\x00\x00\x00"
3382 "\x03\x00\x00\x00"
3383 "\x00\x00"
3384 "\x00\x00"
3385 "\x00\x00\x00\x00"
3386 // {"ああ", "( •̀ㅁ•́;)", ZERO_QUERY_EMOTICON, EMOJI_NONE, 0x00}
3387 "\x05\x00\x00\x00"
3388 "\x01\x00\x00\x00"
3389 "\x02\x00"
3390 "\x00\x00"
3391 "\x00\x00\x00\x00";
3392
3393 const char *kTestStrings[] = {
3394 "",
3395 // "( •̀ㅁ•́;)"
3396 "\x28\x20\xE2\x80\xA2\xCC\x80\xE3\x85\x81\xE2\x80\xA2\xCC\x81\x3B\x29",
3397 "\xE2\x9D\x95", // "❕"
3398 "\xE2\x9D\xA3", // "❣"
3399 "\xE3\x81\x82", // "あ"
3400 "\xE3\x81\x82\xE3\x81\x82", // "ああ"
33673401 };
3368 const char *kTestKey1 = "\xe3\x81\x82\xe3\x81\x82"; // "ああ"
3369 const ZeroQueryEntry kTestValues1[] = {
3370 // "( •̀ㅁ•́;)"
3371 {
3372 ZERO_QUERY_EMOTICON,
3373 "\x28\x20\xE2\x80\xA2\xCC\x80\xE3\x85\x81\xE2\x80\xA2\xCC\x81\x3B\x29",
3374 EMOJI_NONE, 0x0
3375 },
3376 };
3377 const ZeroQueryList kTestData_data[] = {
3378 {kTestKey0, kTestValues0, 3},
3379 {kTestKey1, kTestValues1, 1},
3380 };
3381 const size_t kTestData_size = 2;
33823402
33833403 struct TestEntry {
33843404 int32 available_emoji_carrier;
34113431 types.c_str());
34123432 }
34133433 };
3434
34143435 } // namespace
34153436
34163437 TEST_F(DictionaryPredictorTest, GetZeroQueryCandidates) {
3438 // Create test zero query data.
3439 std::unique_ptr<uint32[]> string_data_buffer;
3440 ZeroQueryDict zero_query_dict;
3441 {
3442 // kTestTokenArray contains a trailing '\0', so create a StringPiece that
3443 // excludes it by subtracting 1.
3444 const StringPiece token_array_data(kTestTokenArray,
3445 arraysize(kTestTokenArray) - 1);
3446 vector<StringPiece> strs;
3447 for (const char *str : kTestStrings) {
3448 strs.push_back(str);
3449 }
3450 const StringPiece string_array_data =
3451 SerializedStringArray::SerializeToBuffer(strs, &string_data_buffer);
3452 zero_query_dict.Init(token_array_data, string_array_data);
3453 }
3454
34173455 vector<TestEntry> test_entries;
34183456 {
34193457 TestEntry entry;
35253563 vector<DictionaryPredictor::ZeroQueryResult> actual_candidates;
35263564 const bool actual_result =
35273565 DictionaryPredictor::GetZeroQueryCandidatesForKey(
3528 request, test_entry.key,
3529 kTestData_data, kTestData_data + kTestData_size,
3530 &actual_candidates);
3566 request, test_entry.key, zero_query_dict, &actual_candidates);
35313567 EXPECT_EQ(test_entry.expected_result, actual_result)
35323568 << test_entry.DebugString();
35333569 for (size_t j = 0; j < test_entry.expected_candidates.size(); ++j) {
+0
-164
src/prediction/gen_embedded_string_array_for_zero_query.py less more
0 # -*- coding: utf-8 -*-
1 # Copyright 2010-2016, Google Inc.
2 # All rights reserved.
3 #
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions are
6 # met:
7 #
8 # * Redistributions of source code must retain the above copyright
9 # notice, this list of conditions and the following disclaimer.
10 # * Redistributions in binary form must reproduce the above
11 # copyright notice, this list of conditions and the following disclaimer
12 # in the documentation and/or other materials provided with the
13 # distribution.
14 # * Neither the name of Google Inc. nor the names of its
15 # contributors may be used to endorse or promote products derived from
16 # this software without specific prior written permission.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 """Generate header file of a string array for zero query suggestion.
31
32 Usage:
33 gen_embedded_string_array_for_zero_query.py --input input.def \
34 --output /path/to/output/zero_query_hoge.h --var_name ZeroQueryHoge
35
36 Input format:
37 <key> <TAB> <candidate_1>,<candidate_2>,..,<candidate_n>
38 ...
39 For more details, please refer to definition files under mozc/data/zero_query/
40
41 Output format:
42 const char *Var0[] = {"Key0", "Cand00", "Cand01", .., 0};
43 const char *Var1[] = {"Key1", "Cand10", "Cand11", .., 0};
44
45 const char **Var[] = {Var0, Var1, .., VarN};
46
47 Here, (Cand00, Cand10, ...) is sorted so that we can use binary search.
48 """
49
50 __author__ = "toshiyuki"
51
52 import optparse
53 import os
54
55
56 _MOZC_DIR_FOR_DEFINE_GUARD = 'MOZC'
57
58
59 def EscapeString(string):
60 """Escapes string."""
61 return '"' + string.encode('string_escape') + '"'
62
63
64 def GetDefineGuardSymbol(file_name):
65 """Returns define guard symbol for .h file.
66
67 For example, returns 'SOME_EXAMPLE_H' for '/path/to/some_example.h'
68
69 Args:
70 file_name: a string indicating output file path.
71 Returns:
72 A string for define guard.
73 """
74 return os.path.basename(file_name).upper().replace('.', '_')
75
76
77 def GetDefineGuardHeaderLines(output_file_name):
78 """Returns define guard header for .h file."""
79 result = []
80 result.append(
81 '#ifndef %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD,
82 GetDefineGuardSymbol(output_file_name)))
83 result.append(
84 '#define %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD,
85 GetDefineGuardSymbol(output_file_name)))
86 return result
87
88
89 def GetDefineGuardFooterLines(output_file_name):
90 """Returns define guard footer for .h file."""
91 return [
92 '#endif // %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD,
93 GetDefineGuardSymbol(output_file_name))]
94
95
96 def GetZeroQueryRules(input_file_name):
97 """Returns zero query trigerring rules. The list is sorted by key."""
98 rules = []
99 with open(input_file_name, 'r') as input_file:
100 for line in input_file:
101 if line.startswith('#'):
102 continue
103 line = line.rstrip('\r\n')
104 if not line:
105 continue
106
107 tokens = line.split('\t')
108 key = tokens[0]
109 values = tokens[1].split(',')
110
111 rules.append((key, values))
112 rules.sort(lambda x, y: cmp(x[0], y[0])) # For binary search
113 return rules
114
115
116 def GetHeaderContents(input_file_name, var_name, output_file_name):
117 """Returns contents for header file that contains a string array."""
118 zero_query_rules = GetZeroQueryRules(input_file_name)
119
120 result = []
121 result.extend(GetDefineGuardHeaderLines(output_file_name))
122 result.append('namespace mozc {')
123 result.append('namespace {')
124
125 for i, rule in enumerate(zero_query_rules):
126 result.append('const char *%s%d[] = {' % (var_name, i))
127 result.append(' ' + ', '.join(
128 [EscapeString(s) for s in [rule[0]] + rule[1]] + ['0']))
129 result.append('};')
130
131 result.append('} // namespace')
132
133 result.append('const char **%s_data[] = {' % var_name)
134 result.append(' ' + ', '.join(
135 ['%s%d' % (var_name, c) for c in range(len(zero_query_rules))]))
136 result.append('};')
137 result.append(
138 'const size_t %s_size = %d;' % (var_name, len(zero_query_rules)))
139
140 result.append('} // namespace mozc')
141 result.extend(GetDefineGuardFooterLines(output_file_name))
142 return result
143
144
145 def ParseOption():
146 """Parses command line options."""
147 parser = optparse.OptionParser()
148 parser.add_option('--input', dest='input', help='Input file path')
149 parser.add_option('--output', dest='output', help='Output file path')
150 parser.add_option(
151 '--var_name', dest='var_name', help='Var name for the array')
152 return parser.parse_args()[0]
153
154
155 def main():
156 options = ParseOption()
157 lines = GetHeaderContents(options.input, options.var_name, options.output)
158 with open(options.output, 'w') as out_file:
159 out_file.write('\n'.join(lines))
160
161
162 if __name__ == '__main__':
163 main()
3636 import sys
3737 import unicodedata
3838 from build_tools import code_generator_util
39 from prediction import codegen_util_for_zero_query as util
40
41
42 _VAR_NAME_FOR_HEADER = 'kZeroQueryData'
39 from prediction import gen_zero_query_util as util
4340
4441
4542 def ParseCodePoint(s):
294291 parser.add_option('--input_emoji', dest='input_emoji', help='emoji data file')
295292 parser.add_option(
296293 '--input_emoticon', dest='input_emoticon', help='emoticon data file')
297 parser.add_option('--output', dest='output', help='output header file')
294 parser.add_option('--output_token_array', dest='output_token_array',
295 help='output token array file')
296 parser.add_option('--output_string_array', dest='output_string_array',
297 help='output string array file')
298298 return parser.parse_args()[0]
299299
300300
313313 zero_query_rule_dict, zero_query_symbol_dict,
314314 zero_query_emoji_dict, zero_query_emoticon_dict)
315315
316 with open(options.output, 'w') as output_stream:
317 util.WriteHeaderFileForZeroQuery(
318 merged_zero_query_dict, options.output,
319 _VAR_NAME_FOR_HEADER, output_stream)
316 util.WriteZeroQueryData(merged_zero_query_dict,
317 options.output_token_array,
318 options.output_string_array)
320319
321320
322321 if __name__ == '__main__':
3131
3232 from collections import defaultdict
3333 import optparse
34 from prediction import codegen_util_for_zero_query as util
3534
36
37 _VAR_NAME_FOR_HEADER = 'kZeroQueryNum'
35 from prediction import gen_zero_query_util as util
3836
3937
4038 def ReadZeroQueryNumberData(input_stream):
6361 """Parses command line options."""
6462 parser = optparse.OptionParser()
6563 parser.add_option('--input', dest='input', help='Input file path')
66 parser.add_option('--output', dest='output', help='Output file path')
64 parser.add_option('--output_token_array', dest='output_token_array',
65 help='Output token array file path')
66 parser.add_option('--output_string_array', dest='output_string_array',
67 help='Output string array file path')
6768 return parser.parse_args()[0]
6869
6970
7172 options = ParseOption()
7273 with open(options.input, 'r') as input_stream:
7374 zero_query_dict = ReadZeroQueryNumberData(input_stream)
74
75 with open(options.output, 'w') as output_stream:
76 util.WriteHeaderFileForZeroQuery(
77 zero_query_dict, options.output, _VAR_NAME_FOR_HEADER, output_stream)
75 util.WriteZeroQueryData(zero_query_dict,
76 options.output_token_array,
77 options.output_string_array)
7878
7979
8080 if __name__ == '__main__':
0 # -*- coding: utf-8 -*-
1 # Copyright 2010-2016, Google Inc.
2 # All rights reserved.
3 #
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions are
6 # met:
7 #
8 # * Redistributions of source code must retain the above copyright
9 # notice, this list of conditions and the following disclaimer.
10 # * Redistributions in binary form must reproduce the above
11 # copyright notice, this list of conditions and the following disclaimer
12 # in the documentation and/or other materials provided with the
13 # distribution.
14 # * Neither the name of Google Inc. nor the names of its
15 # contributors may be used to endorse or promote products derived from
16 # this software without specific prior written permission.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 """Generate binary data for zero query suggestion.
31
32 For output format, see zero_query_dict.h.
33 """
34
35 __author__ = "toshiyuki"
36
37 import os
38 import struct
39
40 from build_tools import code_generator_util as cgu
41 from build_tools import serialized_string_array_builder
42
43
44 ZERO_QUERY_TYPE_NONE = 0
45 ZERO_QUERY_TYPE_NUMBER_SUFFIX = 1
46 ZERO_QUERY_TYPE_EMOTICON = 2
47 ZERO_QUERY_TYPE_EMOJI = 3
48
49 # bit fields
50 # These are standing for command::Request::EmojiCarrierType
51 EMOJI_TYPE_NONE = 0
52 EMOJI_TYPE_UNICODE = 1
53 EMOJI_TYPE_DOCOMO = 2
54 EMOJI_TYPE_SOFTBANK = 4
55 EMOJI_TYPE_KDDI = 8
56
57
58 class ZeroQueryEntry(object):
59
60 def __init__(self, entry_type, value, emoji_type, emoji_android_pua):
61 self.entry_type = entry_type
62 self.value = value
63 self.emoji_type = emoji_type
64 self.emoji_android_pua = emoji_android_pua
65
66
67 def WriteZeroQueryData(zero_query_dict, output_token_array,
68 output_string_array):
69 # Collect all the strings and assing index in ascending order
70 string_index = {}
71 for key, entry_list in zero_query_dict.iteritems():
72 string_index[key] = 0
73 for entry in entry_list:
74 string_index[entry.value] = 0
75 sorted_strings = sorted(string_index)
76 for i, s in enumerate(sorted_strings):
77 string_index[s] = i
78
79 with open(output_token_array, 'wb') as f:
80 for key in sorted(zero_query_dict):
81 for entry in zero_query_dict[key]:
82 f.write(struct.pack('<I', string_index[key]))
83 f.write(struct.pack('<I', string_index[entry.value]))
84 f.write(struct.pack('<H', entry.entry_type))
85 f.write(struct.pack('<H', entry.emoji_type))
86 f.write(struct.pack('<I', entry.emoji_android_pua))
87
88 serialized_string_array_builder.SerializeToFile(sorted_strings,
89 output_string_array)
5858 '../session/session_base.gyp:request_test_util',
5959 '../storage/storage.gyp:storage',
6060 '../usage_stats/usage_stats_base.gyp:usage_stats',
61 'gen_zero_query_data#host',
62 'gen_zero_query_number_data#host',
6361 'prediction_base.gyp:suggestion_filter',
6462 'prediction_protocol',
65 ],
66 },
67 {
68 'target_name': 'gen_zero_query_number_data',
69 'type': 'none',
70 'toolsets': ['host'],
71 'actions': [
72 {
73 'action_name': 'gen_zero_query_number_data',
74 'variables': {
75 'input_files': [
76 '../data/zero_query/zero_query_number.def',
77 ],
78 },
79 'inputs': [
80 'gen_zero_query_number_data.py',
81 'codegen_util_for_zero_query.py',
82 '<@(input_files)',
83 ],
84 'outputs': [
85 '<(gen_out_dir)/zero_query_number_data.h',
86 ],
87 'action': [
88 'python', 'gen_zero_query_number_data.py',
89 '--input=<@(input_files)',
90 '--output=<(gen_out_dir)/zero_query_number_data.h',
91 ],
92 'message': 'Generating <(gen_out_dir)/zero_query_number_data.h',
93 },
94 ],
95 },
96 {
97 'target_name': 'gen_zero_query_data',
98 'type': 'none',
99 'toolsets': ['host'],
100 'actions': [
101 {
102 'action_name': 'gen_zero_query_data',
103 'variables': {
104 'input_rule': '../data/zero_query/zero_query.def',
105 'input_symbol': '../data/symbol/symbol.tsv',
106 'input_emoji': '../data/emoji/emoji_data.tsv',
107 'input_emoticon': '../data/emoticon/categorized.tsv',
108 },
109 'inputs': [
110 'gen_zero_query_data.py',
111 'codegen_util_for_zero_query.py',
112 '<(input_rule)',
113 '<(input_symbol)',
114 '<(input_emoji)',
115 '<(input_emoticon)',
116 ],
117 'outputs': [
118 '<(gen_out_dir)/zero_query_data.h',
119 ],
120 'action': [
121 'python', 'gen_zero_query_data.py',
122 '--input_rule=<(input_rule)',
123 '--input_symbol=<(input_symbol)',
124 '--input_emoji=<(input_emoji)',
125 '--input_emoticon=<(input_emoticon)',
126 '--output=<(gen_out_dir)/zero_query_data.h',
127 ],
128 'message': 'Generating <(gen_out_dir)/zero_query_data.h',
129 },
13063 ],
13164 },
13265 {
0 // Copyright 2010-2016, Google Inc.
1 // All rights reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are
5 // met:
6 //
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above
10 // copyright notice, this list of conditions and the following disclaimer
11 // in the documentation and/or other materials provided with the
12 // distribution.
13 // * Neither the name of Google Inc. nor the names of its
14 // contributors may be used to endorse or promote products derived from
15 // this software without specific prior written permission.
16 //
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29 #ifndef MOZC_PREDICTION_ZERO_QUERY_DICT_H_
30 #define MOZC_PREDICTION_ZERO_QUERY_DICT_H_
31
32 #include <algorithm>
33 #include <iterator>
34 #include <utility>
35
36 #include "base/port.h"
37 #include "base/serialized_string_array.h"
38
39 namespace mozc {
40
41 enum ZeroQueryType {
42 ZERO_QUERY_NONE = 0, // "☁" (symbol, non-unicode 6.0 emoji), and rule based.
43 ZERO_QUERY_NUMBER_SUFFIX, // "階" from "2"
44 ZERO_QUERY_EMOTICON, // "(>ω<)" from "うれしい"
45 ZERO_QUERY_EMOJI, // <umbrella emoji> from "かさ"
46 // Following types are defined for usage stats.
47 // The candidates of these types will not be stored at |ZeroQueryList|.
48 // - "ヒルズ" from "六本木"
49 // These candidates will be generated from dictionary entries
50 // such as "六本木ヒルズ".
51 ZERO_QUERY_BIGRAM,
52 // - "に" from "六本木".
53 // These candidates will be generated from suffix dictionary.
54 ZERO_QUERY_SUFFIX,
55 };
56
57 // bit fields
58 enum ZeroQueryEmojiType {
59 EMOJI_NONE = 0,
60 EMOJI_UNICODE = 1,
61 EMOJI_DOCOMO = 2,
62 EMOJI_SOFTBANK = 4,
63 EMOJI_KDDI = 8,
64 };
65
66 // Zero query dictionary is a multimap from string to a list of zero query
67 // entries, where each entry can be looked up by equal_range() method. The data
68 // is serialized to two binary data: token array and string array. Token array
69 // encodes an array of zero query entries, where each entry is encoded in 16
70 // bytes as follows:
71 //
72 // ZeroQueryEntry {
73 // uint32 key_index: 4 bytes
74 // uint32 value_index: 4 bytes
75 // ZeroQueryType type: 2 bytes
76 // uint16 emoji_type: 2 bytes
77 // uint32 emoji_android_pua: 4 bytes
78 // }
79 //
80 // The token array is sorted in ascending order of key_index for binary search.
81 // String values of key and value are encoded separately in the string array,
82 // which can be extracted by using |key_index| and |value_index|. The string
83 // array is also sorted in ascending order of strings. For the serialization
84 // format of string array, see base/serialized_string_array.h".
85 class ZeroQueryDict {
86 public:
87 static const size_t kTokenByteSize = 16;
88
89 class iterator : public std::iterator<std::random_access_iterator_tag,
90 uint32> {
91 public:
92 iterator(const char *ptr, const SerializedStringArray *array)
93 : ptr_(ptr), string_array_(array) {}
94 iterator(const iterator& x) = default;
95 iterator& operator=(const iterator& x) = default;
96
97 uint32 operator*() const { return key_index(); }
98
99 uint32 key_index() const {
100 return *reinterpret_cast<const uint32 *>(ptr_);
101 }
102
103 uint32 value_index() const {
104 return *reinterpret_cast<const uint32 *>(ptr_ + 4);
105 }
106
107 ZeroQueryType type() const {
108 const uint16 val = *reinterpret_cast<const uint16 *>(ptr_ + 8);
109 return static_cast<ZeroQueryType>(val);
110 }
111
112 uint16 emoji_type() const {
113 return *reinterpret_cast<const uint16 *>(ptr_ + 10);
114 }
115
116 uint32 emoji_android_pua() const {
117 return *reinterpret_cast<const uint32 *>(ptr_ + 12);
118 }
119
120 StringPiece key() const { return (*string_array_)[key_index()]; }
121 StringPiece value() const { return (*string_array_)[value_index()]; }
122
123 iterator &operator++() {
124 ptr_ += kTokenByteSize;
125 return *this;
126 }
127
128 iterator operator++(int) {
129 const iterator tmp(ptr_, string_array_);
130 ptr_ += kTokenByteSize;
131 return tmp;
132 }
133
134 iterator &operator+=(ptrdiff_t n) {
135 ptr_ += n * kTokenByteSize;
136 return *this;
137 }
138
139 friend iterator operator+(iterator iter, ptrdiff_t n) {
140 iter += n;
141 return iter;
142 }
143
144 friend iterator operator+(ptrdiff_t n, iterator iter) {
145 iter += n;
146 return iter;
147 }
148
149 iterator &operator-=(ptrdiff_t n) {
150 ptr_ -= n * kTokenByteSize;
151 return *this;
152 }
153
154 friend iterator operator-(iterator iter, ptrdiff_t n) {
155 iter -= n;
156 return iter;
157 }
158
159 friend ptrdiff_t operator-(iterator x, iterator y) {
160 return (x.ptr_ - y.ptr_) / kTokenByteSize;
161 }
162
163 friend bool operator==(iterator x, iterator y) {
164 return x.ptr_ == y.ptr_;
165 }
166
167 friend bool operator!=(iterator x, iterator y) {
168 return x.ptr_ != y.ptr_;
169 }
170
171 friend bool operator<(iterator x, iterator y) {
172 return x.ptr_ < y.ptr_;
173 }
174
175 friend bool operator<=(iterator x, iterator y) {
176 return x.ptr_ <= y.ptr_;
177 }
178
179 friend bool operator>(iterator x, iterator y) {
180 return x.ptr_ > y.ptr_;
181 }
182
183 friend bool operator>=(iterator x, iterator y) {
184 return x.ptr_ >= y.ptr_;
185 }
186
187 private:
188 const char *ptr_;
189 const SerializedStringArray * string_array_;
190 };
191
192 void Init(StringPiece token_array_data, StringPiece string_array_data) {
193 token_array_ = token_array_data;
194 string_array_.Set(string_array_data);
195 }
196
197 iterator begin() const {
198 return iterator(token_array_.data(), &string_array_);
199 }
200
201 iterator end() const {
202 return iterator(token_array_.data() + token_array_.size(),
203 &string_array_);
204 }
205
206 std::pair<iterator, iterator> equal_range(StringPiece key) const {
207 const auto iter = std::lower_bound(string_array_.begin(),
208 string_array_.end(), key);
209 if (iter == string_array_.end() || *iter != key) {
210 return std::pair<iterator, iterator>(end(), end());
211 }
212 return std::equal_range(begin(), end(), iter.index());
213 }
214
215 private:
216 StringPiece token_array_;
217 SerializedStringArray string_array_;
218 };
219
220 } // namespace mozc
221
222 #endif // MOZC_PREDICTION_ZERO_QUERY_DICT_H_
+0
-76
src/prediction/zero_query_list.h less more
0 // Copyright 2010-2016, Google Inc.
1 // All rights reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are
5 // met:
6 //
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above
10 // copyright notice, this list of conditions and the following disclaimer
11 // in the documentation and/or other materials provided with the
12 // distribution.
13 // * Neither the name of Google Inc. nor the names of its
14 // contributors may be used to endorse or promote products derived from
15 // this software without specific prior written permission.
16 //
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29 #ifndef MOZC_PREDICTION_ZERO_QUERY_LIST_H_
30 #define MOZC_PREDICTION_ZERO_QUERY_LIST_H_
31
32 #include "base/port.h"
33
34 namespace mozc {
35 enum ZeroQueryType {
36 ZERO_QUERY_NONE = 0, // "☁" (symbol, non-unicode 6.0 emoji), and rule based.
37 ZERO_QUERY_NUMBER_SUFFIX, // "階" from "2"
38 ZERO_QUERY_EMOTICON, // "(>ω<)" from "うれしい"
39 ZERO_QUERY_EMOJI, // <umbrella emoji> from "かさ"
40 // Following types are defined for usage stats.
41 // The candidates of these types will not be stored at |ZeroQueryList|.
42 // - "ヒルズ" from "六本木"
43 // These candidates will be generated from dictionary entries
44 // such as "六本木ヒルズ".
45 ZERO_QUERY_BIGRAM,
46 // - "に" from "六本木".
47 // These candidates will be generated from suffix dictionary.
48 ZERO_QUERY_SUFFIX,
49 };
50
51 // bit fields
52 enum ZeroQueryEmojiType {
53 EMOJI_NONE = 0,
54 EMOJI_UNICODE = 1,
55 EMOJI_DOCOMO = 2,
56 EMOJI_SOFTBANK = 4,
57 EMOJI_KDDI = 8,
58 };
59
60 struct ZeroQueryEntry {
61 ZeroQueryType type;
62 const char *value;
63 uint8 emoji_type; // ZeroQueryEmojiType
64 // The carrier dependent emoji code point on Android.
65 uint32 emoji_android_pua;
66 };
67
68 struct ZeroQueryList {
69 const char *key;
70 const ZeroQueryEntry *entries;
71 const size_t entries_size;
72 };
73 } // namespace mozc
74
75 #endif // MOZC_PREDICTION_ZERO_QUERY_LIST_H_