Commit 8f7ff0ee4bde111724b9cffc824fac2a06b92aec - mozc

+19

-0

src/data_manager/data_manager.cc less more

208	208	LOG(ERROR) << "Symbol dictionary data is broken";
209	209	return false;
210	210	}
	211	if (!reader.Get("emoticon_token", &emoticon_token_array_data_)) {
	212	LOG(ERROR) << "Cannot find an emoticon token array";
	213	return false;
	214	}
	215	if (!reader.Get("emoticon_string", &emoticon_string_array_data_)) {
	216	LOG(ERROR) << "Cannot find an emoticon string array or data is broken";
	217	return false;
	218	}
	219	if (!SerializedDictionary::VerifyData(emoticon_token_array_data_,
	220	emoticon_string_array_data_)) {
	221	LOG(ERROR) << "Emoticon dictionary data is broken";
	222	return false;
	223	}
211	224
212	225	if (!reader.Get("usage_item_array", &usage_items_data_)) {
213	226	VLOG(2) << "Usage dictionary is not provided";

336	349	*string_array_data = symbol_string_array_data_;
337	350	}
338	351
	352	void DataManager::GetEmoticonRewriterData(
	353	StringPiece token_array_data, StringPiece string_array_data) const {
	354	*token_array_data = emoticon_token_array_data_;
	355	*string_array_data = emoticon_string_array_data_;
	356	}
	357
339	358	void DataManager::GetCounterSuffixSortedArray(const char **array,
340	359	size_t *size) const {
341	360	*array = counter_suffix_data_.data();

+41

-0

src/data_manager/data_manager.gypi less more

116	116	'gen_separate_suffix_data_for_<(dataset_tag)#host',
117	117	'gen_separate_reading_correction_data_for_<(dataset_tag)#host',
118	118	'gen_separate_symbol_rewriter_data_for_<(dataset_tag)#host',
	119	'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)#host',
119	120	],
120	121	'actions': [
121	122	{

145	146	'reading_correction_correction': '<(gen_out_dir)/reading_correction_correction.data',
146	147	'symbol_token': '<(gen_out_dir)/symbol_token.data',
147	148	'symbol_string': '<(gen_out_dir)/symbol_string.data',
	149	'emoticon_token': '<(gen_out_dir)/emoticon_token.data',
	150	'emoticon_string': '<(gen_out_dir)/emoticon_string.data',
148	151	},
149	152	'inputs': [
150	153	'<(pos_matcher)',

170	173	'<(reading_correction_correction)',
171	174	'<(symbol_token)',
172	175	'<(symbol_string)',
	176	'<(emoticon_token)',
	177	'<(emoticon_string)',
173	178	],
174	179	'outputs': [
175	180	'<(gen_out_dir)/<(out_mozc_data)',

201	206	'reading_correction_correction:32:<(gen_out_dir)/reading_correction_correction.data',
202	207	'symbol_token:32:<(gen_out_dir)/symbol_token.data',
203	208	'symbol_string:32:<(gen_out_dir)/symbol_string.data',
	209	'emoticon_token:32:<(gen_out_dir)/emoticon_token.data',
	210	'emoticon_string:32:<(gen_out_dir)/emoticon_string.data',
204	211	],
205	212	'conditions': [
206	213	['target_platform!="Android"', {

697	704	],
698	705	},
699	706	{
	707	'target_name': 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)',
	708	'type': 'none',
	709	'toolsets': ['host'],
	710	'dependencies': [
	711	'../../rewriter/rewriter_base.gyp:gen_emoticon_rewriter_data_main',
	712	],
	713	'actions': [
	714	{
	715	'action_name': 'gen_separate_emoticon_rewriter_data_for_<(dataset_tag)',
	716	'variables': {
	717	'generator': '<(PRODUCT_DIR)/gen_emoticon_rewriter_data_main<(EXECUTABLE_SUFFIX)',
	718	'input_files': [
	719	'<(mozc_dir)/data/emoticon/emoticon.tsv',
	720	],
	721	},
	722	'inputs': [
	723	'<(generator)',
	724	'<@(input_files)',
	725	],
	726	'outputs': [
	727	'<(gen_out_dir)/emoticon_token.data',
	728	'<(gen_out_dir)/emoticon_string.data',
	729	],
	730	'action': [
	731	'<(generator)',
	732	'--input=<(mozc_dir)/data/emoticon/emoticon.tsv',
	733	'--output_token_array=<(gen_out_dir)/emoticon_token.data',
	734	'--output_string_array=<(gen_out_dir)/emoticon_string.data',
	735	],
	736	'message': '[<(dataset_tag)] Generating emoticon data',
	737	},
	738	],
	739	},
	740	{
700	741	'target_name': 'gen_separate_counter_suffix_data_for_<(dataset_tag)',
701	742	'type': 'none',
702	743	'toolsets': ['host'],

+4

-0

src/data_manager/data_manager.h less more

83	83	StringPiece *correction_array_data) const override;
84	84	void GetSymbolRewriterData(StringPiece *token_array_data,
85	85	StringPiece *string_array_data) const override;
	86	void GetEmoticonRewriterData(StringPiece *token_array_data,
	87	StringPiece *string_array_data) const override;
86	88
87	89	#ifndef NO_USAGE_REWRITER
88	90	void GetUsageRewriterData(

119	121	StringPiece reading_correction_correction_array_data_;
120	122	StringPiece symbol_token_array_data_;
121	123	StringPiece symbol_string_array_data_;
	124	StringPiece emoticon_token_array_data_;
	125	StringPiece emoticon_string_array_data_;
122	126	StringPiece usage_base_conjugation_suffix_data_;
123	127	StringPiece usage_conjugation_suffix_data_;
124	128	StringPiece usage_conjugation_index_data_;

+4

-0

src/data_manager/data_manager_interface.h less more

89	89	virtual void GetSymbolRewriterData(StringPiece *token_array_data,
90	90	StringPiece *string_array_data) const = 0;
91	91
	92	// Gets an address of symbol rewriter data array and its size.
	93	virtual void GetEmoticonRewriterData(
	94	StringPiece token_array_data, StringPiece string_array_data) const = 0;
	95
92	96	#ifndef NO_USAGE_REWRITER
93	97	// Gets the usage rewriter data.
94	98	virtual void GetUsageRewriterData(

+12

-0

src/data_manager/packed/packed_data_manager.cc less more

88	88	void GetSuggestionFilterData(const char *data, size_t size) const;
89	89	void GetSymbolRewriterData(StringPiece *token_array_data,
90	90	StringPiece *string_array_data) const;
	91	void GetEmoticonRewriterData(StringPiece *token_array_data,
	92	StringPiece *string_array_data) const;
91	93	#ifndef NO_USAGE_REWRITER
92	94	void GetUsageRewriterData(StringPiece *base_conjugation_suffix_data,
93	95	StringPiece *conjugation_suffix_data,

236	238	void PackedDataManager::Impl::GetSymbolRewriterData(
237	239	StringPiece token_array_data, StringPiece string_array_data) const {
238	240	manager_.GetSymbolRewriterData(token_array_data, string_array_data);
	241	}
	242
	243	void PackedDataManager::Impl::GetEmoticonRewriterData(
	244	StringPiece token_array_data, StringPiece string_array_data) const {
	245	manager_.GetEmoticonRewriterData(token_array_data, string_array_data);
239	246	}
240	247
241	248	#ifndef NO_USAGE_REWRITER

391	398	manager_impl_->GetSymbolRewriterData(token_array_data, string_array_data);
392	399	}
393	400
	401	void PackedDataManager::GetEmoticonRewriterData(
	402	StringPiece token_array_data, StringPiece string_array_data) const {
	403	manager_impl_->GetEmoticonRewriterData(token_array_data, string_array_data);
	404	}
	405
394	406	#ifndef NO_USAGE_REWRITER
395	407	void PackedDataManager::GetUsageRewriterData(
396	408	StringPiece *base_conjugation_suffix_data,

+2

-0

src/data_manager/packed/packed_data_manager.h less more

72	72	void GetSuggestionFilterData(const char *data, size_t size) const override;
73	73	void GetSymbolRewriterData(StringPiece *token_array_data,
74	74	StringPiece *string_array_data) const override;
	75	void GetEmoticonRewriterData(StringPiece *token_array_data,
	76	StringPiece *string_array_data) const override;
75	77	#ifndef NO_USAGE_REWRITER
76	78	void GetUsageRewriterData(
77	79	StringPiece *base_conjugation_suffix_data,

+1

-1

src/mozc_version_template.txt less more

0	0	MAJOR=2
1	1	MINOR=17
2		BUILD=2532
	2	BUILD=2533
3	3	REVISION=102
4	4	# NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
5	5	# downloaded by NaCl Mozc.

+56

-71

src/rewriter/emoticon_rewriter.cc less more

42	42	#include "protocol/commands.pb.h"
43	43	#include "protocol/config.pb.h"
44	44	#include "request/conversion_request.h"
45		#include "rewriter/embedded_dictionary.h"
46	45	#include "rewriter/rewriter_interface.h"
	46	#include "rewriter/serialized_dictionary.h"
47	47
48	48	namespace mozc {
49	49	namespace {
50	50
51		#include "rewriter/emoticon_rewriter_data.h"
52
53		class EmoticonDictionary {
54		public:
55		EmoticonDictionary()
56		: dic_(new EmbeddedDictionary(kEmoticonData_token_data,
57		kEmoticonData_token_size)) {}
58
59		~EmoticonDictionary() {}
60
61		EmbeddedDictionary *GetDictionary() const {
62		return dic_.get();
63		}
64
65		private:
66		std::unique_ptr<EmbeddedDictionary> dic_;
67		};
68
69	51	class ValueCostCompare {
70	52	public:
71		bool operator() (const EmbeddedDictionary::Value *a,
72		const EmbeddedDictionary::Value *b) const {
73		return a->cost < b->cost;
	53	bool operator() (SerializedDictionary::const_iterator a,
	54	SerializedDictionary::const_iterator b) const {
	55	return a.cost() < b.cost();
74	56	}
75	57	};
76	58
77	59	class IsEqualValue {
78	60	public:
79		bool operator() (const EmbeddedDictionary::Value *a,
80		const EmbeddedDictionary::Value *b) const {
81		return strcmp(a->value, b->value) == 0;
	61	bool operator() (const SerializedDictionary::const_iterator a,
	62	const SerializedDictionary::const_iterator b) const {
	63	return a.value() == b.value();
82	64	}
83	65	};
84	66
85	67	// Insert Emoticon into the \|segment\|
86	68	// Top \|initial_insert_size\| candidates are inserted from \|initial_insert_pos\|.
87	69	// Remained candidates are added to the buttom.
88		void InsertCandidates(const EmbeddedDictionary::Value *value,
89		size_t value_size,
	70	void InsertCandidates(SerializedDictionary::const_iterator begin,
	71	SerializedDictionary::const_iterator end,
90	72	size_t initial_insert_pos,
91	73	size_t initial_insert_size,
92	74	bool is_no_learning,

100	82	size_t offset = min(initial_insert_pos, segment->candidates_size());
101	83
102	84	// Sort values by cost just in case
103		vector<const EmbeddedDictionary::Value *> sorted_value;
104		for (size_t i = 0; i < value_size; ++i) {
105		sorted_value.push_back(&value[i]);
	85	vector<SerializedDictionary::const_iterator> sorted_value;
	86	for (auto iter = begin; iter != end; ++iter) {
	87	sorted_value.push_back(iter);
106	88	}
107	89
108	90	std::sort(sorted_value.begin(), sorted_value.end(), ValueCostCompare());

115	97	sorted_value.end());
116	98
117	99	for (size_t i = 0; i < sorted_value.size(); ++i) {
118		Segment::Candidate *c = NULL;
	100	Segment::Candidate *c = nullptr;
119	101
120	102	if (i < initial_insert_size) {
121	103	c = segment->insert_candidate(offset);

124	106	c = segment->push_back_candidate();
125	107	}
126	108
127		if (c == NULL) {
	109	if (c == nullptr) {
128	110	LOG(ERROR) << "cannot insert candidate at " << offset;
129	111	continue;
130	112	}
131	113
132	114	c->Init();
133	115	// TODO(taku): set an appropriate POS here.
134		c->lid = sorted_value[i]->lid;
135		c->rid = sorted_value[i]->rid;
	116	c->lid = sorted_value[i].lid();
	117	c->rid = sorted_value[i].rid();
136	118	c->cost = base_candidate.cost;
137		c->value = sorted_value[i]->value;
138		c->content_value = sorted_value[i]->value;
	119	sorted_value[i].value().CopyToString(&c->value);
	120	c->content_value = c->value;
139	121	c->key = base_candidate.key;
140	122	c->content_key = base_candidate.content_key;
141	123	// no full/half width normalizations

150	132	const char kBaseEmoticonDescription[]
151	133	= "\xE9\xA1\x94\xE6\x96\x87\xE5\xAD\x97";
152	134
153		if (sorted_value[i]->description == NULL) {
	135	if (sorted_value[i].description().empty()) {
154	136	c->description = kBaseEmoticonDescription;
155	137	} else {
156	138	string description = kBaseEmoticonDescription;
157	139	description.append(" ");
158		description.append(sorted_value[i]->description);
	140	sorted_value[i].description().AppendToString(&description);
159	141	c->description = description;
160	142	}
161	143	}
162	144	}
163	145
164		bool RewriteCandidate(Segments *segments) {
	146	} // namespace
	147
	148	bool EmoticonRewriter::RewriteCandidate(Segments *segments) const {
165	149	bool modified = false;
166	150	for (size_t i = 0; i < segments->conversion_segments_size(); ++i) {
167	151	const string &key = segments->conversion_segment(i).key();

170	154	continue;
171	155	}
172	156	bool is_no_learning = false;
173		const EmbeddedDictionary::Value *value = NULL;
174		size_t value_size = 0;
	157	SerializedDictionary::const_iterator begin;
	158	SerializedDictionary::const_iterator end = dic_.end();
175	159	size_t initial_insert_size = 0;
176	160	size_t initial_insert_pos = 0;
177	161

183	167	if (key == "\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98") {
184	168	// When key is "かおもじ", default candidate size should be small enough.
185	169	// It is safe to expand all candidates at this time.
186		const EmbeddedDictionary::Token *token
187		= Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken();
188		CHECK(token);
	170	begin = dic_.begin();
	171	CHECK(begin != dic_.end());
	172	end = dic_.end();
189	173	// set large value(100) so that all candidates are pushed to the bottom
190		value = token->value;
191		value_size = token->value_size;
192	174	initial_insert_pos = 100;
193		initial_insert_size = token->value_size;
	175	initial_insert_size = dic_.size();
194	176	// "かお"
195	177	} else if (key == "\xE3\x81\x8B\xE3\x81\x8A") {
196	178	// When key is "かお", expand all candidates in conservative way.
197		const EmbeddedDictionary::Token *token
198		= Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken();
199		CHECK(token);
	179	begin = dic_.begin();
	180	CHECK(begin != dic_.end());
200	181	// first 6 candidates are inserted at 4 th position.
201	182	// Other candidates are pushed to the buttom.
202		value = token->value;
203		value_size = token->value_size;
204	183	initial_insert_pos = 4;
205	184	initial_insert_size = 6;
206	185	} else if (key == "\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F"
207	186	"\xE3\x82\x89\xE3\x81\x84") { // "ふくわらい"
208	187	// Choose one emoticon randomly from the dictionary.
209	188	// TODO(taku): want to make it "generate" more funny emoticon.
210		const EmbeddedDictionary::Token *token
211		= Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken();
212		CHECK(token);
	189	begin = dic_.begin();
	190	CHECK(begin != dic_.end());
213	191	uint32 n = 0;
214	192	// use secure random not to predict the next emoticon.
215	193	Util::GetRandomSequence(reinterpret_cast<char *>(&n), sizeof(n));
216		value = token->value + n % token->value_size;
217		value_size = 1;
	194	begin += n % dic_.size();
	195	end = begin + 1;
218	196	initial_insert_pos = 4;
219	197	initial_insert_size = 1;
220	198	is_no_learning = true; // do not learn this candidate.
221	199	} else {
222		const EmbeddedDictionary::Token *token
223		= Singleton<EmoticonDictionary>::get()->GetDictionary()->Lookup(key);
224		// by default, insert canidate at 7 th position.
225		if (token != NULL) {
226		value = token->value;
227		value_size = token->value_size;
	200	const auto range = dic_.equal_range(key);
	201	begin = range.first;
	202	end = range.second;
	203	if (begin != end) {
228	204	initial_insert_pos = 6;
229		initial_insert_size = token == NULL ? 0 : token->value_size;
	205	initial_insert_size = std::distance(begin, end);
230	206	}
231	207	}
232	208
233		if (value == NULL \|\| value_size == 0) {
	209	if (begin == end) {
234	210	continue;
235	211	}
236	212
237		InsertCandidates(value, value_size,
	213	InsertCandidates(begin, end,
238	214	initial_insert_pos,
239	215	initial_insert_size,
240	216	is_no_learning,

244	220
245	221	return modified;
246	222	}
247		} // namespace
248
249		EmoticonRewriter::EmoticonRewriter() {}
250
251		EmoticonRewriter::~EmoticonRewriter() {}
	223
	224	std::unique_ptr<EmoticonRewriter> EmoticonRewriter::CreateFromDataManager(
	225	const DataManagerInterface &data_manager) {
	226	StringPiece token_array_data, string_array_data;
	227	data_manager.GetEmoticonRewriterData(&token_array_data, &string_array_data);
	228	return std::unique_ptr<EmoticonRewriter>(
	229	new EmoticonRewriter(token_array_data, string_array_data));
	230	}
	231
	232	EmoticonRewriter::EmoticonRewriter(StringPiece token_array_data,
	233	StringPiece string_array_data)
	234	: dic_(token_array_data, string_array_data) {}
	235
	236	EmoticonRewriter::~EmoticonRewriter() = default;
252	237
253	238	int EmoticonRewriter::capability(const ConversionRequest &request) const {
254	239	if (request.request().mixed_conversion()) {

+17

-5

src/rewriter/emoticon_rewriter.h less more

29	29	#ifndef MOZC_REWRITER_EMOTICON_REWRITER_H_
30	30	#define MOZC_REWRITER_EMOTICON_REWRITER_H_
31	31
	32	#include <memory>
	33
	34	#include "data_manager/data_manager_interface.h"
32	35	#include "rewriter/rewriter_interface.h"
	36	#include "rewriter/serialized_dictionary.h"
33	37
34	38	namespace mozc {
35	39

38	42
39	43	class EmoticonRewriter : public RewriterInterface {
40	44	public:
41		EmoticonRewriter();
42		virtual ~EmoticonRewriter();
	45	static std::unique_ptr<EmoticonRewriter> CreateFromDataManager(
	46	const DataManagerInterface &data_manager);
43	47
44		virtual int capability(const ConversionRequest &request) const;
	48	EmoticonRewriter(StringPiece token_array_data, StringPiece string_array_data);
	49	~EmoticonRewriter() override;
45	50
46		virtual bool Rewrite(const ConversionRequest &request,
47		Segments *segments) const;
	51	int capability(const ConversionRequest &request) const override;
	52
	53	bool Rewrite(const ConversionRequest &request,
	54	Segments *segments) const override;
	55
	56	private:
	57	bool RewriteCandidate(Segments *segments) const;
	58
	59	SerializedDictionary dic_;
48	60	};
49	61
50	62	} // namespace mozc

+28

-26

src/rewriter/emoticon_rewriter_test.cc less more

29	29	#include "rewriter/emoticon_rewriter.h"
30	30
31	31	#include <cstddef>
	32	#include <memory>
32	33	#include <string>
33	34
34	35	#include "base/logging.h"

36	37	#include "base/util.h"
37	38	#include "config/config_handler.h"
38	39	#include "converter/segments.h"
	40	#include "data_manager/testing/mock_data_manager.h"
39	41	#include "protocol/commands.pb.h"
40	42	#include "protocol/config.pb.h"
41	43	#include "request/conversion_request.h"
	44	#include "testing/base/public/googletest.h"
42	45	#include "testing/base/public/gunit.h"
43
44		DECLARE_string(test_tmpdir);
	46	#include "testing/base/public/mozctest.h"
45	47
46	48	namespace mozc {
	49	namespace {
47	50
48		namespace {
49	51	void AddSegment(const string &key, const string &value,
50	52	Segments *segments) {
51	53	segments->Clear();

70	72	}
71	73	return false;
72	74	}
73		} // namespace
74	75
75		class EmoticonRewriterTest : public testing::Test {
	76	class EmoticonRewriterTest : public ::testing::Test {
76	77	protected:
77		EmoticonRewriterTest() {}
78		~EmoticonRewriterTest() {}
	78	testing::MockDataManager mock_data_manager_;
79	79
80		virtual void SetUp() {
81		SystemUtil::SetUserProfileDirectory(FLAGS_test_tmpdir);
82		}
83
84		virtual void TearDown() {}
	80	private:
	81	testing::ScopedTmpUserProfileDirectory scoped_profile_dir_;
85	82	};
86	83
87	84	TEST_F(EmoticonRewriterTest, BasicTest) {
88		EmoticonRewriter emoticon_rewriter;
	85	std::unique_ptr<EmoticonRewriter> emoticon_rewriter =
	86	EmoticonRewriter::CreateFromDataManager(mock_data_manager_);
	87
89	88	config::Config config;
90	89	config::ConfigHandler::GetDefaultConfig(&config);
91	90	ConversionRequest request;

95	94
96	95	Segments segments;
97	96	AddSegment("test", "test", &segments);
98		emoticon_rewriter.Rewrite(request, &segments);
	97	emoticon_rewriter->Rewrite(request, &segments);
99	98	EXPECT_FALSE(HasEmoticon(segments));
100	99
101	100	// "かお"
102	101	AddSegment("\xE3\x81\x8B\xE3\x81\x8A", "test", &segments);
103		emoticon_rewriter.Rewrite(request, &segments);
	102	emoticon_rewriter->Rewrite(request, &segments);
104	103	EXPECT_TRUE(HasEmoticon(segments));
105	104
106	105	// "かおもじ"
107	106	AddSegment("\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98",
108	107	"test", &segments);
109		emoticon_rewriter.Rewrite(request, &segments);
	108	emoticon_rewriter->Rewrite(request, &segments);
110	109	EXPECT_TRUE(HasEmoticon(segments));
111	110
112	111	// "にこにこ"
113	112	AddSegment("\xE3\x81\xAB\xE3\x81\x93\xE3\x81\xAB\xE3\x81\x93",
114	113	"test", &segments);
115		emoticon_rewriter.Rewrite(request, &segments);
	114	emoticon_rewriter->Rewrite(request, &segments);
116	115	EXPECT_TRUE(HasEmoticon(segments));
117	116
118	117	// "ふくわらい"
119	118	AddSegment("\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F\xE3\x82\x89\xE3\x81\x84",
120	119	"test", &segments);
121		emoticon_rewriter.Rewrite(request, &segments);
	120	emoticon_rewriter->Rewrite(request, &segments);
122	121	EXPECT_TRUE(HasEmoticon(segments));
123	122	}
124	123

127	126
128	127	Segments segments;
129	128	AddSegment("test", "test", &segments);
130		emoticon_rewriter.Rewrite(request, &segments);
	129	emoticon_rewriter->Rewrite(request, &segments);
131	130	EXPECT_FALSE(HasEmoticon(segments));
132	131
133	132	// "かお"
134	133	AddSegment("\xE3\x81\x8B\xE3\x81\x8A", "test", &segments);
135		emoticon_rewriter.Rewrite(request, &segments);
	134	emoticon_rewriter->Rewrite(request, &segments);
136	135	EXPECT_FALSE(HasEmoticon(segments));
137	136
138	137	// "かおもじ"
139	138	AddSegment("\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98",
140	139	"test", &segments);
141		emoticon_rewriter.Rewrite(request, &segments);
	140	emoticon_rewriter->Rewrite(request, &segments);
142	141	EXPECT_FALSE(HasEmoticon(segments));
143	142
144	143	// "にこにこ"
145	144	AddSegment("\xE3\x81\xAB\xE3\x81\x93\xE3\x81\xAB\xE3\x81\x93",
146	145	"test", &segments);
147		emoticon_rewriter.Rewrite(request, &segments);
	146	emoticon_rewriter->Rewrite(request, &segments);
148	147	EXPECT_FALSE(HasEmoticon(segments));
149	148
150	149	// "ふくわらい"
151	150	AddSegment("\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F\xE3\x82\x89\xE3\x81\x84",
152	151	"test", &segments);
153		emoticon_rewriter.Rewrite(request, &segments);
	152	emoticon_rewriter->Rewrite(request, &segments);
154	153	EXPECT_FALSE(HasEmoticon(segments));
155	154	}
156	155	}
157	156
158	157	TEST_F(EmoticonRewriterTest, MobileEnvironmentTest) {
159		EmoticonRewriter rewriter;
	158	std::unique_ptr<EmoticonRewriter> rewriter =
	159	EmoticonRewriter::CreateFromDataManager(mock_data_manager_);
	160
160	161	commands::Request request;
161	162	ConversionRequest convreq;
162	163	convreq.set_request(&request);
163	164
164	165	{
165	166	request.set_mixed_conversion(true);
166		EXPECT_EQ(RewriterInterface::ALL, rewriter.capability(convreq));
	167	EXPECT_EQ(RewriterInterface::ALL, rewriter->capability(convreq));
167	168	}
168	169
169	170	{
170	171	request.set_mixed_conversion(false);
171		EXPECT_EQ(RewriterInterface::CONVERSION, rewriter.capability(convreq));
	172	EXPECT_EQ(RewriterInterface::CONVERSION, rewriter->capability(convreq));
172	173	}
173	174	}
174	175
	176	} // namespace
175	177	} // namespace mozc

+135

-0

src/rewriter/gen_emoticon_rewriter_data.cc less more

	0	// Copyright 2010-2016, Google Inc.
	1	// All rights reserved.
	2	//
	3	// Redistribution and use in source and binary forms, with or without
	4	// modification, are permitted provided that the following conditions are
	5	// met:
	6	//
	7	// * Redistributions of source code must retain the above copyright
	8	// notice, this list of conditions and the following disclaimer.
	9	// * Redistributions in binary form must reproduce the above
	10	// copyright notice, this list of conditions and the following disclaimer
	11	// in the documentation and/or other materials provided with the
	12	// distribution.
	13	// * Neither the name of Google Inc. nor the names of its
	14	// contributors may be used to endorse or promote products derived from
	15	// this software without specific prior written permission.
	16	//
	17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28
	29	#include <algorithm>
	30	#include <memory>
	31	#include <string>
	32	#include <unordered_map>
	33	#include <vector>
	34
	35	#include "base/file_stream.h"
	36	#include "base/flags.h"
	37	#include "base/init_mozc.h"
	38	#include "base/logging.h"
	39	#include "base/string_piece.h"
	40	#include "base/util.h"
	41	#include "rewriter/serialized_dictionary.h"
	42
	43	DEFINE_string(input, "", "Emoticon dictionary file");
	44	DEFINE_string(output_token_array, "", "Output token array");
	45	DEFINE_string(output_string_array, "", "Output string array");
	46
	47	namespace mozc {
	48	namespace {
	49
	50	using KeyList = vector<string>;
	51	using CompilerToken = SerializedDictionary::CompilerToken;
	52	using TokenList = SerializedDictionary::TokenList;
	53
	54	int LookupCount(const std::unordered_map<string, int> &key_count,
	55	const string &key) {
	56	const auto iter = key_count.find(key);
	57	return (iter == key_count.end()) ? 0 : iter->second;
	58	}
	59
	60	string GetDescription(const KeyList &key_list,
	61	const std::unordered_map<string, int> &key_count) {
	62	if (key_list.size() == 1) {
	63	return key_list[0];
	64	}
	65	KeyList sorted_key_list(key_list);
	66	sort(sorted_key_list.begin(), sorted_key_list.end(),
	67	[&key_count](const string &x, const string &y) {
	68	const int x_count = LookupCount(key_count, x);
	69	const int y_count = LookupCount(key_count, y);
	70	if (x_count == y_count) {
	71	return x < y;
	72	}
	73	return x_count < y_count;
	74	});
	75	return Util::StringPrintf("%s %s", sorted_key_list.back().c_str(),
	76	sorted_key_list.front().c_str());
	77	}
	78
	79	map<string, TokenList> ReadEmoticonTsv(const string &path) {
	80	InputFileStream ifs(path.c_str());
	81
	82	string line;
	83	getline(ifs, line); // Skip header
	84
	85	vector<pair<string, KeyList>> data;
	86	std::unordered_map<string, int> key_count;
	87	while (getline(ifs, line)) {
	88	vector<StringPiece> field_list;
	89	Util::SplitStringUsing(line, "\t", &field_list);
	90	CHECK_GE(field_list.size(), 2) << "Format error: " << line;
	91	LOG_IF(WARNING, field_list.size() > 3) << "Ignore extra columns: " << line;
	92
	93	string replaced;
	94	Util::StringReplace(field_list[1], "\xE3\x80\x80", " ", true, &replaced);
	95	KeyList key_list;
	96	Util::SplitStringUsing(field_list[1], " ", &key_list);
	97
	98	data.emplace_back(field_list[0].as_string(), std::move(key_list));
	99	for (const auto &key : key_list) {
	100	++key_count[key];
	101	}
	102	}
	103
	104	map<string, TokenList> input_data;
	105	int16 cost = 10;
	106	for (const auto &kv : data) {
	107	const string &value = kv.first;
	108	const KeyList &key_list = kv.second;
	109	const string &description = GetDescription(key_list, key_count);
	110	for (const string &key : key_list) {
	111	std::unique_ptr<CompilerToken> token(new CompilerToken());
	112	token->value = value;
	113	token->description = description;
	114	token->lid = 0;
	115	token->rid = 0;
	116	token->cost = cost;
	117	input_data[key].push_back(std::move(token));
	118	cost += 10;
	119	}
	120	}
	121
	122	return input_data;
	123	}
	124
	125	} // namespace
	126	} // namespace mozc
	127
	128	int main(int argc, char **argv) {
	129	mozc::InitMozc(argv[0], &argc, &argv, true);
	130	const auto &input_data = mozc::ReadEmoticonTsv(FLAGS_input);
	131	mozc::SerializedDictionary::CompileToFiles(
	132	input_data, FLAGS_output_token_array, FLAGS_output_string_array);
	133	return 0;
	134	}

+0

-127

~~src/rewriter/gen_emoticon_rewriter_data.py~~ less more

0		# -- coding: utf-8 --
1		# Copyright 2010-2016, Google Inc.
2		# All rights reserved.
3		#
4		# Redistribution and use in source and binary forms, with or without
5		# modification, are permitted provided that the following conditions are
6		# met:
7		#
8		# * Redistributions of source code must retain the above copyright
9		# notice, this list of conditions and the following disclaimer.
10		# * Redistributions in binary form must reproduce the above
11		# copyright notice, this list of conditions and the following disclaimer
12		# in the documentation and/or other materials provided with the
13		# distribution.
14		# * Neither the name of Google Inc. nor the names of its
15		# contributors may be used to endorse or promote products derived from
16		# this software without specific prior written permission.
17		#
18		# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19		# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20		# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21		# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22		# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23		# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24		# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25		# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26		# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27		# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28		# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30		"""Converter from emoticon data to embedded_dictionary.
31
32		Usage:
33		python gen_emoticon_rewriter_data.py --input=input.tsv --output=output_header
34		"""
35
36		__author__ = "hidehiko"
37
38		from collections import defaultdict
39		import logging
40		import optparse
41		import re
42		import sys
43		from rewriter import embedded_dictionary_compiler
44
45
46		def ParseOptions():
47		parser = optparse.OptionParser()
48		parser.add_option('--input', dest='input', help='emoticon dictionary file')
49		parser.add_option('--output', dest='output', help='output header file')
50		return parser.parse_args()[0]
51
52
53		def GetDescription(key_list, key_count):
54		"""Generates a description from readings.
55
56		We simply add 1) the most general reading and 2) the most specific reading.
57		1) and 2) are simply approximated by checking the frequency of the readings.
58
59		Args:
60		key_list: a list of key strings.
61		key_count: a dictionary of key to the number of key's occurence in the data
62		file.
63		Returns:
64		the description string.
65		"""
66		if len(key_list) == 1:
67		return key_list[0]
68
69		sorted_key_list = sorted(key_list, key=lambda key: (key_count[key], key))
70		return '%s %s' % (sorted_key_list[-1], sorted_key_list[0])
71
72
73		def ReadEmoticonTsv(stream):
74		"""Read lines from stream to a Token dictionary for a embedded dictionary."""
75		# Skip the first line (header).
76		stream.next()
77
78		data = []
79		key_count = defaultdict(int)
80		for line in stream:
81		# The file format is:
82		# value <tab> readings(space delimitered)
83		field_list = line.rstrip('\n').split('\t')
84		# Check the size of columns.
85		if len(field_list) < 2:
86		logging.critical('format error: %s', line)
87		sys.exit(1)
88		if len(field_list) > 3:
89		logging.warning('ignore extra columns: %s', line)
90
91		# \xE3\x80\x80 is full width space
92		key_list = re.split(r'(?: \|\xE3\x80\x80)+', field_list[1].strip())
93		data.append((field_list[0], key_list))
94		for key in key_list:
95		key_count[key] += 1
96
97		input_data = defaultdict(list)
98		cost = 10
99		for value, key_list in data:
100		input_value = value
101		if input_value == "":
102		input_value = None
103		description = GetDescription(key_list, key_count)
104		if description == "":
105		description = None
106
107		for key in key_list:
108		input_data[key].append(embedded_dictionary_compiler.Token(
109		key, input_value, description, None, 0, 0, cost))
110		cost += 10
111
112		return input_data
113
114
115		def main():
116		options = ParseOptions()
117		with open(options.input, 'r') as input_stream:
118		input_data = ReadEmoticonTsv(input_stream)
119
120		with open(options.output, 'w') as output_stream:
121		embedded_dictionary_compiler.Compile(
122		'EmoticonData', input_data, output_stream)
123
124
125		if __name__ == '__main__':
126		main()

+1

-1

src/rewriter/rewriter.cc less more

107	107	kEmojiDataList, arraysize(kEmojiDataList),
108	108	kEmojiTokenList, arraysize(kEmojiTokenList),
109	109	kEmojiValueList));
110		AddRewriter(new EmoticonRewriter);
	110	AddRewriter(EmoticonRewriter::CreateFromDataManager(*data_manager).release());
111	111	AddRewriter(new CalculatorRewriter(parent_converter));
112	112	AddRewriter(new SymbolRewriter(parent_converter, data_manager));
113	113	AddRewriter(new UnicodeRewriter(parent_converter));

+12

-20

src/rewriter/rewriter_base.gyp less more

67	67	],
68	68	},
69	69	{
70		'action_name': 'gen_emoticon_rewriter_data',
71		'variables': {
72		'input_file': '../data/emoticon/emoticon.tsv',
73		'output_file': '<(gen_out_dir)/emoticon_rewriter_data.h',
74		},
75		'inputs': [
76		'embedded_dictionary_compiler.py',
77		'gen_emoticon_rewriter_data.py',
78		'<(input_file)',
79		],
80		'outputs': [
81		'<(output_file)'
82		],
83		'action': [
84		'python', 'gen_emoticon_rewriter_data.py',
85		'--input=<(input_file)',
86		'--output=<(output_file)',
87		],
88		},
89		{
90	70	'action_name': 'gen_emoji_rewriter_data',
91	71	'variables': {
92	72	'input_file': '../data/emoji/emoji_data.tsv',

213	193	'../base/base.gyp:serialized_string_array',
214	194	],
215	195	},
	196	{
	197	'target_name': 'gen_emoticon_rewriter_data_main',
	198	'type': 'executable',
	199	'toolsets': ['host'],
	200	'sources': [
	201	'gen_emoticon_rewriter_data.cc',
	202	],
	203	'dependencies': [
	204	'../base/base.gyp:base',
	205	'rewriter_serialized_dictionary.gyp:serialized_dictionary',
	206	],
	207	},
216	208	],
217	209	}

+19

-14

src/rewriter/serialized_dictionary.cc less more

48	48	namespace mozc {
49	49	namespace {
50	50
51		struct CompilerToken {
52		string value;
53		string description;
54		string additional_description;
55		uint16 lid;
56		uint16 rid;
57		int16 cost;
58		};
59
60		using TokenList = vector<std::unique_ptr<CompilerToken>>;
	51	using CompilerToken = SerializedDictionary::CompilerToken;
	52	using TokenList = SerializedDictionary::TokenList;
61	53
62	54	struct CompareByCost {
63	55	bool operator()(const std::unique_ptr<CompilerToken> &t1,

112	104	std::istream *input,
113	105	std::unique_ptr<uint32[]> *output_token_array_buf,
114	106	std::unique_ptr<uint32[]> *output_string_array_buf) {
115		CHECK(SystemUtil::IsLittleEndian());
116
117	107	map<string, TokenList> dic;
118	108	LoadTokens(input, &dic);
	109	return Compile(dic, output_token_array_buf, output_string_array_buf);
	110	}
	111
	112	pair<StringPiece, StringPiece> SerializedDictionary::Compile(
	113	const map<string, TokenList> &dic,
	114	std::unique_ptr<uint32[]> *output_token_array_buf,
	115	std::unique_ptr<uint32[]> *output_string_array_buf) {
	116	CHECK(SystemUtil::IsLittleEndian());
119	117
120	118	// Build a mapping from string to its index in a serialized string array.
121	119	// Note that duplicate keys share the same index, so data is slightly

188	186	const string &output_string_array) {
189	187	InputFileStream ifs(input.c_str());
190	188	CHECK(ifs.good());
191
	189	map<string, TokenList> dic;
	190	LoadTokens(&ifs, &dic);
	191	CompileToFiles(dic, output_token_array, output_string_array);
	192	}
	193
	194	void SerializedDictionary::CompileToFiles(const map<string, TokenList> &dic,
	195	const string &output_token_array,
	196	const string &output_string_array) {
192	197	std::unique_ptr<uint32[]> buf1, buf2;
193		const pair<StringPiece, StringPiece> data = Compile(&ifs, &buf1, &buf2);
	198	const pair<StringPiece, StringPiece> data = Compile(dic, &buf1, &buf2);
194	199	CHECK(VerifyData(data.first, data.second));
195	200
196	201	OutputFileStream token_ofs(output_token_array.c_str(),

+23

-0

src/rewriter/serialized_dictionary.h less more

31	31
32	32	#include <istream>
33	33	#include <iterator>
	34	#include <map>
34	35	#include <string>
35	36	#include <utility>
36	37

105	106	// array by index.
106	107	class SerializedDictionary {
107	108	public:
	109	struct CompilerToken {
	110	string value;
	111	string description;
	112	string additional_description;
	113	uint16 lid;
	114	uint16 rid;
	115	int16 cost;
	116	};
	117
	118	using TokenList = vector<std::unique_ptr<CompilerToken>>;
	119
108	120	static const size_t kTokenByteLength = 24;
109	121
110	122	class iterator : public std::iterator<std::random_access_iterator_tag,

280	292	std::istream *input,
281	293	std::unique_ptr<uint32[]> *output_token_array_buf,
282	294	std::unique_ptr<uint32[]> *output_string_array_buf);
	295	static pair<StringPiece, StringPiece> Compile(
	296	const map<string, TokenList> &dic,
	297	std::unique_ptr<uint32[]> *output_token_array_buf,
	298	std::unique_ptr<uint32[]> *output_string_array_buf);
283	299
284	300	// Creates serialized data and writes them to files.
285	301	static void CompileToFiles(const string &input,
286	302	const string &output_token_array,
287	303	const string &output_string_array);
	304	static void CompileToFiles(const map<string, TokenList> &dic,
	305	const string &output_token_array,
	306	const string &output_string_array);
288	307
289	308	// Validates the serialized data.
290	309	static bool VerifyData(StringPiece token_array_data,

294	313	// boundary.
295	314	SerializedDictionary(StringPiece token_array, StringPiece string_array_data);
296	315	~SerializedDictionary();
	316
	317	std::size_t size() const {
	318	return token_array_.size() / kTokenByteLength;
	319	}
297	320
298	321	iterator begin() { return iterator(token_array_.data(), &string_array_); }
299	322	const_iterator begin() const {