Codebase list mozc / 493fab9
Disallow general symbols to be content words Emoticon (顔文字) from user dictionary has "general symbol" (記号,一般) ID in the Mozc engine. This CL fixes the candidate filter to filter candidates whose content words are general symbol. Closes #389. BUG=#389 TEST= REF_BUG=31760437 REF_CL=134369467 REF_TIME=2016-09-27T15:38:59+09:00 REF_TIME_RAW=1474958339 +0900 Noriyuki Takahashi 7 years ago
3 changed file(s) with 84 addition(s) and 66 deletion(s). Raw diff Collapse all Expand all
138138 return false;
139139 }
140140
141 bool ContainsIsolatedWord(const dictionary::POSMatcher &pos_matcher,
142 const vector<const Node *> &nodes) {
141 bool IsIsolatedWordOrGeneralSymbol(const dictionary::POSMatcher &pos_matcher,
142 uint16 pos_id) {
143 return pos_matcher.IsIsolatedWord(pos_id) ||
144 pos_matcher.IsGeneralSymbol(pos_id);
145 }
146
147 bool ContainsIsolatedWordOrGeneralSymbol(
148 const dictionary::POSMatcher &pos_matcher,
149 const vector<const Node *> &nodes) {
143150 for (const Node *node : nodes) {
144 if (pos_matcher.IsIsolatedWord(node->lid)) {
151 if (IsIsolatedWordOrGeneralSymbol(pos_matcher, node->lid)) {
145152 return true;
146153 }
147154 }
247254
248255 CHECK(top_candidate_);
249256
250 // "短縮よみ" must have only 1 node.
251 if (nodes.size() > 1 && ContainsIsolatedWord(*pos_matcher_, nodes)) {
252 return CandidateFilter::BAD_CANDIDATE;
253 }
254 // This case tests the case where the isolated word is in content word.
255 if (pos_matcher_->IsIsolatedWord(nodes[0]->lid) &&
257 // "短縮よみ" or "記号,一般" must have only 1 node. Note that "顔文字" POS
258 // from user dictionary is converted to "記号,一般" in Mozc engine.
259 if (nodes.size() > 1 &&
260 ContainsIsolatedWordOrGeneralSymbol(*pos_matcher_, nodes)) {
261 return CandidateFilter::BAD_CANDIDATE;
262 }
263 // This case tests the case where the isolated word or general symbol is in
264 // content word.
265 if (IsIsolatedWordOrGeneralSymbol(*pos_matcher_, nodes[0]->lid) &&
256266 (IsNormalOrConstrainedNode(nodes[0]->prev) ||
257267 IsNormalOrConstrainedNode(nodes[0]->next))) {
258268 return CandidateFilter::BAD_CANDIDATE;
290290 }
291291 }
292292
293 TEST_F(CandidateFilterTest, IsolatedWord) {
293 TEST_F(CandidateFilterTest, IsolatedWordOrGeneralSymbol) {
294294 std::unique_ptr<CandidateFilter> filter(CreateCandidateFilter(true));
295295 vector<const Node *> nodes;
296296 Segment::Candidate *c = NewCandidate();
301301 nodes.push_back(node);
302302 node->prev = NewNode();
303303 node->next = NewNode();
304 node->lid = pos_matcher().GetIsolatedWordId();
305 node->rid = pos_matcher().GetIsolatedWordId();
306304 node->key = "abc";
307305 node->value = "test";
308306
309 node->prev->node_type = Node::NOR_NODE;
310 node->next->node_type = Node::EOS_NODE;
311 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
312 EXPECT_EQ(CandidateFilter::BAD_CANDIDATE,
313 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
314 // Clear the internal set |seen_| to prevent "abc" from being filtered by
315 // "seen" rule.
316 filter->Reset();
317 }
318
319 node->prev->node_type = Node::BOS_NODE;
320 node->next->node_type = Node::NOR_NODE;
321 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
322 EXPECT_EQ(CandidateFilter::BAD_CANDIDATE,
323 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
324 filter->Reset();
325 }
326
327 node->prev->node_type = Node::NOR_NODE;
328 node->next->node_type = Node::NOR_NODE;
329 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
330 EXPECT_EQ(CandidateFilter::BAD_CANDIDATE,
331 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
332 filter->Reset();
333 }
334
335 node->prev->node_type = Node::BOS_NODE;
336 node->next->node_type = Node::EOS_NODE;
337 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
338 EXPECT_EQ(CandidateFilter::GOOD_CANDIDATE,
339 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
340 filter->Reset();
341 }
342
343 Node *backup_node = node->prev;
344 node->prev = nullptr;
345 node->next->node_type = Node::EOS_NODE;
346 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
347 EXPECT_EQ(CandidateFilter::GOOD_CANDIDATE,
348 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
349 filter->Reset();
350 }
351 node->prev = backup_node;
352
353 backup_node = node->next;
354 node->prev->node_type = Node::BOS_NODE;
355 node->next = nullptr;
356 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
357 EXPECT_EQ(CandidateFilter::GOOD_CANDIDATE,
358 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
359 filter->Reset();
360 }
361 node->next = backup_node;
307 const uint16 pos_ids[] = {
308 pos_matcher().GetIsolatedWordId(),
309 pos_matcher().GetGeneralSymbolId(),
310 };
311 // Perform the same test for the above POS IDs.
312 for (const uint16 id : pos_ids) {
313 node->lid = id;
314 node->rid = id;
315
316 node->prev->node_type = Node::NOR_NODE;
317 node->next->node_type = Node::EOS_NODE;
318 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
319 EXPECT_EQ(CandidateFilter::BAD_CANDIDATE,
320 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
321 // Clear the internal set |seen_| to prevent "abc" from being filtered by
322 // "seen" rule.
323 filter->Reset();
324 }
325
326 node->prev->node_type = Node::BOS_NODE;
327 node->next->node_type = Node::NOR_NODE;
328 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
329 EXPECT_EQ(CandidateFilter::BAD_CANDIDATE,
330 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
331 filter->Reset();
332 }
333
334 node->prev->node_type = Node::NOR_NODE;
335 node->next->node_type = Node::NOR_NODE;
336 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
337 EXPECT_EQ(CandidateFilter::BAD_CANDIDATE,
338 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
339 filter->Reset();
340 }
341
342 node->prev->node_type = Node::BOS_NODE;
343 node->next->node_type = Node::EOS_NODE;
344 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
345 EXPECT_EQ(CandidateFilter::GOOD_CANDIDATE,
346 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
347 filter->Reset();
348 }
349
350 Node *backup_node = node->prev;
351 node->prev = nullptr;
352 node->next->node_type = Node::EOS_NODE;
353 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
354 EXPECT_EQ(CandidateFilter::GOOD_CANDIDATE,
355 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
356 filter->Reset();
357 }
358 node->prev = backup_node;
359
360 backup_node = node->next;
361 node->prev->node_type = Node::BOS_NODE;
362 node->next = nullptr;
363 for (size_t i = 0; i < arraysize(kRequestTypes); ++i) {
364 EXPECT_EQ(CandidateFilter::GOOD_CANDIDATE,
365 filter->FilterCandidate("abc", c, nodes, kRequestTypes[i]));
366 filter->Reset();
367 }
368 node->next = backup_node;
369 }
362370 }
363371
364372 TEST_F(CandidateFilterTest, IsolatedWordInMultipleNodes) {
2929
3030 MAJOR=2
3131 MINOR=18
32 BUILD=2616
32 BUILD=2617
3333 REVISION=102
3434 # This version represents the version of Mozc IME engine (converter, predictor,
3535 # etc.). This version info is included both in the Mozc server and in the Mozc