Codebase list mozc / 9ea0c14
Simplify codec implementation REF_BUG=73141854,73527019 REF_CL=185935444,186412338,186417457 REF_TIME=2018-02-21T15:06:51+09:00 REF_TIME_RAW=1519193211 +0900 Noriyuki Takahashi 6 years ago
6 changed file(s) with 73 addition(s) and 53 deletion(s). Raw diff Collapse all Expand all
2929
3030 MAJOR=2
3131 MINOR=23
32 BUILD=2811
32 BUILD=2812
3333 REVISION=102
3434 # This version represents the version of Mozc IME engine (converter, predictor,
3535 # etc.). This version info is included both in the Mozc server and in the Mozc
2828
2929 #include "dictionary/file/codec.h"
3030
31 #include <climits>
32
31 #include "base/hash.h"
3332 #include "base/logging.h"
3433 #include "base/port.h"
3534 #include "base/util.h"
4039 namespace mozc {
4140 namespace dictionary {
4241
43 DictionaryFileCodec::DictionaryFileCodec() : filemagic_(20110701) {}
42 DictionaryFileCodec::DictionaryFileCodec()
43 : seed_(2135654146), filemagic_(20110701) {}
4444
45 DictionaryFileCodec::~DictionaryFileCodec() {}
45 DictionaryFileCodec::~DictionaryFileCodec() = default;
4646
4747 void DictionaryFileCodec::WriteSections(
4848 const std::vector<DictionaryFileSection> &sections,
4949 std::ostream *ofs) const {
5050 DCHECK(ofs);
5151 WriteHeader(ofs);
52 for (size_t i = 0; i < sections.size(); ++i) {
53 WriteSection(sections[i], ofs);
52
53 if (sections.size() == 4) {
54 // In production, the number of sections equals 4. In this case, write the
55 // sections in the following deterministic order. This order was determined
56 // by random shuffle for engine version 24 but it's now made deterministic
57 // to obsolte DictionaryFileCodec.
58 for (size_t i : {0, 2, 1, 3}) {
59 WriteSection(sections[i], ofs);
60 }
61 } else {
62 // Some tests don't have four sections. In this case, simply write sections
63 // in given order.
64 for (const auto &section : sections) {
65 WriteSection(section, ofs);
66 }
5467 }
68
5569 filecodec_util::WriteInt(0, ofs);
5670 }
5771
5872 void DictionaryFileCodec::WriteHeader(std::ostream *ofs) const {
5973 DCHECK(ofs);
6074 filecodec_util::WriteInt(filemagic_, ofs);
75 filecodec_util::WriteInt(seed_, ofs);
6176 }
6277
6378 void DictionaryFileCodec::WriteSection(const DictionaryFileSection &section,
6479 std::ostream *ofs) const {
6580 DCHECK(ofs);
66 const string &name = GetSectionName(section.name);
67 VLOG(1) << "section=" << name << " length=" << section.len;
81 const string &name = section.name;
82 // name should be encoded
83 // uint64 needs just 8 bytes.
84 DCHECK_EQ(8, name.size());
85 string escaped;
86 Util::Escape(name, &escaped);
87 VLOG(1) << "section=" << escaped << " length=" << section.len;
6888 filecodec_util::WriteInt(section.len, ofs);
69
70 const int name_len = name.size() + 1; // including '\0'
71 ofs->write(name.data(), name_len);
72 Pad4(name_len, ofs);
89 ofs->write(name.data(), name.size());
7390
7491 ofs->write(section.ptr, section.len);
7592 Pad4(section.len, ofs);
7693 }
7794
95 void DictionaryFileCodec::Pad4(int length, std::ostream *ofs) {
96 DCHECK(ofs);
97 for (int i = length; (i % 4) != 0; ++i) {
98 (*ofs) << '\0';
99 }
100 }
101
78102 string DictionaryFileCodec::GetSectionName(const string &name) const {
79 // Use the given string as is
80 return name;
103 VLOG(1) << "seed\t" << seed_;
104 const uint64 name_fp = Hash::FingerprintWithSeed(name, seed_);
105 const string fp_string(reinterpret_cast<const char *>(&name_fp),
106 sizeof(name_fp));
107 string escaped;
108 Util::Escape(fp_string, &escaped);
109 VLOG(1) << "Section name for " << name << ": " << escaped;
110 return fp_string;
81111 }
82112
83113 bool DictionaryFileCodec::ReadSections(
86116 DCHECK(sections);
87117 const char *ptr = image;
88118 const int filemagic = filecodec_util::ReadInt(ptr);
89 CHECK(filemagic == filemagic_) <<
90 "invalid dictionary file magic (recompile dictionary?)";
119 CHECK(filemagic == filemagic_)
120 << "invalid dictionary file magic (recompile dictionary?)";
91121 ptr += sizeof(filemagic);
92
122 seed_ = filecodec_util::ReadInt(ptr);
123 ptr += sizeof(seed_);
93124 int size;
94125 while ((size = filecodec_util::ReadInt(ptr))) {
95126 ptr += sizeof(size);
96 const string name(ptr);
97 VLOG(1) << "section=" << name << " length=" << size;
98 const int name_len = name.size() + 1;
99 ptr += name_len;
100 ptr += filecodec_util::Rup4(name_len);
127 // finger print name
128 const string name(ptr, sizeof(uint64));
129 ptr += sizeof(uint64);
130
131 string escaped;
132 Util::Escape(name, &escaped);
133 VLOG(1) << "section=" << escaped << " length=" << size;
101134
102135 sections->push_back(DictionaryFileSection(ptr, size, name));
103136
110143 return true;
111144 }
112145
113 // Write padding
114 void DictionaryFileCodec::Pad4(int length, std::ostream *ofs) {
115 DCHECK(ofs);
116 for (int i = length; (i % 4) != 0; ++i) {
117 (*ofs) << static_cast<char>(Util::Random(CHAR_MAX));
118 }
119 }
120
121146 } // namespace dictionary
122147 } // namespace mozc
4545 class DictionaryFileCodec : public DictionaryFileCodecInterface {
4646 public:
4747 DictionaryFileCodec();
48 virtual ~DictionaryFileCodec();
48 ~DictionaryFileCodec() override;
4949
50 virtual void WriteSections(const std::vector<DictionaryFileSection> &sections,
51 std::ostream *ofs) const;
52 virtual bool ReadSections(const char *image, int length,
53 std::vector<DictionaryFileSection> *sections) const;
54 virtual string GetSectionName(const string &name) const;
50 void WriteSections(const std::vector<DictionaryFileSection> &sections,
51 std::ostream *ofs) const override;
52 bool ReadSections(
53 const char *image, int length,
54 std::vector<DictionaryFileSection> *sections) const override;
55 string GetSectionName(const string &name) const override;
5556
5657 private:
5758 void WriteHeader(std::ostream *ofs) const;
6061
6162 static void Pad4(int length, std::ostream *ofs);
6263
64 // Seed value for name string finger print
65 // Made it mutable for reading sections.
66 mutable int seed_;
6367 // Magic value for simple file validation
6468 const int filemagic_;
6569
2828
2929 #include "dictionary/file/codec_factory.h"
3030
31
3231 #include "base/singleton.h"
3332 #include "dictionary/file/codec.h"
3433 #include "dictionary/file/codec_interface.h"
3534
36
3735 namespace mozc {
3836 namespace dictionary {
39
40
4137 namespace {
4238 DictionaryFileCodecInterface *g_dictionary_file_codec = nullptr;
4339 } // namespace
4440
45 typedef DictionaryFileCodec DefaultCodec;
46
4741 DictionaryFileCodecInterface *DictionaryFileCodecFactory::GetCodec() {
4842 if (g_dictionary_file_codec == nullptr) {
49 return Singleton<DefaultCodec>::get();
50 } else {
51 return g_dictionary_file_codec;
43 return Singleton<DictionaryFileCodec>::get();
5244 }
45 return g_dictionary_file_codec;
5346 }
5447
5548 void DictionaryFileCodecFactory::SetCodec(DictionaryFileCodecInterface *codec) {
3737 namespace mozc {
3838 namespace dictionary {
3939
40
4140 class DictionaryFileCodecFactory {
4241 public:
4342 // Returns the singleton instance.
4949 CodecTest() : test_file_(FLAGS_test_tmpdir + "testfile.txt") {}
5050
5151 protected:
52 virtual void SetUp() {
52 void SetUp() override {
5353 DictionaryFileCodecFactory::SetCodec(NULL);
5454 FileUtil::Unlink(test_file_);
5555 }
5656
57 virtual void TearDown() {
57 void TearDown() override {
5858 // Reset to default setting
5959 DictionaryFileCodecFactory::SetCodec(NULL);
6060 FileUtil::Unlink(test_file_);
184184 EXPECT_TRUE(CheckValue(sections[index], "Value 1 test test"));
185185 }
186186
187 TEST_F(CodecTest, CodecTest) {
188 std::unique_ptr<DictionaryFileCodec> default_codec(
187 TEST_F(CodecTest, RandomizedCodecTest) {
188 std::unique_ptr<DictionaryFileCodec> internal_codec(
189189 new DictionaryFileCodec);
190 DictionaryFileCodecFactory::SetCodec(default_codec.get());
190 DictionaryFileCodecFactory::SetCodec(internal_codec.get());
191191 const DictionaryFileCodecInterface *codec =
192192 DictionaryFileCodecFactory::GetCodec();
193193 EXPECT_TRUE(codec != NULL);
221221 EXPECT_TRUE(CheckValue(sections[index], "Value 1 test test"));
222222 }
223223
224
225224 } // namespace
226225 } // namespace dictionary
227226 } // namespace mozc