28 | 28 |
|
29 | 29 |
#include "dictionary/file/codec.h"
|
30 | 30 |
|
31 | |
#include <climits>
|
32 | |
|
|
31 |
#include "base/hash.h"
|
33 | 32 |
#include "base/logging.h"
|
34 | 33 |
#include "base/port.h"
|
35 | 34 |
#include "base/util.h"
|
|
40 | 39 |
namespace mozc {
|
41 | 40 |
namespace dictionary {
|
42 | 41 |
|
43 | |
DictionaryFileCodec::DictionaryFileCodec() : filemagic_(20110701) {}
|
|
42 |
DictionaryFileCodec::DictionaryFileCodec()
|
|
43 |
: seed_(2135654146), filemagic_(20110701) {}
|
44 | 44 |
|
45 | |
DictionaryFileCodec::~DictionaryFileCodec() {}
|
|
45 |
DictionaryFileCodec::~DictionaryFileCodec() = default;
|
46 | 46 |
|
47 | 47 |
void DictionaryFileCodec::WriteSections(
|
48 | 48 |
const std::vector<DictionaryFileSection> §ions,
|
49 | 49 |
std::ostream *ofs) const {
|
50 | 50 |
DCHECK(ofs);
|
51 | 51 |
WriteHeader(ofs);
|
52 | |
for (size_t i = 0; i < sections.size(); ++i) {
|
53 | |
WriteSection(sections[i], ofs);
|
|
52 |
|
|
53 |
if (sections.size() == 4) {
|
|
54 |
// In production, the number of sections equals 4. In this case, write the
|
|
55 |
// sections in the following deterministic order. This order was determined
|
|
56 |
// by random shuffle for engine version 24 but it's now made deterministic
|
|
57 |
// to obsolte DictionaryFileCodec.
|
|
58 |
for (size_t i : {0, 2, 1, 3}) {
|
|
59 |
WriteSection(sections[i], ofs);
|
|
60 |
}
|
|
61 |
} else {
|
|
62 |
// Some tests don't have four sections. In this case, simply write sections
|
|
63 |
// in given order.
|
|
64 |
for (const auto §ion : sections) {
|
|
65 |
WriteSection(section, ofs);
|
|
66 |
}
|
54 | 67 |
}
|
|
68 |
|
55 | 69 |
filecodec_util::WriteInt(0, ofs);
|
56 | 70 |
}
|
57 | 71 |
|
58 | 72 |
void DictionaryFileCodec::WriteHeader(std::ostream *ofs) const {
|
59 | 73 |
DCHECK(ofs);
|
60 | 74 |
filecodec_util::WriteInt(filemagic_, ofs);
|
|
75 |
filecodec_util::WriteInt(seed_, ofs);
|
61 | 76 |
}
|
62 | 77 |
|
63 | 78 |
void DictionaryFileCodec::WriteSection(const DictionaryFileSection §ion,
|
64 | 79 |
std::ostream *ofs) const {
|
65 | 80 |
DCHECK(ofs);
|
66 | |
const string &name = GetSectionName(section.name);
|
67 | |
VLOG(1) << "section=" << name << " length=" << section.len;
|
|
81 |
const string &name = section.name;
|
|
82 |
// name should be encoded
|
|
83 |
// uint64 needs just 8 bytes.
|
|
84 |
DCHECK_EQ(8, name.size());
|
|
85 |
string escaped;
|
|
86 |
Util::Escape(name, &escaped);
|
|
87 |
VLOG(1) << "section=" << escaped << " length=" << section.len;
|
68 | 88 |
filecodec_util::WriteInt(section.len, ofs);
|
69 | |
|
70 | |
const int name_len = name.size() + 1; // including '\0'
|
71 | |
ofs->write(name.data(), name_len);
|
72 | |
Pad4(name_len, ofs);
|
|
89 |
ofs->write(name.data(), name.size());
|
73 | 90 |
|
74 | 91 |
ofs->write(section.ptr, section.len);
|
75 | 92 |
Pad4(section.len, ofs);
|
76 | 93 |
}
|
77 | 94 |
|
|
95 |
void DictionaryFileCodec::Pad4(int length, std::ostream *ofs) {
|
|
96 |
DCHECK(ofs);
|
|
97 |
for (int i = length; (i % 4) != 0; ++i) {
|
|
98 |
(*ofs) << '\0';
|
|
99 |
}
|
|
100 |
}
|
|
101 |
|
78 | 102 |
string DictionaryFileCodec::GetSectionName(const string &name) const {
|
79 | |
// Use the given string as is
|
80 | |
return name;
|
|
103 |
VLOG(1) << "seed\t" << seed_;
|
|
104 |
const uint64 name_fp = Hash::FingerprintWithSeed(name, seed_);
|
|
105 |
const string fp_string(reinterpret_cast<const char *>(&name_fp),
|
|
106 |
sizeof(name_fp));
|
|
107 |
string escaped;
|
|
108 |
Util::Escape(fp_string, &escaped);
|
|
109 |
VLOG(1) << "Section name for " << name << ": " << escaped;
|
|
110 |
return fp_string;
|
81 | 111 |
}
|
82 | 112 |
|
83 | 113 |
bool DictionaryFileCodec::ReadSections(
|
|
86 | 116 |
DCHECK(sections);
|
87 | 117 |
const char *ptr = image;
|
88 | 118 |
const int filemagic = filecodec_util::ReadInt(ptr);
|
89 | |
CHECK(filemagic == filemagic_) <<
|
90 | |
"invalid dictionary file magic (recompile dictionary?)";
|
|
119 |
CHECK(filemagic == filemagic_)
|
|
120 |
<< "invalid dictionary file magic (recompile dictionary?)";
|
91 | 121 |
ptr += sizeof(filemagic);
|
92 | |
|
|
122 |
seed_ = filecodec_util::ReadInt(ptr);
|
|
123 |
ptr += sizeof(seed_);
|
93 | 124 |
int size;
|
94 | 125 |
while ((size = filecodec_util::ReadInt(ptr))) {
|
95 | 126 |
ptr += sizeof(size);
|
96 | |
const string name(ptr);
|
97 | |
VLOG(1) << "section=" << name << " length=" << size;
|
98 | |
const int name_len = name.size() + 1;
|
99 | |
ptr += name_len;
|
100 | |
ptr += filecodec_util::Rup4(name_len);
|
|
127 |
// finger print name
|
|
128 |
const string name(ptr, sizeof(uint64));
|
|
129 |
ptr += sizeof(uint64);
|
|
130 |
|
|
131 |
string escaped;
|
|
132 |
Util::Escape(name, &escaped);
|
|
133 |
VLOG(1) << "section=" << escaped << " length=" << size;
|
101 | 134 |
|
102 | 135 |
sections->push_back(DictionaryFileSection(ptr, size, name));
|
103 | 136 |
|
|
110 | 143 |
return true;
|
111 | 144 |
}
|
112 | 145 |
|
113 | |
// Write padding
|
114 | |
void DictionaryFileCodec::Pad4(int length, std::ostream *ofs) {
|
115 | |
DCHECK(ofs);
|
116 | |
for (int i = length; (i % 4) != 0; ++i) {
|
117 | |
(*ofs) << static_cast<char>(Util::Random(CHAR_MAX));
|
118 | |
}
|
119 | |
}
|
120 | |
|
121 | 146 |
} // namespace dictionary
|
122 | 147 |
} // namespace mozc
|