New upstream version 3.0.0~git20160910
Liang Guo
7 years ago
0 | SunPinyin | |
1 | === | |
2 | ||
3 | SunPinyin is an SLM (Statistical Language Model) based input method | |
4 | engine. To model the Chinese language, it use a backoff bigram and | |
5 | trigram language model. | |
6 | ||
7 | Currently, SunPinyin 2.0 is available on IBus, SCIM, and as a | |
8 | standalone XIM Server. |
0 | SunPinyin | |
1 | === | |
2 | ||
3 | SunPinyin is an SLM (Statistical Language Model) based input method | |
4 | engine. To model the Chinese language, it uses a backoff bigram and | |
5 | trigram language model. | |
6 | ||
7 | Currently, SunPinyin 2.0 is available on IBus, SCIM, and as a | |
8 | standalone XIM Server. | |
9 | ||
10 | [![Build Status](https://travis-ci.org/sunpinyin/sunpinyin.svg?branch=master)](https://travis-ci.org/sunpinyin/sunpinyin) |
1 | 1 | import os |
2 | 2 | import sys |
3 | 3 | |
4 | version="2.0.4" | |
4 | ||
5 | version = "2.0.4" | |
5 | 6 | abi_major = 3 |
6 | 7 | abi_minor = 0 |
7 | 8 | |
12 | 13 | 'src/slm/ids2ngram/idngram_merge.cpp', |
13 | 14 | 'src/slm/mmseg/mmseg.cpp', |
14 | 15 | 'src/slm/tslminfo/tslminfo.cpp', |
15 | 'src/slm/tslmpack/arpa_slm.cpp', | |
16 | 'src/slm/tslmpack/arpa_conv.cpp', | |
17 | 'src/slm/tslmpack/slmpack.cpp', | |
18 | 16 | 'src/slm/slm.cpp', |
19 | 17 | 'src/slm/slminfo/slminfo.cpp', |
18 | 'src/slm/slmpack/arpa_slm.cpp', | |
19 | 'src/slm/slmpack/slmpack.cpp', | |
20 | 20 | 'src/slm/sim_sen.cpp', |
21 | 21 | 'src/slm/sim_slm.cpp', |
22 | 22 | 'src/slm/getwordfreq/getwordfreq.cpp', |
26 | 26 | 'src/slm/thread/ValueCompress.cpp', |
27 | 27 | 'src/slm/slmbuild/slmbuild.cpp', |
28 | 28 | 'src/slm/slmprune/slmprune.cpp', |
29 | 'src/slm/sim_slmbuilder.cpp', | |
29 | 'src/slm/slmbuild/sim_slmbuilder.cpp', | |
30 | 30 | 'src/slm/tslmendian/slm_endian.cpp', |
31 | 31 | 'src/slm/tslmendian/writer.cpp', |
32 | 32 | 'src/slm/tslmendian/slm_file.cpp', |
66 | 66 | 'src/slm/ids2ngram/idngram.h', |
67 | 67 | 'src/slm/ids2ngram/idngram_merge.h', |
68 | 68 | 'src/slm/slm.h', |
69 | 'src/slm/tslmpack/arpa_slm.h', | |
70 | 'src/slm/tslmpack/common.h', | |
71 | 'src/slm/tslmpack/arpa_conv.h', | |
72 | 69 | 'src/slm/sim_dict.h', |
73 | 70 | 'src/slm/sim_sen.h', |
74 | 71 | 'src/slm/sim_slm.h', |
75 | 72 | 'src/slm/thread/ValueCompress.h', |
76 | 73 | 'src/slm/sim_fmerge.h', |
77 | 'src/slm/sim_slmbuilder.h', | |
74 | 'src/slm/slmbuild/sim_slmbuilder.h', | |
75 | 'src/slm/slmpack/arpa_slm.h', | |
76 | 'src/slm/slmpack/common.h', | |
78 | 77 | 'src/slm/tslmendian/slm_file.h', |
79 | 78 | 'src/slm/tslmendian/writer.h', |
80 | 79 | 'src/lexicon/pytrie_gen.h', |
132 | 131 | 'src/slmthread', |
133 | 132 | 'src/tslmendian', |
134 | 133 | 'src/tslminfo', |
135 | 'src/tslmpack', | |
134 | 'src/slmpack', | |
136 | 135 | 'src/genpyt', |
137 | 136 | 'src/getwordfreq', |
138 | 137 | 'src/sunpinyin-dictgen', |
149 | 148 | 'man/slmthread.1', |
150 | 149 | 'man/tslmendian.1', |
151 | 150 | 'man/tslminfo.1', |
152 | 'man/tslmpack.1', | |
151 | 'man/slmpack.1', | |
153 | 152 | 'man/genpyt.1', |
154 | 153 | 'man/getwordfreq.1', |
155 | 154 | ] |
186 | 185 | opts.Add('DATADIR', default='/usr/local/share') |
187 | 186 | opts.Add('ENABLE_PLUGINS', default=False) |
188 | 187 | |
189 | # | |
190 | #==============================environment============================== | |
188 | ||
189 | # | |
190 | # ==============================environment============================== | |
191 | 191 | # |
192 | 192 | # |
193 | 193 | def allinc(): |
194 | inc=[] | |
195 | for root, dirs, files in os.walk('src'): | |
196 | inc.append(root) | |
197 | return inc | |
194 | return [root for root, _, _ in os.walk('src')] | |
195 | ||
198 | 196 | |
199 | 197 | def GetOS(): |
200 | 198 | return platform.uname()[0] |
199 | ||
201 | 200 | |
202 | 201 | def CreateEnvironment(): |
203 | 202 | make = 'make' |
204 | 203 | wget = 'wget' |
204 | w3m = 'wget -q -O -' | |
205 | 205 | tar = 'tar' |
206 | 206 | if GetOS() == 'Darwin': |
207 | 207 | wget = 'curl -O' |
208 | w3m = 'curl -s' | |
208 | 209 | elif GetOS() == 'FreeBSD': |
209 | 210 | make = 'gmake' |
210 | 211 | wget = 'fetch' |
212 | w3m = 'fetch -o -' | |
211 | 213 | elif GetOS() == 'SunOS': |
212 | 214 | make = 'gmake' |
213 | 215 | tar = 'gtar' |
214 | 216 | libln_builder = Builder(action='cd ${TARGET.dir} && ln -s ${SOURCE.name} ${TARGET.name}') |
215 | env = Environment(ENV = os.environ, CFLAGS = cflags, CXXFLAGS = cflags, | |
216 | MAKE = make, WGET = wget, TAR = tar, | |
217 | CPPPATH = ['.'] + allinc(), | |
218 | tools = ['default', 'textfile']) | |
219 | env.Append(BUILDERS = {'InstallAsSymlink': libln_builder}) | |
217 | env = Environment(ENV=os.environ, CFLAGS=cflags, CXXFLAGS='', | |
218 | MAKE=make, WGET=wget, W3M=w3m, TAR=tar, | |
219 | CPPPATH=['.'] + allinc(), | |
220 | tools=['default', 'textfile']) | |
221 | env.Append(BUILDERS={'InstallAsSymlink': libln_builder}) | |
220 | 222 | env['ENDIANNESS'] = "be" if sys.byteorder == "big" else "le" |
221 | 223 | return env |
224 | ||
222 | 225 | |
223 | 226 | def PassVariables(envvar, env): |
224 | 227 | for (x, y) in envvar: |
275 | 278 | if GetOption('rpath') is not None and GetOS() != 'Darwin': |
276 | 279 | env.MergeFlags('-Wl,-R -Wl,%s' % GetOption('rpath')) |
277 | 280 | |
278 | # | |
279 | #==============================configure================================ | |
281 | ||
282 | # | |
283 | # ==============================configure================================ | |
280 | 284 | # |
281 | 285 | def CheckPKGConfig(context, version='0.12.0'): |
282 | 286 | context.Message('Checking for pkg-config... ') |
284 | 288 | context.Result(ret) |
285 | 289 | return ret |
286 | 290 | |
291 | ||
287 | 292 | def CheckPKG(context, name): |
288 | 293 | context.Message('Checking for %s... ' % name) |
289 | 294 | ret = context.TryAction('pkg-config --exists \'%s\'' % name)[0] |
290 | 295 | context.Result(ret) |
291 | 296 | return ret |
297 | ||
292 | 298 | |
293 | 299 | def CheckPython(context): |
294 | 300 | context.Message('Checking for Python library...') |
299 | 305 | '!python-config --libs']) |
300 | 306 | return ret |
301 | 307 | |
308 | ||
302 | 309 | def AppendEndianCheck(conf): |
303 | 310 | conf.config_h_text += r''' |
304 | 311 | |
312 | 319 | || defined(_POWER) || defined(__powerpc__) \ |
313 | 320 | || defined(__ppc__) || defined(__hpux) || defined(__hppa) \ |
314 | 321 | || defined(_MIPSEB) || defined(_POWER) \ |
315 | || defined(__s390__) || (defined(__sh__) && defined(__BIG_ENDIAN__)) | |
322 | || defined(__s390__) || (defined(__sh__) && defined(__BIG_ENDIAN__)) \ | |
323 | || defined(__AARCH64EB__) | |
316 | 324 | # define WORDS_BIGENDIAN 1 |
317 | 325 | |
318 | 326 | #elif defined(__i386__) || defined(__i386) \ |
323 | 331 | || defined(__x86_64) || defined(__x86_64__) \ |
324 | 332 | || defined(_M_X64) || defined(__bfin__) \ |
325 | 333 | || defined(__alpha__) || defined(__ARMEL__) \ |
326 | || defined(_MIPSEL) || (defined(__sh__) && defined(__LITTLE_ENDIAN__)) | |
334 | || defined(_MIPSEL) || (defined(__sh__) && defined(__LITTLE_ENDIAN__)) \ | |
335 | || defined(__AARCH64EL__) | |
327 | 336 | # undef WORDS_BIGENDIAN |
328 | 337 | |
329 | 338 | #else |
332 | 341 | ''' |
333 | 342 | |
334 | 343 | conf = env.Configure(clean=False, help=False, config_h='config.h', |
335 | custom_tests={'CheckPKGConfig' : CheckPKGConfig, | |
336 | 'CheckPKG' : CheckPKG, | |
344 | custom_tests={'CheckPKGConfig': CheckPKGConfig, | |
345 | 'CheckPKG': CheckPKG, | |
337 | 346 | 'CheckPython': CheckPython}) |
347 | ||
338 | 348 | |
339 | 349 | def DoConfigure(): |
340 | 350 | if GetOS() == 'Darwin': |
358 | 368 | conf.CheckCHeader('assert.h') |
359 | 369 | conf.CheckFunc('bind_textdomain_codeset') |
360 | 370 | conf.CheckFunc('dcgettext') |
361 | conf.CheckCHeader('dlfcn.h') | |
362 | 371 | conf.CheckFunc('exp2') |
363 | 372 | conf.CheckCHeader('fcntl.h') |
364 | 373 | conf.CheckCHeader('getopt.h') |
377 | 386 | conf.CheckFunc('mmap') |
378 | 387 | conf.CheckFunc('munmap') |
379 | 388 | conf.CheckFunc('setlocale') |
380 | conf.CheckFunc('strndup') | |
381 | 389 | conf.CheckCHeader('sys/mman.h') |
382 | 390 | conf.CheckCHeader('sys/param.h') |
383 | 391 | conf.CheckCHeader('sys/stat.h') |
404 | 412 | DoConfigure() |
405 | 413 | |
406 | 414 | # |
407 | #==============================compile============================== | |
415 | # ==============================compile============================== | |
408 | 416 | # |
409 | 417 | env.Object(slmsource) |
410 | 418 | env.Command('src/pinyin/quanpin_trie.h', 'python/quanpin_trie_gen.py', |
415 | 423 | SConscript(['src/SConscript', 'man/SConscript', 'doc/SConscript'], exports='env') |
416 | 424 | |
417 | 425 | env.Substfile('sunpinyin-2.0.pc.in', SUBST_DICT={ |
418 | '@PREFIX@': env['PREFIX'], | |
419 | '@LIBDIR@': env['LIBDIR'], | |
420 | '@VERSION@': version, | |
421 | '@CFLAGS@': reduce(lambda a, b: a + ' ' + b, | |
422 | map(lambda x: '-I$${includedir}' + x[3:], | |
423 | allinc())), | |
424 | }) | |
426 | '@PREFIX@': env['PREFIX'], | |
427 | '@LIBDIR@': env['LIBDIR'], | |
428 | '@VERSION@': version, | |
429 | '@CFLAGS@': reduce(lambda a, b: a + ' ' + b, | |
430 | map(lambda x: '-I$${includedir}' + x[3:], | |
431 | allinc())), | |
432 | }) | |
425 | 433 | |
426 | 434 | libname_default = '%ssunpinyin%s' % (env.subst('${SHLIBPREFIX}'), |
427 | 435 | env.subst('${SHLIBSUFFIX}')) |
436 | 444 | else: |
437 | 445 | # TODO: add install_name on Darwin? |
438 | 446 | lib = env.SharedLibrary('sunpinyin', source=imesource) |
447 | ||
439 | 448 | |
440 | 449 | def DoInstall(): |
441 | 450 | lib_target = None |
8 | 8 | For developers and expert users |
9 | 9 | ------------------------------- |
10 | 10 | |
11 | Get `lm_sc.t3g.arpa.tar.bz2' and `dict.utf8.tar.bz2' from [1], | |
11 | Get `lm_sc.3gm.arpa.tar.bz2' and `dict.utf8.tar.bz2' from [1] or [2], | |
12 | 12 | unpack them into some directory, and issue the following commands in |
13 | 13 | that directory: |
14 | 14 | |
19 | 19 | # Install the generated data files (requires root permission). |
20 | 20 | make install |
21 | 21 | |
22 | [1] https://code.google.com/p/open-gram/downloads | |
22 | [1] https://open-gram.googlecode.com/git/ | |
23 | [2] http://sourceforge.net/projects/open-gram/files/ | |
23 | 24 | |
24 | 25 | -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
7 | 7 | ENDIANNESS = @ENDIANNESS@ |
8 | 8 | endif |
9 | 9 | |
10 | DICT_FILE = dict.utf8 | |
10 | SLM_TARGET = lm_sc | |
11 | 11 | |
12 | SLM_TARGET = lm_sc | |
13 | TSLM2_TEXT_FILE = ${SLM_TARGET}.t2g.arpa | |
12 | SLM2_TEXT_FILE = ${SLM_TARGET}.2gm.arpa | |
13 | SLM2_FILE = ${SLM_TARGET}.2gm | |
14 | 14 | TSLM2_ORIG_FILE = ${SLM_TARGET}.t2g.orig |
15 | 15 | TSLM2_DIST_FILE = ${SLM_TARGET}.t2g |
16 | TSLM3_TEXT_FILE = ${SLM_TARGET}.t3g.arpa | |
16 | ||
17 | SLM3_TEXT_FILE = ${SLM_TARGET}.3gm.arpa | |
18 | SLM3_FILE = ${SLM_TARGET}.3gm | |
17 | 19 | TSLM3_ORIG_FILE = ${SLM_TARGET}.t3g.orig |
18 | 20 | TSLM3_DIST_FILE = ${SLM_TARGET}.t3g |
19 | 21 | |
22 | DICT_FILE = dict.utf8 | |
20 | 23 | PYTRIE_FILE = pydict_sc.bin |
21 | 24 | PYTRIE_LOG_FILE = pydict_sc.log |
22 | 25 | |
25 | 28 | all: slm3_dist |
26 | 29 | install: slm3_install |
27 | 30 | |
31 | slm2: ${SLM2_FILE} | |
32 | ${SLM2_FILE}: ${SLM2_TEXT_FILE} ${DICT_FILE} | |
33 | slmpack $^ $@ | |
34 | ||
28 | 35 | tslm2_orig: ${TSLM2_ORIG_FILE} |
29 | ${TSLM2_ORIG_FILE}: ${DICT_FILE} ${TSLM2_TEXT_FILE} | |
30 | tslmpack ${TSLM2_TEXT_FILE} ${DICT_FILE} $@ | |
36 | ${TSLM2_ORIG_FILE}: ${SLM2_FILE} | |
37 | slmthread $^ $@ | |
31 | 38 | |
32 | 39 | tslm2_dist: ${TSLM2_DIST_FILE} |
33 | 40 | ${TSLM2_DIST_FILE}: ${TSLM2_ORIG_FILE} |
37 | 44 | genpyt -e ${ENDIANNESS} -i ${DICT_FILE} -s ${TSLM2_ORIG_FILE} \ |
38 | 45 | -l ${PYTRIE_LOG_FILE} -o ${PYTRIE_FILE} |
39 | 46 | |
47 | slm3: ${SLM3_FILE} | |
48 | ${SLM3_FILE}: ${SLM3_TEXT_FILE} ${DICT_FILE} | |
49 | slmpack $^ $@ | |
50 | ||
40 | 51 | tslm3_orig: ${TSLM3_ORIG_FILE} |
41 | ${TSLM3_ORIG_FILE}: ${DICT_FILE} ${TSLM3_TEXT_FILE} | |
42 | tslmpack ${TSLM3_TEXT_FILE} ${DICT_FILE} $@ | |
52 | ${TSLM3_ORIG_FILE}: ${SLM3_FILE} | |
53 | slmthread $^ $@ | |
43 | 54 | |
44 | 55 | tslm3_dist: ${TSLM3_DIST_FILE} |
45 | 56 | ${TSLM3_DIST_FILE}: ${TSLM3_ORIG_FILE} |
100 | 100 | rm -f ${BIGRAM_STAT} ${SLM2_RAW_FILE} |
101 | 101 | rm -f ${TRIGRAM_STAT} ${SLM3_RAW_FILE} |
102 | 102 | |
103 | mmseg_bigram: mmseg_ids tslm2_info | |
104 | mmseg_trigram: mmseg_ids tslm3_info | |
105 | slm_bigram: slm2_ids tslm2_info | |
106 | slm_trigram: slm3_ids tslm3_info | |
103 | mmseg_bigram: mmseg_ids slm2_info | |
104 | mmseg_trigram: mmseg_ids slm3_info | |
105 | slm_bigram: slm2_ids slm2_info | |
106 | slm_trigram: slm3_ids slm3_info | |
107 | 107 | |
108 | 108 | bootstrap2: |
109 | 109 | make mmseg_bigram |
0 | 0 | import os |
1 | 1 | Import('env') |
2 | 2 | |
3 | pod2man = Builder(action = 'pod2man < $SOURCE > $TARGET') | |
3 | pod2man = Builder(action = 'pod2man $SOURCE $TARGET') | |
4 | 4 | env.Append(BUILDERS = {'Man': pod2man}) |
5 | 5 | |
6 | 6 | env.Man('mmseg.1', 'mmseg.pod') |
10 | 10 | env.Man('slmbuild.1', 'slmbuild.pod') |
11 | 11 | env.Man('slmprune.1', 'slmprune.pod') |
12 | 12 | env.Man('slminfo.1', 'slminfo.pod') |
13 | env.Man('slmpack.1', 'slmpack.pod') | |
13 | 14 | env.Man('slmthread.1', 'slmthread.pod') |
14 | 15 | env.Man('tslmendian.1', 'tslmendian.pod') |
15 | 16 | env.Man('tslminfo.1', 'tslminfo.pod') |
16 | env.Man('tslmpack.1', 'tslmpack.pod') | |
17 | 17 | env.Man('genpyt.1', 'genpyt.pod') |
18 | 18 | env.Man('getwordfreq.1', 'getwordfreq.pod') |
19 | 19 |
0 | =head1 NAME | |
1 | ||
2 | slmpack - convert the ARPA format of SunPinyin back-off language model to its binary representation | |
3 | ||
4 | =head1 SYNOPSIS | |
5 | ||
6 | B<slmpack> I<arpa_file> I<dict_file> I<binary_slm_file> | |
7 | ||
8 | =head1 DESCRIPTION | |
9 | ||
10 | B<slmpack> converts the ARPA format of a threaded SunPinyin back-off | |
11 | language model to its binary representation. | |
12 | ||
13 | =head1 NOTE | |
14 | ||
15 | If you convert a language model to ARPA format using B<slminfo>, and | |
16 | then convert it back using B<slmpack>, the check-sum of generated | |
17 | binary file may be different from that of the original one. The reason | |
18 | is the padding bits in the n-gram instances are not initialized before | |
19 | writing the data out. | |
20 | ||
21 | =head1 AUTHOR | |
22 | ||
23 | Originally written by Kov.Chai E<lt>tchaikov@gmail.comE<gt>. | |
24 | Currently maintained by Kov.Chai E<lt>tchaikov@gmail.comE<gt>. | |
25 | ||
26 | =head1 SEE ALSO | |
27 | ||
28 | B<slminfo>(1). | |
29 | ||
30 | =for comment | |
31 | -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
23 | 23 | |
24 | 24 | =item B<-i> I<input-lm-file> |
25 | 25 | |
26 | Identify the input file of convert. Generally, this file is generated by B<slmthread> or B<tslmpack>. | |
27 | ||
26 | Identify the input file of convert. Generally, this file is generated by B<slmthread>. | |
28 | 27 | |
29 | 28 | =item B<-o> I<out-lm-file> |
30 | 29 | |
45 | 44 | |
46 | 45 | =head1 SEE ALSO |
47 | 46 | |
48 | B<slmthread>(1). B<tslminfo>, B<tslmpack>. | |
47 | B<slmthread>(1). B<tslminfo>. | |
49 | 48 | |
50 | 49 | =for comment |
51 | 50 | -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | =head1 NAME | |
1 | ||
2 | tslmpack - convert the ARPA format of SunPinyin back-off language model to its binary representation | |
3 | ||
4 | =head1 SYNOPSIS | |
5 | ||
6 | B<tslmpack> I<arpa_file> I<dict_file> I<binary_slm_file> | |
7 | ||
8 | =head1 DESCRIPTION | |
9 | ||
10 | B<tslmpack> converts the ARPA format of a threaded SunPinyin back-off | |
11 | language model to its binary representation. | |
12 | ||
13 | =head1 NOTE | |
14 | ||
15 | If you convert a language model to ARPA format using B<tslminfo>, and | |
16 | then convert it back using B<tslmpack>, the check-sum of generated | |
17 | binary file may be different from that of the original one. The reason | |
18 | is the padding bits in the n-gram instances are not initialized before | |
19 | writing the data out. | |
20 | ||
21 | ||
22 | =head1 AUTHOR | |
23 | ||
24 | Originally written by Kov.Chai E<lt>tchaikov@gmail.comE<gt>. | |
25 | Currently maintained by Kov.Chai E<lt>tchaikov@gmail.comE<gt>. | |
26 | ||
27 | =head1 SEE ALSO | |
28 | ||
29 | B<tslminfo>(1). | |
30 | ||
31 | =for comment | |
32 | -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | 0 | import os |
1 | ||
2 | ||
1 | 3 | Import('env') |
2 | 4 | |
3 | 5 | env.Program('mmseg', ['portability.o', 'slm/sim_dict.o', 'slm/sim_sen.o', |
4 | 6 | 'slm/mmseg/mmseg.o']) |
5 | 7 | |
6 | 8 | env.Program('slmseg', ['portability.o', 'slm/sim_dict.o', 'slm/sim_sen.o', |
7 | 'slm/slm.o', 'slm/slmseg/slmseg.o']) | |
9 | 'slm/slm.o', 'slm/slmseg/slmseg.o']) | |
8 | 10 | |
9 | 11 | env.Program('ids2ngram', ['portability.o', 'slm/ids2ngram/ids2ngram.o']) |
10 | 12 | |
11 | 13 | env.Program('idngram_merge', ['portability.o', 'slm/ids2ngram/idngram_merge.o']) |
12 | 14 | |
13 | env.Program('slmbuild', ['portability.o', 'slm/sim_slmbuilder.o', | |
15 | env.Program('slmbuild', ['portability.o', 'slm/slmbuild/sim_slmbuilder.o', | |
14 | 16 | 'slm/slmbuild/slmbuild.o']) |
15 | 17 | |
16 | 18 | env.Program('slmprune', ['portability.o', 'slm/sim_slm.o', |
17 | 19 | 'slm/slmprune/slmprune.o']) |
18 | 20 | |
19 | 21 | env.Program('slminfo', ['portability.o', 'slm/slminfo/slminfo.o']) |
22 | ||
23 | env.Program('slmpack', ['portability.o', 'slm/sim_slm.o', 'slm/slmpack/slmpack.o', | |
24 | 'slm/slmpack/arpa_slm.o']) | |
20 | 25 | |
21 | 26 | env.Program('slmthread', ['portability.o', 'slm/sim_slm.o', |
22 | 27 | 'slm/thread/ValueCompress.o', 'slm/thread/slmthread.o']) |
27 | 32 | |
28 | 33 | env.Program('tslminfo', ['portability.o', 'slm/slm.o', 'slm/tslminfo/tslminfo.o']) |
29 | 34 | |
30 | env.Program('tslmpack', ['portability.o', 'slm/slm.o', | |
31 | 'slm/thread/ValueCompress.o', 'slm/tslmpack/slmpack.o', | |
32 | 'slm/tslmpack/arpa_conv.o', 'slm/tslmpack/arpa_slm.o']) | |
33 | ||
34 | 35 | env.Program('genpyt', ['portability.o', 'slm/slm.o', 'slm/tslmendian/writer.o', |
35 | 'lexicon/trie_writer.o', 'lexicon/genpyt.o', | |
36 | 'lexicon/pytrie.o', 'lexicon/pytrie_gen.o', | |
37 | 'pinyin/pinyin_data.o']) | |
36 | 'lexicon/trie_writer.o', 'lexicon/genpyt.o', | |
37 | 'lexicon/pytrie.o', 'lexicon/pytrie_gen.o', | |
38 | 'pinyin/pinyin_data.o']) | |
38 | 39 | |
39 | 40 | env.Program('getwordfreq', ['portability.o', 'slm/slm.o', |
40 | 41 | 'slm/getwordfreq/getwordfreq.o']) |
41 | 42 | |
42 | 43 | env.Program('testvc', ['slm/thread/ValueCompress.o', 'slm/thread/test_vc.o']) |
43 | 44 | |
44 | env.Substfile('sunpinyin-dictgen.mk.in', SUBST_DICT = { | |
45 | env.Substfile('sunpinyin-dictgen.mk.in', SUBST_DICT={ | |
45 | 46 | '@MAKE@': env['MAKE'], |
46 | 47 | '@TAR@': env['TAR'], |
47 | 48 | '@WGET@': env['WGET'], |
49 | '@W3M@': env['W3M'], | |
48 | 50 | '@DATADIR@': env['DATADIR'], |
49 | 51 | '@ENDIANNESS@': env['ENDIANNESS'], |
50 | }) | |
52 | }) | |
51 | 53 | env.Command('sunpinyin-dictgen', 'sunpinyin-dictgen.mk', [ |
52 | Copy("$TARGET", "$SOURCE"), | |
53 | Chmod("$TARGET", 0755), | |
54 | ]) | |
54 | Copy("$TARGET", "$SOURCE"), | |
55 | Chmod("$TARGET", 0755), | |
56 | ]) | |
55 | 57 | |
56 | 58 | # -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
34 | 34 | * to such option by the copyright holder. |
35 | 35 | */ |
36 | 36 | |
37 | #ifdef MACOSX | |
38 | #include <Python/Python.h> | |
39 | #else | |
37 | 40 | #include <Python.h> |
41 | #endif | |
42 | ||
38 | 43 | #include <signal.h> |
39 | 44 | #include <sstream> |
40 | 45 |
193 | 193 | && !m_pIC->isEmpty()) { |
194 | 194 | changeMasks |= KEYEVENT_USED; |
195 | 195 | if (m_candiPageFirst > 0) { |
196 | m_candiPageFirst -= m_candiWindowSize; | |
197 | if (m_candiPageFirst < 0) m_candiPageFirst = 0; | |
196 | if (m_candiPageFirst > m_candiWindowSize) { | |
197 | m_candiPageFirst -= m_candiWindowSize; | |
198 | } else { | |
199 | m_candiPageFirst = 0; | |
200 | } | |
198 | 201 | changeMasks |= CANDIDATE_MASK; |
199 | 202 | } |
200 | 203 | } else if (((modifiers == 0 && keycode == IM_VK_PAGE_DOWN) |
61 | 61 | return true; |
62 | 62 | |
63 | 63 | if (m_start == other.m_start) |
64 | return m_len < m_len; | |
64 | return m_len < other.m_len; | |
65 | 65 | |
66 | 66 | return false; |
67 | 67 | } |
193 | 193 | const unsigned char *src = (const unsigned char*)s; |
194 | 194 | TWCHAR* dst = pwcs; |
195 | 195 | |
196 | while (dst - pwcs < n) { | |
196 | while (dst - pwcs < (ssize_t)n) { | |
197 | 197 | if (*src < 0xc0 || *src >= 0xfe) { |
198 | 198 | if (*src < 0x80) *dst++ = *src; |
199 | 199 | if (*src++ == 0) break; |
264 | 264 | return sz; |
265 | 265 | } |
266 | 266 | |
267 | #if !defined (HAVE_STRNDUP) | |
268 | extern "C" char * | |
269 | strndup(const char *s, size_t n) | |
270 | { | |
271 | size_t nMost; | |
272 | char *p = NULL; | |
273 | ||
274 | if (!s) | |
275 | return NULL; | |
276 | ||
277 | #ifdef __cplusplus | |
278 | nMost = std::min(strlen(s) + 1, n + 1); | |
279 | #else | |
280 | nMost = min(strlen(s) + 1, n + 1); | |
281 | #endif | |
282 | p = (char*)malloc(nMost); | |
283 | memcpy(p, s, nMost); | |
284 | p[nMost - 1] = '\0'; | |
285 | ||
286 | return p; | |
287 | } | |
288 | #endif //HAVE_STRNDUP | |
289 | ||
290 | 267 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
325 | 325 | } |
326 | 326 | #endif |
327 | 327 | |
328 | #if !defined (HAVE_STRNDUP) | |
329 | extern "C" char *strndup(const char *s, size_t n); | |
330 | #endif //HAVE_STRNDUP | |
331 | ||
332 | 328 | #endif |
333 | 329 | |
334 | 330 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | /* | |
1 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. | |
2 | * | |
3 | * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. | |
4 | * | |
5 | * The contents of this file are subject to the terms of either the GNU Lesser | |
6 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
7 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
8 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
9 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
10 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
11 | * specific language governing permissions and limitations under the License. When | |
12 | * distributing the software, include this License Header Notice in each file and | |
13 | * include the full text of the License in the License file as well as the | |
14 | * following notice: | |
15 | * | |
16 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
17 | * (CDDL) | |
18 | * For Covered Software in this distribution, this License shall be governed by the | |
19 | * laws of the State of California (excluding conflict-of-law provisions). | |
20 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
21 | * the Federal Courts of the Northern District of California and the state courts | |
22 | * of the State of California, with venue lying in Santa Clara County, California. | |
23 | * | |
24 | * Contributor(s): | |
25 | * | |
26 | * If you wish your version of this file to be governed by only the CDDL or only | |
27 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
28 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
29 | * license." If you don't indicate a single choice of license, a recipient has the | |
30 | * option to distribute your version of this file under either the CDDL or the LGPL | |
31 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
32 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
33 | * Version 2 license, then the option applies only if the new code is made subject | |
34 | * to such option by the copyright holder. | |
35 | */ | |
36 | ||
37 | #ifdef HAVE_CONFIG_H | |
38 | #include "config.h" | |
39 | #endif | |
40 | ||
41 | #ifdef HAVE_ASSERT_H | |
42 | #include <assert.h> | |
43 | #endif | |
44 | ||
45 | #include <stdlib.h> | |
46 | #include <math.h> | |
47 | #include <vector> | |
48 | #include <algorithm> | |
49 | ||
50 | #include "sim_slmbuilder.h" | |
51 | ||
52 | void | |
53 | CSlmGTDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) | |
54 | { | |
55 | if (dis != NULL) | |
56 | delete [] dis; | |
57 | dis = new double[--n]; | |
58 | if (thres > n) thres = n; | |
59 | for (int freq = 1; freq < n; ++freq) { | |
60 | if (nr[freq] == 0 || nr[freq + 1] == 0) | |
61 | dis[freq] = 1.0; | |
62 | else | |
63 | dis[freq] = double(nr[freq + 1]) / nr[freq]; | |
64 | printf("%lf ", dis[freq]); fflush(stdout); | |
65 | } | |
66 | } | |
67 | ||
68 | double | |
69 | CSlmGTDiscounter::discount(int freq) | |
70 | { | |
71 | double newfreq = freq * ((freq < thres) ? dis[freq] : hd); | |
72 | if (newfreq >= double(freq)) | |
73 | newfreq = freq * hd; | |
74 | return newfreq; | |
75 | } | |
76 | ||
77 | void | |
78 | CSlmAbsoluteDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) | |
79 | { | |
80 | // normally, c should not greater than 1.0, yet when cut-off is used, it could be so. | |
81 | if (c <= 0.0) { | |
82 | c = double(nr[1]) / (nr[1] + 2.0 * nr[2]); | |
83 | printf("parameter c=%lf", c); fflush(stdout); | |
84 | } else { | |
85 | printf("Using given parameter c=%lf", c); fflush(stdout); | |
86 | } | |
87 | } | |
88 | ||
89 | double | |
90 | CSlmAbsoluteDiscounter::discount(int freq) | |
91 | { | |
92 | return (freq > 0) ? (freq - c) : (0.0); | |
93 | } | |
94 | ||
95 | void | |
96 | CSlmLinearDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) | |
97 | { | |
98 | if (dis <= 0.0 || dis >= 1.0) { | |
99 | dis = 1.0 - double(nr[1]) / nr[0]; | |
100 | printf("parameter d=%lf", dis); fflush(stdout); | |
101 | } else { | |
102 | printf("Using given parameter d=%lf", dis); fflush(stdout); | |
103 | } | |
104 | } | |
105 | ||
106 | double | |
107 | CSlmLinearDiscounter::discount(int freq) | |
108 | { | |
109 | return freq * dis; | |
110 | } | |
111 | ||
112 | // n=1 for unigram, n=2 for bigram; | |
113 | // level[0] is for psuedo 0 gram, ... | |
114 | void | |
115 | CSlmBuilder::Create(int n) | |
116 | { | |
117 | assert(n != 0); | |
118 | nlevel = n; | |
119 | level = new void * [n + 1]; | |
120 | for (int i = 0; i < n; ++i) { | |
121 | level[i] = new std::vector<TNode>; | |
122 | if (i) ((TNodeLevel*)level[i])->reserve(1024); | |
123 | } | |
124 | //Add leaf level | |
125 | level[n] = new std::vector<TLeaf>; | |
126 | ((TLeafLevel*)level[n])->reserve(1024); | |
127 | ||
128 | //Add psuedo root node | |
129 | ((TNodeLevel*)level[0])->push_back(TNode(0, 0, 0)); | |
130 | ||
131 | //Initialize the nr[n+1][SLM_MAX_R] 2-D array | |
132 | nr = new FREQ_TYPE[n + 1][SLM_MAX_R]; | |
133 | for (int lvl = 0; lvl < n + 1; ++lvl) | |
134 | for (int r = 0; r < SLM_MAX_R; ++r) | |
135 | nr[lvl][r] = 0; | |
136 | } | |
137 | ||
138 | void | |
139 | CSlmBuilder::SetCut(FREQ_TYPE threshold[]) | |
140 | { | |
141 | if (cut != NULL) | |
142 | delete [] cut; | |
143 | cut = new FREQ_TYPE[nlevel + 1]; | |
144 | for (int i = 0; i < nlevel; ++i) | |
145 | cut[i + 1] = threshold[i]; | |
146 | } | |
147 | ||
148 | void | |
149 | CSlmBuilder::SetDiscounter(CSlmDiscounter* dis[]) | |
150 | { | |
151 | if (discounter != NULL) | |
152 | delete [] discounter; | |
153 | discounter = new CSlmDiscounter* [nlevel + 1]; | |
154 | for (int i = 0; i < nlevel; ++i) | |
155 | discounter[i + 1] = dis[i]; | |
156 | } | |
157 | ||
158 | void | |
159 | CSlmBuilder::SetBreakerIds(int nId, TSIMWordId brks[]) | |
160 | { | |
161 | breaker.clear(); | |
162 | for (int i = 0; i < nId; ++i) | |
163 | breaker.push_back(brks[i]); | |
164 | std::make_heap(breaker.begin(), breaker.end()); | |
165 | std::sort_heap(breaker.begin(), breaker.end()); | |
166 | } | |
167 | ||
168 | void | |
169 | CSlmBuilder::SetExcludeIds(int nId, TSIMWordId excludes[]) | |
170 | { | |
171 | m_excludes.clear(); | |
172 | for (int i = 0; i < nId; ++i) | |
173 | m_excludes.push_back(excludes[i]); | |
174 | std::make_heap(m_excludes.begin(), m_excludes.end()); | |
175 | std::sort_heap(m_excludes.begin(), m_excludes.end()); | |
176 | } | |
177 | ||
178 | bool | |
179 | CSlmBuilder::isBreakId(TSIMWordId id) | |
180 | { | |
181 | return std::binary_search(breaker.begin(), breaker.end(), id); | |
182 | } | |
183 | ||
184 | bool | |
185 | CSlmBuilder::isExcludeId(TSIMWordId id) | |
186 | { | |
187 | return std::binary_search(m_excludes.begin(), m_excludes.end(), id); | |
188 | } | |
189 | ||
190 | void | |
191 | CSlmBuilder::AddNGram(TSIMWordId* ngram, FREQ_TYPE fr) | |
192 | { | |
193 | int ch; | |
194 | bool brk = isExcludeId(*ngram); | |
195 | ||
196 | for (int i = 1; i < nlevel; ++i) { | |
197 | TNodeLevel* pnl = (TNodeLevel*)(level[i]); | |
198 | if (pnl->capacity() == pnl->size()) { | |
199 | size_t newsz = 2 * pnl->capacity(); | |
200 | if (pnl->capacity() > 1024 * 1024) | |
201 | newsz = pnl->capacity() + 1024 * 1024; | |
202 | pnl->reserve(newsz); | |
203 | } | |
204 | } | |
205 | TLeafLevel* pll = (TLeafLevel*)(level[nlevel]); | |
206 | if (pll->capacity() == pll->size()) { | |
207 | size_t newsz = 2 * pll->capacity(); | |
208 | if (pll->capacity() > 1024 * 1024) | |
209 | newsz = pll->capacity() + 1024 * 1024; | |
210 | pll->reserve(newsz); | |
211 | } | |
212 | ||
213 | if (!brk) | |
214 | (*(TNodeLevel*)(level[0]))[0].freq += fr; | |
215 | ||
216 | bool branch = false; | |
217 | for (int i = 1; (!brk && i < nlevel); ++i) { | |
218 | std::vector<TNode> & pv = *(TNodeLevel*)(level[i - 1]); | |
219 | std::vector<TNode> & v = *(TNodeLevel*)(level[i]); | |
220 | branch = branch || (pv.back().child >= (int) v.size()) || | |
221 | (v.back().id != ngram[i - 1]); | |
222 | if (branch) { | |
223 | if (i == nlevel - 1) | |
224 | ch = ((TLeafLevel*)(level[i + 1]))->size(); | |
225 | else | |
226 | ch = ((TNodeLevel*)(level[i + 1]))->size(); | |
227 | v.push_back(TNode(ngram[i - 1], ch, fr)); | |
228 | } else { | |
229 | v.back().freq += fr; | |
230 | } | |
231 | brk = (i > 1 && isBreakId(ngram[i - 1])) || isExcludeId(ngram[i]); | |
232 | } | |
233 | ||
234 | // Insert to the leaf level | |
235 | if (!brk) { | |
236 | if (fr > cut[nlevel]) { | |
237 | TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); | |
238 | v.push_back(TLeaf(ngram[nlevel - 1], fr)); | |
239 | } else { | |
240 | nr[nlevel][0] += fr; | |
241 | nr[nlevel][fr] += fr; | |
242 | } | |
243 | } | |
244 | } | |
245 | ||
246 | void | |
247 | CSlmBuilder::CountNr() | |
248 | { | |
249 | printf("\nCounting Nr..."); fflush(stdout); | |
250 | for (int lvl = 1; lvl < nlevel; ++lvl) { | |
251 | TNodeLevel& v = *(TNodeLevel*)(level[lvl]); | |
252 | for (TNodeIterator it = v.begin(), ite = v.end(); it != ite; ++it) { | |
253 | FREQ_TYPE freq = it->freq; | |
254 | nr[lvl][0] += freq; | |
255 | if (freq < (int) SLM_MAX_R && freq > 0) | |
256 | nr[lvl][freq] += freq; | |
257 | } | |
258 | } | |
259 | TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); | |
260 | for (TLeafIterator it = v.begin(), ite = v.end(); it != ite; ++it) { | |
261 | FREQ_TYPE freq = it->freq; | |
262 | nr[nlevel][0] += freq; | |
263 | if (freq < (int) SLM_MAX_R && freq > 0) | |
264 | nr[nlevel][freq] += freq; | |
265 | } | |
266 | printf("\n"); fflush(stdout); | |
267 | } | |
268 | ||
269 | int | |
270 | CSlmBuilder::CutLeafLevel(TNodeIterator pfirst, | |
271 | TNodeIterator plast, | |
272 | TLeafIterator chfirst, | |
273 | TLeafIterator chlast, | |
274 | int thred) | |
275 | { | |
276 | int idxfirst, idxchk; | |
277 | TLeafIterator chchk = chfirst; | |
278 | for (idxfirst = idxchk = 0; chchk != chlast; ++chchk, ++idxchk) { | |
279 | //do not cut item whoese 1. freq > thred; 2. psuedo tail | |
280 | if ((int) chchk->freq > thred || (chchk + 1) == chlast) { | |
281 | if (idxfirst < idxchk) | |
282 | *chfirst = *chchk; | |
283 | for (; pfirst != plast && pfirst->child <= idxchk; ++pfirst) | |
284 | pfirst->child = idxfirst; | |
285 | ++idxfirst; | |
286 | ++chfirst; | |
287 | } | |
288 | } | |
289 | assert(pfirst == plast); | |
290 | return idxfirst; | |
291 | } | |
292 | ||
293 | int | |
294 | CSlmBuilder::CutNodeLevel(TNodeIterator pfirst, | |
295 | TNodeIterator plast, | |
296 | TNodeIterator chfirst, | |
297 | TNodeIterator chlast, | |
298 | int thred) | |
299 | { | |
300 | int idxfirst, idxchk; | |
301 | TNodeIterator chchk = chfirst; | |
302 | for (idxfirst = idxchk = 0; chchk != chlast; ++chchk, ++idxchk) { | |
303 | //do not cut item whoese 1. freq > thred; 2. psuedo tail; 3. leading children | |
304 | TNodeIterator chnext = chchk + 1; | |
305 | if ((int) chchk->freq > thred || chnext == chlast || | |
306 | (chnext->child != chchk->child)) { | |
307 | if (idxfirst < idxchk) | |
308 | *chfirst = *chchk; | |
309 | for (; pfirst != plast && pfirst->child <= idxchk; ++pfirst) | |
310 | pfirst->child = idxfirst; | |
311 | ++idxfirst; | |
312 | ++chfirst; | |
313 | } | |
314 | } | |
315 | assert(pfirst == plast); | |
316 | return idxfirst; | |
317 | } | |
318 | ||
319 | void | |
320 | CSlmBuilder::Cut() | |
321 | { | |
322 | printf("\nCuting according freq..."); fflush(stdout); | |
323 | for (int lvl = nlevel; lvl > 0; --lvl) { | |
324 | printf("\n Cut level %d with threshold %d...", lvl, cut[lvl]); | |
325 | fflush(stdout); | |
326 | TNodeLevel& parent = *(TNodeLevel*)(level[lvl - 1]); | |
327 | if (lvl == nlevel) { | |
328 | if (cut[lvl] > 0) { | |
329 | TLeafLevel& v = *(TLeafLevel*)(level[lvl]); | |
330 | int newsize = CutLeafLevel(parent.begin(), | |
331 | parent.end(), v.begin(), | |
332 | v.end(), cut[lvl]); | |
333 | v.erase(v.begin() + newsize, v.end()); | |
334 | } | |
335 | } else { | |
336 | if (cut[lvl] > 0) { | |
337 | TNodeLevel& v = *(TNodeLevel*)(level[lvl]); | |
338 | int newsize = CutNodeLevel(parent.begin(), | |
339 | parent.end(), v.begin(), | |
340 | v.end(), cut[lvl]); | |
341 | v.erase(v.begin() + newsize, v.end()); | |
342 | } | |
343 | } | |
344 | } | |
345 | printf("\n"); fflush(stdout); | |
346 | } | |
347 | ||
348 | void | |
349 | CSlmBuilder::AppendTails() | |
350 | { | |
351 | printf("\nAppending psuedo tail node for each level..."); fflush(stdout); | |
352 | for (int lvl = 0; lvl < nlevel; ++lvl) { | |
353 | int child_size = 0; | |
354 | if (lvl == nlevel - 1) { | |
355 | child_size = ((TLeafLevel*)(level[lvl + 1]))->size(); | |
356 | } else { | |
357 | child_size = ((TNodeLevel*)(level[lvl + 1]))->size(); | |
358 | } | |
359 | TNodeLevel& v = *(TNodeLevel*)(level[lvl]); | |
360 | v.push_back(TNode(0x00FFFFFF, child_size, 1)); | |
361 | } | |
362 | //also make a psuedo tail node for the leaf level | |
363 | ((TLeafLevel*)(level[nlevel]))->push_back(TLeaf(0, 1)); | |
364 | printf("\n"); fflush(stdout); | |
365 | } | |
366 | ||
367 | template<class TChildLevel> | |
368 | void | |
369 | DiscountOneLevel(CSlmBuilder::TNodeLevel& v, | |
370 | TChildLevel& ch, | |
371 | CSlmDiscounter* disc, | |
372 | int bUseLogPr) | |
373 | { | |
374 | CSlmBuilder::TNodeIterator it = v.begin(); | |
375 | CSlmBuilder::TNodeIterator ite = v.begin() + (v.size() - 1); | |
376 | for (; it != ite; ++it) { //do not calc the psuedo tail item | |
377 | CSlmBuilder::TNodeIterator itnext = it + 1; | |
378 | double root_freq = it->freq; | |
379 | for (int h = it->child, t = itnext->child; h < t; ++h) { | |
380 | double pr = disc->discount(ch[h].freq) / root_freq; | |
381 | assert(pr > 0.0 && pr < 1.0); | |
382 | if (bUseLogPr) { | |
383 | ch[h].pr = CSlmBuilder::PR_TYPE(-log(pr)); | |
384 | } else { | |
385 | ch[h].pr = CSlmBuilder::PR_TYPE(pr); | |
386 | } | |
387 | } | |
388 | } | |
389 | } | |
390 | ||
391 | void | |
392 | CSlmBuilder::Discount() | |
393 | { | |
394 | printf("\nDiscounting..."); | |
395 | for (int lvl = nlevel; lvl > 0; --lvl) { | |
396 | printf("\n Initializing level %d's %s discount method: ", | |
397 | lvl, | |
398 | discounter[lvl]->getName()); | |
399 | discounter[lvl]->init(SLM_MAX_R, nr[lvl]); | |
400 | } | |
401 | printf("\n"); | |
402 | for (int lvl = nlevel - 1; lvl >= 0; --lvl) { | |
403 | printf("\n Discounting level %d ...", lvl + 1); fflush(stdout); | |
404 | TNodeLevel& v = *(TNodeLevel*)(level[lvl]); | |
405 | if (lvl == nlevel - 1) { //its child is leaf | |
406 | TLeafLevel& ch = *(TLeafLevel*)(level[lvl + 1]); | |
407 | DiscountOneLevel(v, ch, discounter[lvl + 1], bUseLogPr); | |
408 | } else { | |
409 | TNodeLevel& ch = *(TNodeLevel*)(level[lvl + 1]); | |
410 | DiscountOneLevel(v, ch, discounter[lvl + 1], bUseLogPr); | |
411 | } | |
412 | } | |
413 | printf("\n Giving psuedo root level 0 a distribution..."); | |
414 | //make the psuedo 0-gram a equal distribution | |
415 | TNodeLevel& v0 = *(TNodeLevel*)(level[0]); | |
416 | if (bUseLogPr) { | |
417 | v0[0].pr = PR_TYPE(-log(double(1.0) / m_nWord)); | |
418 | } else { | |
419 | v0[0].pr = PR_TYPE(double(1.0) / m_nWord); | |
420 | } | |
421 | printf("\n"); fflush(stdout); | |
422 | } | |
423 | ||
424 | template<class chIterator> | |
425 | double | |
426 | CalcNodeBow(CSlmBuilder* builder, | |
427 | int lvl, | |
428 | TSIMWordId words[], | |
429 | chIterator chh, | |
430 | chIterator cht, | |
431 | int bUseLogPr) | |
432 | { | |
433 | if (chh == cht) return 1.0; | |
434 | double sumnext = 0.0, sum = 0.0; | |
435 | for (; chh < cht; ++chh) { | |
436 | if (bUseLogPr) { | |
437 | sumnext += exp(-(chh->pr)); | |
438 | } else { | |
439 | sumnext += double(chh->pr); | |
440 | } | |
441 | words[lvl + 1] = chh->id; | |
442 | sum += builder->getPr(lvl, words + 2); | |
443 | } | |
444 | assert(sumnext > 0.0 && sumnext < 1.05); | |
445 | assert(sum < 1.05 && sum > 0.0); | |
446 | //消除计算误差的影响 | |
447 | if (sumnext >= 1.0 || sum >= 1.0) { | |
448 | double bow = ((sumnext > sum) ? sumnext : sum) + 0.0001; | |
449 | bow = (bow - sumnext) / (bow - sum); | |
450 | printf( | |
451 | "\n (sigma(p(w|h)=%lf, sigma(p(w|h')=%lf) bow ==> %lf due to Calculation precision for %d-gram:", | |
452 | sumnext, | |
453 | sum, | |
454 | bow, | |
455 | lvl); | |
456 | for (int i = 1; i <= lvl; ++i) | |
457 | printf("%d ", words[i]); | |
458 | return bow; | |
459 | } | |
460 | return (1.0 - sumnext) / (1.0 - sum); | |
461 | } | |
462 | ||
463 | void | |
464 | CSlmBuilder::CalcBOW() | |
465 | { | |
466 | printf("\nCalculating Back-Off Weight..."); | |
467 | for (int lvl = 0; lvl < nlevel; ++lvl) { | |
468 | printf("\n Processing level %d ", lvl); fflush(stdout); | |
469 | TNode* base[16]; //it should be lvl+1, yet some compiler does not support it | |
470 | int idx[16]; //it should be lvl+1, yet some compiler does not support it | |
471 | for (int i = 0; i <= lvl; ++i) { | |
472 | base[i] = &((*(TNodeLevel*)level[i])[0]); | |
473 | idx[i] = 0; | |
474 | } | |
475 | TSIMWordId words[17]; //it should be lvl+2, yet some compiler do not support it | |
476 | int sz = ((TNodeLevel*)(level[lvl]))->size() - 1; | |
477 | printf("(%d items)...", sz + 1); fflush(stdout); | |
478 | for (; idx[lvl] < sz; ++idx[lvl]) { | |
479 | words[lvl] = base[lvl][idx[lvl]].id; | |
480 | for (int k = lvl - 1; k >= 0; --k) { | |
481 | while (base[k][idx[k] + 1].child <= idx[k + 1]) | |
482 | ++idx[k]; | |
483 | words[k] = base[k][idx[k]].id; | |
484 | } | |
485 | TNode & node = base[lvl][idx[lvl]]; | |
486 | TNode & nodenext = *((&node) + 1); | |
487 | double bow; | |
488 | if (lvl == nlevel - 1) { | |
489 | TLeaf * ch = &((*(TLeafLevel*)level[lvl + 1])[0]); | |
490 | bow = CalcNodeBow(this, | |
491 | lvl, | |
492 | words, | |
493 | ch + node.child, | |
494 | ch + nodenext.child, | |
495 | bUseLogPr); | |
496 | } else { | |
497 | TNode * ch = &((*(TNodeLevel*)level[lvl + 1])[0]); | |
498 | bow = CalcNodeBow(this, | |
499 | lvl, | |
500 | words, | |
501 | ch + node.child, | |
502 | ch + nodenext.child, | |
503 | bUseLogPr); | |
504 | } | |
505 | if (bUseLogPr) { | |
506 | node.bow = PR_TYPE(-log(bow)); | |
507 | } else { | |
508 | node.bow = PR_TYPE(bow); | |
509 | } | |
510 | } | |
511 | } | |
512 | printf("\n"); fflush(stdout); | |
513 | } | |
514 | ||
515 | double | |
516 | CSlmBuilder::getPr(int n, TSIMWordId *words) | |
517 | { | |
518 | int lvl; | |
519 | double bow = 1.0; | |
520 | void* pnode = &((*(TNodeLevel*)level[0])[0]); | |
521 | ||
522 | assert(n <= nlevel); | |
523 | ||
524 | if (n == 0) { | |
525 | if (bUseLogPr) { | |
526 | return exp(-((TNode*)pnode)->pr); | |
527 | } else { | |
528 | return ((TNode*)pnode)->pr; | |
529 | } | |
530 | } | |
531 | ||
532 | for (lvl = 0; pnode != NULL && lvl < n; ++lvl) { | |
533 | if (bUseLogPr) { | |
534 | bow = exp(-((TNode*)pnode)->bow); | |
535 | } else { | |
536 | bow = ((TNode*)pnode)->bow; | |
537 | } | |
538 | pnode = FindChild(lvl, (TNode*)pnode, words[lvl]); | |
539 | } | |
540 | ||
541 | if (pnode != NULL) { // find the whole string | |
542 | if (bUseLogPr) { | |
543 | return exp(-((TLeaf*)pnode)->pr); | |
544 | } else { | |
545 | return ((TLeaf*)pnode)->pr; | |
546 | } | |
547 | } else if (lvl == n - 1) { // only find the history | |
548 | return bow * getPr(n - 1, words + 1); | |
549 | } else { //even not find the history | |
550 | return getPr(n - 1, words + 1); | |
551 | } | |
552 | } | |
553 | ||
554 | void* | |
555 | CSlmBuilder::FindChild(int lvl, TNode* root, TSIMWordId id) | |
556 | { | |
557 | int chh = root->child, cht = (root + 1)->child; | |
558 | if (lvl == nlevel - 1) { | |
559 | TLeaf* pleaf = &((*(TLeafLevel*)level[lvl + 1])[0]); | |
560 | return (void*)binary_find(pleaf, chh, cht, TLeaf(id)); | |
561 | } else { | |
562 | TNode* pnode = &((*(TNodeLevel*)level[lvl + 1])[0]); | |
563 | return (void*)binary_find(pnode, chh, cht, TNode(id)); | |
564 | } | |
565 | } | |
566 | ||
567 | void | |
568 | CSlmBuilder::Build() | |
569 | { | |
570 | CountNr(); | |
571 | AppendTails(); | |
572 | Cut(); | |
573 | Discount(); | |
574 | CalcBOW(); | |
575 | } | |
576 | ||
577 | void | |
578 | CSlmBuilder::Write(FILE *out) | |
579 | { | |
580 | fwrite(&nlevel, sizeof(nlevel), 1, out); | |
581 | fwrite(&bUseLogPr, sizeof(bUseLogPr), 1, out); | |
582 | for (int lvl = 0; lvl <= nlevel; ++lvl) { | |
583 | int sz = 0; | |
584 | if (lvl == nlevel) | |
585 | sz = ((TLeafLevel*)(level[lvl]))->size(); | |
586 | else | |
587 | sz = ((TNodeLevel*)(level[lvl]))->size(); | |
588 | fwrite(&sz, sizeof(sz), 1, out); | |
589 | } | |
590 | for (int lvl = 0; lvl < nlevel; ++lvl) { | |
591 | TNodeLevel& v = *(TNodeLevel*)(level[lvl]); | |
592 | for (TNodeIterator it = v.begin(), ite = v.end(); it != ite; ++it) | |
593 | fwrite(&(*it), sizeof(TNode), 1, out); | |
594 | } | |
595 | TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); | |
596 | for (TLeafIterator it = v.begin(), ite = v.end(); it != ite; ++it) | |
597 | fwrite(&(*it), sizeof(TLeaf), 1, out); | |
598 | } | |
599 | ||
600 | void | |
601 | CSlmBuilder::Close(void) | |
602 | { | |
603 | if (level != NULL) { | |
604 | for (int lvl = 0; lvl <= nlevel; ++lvl) { | |
605 | if (lvl == nlevel) | |
606 | delete (TLeafLevel*)(level[lvl]); | |
607 | else | |
608 | delete (TNodeLevel*)(level[lvl]); | |
609 | } | |
610 | delete [] level; | |
611 | level = NULL; | |
612 | } | |
613 | if (cut != NULL) { | |
614 | delete [] cut; | |
615 | cut = NULL; | |
616 | } | |
617 | if (discounter != NULL) { | |
618 | for (int lvl = 1; lvl <= nlevel; ++lvl) { | |
619 | delete discounter[lvl]; | |
620 | } | |
621 | delete [] discounter; | |
622 | discounter = NULL; | |
623 | } | |
624 | if (nr != NULL) { | |
625 | delete [] nr; | |
626 | nr = NULL; | |
627 | } | |
628 | breaker.clear(); | |
629 | m_nWord = 0; | |
630 | nlevel = 0; | |
631 | } | |
632 | ||
633 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | // -*- mode: c++ -*- | |
1 | /* | |
2 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. | |
3 | * | |
4 | * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. | |
5 | * | |
6 | * The contents of this file are subject to the terms of either the GNU Lesser | |
7 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
8 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
9 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
10 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
11 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
12 | * specific language governing permissions and limitations under the License. When | |
13 | * distributing the software, include this License Header Notice in each file and | |
14 | * include the full text of the License in the License file as well as the | |
15 | * following notice: | |
16 | * | |
17 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
18 | * (CDDL) | |
19 | * For Covered Software in this distribution, this License shall be governed by the | |
20 | * laws of the State of California (excluding conflict-of-law provisions). | |
21 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
22 | * the Federal Courts of the Northern District of California and the state courts | |
23 | * of the State of California, with venue lying in Santa Clara County, California. | |
24 | * | |
25 | * Contributor(s): | |
26 | * | |
27 | * If you wish your version of this file to be governed by only the CDDL or only | |
28 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
29 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
30 | * license." If you don't indicate a single choice of license, a recipient has the | |
31 | * option to distribute your version of this file under either the CDDL or the LGPL | |
32 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
33 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
34 | * Version 2 license, then the option applies only if the new code is made subject | |
35 | * to such option by the copyright holder. | |
36 | */ | |
37 | ||
38 | #ifndef _SIM_SLM_BUILDER_H | |
39 | #define _SIM_SLM_BUILDER_H | |
40 | ||
41 | #include "../portability.h" | |
42 | ||
43 | #include "sim_slm.h" | |
44 | ||
45 | class CSlmDiscounter; | |
46 | ||
47 | class CSlmBuilder { | |
48 | public: | |
49 | static const int SLM_MAX_R = 16; | |
50 | typedef CSIMSlm::FREQ_TYPE FREQ_TYPE; | |
51 | typedef CSIMSlm::PR_TYPE PR_TYPE; | |
52 | typedef CSIMSlm::TNode TNode; | |
53 | typedef CSIMSlm::TLeaf TLeaf; | |
54 | ||
55 | public: | |
56 | CSlmBuilder() | |
57 | : nlevel(0), bUseLogPr(0), level(NULL), m_nWord(0), cut(NULL), | |
58 | discounter(NULL), nr(NULL), breaker(), m_excludes() { } | |
59 | ~CSlmBuilder() | |
60 | { Close(); } | |
61 | ||
62 | void Create(int n); | |
63 | void SetNumberOfWord(int nWord) { this->m_nWord = nWord; } | |
64 | void SetCut(FREQ_TYPE threshold[]); | |
65 | void SetDiscounter(CSlmDiscounter * dis[]); | |
66 | void SetBreakerIds(int nId, TSIMWordId brks[]); | |
67 | void SetExcludeIds(int nId, TSIMWordId excludes[]); | |
68 | void SetUseLogPr(int bUse) | |
69 | { bUseLogPr = bUse; } | |
70 | ||
71 | void AddNGram(TSIMWordId* ngram, FREQ_TYPE fr); | |
72 | void Build(); | |
73 | void Write(FILE* out); | |
74 | void Close(); | |
75 | ||
76 | //get pr(w[n-1] | w[0]...w[n-2]) on constructed partial model (low levels) | |
77 | double getPr(int n, TSIMWordId* w); | |
78 | ||
79 | public: | |
80 | typedef std::vector<TNode> TNodeLevel; | |
81 | typedef std::vector<TLeaf> TLeafLevel; | |
82 | typedef TNodeLevel::iterator TNodeIterator; | |
83 | typedef TLeafLevel::iterator TLeafIterator; | |
84 | ||
85 | protected: | |
86 | bool isBreakId(TSIMWordId id); | |
87 | bool isExcludeId(TSIMWordId id); | |
88 | void CountNr(); | |
89 | void AppendTails(); | |
90 | void Cut(); | |
91 | void Discount(); | |
92 | void CalcBOW(); | |
93 | void*FindChild(int lvl, TNode* root, TSIMWordId id); | |
94 | int CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast, | |
95 | TNodeIterator chfirst, TNodeIterator chlast, int thred); | |
96 | int CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast, | |
97 | TLeafIterator chfirst, TLeafIterator chlast, int thred); | |
98 | ||
99 | private: | |
100 | int nlevel, bUseLogPr; | |
101 | void** level; | |
102 | //level[0] is psudeo root level, level[1] is unigram level, ..., all are vector type | |
103 | ||
104 | int m_nWord; | |
105 | FREQ_TYPE* cut; // cut[1] is not cut threshold for 1-gram, ... | |
106 | CSlmDiscounter** discounter; // discounter[1] is for 1-gram... | |
107 | FREQ_TYPE(*nr)[SLM_MAX_R]; //nr[1][SLM_MAX_R] is for 1-gram... | |
108 | std::vector<TSIMWordId> breaker; | |
109 | std::vector<TSIMWordId> m_excludes; | |
110 | }; | |
111 | ||
112 | class CSlmDiscounter { | |
113 | public: | |
114 | virtual ~CSlmDiscounter() {} | |
115 | // n is array size, nr is FREQ_TYPE[n], nr[0] is corpuse size,or sigma r*nr; | |
116 | // nr[1] is number of ngram items with freq 1, ... | |
117 | virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr) = 0; | |
118 | ||
119 | // freq is the ngram frequence, not the conditional pr | |
120 | virtual double discount(int freq) = 0; | |
121 | virtual const char* getName() = 0; | |
122 | }; | |
123 | ||
124 | //Good-Turing discount | |
125 | class CSlmGTDiscounter : public CSlmDiscounter { | |
126 | public: | |
127 | CSlmGTDiscounter(int threshold = 10, double highfreq_discount = | |
128 | 0.95) : thres(threshold), hd(highfreq_discount), | |
129 | dis(NULL) {} | |
130 | virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); | |
131 | virtual double discount(int freq); | |
132 | virtual const char* getName() | |
133 | { return "Good-Turing"; } | |
134 | protected: | |
135 | int thres; | |
136 | double hd; | |
137 | double *dis; | |
138 | }; | |
139 | ||
140 | class CSlmAbsoluteDiscounter : public CSlmDiscounter { | |
141 | public: | |
142 | CSlmAbsoluteDiscounter(double substract = 0.0) : c(substract) {} | |
143 | //c == 0 mean this value should be count according to r[] | |
144 | virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); | |
145 | virtual double discount(int freq); // return freq - c | |
146 | virtual const char* getName() | |
147 | { return "Absolution"; } | |
148 | protected: | |
149 | double c; | |
150 | }; | |
151 | ||
152 | class CSlmLinearDiscounter : public CSlmDiscounter { | |
153 | public: | |
154 | CSlmLinearDiscounter(double shrink = 0.0) : dis(shrink) {} | |
155 | //dis == 0 mean this value should be count according to r[] | |
156 | virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); | |
157 | virtual double discount(int freq); // return freq * dis | |
158 | virtual const char* getName() | |
159 | { return "Linear"; } | |
160 | protected: | |
161 | double dis; | |
162 | }; | |
163 | ||
164 | #endif | |
165 | ||
166 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | /* | |
1 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. | |
2 | * | |
3 | * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. | |
4 | * | |
5 | * The contents of this file are subject to the terms of either the GNU Lesser | |
6 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
7 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
8 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
9 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
10 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
11 | * specific language governing permissions and limitations under the License. When | |
12 | * distributing the software, include this License Header Notice in each file and | |
13 | * include the full text of the License in the License file as well as the | |
14 | * following notice: | |
15 | * | |
16 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
17 | * (CDDL) | |
18 | * For Covered Software in this distribution, this License shall be governed by the | |
19 | * laws of the State of California (excluding conflict-of-law provisions). | |
20 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
21 | * the Federal Courts of the Northern District of California and the state courts | |
22 | * of the State of California, with venue lying in Santa Clara County, California. | |
23 | * | |
24 | * Contributor(s): | |
25 | * | |
26 | * If you wish your version of this file to be governed by only the CDDL or only | |
27 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
28 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
29 | * license." If you don't indicate a single choice of license, a recipient has the | |
30 | * option to distribute your version of this file under either the CDDL or the LGPL | |
31 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
32 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
33 | * Version 2 license, then the option applies only if the new code is made subject | |
34 | * to such option by the copyright holder. | |
35 | */ | |
36 | ||
37 | #ifdef HAVE_CONFIG_H | |
38 | #include "config.h" | |
39 | #endif | |
40 | ||
41 | #ifdef HAVE_ASSERT_H | |
42 | #include <assert.h> | |
43 | #endif | |
44 | ||
45 | #include <stdlib.h> | |
46 | #include <math.h> | |
47 | #include <vector> | |
48 | #include <algorithm> | |
49 | ||
50 | #include "sim_slmbuilder.h" | |
51 | ||
52 | void | |
53 | CSlmGTDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) | |
54 | { | |
55 | if (dis != NULL) | |
56 | delete [] dis; | |
57 | dis = new double[--n]; | |
58 | if (thres > n) thres = n; | |
59 | for (int freq = 1; freq < n; ++freq) { | |
60 | if (nr[freq] == 0 || nr[freq + 1] == 0) | |
61 | dis[freq] = 1.0; | |
62 | else | |
63 | dis[freq] = double(nr[freq + 1]) / nr[freq]; | |
64 | printf("%lf ", dis[freq]); fflush(stdout); | |
65 | } | |
66 | } | |
67 | ||
68 | double | |
69 | CSlmGTDiscounter::discount(int freq) | |
70 | { | |
71 | double newfreq = freq * ((freq < thres) ? dis[freq] : hd); | |
72 | if (newfreq >= double(freq)) | |
73 | newfreq = freq * hd; | |
74 | return newfreq; | |
75 | } | |
76 | ||
77 | void | |
78 | CSlmAbsoluteDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) | |
79 | { | |
80 | // normally, c should not greater than 1.0, yet when cut-off is used, it could be so. | |
81 | if (c <= 0.0) { | |
82 | c = double(nr[1]) / (nr[1] + 2.0 * nr[2]); | |
83 | printf("parameter c=%lf", c); fflush(stdout); | |
84 | } else { | |
85 | printf("Using given parameter c=%lf", c); fflush(stdout); | |
86 | } | |
87 | } | |
88 | ||
89 | double | |
90 | CSlmAbsoluteDiscounter::discount(int freq) | |
91 | { | |
92 | return (freq > 0) ? (freq - c) : (0.0); | |
93 | } | |
94 | ||
95 | void | |
96 | CSlmLinearDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) | |
97 | { | |
98 | if (dis <= 0.0 || dis >= 1.0) { | |
99 | dis = 1.0 - double(nr[1]) / nr[0]; | |
100 | printf("parameter d=%lf", dis); fflush(stdout); | |
101 | } else { | |
102 | printf("Using given parameter d=%lf", dis); fflush(stdout); | |
103 | } | |
104 | } | |
105 | ||
106 | double | |
107 | CSlmLinearDiscounter::discount(int freq) | |
108 | { | |
109 | return freq * dis; | |
110 | } | |
111 | ||
112 | // n=1 for unigram, n=2 for bigram; | |
113 | // level[0] is for psuedo 0 gram, ... | |
114 | void | |
115 | CSlmBuilder::Create(int n) | |
116 | { | |
117 | assert(n != 0); | |
118 | nlevel = n; | |
119 | level = new void * [n + 1]; | |
120 | for (int i = 0; i < n; ++i) { | |
121 | level[i] = new std::vector<TNode>; | |
122 | if (i) ((TNodeLevel*)level[i])->reserve(1024); | |
123 | } | |
124 | //Add leaf level | |
125 | level[n] = new std::vector<TLeaf>; | |
126 | ((TLeafLevel*)level[n])->reserve(1024); | |
127 | ||
128 | //Add psuedo root node | |
129 | ((TNodeLevel*)level[0])->push_back(TNode(0, 0, 0)); | |
130 | ||
131 | //Initialize the nr[n+1][SLM_MAX_R] 2-D array | |
132 | nr = new FREQ_TYPE[n + 1][SLM_MAX_R]; | |
133 | for (int lvl = 0; lvl < n + 1; ++lvl) | |
134 | for (int r = 0; r < SLM_MAX_R; ++r) | |
135 | nr[lvl][r] = 0; | |
136 | } | |
137 | ||
138 | void | |
139 | CSlmBuilder::SetCut(FREQ_TYPE threshold[]) | |
140 | { | |
141 | if (cut != NULL) | |
142 | delete [] cut; | |
143 | cut = new FREQ_TYPE[nlevel + 1]; | |
144 | for (int i = 0; i < nlevel; ++i) | |
145 | cut[i + 1] = threshold[i]; | |
146 | } | |
147 | ||
148 | void | |
149 | CSlmBuilder::SetDiscounter(CSlmDiscounter* dis[]) | |
150 | { | |
151 | if (discounter != NULL) | |
152 | delete [] discounter; | |
153 | discounter = new CSlmDiscounter* [nlevel + 1]; | |
154 | for (int i = 0; i < nlevel; ++i) | |
155 | discounter[i + 1] = dis[i]; | |
156 | } | |
157 | ||
158 | void | |
159 | CSlmBuilder::SetBreakerIds(int nId, TSIMWordId brks[]) | |
160 | { | |
161 | breaker.clear(); | |
162 | for (int i = 0; i < nId; ++i) | |
163 | breaker.push_back(brks[i]); | |
164 | std::make_heap(breaker.begin(), breaker.end()); | |
165 | std::sort_heap(breaker.begin(), breaker.end()); | |
166 | } | |
167 | ||
168 | void | |
169 | CSlmBuilder::SetExcludeIds(int nId, TSIMWordId excludes[]) | |
170 | { | |
171 | m_excludes.clear(); | |
172 | for (int i = 0; i < nId; ++i) | |
173 | m_excludes.push_back(excludes[i]); | |
174 | std::make_heap(m_excludes.begin(), m_excludes.end()); | |
175 | std::sort_heap(m_excludes.begin(), m_excludes.end()); | |
176 | } | |
177 | ||
178 | bool | |
179 | CSlmBuilder::isBreakId(TSIMWordId id) | |
180 | { | |
181 | return std::binary_search(breaker.begin(), breaker.end(), id); | |
182 | } | |
183 | ||
184 | bool | |
185 | CSlmBuilder::isExcludeId(TSIMWordId id) | |
186 | { | |
187 | return std::binary_search(m_excludes.begin(), m_excludes.end(), id); | |
188 | } | |
189 | ||
190 | void | |
191 | CSlmBuilder::AddNGram(TSIMWordId* ngram, FREQ_TYPE fr) | |
192 | { | |
193 | int ch; | |
194 | bool brk = isExcludeId(*ngram); | |
195 | ||
196 | for (int i = 1; i < nlevel; ++i) { | |
197 | TNodeLevel* pnl = (TNodeLevel*)(level[i]); | |
198 | if (pnl->capacity() == pnl->size()) { | |
199 | size_t newsz = 2 * pnl->capacity(); | |
200 | if (pnl->capacity() > 1024 * 1024) | |
201 | newsz = pnl->capacity() + 1024 * 1024; | |
202 | pnl->reserve(newsz); | |
203 | } | |
204 | } | |
205 | TLeafLevel* pll = (TLeafLevel*)(level[nlevel]); | |
206 | if (pll->capacity() == pll->size()) { | |
207 | size_t newsz = 2 * pll->capacity(); | |
208 | if (pll->capacity() > 1024 * 1024) | |
209 | newsz = pll->capacity() + 1024 * 1024; | |
210 | pll->reserve(newsz); | |
211 | } | |
212 | ||
213 | if (!brk) | |
214 | (*(TNodeLevel*)(level[0]))[0].freq += fr; | |
215 | ||
216 | bool branch = false; | |
217 | for (int i = 1; (!brk && i < nlevel); ++i) { | |
218 | std::vector<TNode> & pv = *(TNodeLevel*)(level[i - 1]); | |
219 | std::vector<TNode> & v = *(TNodeLevel*)(level[i]); | |
220 | branch = branch || (pv.back().child >= (int) v.size()) || | |
221 | (v.back().id != ngram[i - 1]); | |
222 | if (branch) { | |
223 | if (i == nlevel - 1) | |
224 | ch = ((TLeafLevel*)(level[i + 1]))->size(); | |
225 | else | |
226 | ch = ((TNodeLevel*)(level[i + 1]))->size(); | |
227 | v.push_back(TNode(ngram[i - 1], ch, fr)); | |
228 | } else { | |
229 | v.back().freq += fr; | |
230 | } | |
231 | brk = (i > 1 && isBreakId(ngram[i - 1])) || isExcludeId(ngram[i]); | |
232 | } | |
233 | ||
234 | // Insert to the leaf level | |
235 | if (!brk) { | |
236 | if (fr > cut[nlevel]) { | |
237 | TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); | |
238 | v.push_back(TLeaf(ngram[nlevel - 1], fr)); | |
239 | } else { | |
240 | nr[nlevel][0] += fr; | |
241 | nr[nlevel][fr] += fr; | |
242 | } | |
243 | } | |
244 | } | |
245 | ||
246 | void | |
247 | CSlmBuilder::CountNr() | |
248 | { | |
249 | printf("\nCounting Nr..."); fflush(stdout); | |
250 | for (int lvl = 1; lvl < nlevel; ++lvl) { | |
251 | TNodeLevel& v = *(TNodeLevel*)(level[lvl]); | |
252 | for (TNodeIterator it = v.begin(), ite = v.end(); it != ite; ++it) { | |
253 | FREQ_TYPE freq = it->freq; | |
254 | nr[lvl][0] += freq; | |
255 | if (freq < (int) SLM_MAX_R && freq > 0) | |
256 | nr[lvl][freq] += freq; | |
257 | } | |
258 | } | |
259 | TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); | |
260 | for (TLeafIterator it = v.begin(), ite = v.end(); it != ite; ++it) { | |
261 | FREQ_TYPE freq = it->freq; | |
262 | nr[nlevel][0] += freq; | |
263 | if (freq < (int) SLM_MAX_R && freq > 0) | |
264 | nr[nlevel][freq] += freq; | |
265 | } | |
266 | printf("\n"); fflush(stdout); | |
267 | } | |
268 | ||
269 | int | |
270 | CSlmBuilder::CutLeafLevel(TNodeIterator pfirst, | |
271 | TNodeIterator plast, | |
272 | TLeafIterator chfirst, | |
273 | TLeafIterator chlast, | |
274 | int thred) | |
275 | { | |
276 | int idxfirst, idxchk; | |
277 | TLeafIterator chchk = chfirst; | |
278 | for (idxfirst = idxchk = 0; chchk != chlast; ++chchk, ++idxchk) { | |
279 | //do not cut item whoese 1. freq > thred; 2. psuedo tail | |
280 | if ((int) chchk->freq > thred || (chchk + 1) == chlast) { | |
281 | if (idxfirst < idxchk) | |
282 | *chfirst = *chchk; | |
283 | for (; pfirst != plast && pfirst->child <= idxchk; ++pfirst) | |
284 | pfirst->child = idxfirst; | |
285 | ++idxfirst; | |
286 | ++chfirst; | |
287 | } | |
288 | } | |
289 | assert(pfirst == plast); | |
290 | return idxfirst; | |
291 | } | |
292 | ||
293 | int | |
294 | CSlmBuilder::CutNodeLevel(TNodeIterator pfirst, | |
295 | TNodeIterator plast, | |
296 | TNodeIterator chfirst, | |
297 | TNodeIterator chlast, | |
298 | int thred) | |
299 | { | |
300 | int idxfirst, idxchk; | |
301 | TNodeIterator chchk = chfirst; | |
302 | for (idxfirst = idxchk = 0; chchk != chlast; ++chchk, ++idxchk) { | |
303 | //do not cut item whoese 1. freq > thred; 2. psuedo tail; 3. leading children | |
304 | TNodeIterator chnext = chchk + 1; | |
305 | if ((int) chchk->freq > thred || chnext == chlast || | |
306 | (chnext->child != chchk->child)) { | |
307 | if (idxfirst < idxchk) | |
308 | *chfirst = *chchk; | |
309 | for (; pfirst != plast && pfirst->child <= idxchk; ++pfirst) | |
310 | pfirst->child = idxfirst; | |
311 | ++idxfirst; | |
312 | ++chfirst; | |
313 | } | |
314 | } | |
315 | assert(pfirst == plast); | |
316 | return idxfirst; | |
317 | } | |
318 | ||
319 | void | |
320 | CSlmBuilder::Cut() | |
321 | { | |
322 | printf("\nCuting according freq..."); fflush(stdout); | |
323 | for (int lvl = nlevel; lvl > 0; --lvl) { | |
324 | printf("\n Cut level %d with threshold %d...", lvl, cut[lvl]); | |
325 | fflush(stdout); | |
326 | TNodeLevel& parent = *(TNodeLevel*)(level[lvl - 1]); | |
327 | if (lvl == nlevel) { | |
328 | if (cut[lvl] > 0) { | |
329 | TLeafLevel& v = *(TLeafLevel*)(level[lvl]); | |
330 | int newsize = CutLeafLevel(parent.begin(), | |
331 | parent.end(), v.begin(), | |
332 | v.end(), cut[lvl]); | |
333 | v.erase(v.begin() + newsize, v.end()); | |
334 | } | |
335 | } else { | |
336 | if (cut[lvl] > 0) { | |
337 | TNodeLevel& v = *(TNodeLevel*)(level[lvl]); | |
338 | int newsize = CutNodeLevel(parent.begin(), | |
339 | parent.end(), v.begin(), | |
340 | v.end(), cut[lvl]); | |
341 | v.erase(v.begin() + newsize, v.end()); | |
342 | } | |
343 | } | |
344 | } | |
345 | printf("\n"); fflush(stdout); | |
346 | } | |
347 | ||
348 | void | |
349 | CSlmBuilder::AppendTails() | |
350 | { | |
351 | printf("\nAppending psuedo tail node for each level..."); fflush(stdout); | |
352 | for (int lvl = 0; lvl < nlevel; ++lvl) { | |
353 | int child_size = 0; | |
354 | if (lvl == nlevel - 1) { | |
355 | child_size = ((TLeafLevel*)(level[lvl + 1]))->size(); | |
356 | } else { | |
357 | child_size = ((TNodeLevel*)(level[lvl + 1]))->size(); | |
358 | } | |
359 | TNodeLevel& v = *(TNodeLevel*)(level[lvl]); | |
360 | v.push_back(TNode(0x00FFFFFF, child_size, 1)); | |
361 | } | |
362 | //also make a psuedo tail node for the leaf level | |
363 | ((TLeafLevel*)(level[nlevel]))->push_back(TLeaf(0, 1)); | |
364 | printf("\n"); fflush(stdout); | |
365 | } | |
366 | ||
367 | template<class TChildLevel> | |
368 | void | |
369 | DiscountOneLevel(CSlmBuilder::TNodeLevel& v, | |
370 | TChildLevel& ch, | |
371 | CSlmDiscounter* disc, | |
372 | int bUseLogPr) | |
373 | { | |
374 | CSlmBuilder::TNodeIterator it = v.begin(); | |
375 | CSlmBuilder::TNodeIterator ite = v.begin() + (v.size() - 1); | |
376 | for (; it != ite; ++it) { //do not calc the psuedo tail item | |
377 | CSlmBuilder::TNodeIterator itnext = it + 1; | |
378 | double root_freq = it->freq; | |
379 | for (int h = it->child, t = itnext->child; h < t; ++h) { | |
380 | double pr = disc->discount(ch[h].freq) / root_freq; | |
381 | assert(pr > 0.0 && pr < 1.0); | |
382 | if (bUseLogPr) { | |
383 | ch[h].pr = CSlmBuilder::PR_TYPE(-log(pr)); | |
384 | } else { | |
385 | ch[h].pr = CSlmBuilder::PR_TYPE(pr); | |
386 | } | |
387 | } | |
388 | } | |
389 | } | |
390 | ||
391 | void | |
392 | CSlmBuilder::Discount() | |
393 | { | |
394 | printf("\nDiscounting..."); | |
395 | for (int lvl = nlevel; lvl > 0; --lvl) { | |
396 | printf("\n Initializing level %d's %s discount method: ", | |
397 | lvl, | |
398 | discounter[lvl]->getName()); | |
399 | discounter[lvl]->init(SLM_MAX_R, nr[lvl]); | |
400 | } | |
401 | printf("\n"); | |
402 | for (int lvl = nlevel - 1; lvl >= 0; --lvl) { | |
403 | printf("\n Discounting level %d ...", lvl + 1); fflush(stdout); | |
404 | TNodeLevel& v = *(TNodeLevel*)(level[lvl]); | |
405 | if (lvl == nlevel - 1) { //its child is leaf | |
406 | TLeafLevel& ch = *(TLeafLevel*)(level[lvl + 1]); | |
407 | DiscountOneLevel(v, ch, discounter[lvl + 1], bUseLogPr); | |
408 | } else { | |
409 | TNodeLevel& ch = *(TNodeLevel*)(level[lvl + 1]); | |
410 | DiscountOneLevel(v, ch, discounter[lvl + 1], bUseLogPr); | |
411 | } | |
412 | } | |
413 | printf("\n Giving psuedo root level 0 a distribution..."); | |
414 | //make the psuedo 0-gram a equal distribution | |
415 | TNodeLevel& v0 = *(TNodeLevel*)(level[0]); | |
416 | if (bUseLogPr) { | |
417 | v0[0].pr = PR_TYPE(-log(double(1.0) / m_nWord)); | |
418 | } else { | |
419 | v0[0].pr = PR_TYPE(double(1.0) / m_nWord); | |
420 | } | |
421 | printf("\n"); fflush(stdout); | |
422 | } | |
423 | ||
424 | template<class chIterator> | |
425 | double | |
426 | CalcNodeBow(CSlmBuilder* builder, | |
427 | int lvl, | |
428 | TSIMWordId words[], | |
429 | chIterator chh, | |
430 | chIterator cht, | |
431 | int bUseLogPr) | |
432 | { | |
433 | if (chh == cht) return 1.0; | |
434 | double sumnext = 0.0, sum = 0.0; | |
435 | for (; chh < cht; ++chh) { | |
436 | if (bUseLogPr) { | |
437 | sumnext += exp(-(chh->pr)); | |
438 | } else { | |
439 | sumnext += double(chh->pr); | |
440 | } | |
441 | words[lvl + 1] = chh->id; | |
442 | sum += builder->getPr(lvl, words + 2); | |
443 | } | |
444 | assert(sumnext > 0.0 && sumnext < 1.05); | |
445 | assert(sum < 1.05 && sum > 0.0); | |
446 | //消除计算误差的影响 | |
447 | if (sumnext >= 1.0 || sum >= 1.0) { | |
448 | double bow = ((sumnext > sum) ? sumnext : sum) + 0.0001; | |
449 | bow = (bow - sumnext) / (bow - sum); | |
450 | printf( | |
451 | "\n (sigma(p(w|h)=%lf, sigma(p(w|h')=%lf) bow ==> %lf due to Calculation precision for %d-gram:", | |
452 | sumnext, | |
453 | sum, | |
454 | bow, | |
455 | lvl); | |
456 | for (int i = 1; i <= lvl; ++i) | |
457 | printf("%d ", words[i]); | |
458 | return bow; | |
459 | } | |
460 | return (1.0 - sumnext) / (1.0 - sum); | |
461 | } | |
462 | ||
463 | void | |
464 | CSlmBuilder::CalcBOW() | |
465 | { | |
466 | printf("\nCalculating Back-Off Weight..."); | |
467 | for (int lvl = 0; lvl < nlevel; ++lvl) { | |
468 | printf("\n Processing level %d ", lvl); fflush(stdout); | |
469 | TNode* base[16]; //it should be lvl+1, yet some compiler does not support it | |
470 | int idx[16]; //it should be lvl+1, yet some compiler does not support it | |
471 | for (int i = 0; i <= lvl; ++i) { | |
472 | base[i] = &((*(TNodeLevel*)level[i])[0]); | |
473 | idx[i] = 0; | |
474 | } | |
475 | TSIMWordId words[17]; //it should be lvl+2, yet some compiler do not support it | |
476 | int sz = ((TNodeLevel*)(level[lvl]))->size() - 1; | |
477 | printf("(%d items)...", sz + 1); fflush(stdout); | |
478 | for (; idx[lvl] < sz; ++idx[lvl]) { | |
479 | words[lvl] = base[lvl][idx[lvl]].id; | |
480 | for (int k = lvl - 1; k >= 0; --k) { | |
481 | while (base[k][idx[k] + 1].child <= idx[k + 1]) | |
482 | ++idx[k]; | |
483 | words[k] = base[k][idx[k]].id; | |
484 | } | |
485 | TNode & node = base[lvl][idx[lvl]]; | |
486 | TNode & nodenext = *((&node) + 1); | |
487 | double bow; | |
488 | if (lvl == nlevel - 1) { | |
489 | TLeaf * ch = &((*(TLeafLevel*)level[lvl + 1])[0]); | |
490 | bow = CalcNodeBow(this, | |
491 | lvl, | |
492 | words, | |
493 | ch + node.child, | |
494 | ch + nodenext.child, | |
495 | bUseLogPr); | |
496 | } else { | |
497 | TNode * ch = &((*(TNodeLevel*)level[lvl + 1])[0]); | |
498 | bow = CalcNodeBow(this, | |
499 | lvl, | |
500 | words, | |
501 | ch + node.child, | |
502 | ch + nodenext.child, | |
503 | bUseLogPr); | |
504 | } | |
505 | if (bUseLogPr) { | |
506 | node.bow = PR_TYPE(-log(bow)); | |
507 | } else { | |
508 | node.bow = PR_TYPE(bow); | |
509 | } | |
510 | } | |
511 | } | |
512 | printf("\n"); fflush(stdout); | |
513 | } | |
514 | ||
515 | double | |
516 | CSlmBuilder::getPr(int n, TSIMWordId *words) | |
517 | { | |
518 | int lvl; | |
519 | double bow = 1.0; | |
520 | void* pnode = &((*(TNodeLevel*)level[0])[0]); | |
521 | ||
522 | assert(n <= nlevel); | |
523 | ||
524 | if (n == 0) { | |
525 | if (bUseLogPr) { | |
526 | return exp(-((TNode*)pnode)->pr); | |
527 | } else { | |
528 | return ((TNode*)pnode)->pr; | |
529 | } | |
530 | } | |
531 | ||
532 | for (lvl = 0; pnode != NULL && lvl < n; ++lvl) { | |
533 | if (bUseLogPr) { | |
534 | bow = exp(-((TNode*)pnode)->bow); | |
535 | } else { | |
536 | bow = ((TNode*)pnode)->bow; | |
537 | } | |
538 | pnode = FindChild(lvl, (TNode*)pnode, words[lvl]); | |
539 | } | |
540 | ||
541 | if (pnode != NULL) { // find the whole string | |
542 | if (bUseLogPr) { | |
543 | return exp(-((TLeaf*)pnode)->pr); | |
544 | } else { | |
545 | return ((TLeaf*)pnode)->pr; | |
546 | } | |
547 | } else if (lvl == n - 1) { // only find the history | |
548 | return bow * getPr(n - 1, words + 1); | |
549 | } else { //even not find the history | |
550 | return getPr(n - 1, words + 1); | |
551 | } | |
552 | } | |
553 | ||
554 | void* | |
555 | CSlmBuilder::FindChild(int lvl, TNode* root, TSIMWordId id) | |
556 | { | |
557 | int chh = root->child, cht = (root + 1)->child; | |
558 | if (lvl == nlevel - 1) { | |
559 | TLeaf* pleaf = &((*(TLeafLevel*)level[lvl + 1])[0]); | |
560 | return (void*)binary_find(pleaf, chh, cht, TLeaf(id)); | |
561 | } else { | |
562 | TNode* pnode = &((*(TNodeLevel*)level[lvl + 1])[0]); | |
563 | return (void*)binary_find(pnode, chh, cht, TNode(id)); | |
564 | } | |
565 | } | |
566 | ||
567 | void | |
568 | CSlmBuilder::Build() | |
569 | { | |
570 | CountNr(); | |
571 | AppendTails(); | |
572 | Cut(); | |
573 | Discount(); | |
574 | CalcBOW(); | |
575 | } | |
576 | ||
577 | void | |
578 | CSlmBuilder::Write(FILE *out) | |
579 | { | |
580 | fwrite(&nlevel, sizeof(nlevel), 1, out); | |
581 | fwrite(&bUseLogPr, sizeof(bUseLogPr), 1, out); | |
582 | for (int lvl = 0; lvl <= nlevel; ++lvl) { | |
583 | int sz = 0; | |
584 | if (lvl == nlevel) | |
585 | sz = ((TLeafLevel*)(level[lvl]))->size(); | |
586 | else | |
587 | sz = ((TNodeLevel*)(level[lvl]))->size(); | |
588 | fwrite(&sz, sizeof(sz), 1, out); | |
589 | } | |
590 | for (int lvl = 0; lvl < nlevel; ++lvl) { | |
591 | TNodeLevel& v = *(TNodeLevel*)(level[lvl]); | |
592 | for (TNodeIterator it = v.begin(), ite = v.end(); it != ite; ++it) | |
593 | fwrite(&(*it), sizeof(TNode), 1, out); | |
594 | } | |
595 | TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); | |
596 | for (TLeafIterator it = v.begin(), ite = v.end(); it != ite; ++it) | |
597 | fwrite(&(*it), sizeof(TLeaf), 1, out); | |
598 | } | |
599 | ||
600 | void | |
601 | CSlmBuilder::Close(void) | |
602 | { | |
603 | if (level != NULL) { | |
604 | for (int lvl = 0; lvl <= nlevel; ++lvl) { | |
605 | if (lvl == nlevel) | |
606 | delete (TLeafLevel*)(level[lvl]); | |
607 | else | |
608 | delete (TNodeLevel*)(level[lvl]); | |
609 | } | |
610 | delete [] level; | |
611 | level = NULL; | |
612 | } | |
613 | if (cut != NULL) { | |
614 | delete [] cut; | |
615 | cut = NULL; | |
616 | } | |
617 | if (discounter != NULL) { | |
618 | for (int lvl = 1; lvl <= nlevel; ++lvl) { | |
619 | delete discounter[lvl]; | |
620 | } | |
621 | delete [] discounter; | |
622 | discounter = NULL; | |
623 | } | |
624 | if (nr != NULL) { | |
625 | delete [] nr; | |
626 | nr = NULL; | |
627 | } | |
628 | breaker.clear(); | |
629 | m_nWord = 0; | |
630 | nlevel = 0; | |
631 | } | |
632 | ||
633 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | // -*- mode: c++ -*- | |
1 | /* | |
2 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. | |
3 | * | |
4 | * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. | |
5 | * | |
6 | * The contents of this file are subject to the terms of either the GNU Lesser | |
7 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
8 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
9 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
10 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
11 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
12 | * specific language governing permissions and limitations under the License. When | |
13 | * distributing the software, include this License Header Notice in each file and | |
14 | * include the full text of the License in the License file as well as the | |
15 | * following notice: | |
16 | * | |
17 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
18 | * (CDDL) | |
19 | * For Covered Software in this distribution, this License shall be governed by the | |
20 | * laws of the State of California (excluding conflict-of-law provisions). | |
21 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
22 | * the Federal Courts of the Northern District of California and the state courts | |
23 | * of the State of California, with venue lying in Santa Clara County, California. | |
24 | * | |
25 | * Contributor(s): | |
26 | * | |
27 | * If you wish your version of this file to be governed by only the CDDL or only | |
28 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
29 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
30 | * license." If you don't indicate a single choice of license, a recipient has the | |
31 | * option to distribute your version of this file under either the CDDL or the LGPL | |
32 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
33 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
34 | * Version 2 license, then the option applies only if the new code is made subject | |
35 | * to such option by the copyright holder. | |
36 | */ | |
37 | ||
38 | #ifndef _SIM_SLM_BUILDER_H | |
39 | #define _SIM_SLM_BUILDER_H | |
40 | ||
41 | #include "../../portability.h" | |
42 | ||
43 | #include "sim_slm.h" | |
44 | ||
45 | class CSlmDiscounter; | |
46 | ||
47 | class CSlmBuilder { | |
48 | public: | |
49 | static const int SLM_MAX_R = 16; | |
50 | typedef CSIMSlm::FREQ_TYPE FREQ_TYPE; | |
51 | typedef CSIMSlm::PR_TYPE PR_TYPE; | |
52 | typedef CSIMSlm::TNode TNode; | |
53 | typedef CSIMSlm::TLeaf TLeaf; | |
54 | ||
55 | public: | |
56 | CSlmBuilder() | |
57 | : nlevel(0), bUseLogPr(0), level(NULL), m_nWord(0), cut(NULL), | |
58 | discounter(NULL), nr(NULL), breaker(), m_excludes() { } | |
59 | ~CSlmBuilder() | |
60 | { Close(); } | |
61 | ||
62 | void Create(int n); | |
63 | void SetNumberOfWord(int nWord) { this->m_nWord = nWord; } | |
64 | void SetCut(FREQ_TYPE threshold[]); | |
65 | void SetDiscounter(CSlmDiscounter * dis[]); | |
66 | void SetBreakerIds(int nId, TSIMWordId brks[]); | |
67 | void SetExcludeIds(int nId, TSIMWordId excludes[]); | |
68 | void SetUseLogPr(int bUse) | |
69 | { bUseLogPr = bUse; } | |
70 | ||
71 | void AddNGram(TSIMWordId* ngram, FREQ_TYPE fr); | |
72 | void Build(); | |
73 | void Write(FILE* out); | |
74 | void Close(); | |
75 | ||
76 | //get pr(w[n-1] | w[0]...w[n-2]) on constructed partial model (low levels) | |
77 | double getPr(int n, TSIMWordId* w); | |
78 | ||
79 | public: | |
80 | typedef std::vector<TNode> TNodeLevel; | |
81 | typedef std::vector<TLeaf> TLeafLevel; | |
82 | typedef TNodeLevel::iterator TNodeIterator; | |
83 | typedef TLeafLevel::iterator TLeafIterator; | |
84 | ||
85 | protected: | |
86 | bool isBreakId(TSIMWordId id); | |
87 | bool isExcludeId(TSIMWordId id); | |
88 | void CountNr(); | |
89 | void AppendTails(); | |
90 | void Cut(); | |
91 | void Discount(); | |
92 | void CalcBOW(); | |
93 | void*FindChild(int lvl, TNode* root, TSIMWordId id); | |
94 | int CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast, | |
95 | TNodeIterator chfirst, TNodeIterator chlast, int thred); | |
96 | int CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast, | |
97 | TLeafIterator chfirst, TLeafIterator chlast, int thred); | |
98 | ||
99 | private: | |
100 | int nlevel, bUseLogPr; | |
101 | void** level; | |
102 | //level[0] is psudeo root level, level[1] is unigram level, ..., all are vector type | |
103 | ||
104 | int m_nWord; | |
105 | FREQ_TYPE* cut; // cut[1] is not cut threshold for 1-gram, ... | |
106 | CSlmDiscounter** discounter; // discounter[1] is for 1-gram... | |
107 | FREQ_TYPE(*nr)[SLM_MAX_R]; //nr[1][SLM_MAX_R] is for 1-gram... | |
108 | std::vector<TSIMWordId> breaker; | |
109 | std::vector<TSIMWordId> m_excludes; | |
110 | }; | |
111 | ||
112 | class CSlmDiscounter { | |
113 | public: | |
114 | virtual ~CSlmDiscounter() {} | |
115 | // n is array size, nr is FREQ_TYPE[n], nr[0] is corpuse size,or sigma r*nr; | |
116 | // nr[1] is number of ngram items with freq 1, ... | |
117 | virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr) = 0; | |
118 | ||
119 | // freq is the ngram frequence, not the conditional pr | |
120 | virtual double discount(int freq) = 0; | |
121 | virtual const char* getName() = 0; | |
122 | }; | |
123 | ||
124 | //Good-Turing discount | |
125 | class CSlmGTDiscounter : public CSlmDiscounter { | |
126 | public: | |
127 | CSlmGTDiscounter(int threshold = 10, double highfreq_discount = | |
128 | 0.95) : thres(threshold), hd(highfreq_discount), | |
129 | dis(NULL) {} | |
130 | virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); | |
131 | virtual double discount(int freq); | |
132 | virtual const char* getName() | |
133 | { return "Good-Turing"; } | |
134 | protected: | |
135 | int thres; | |
136 | double hd; | |
137 | double *dis; | |
138 | }; | |
139 | ||
140 | class CSlmAbsoluteDiscounter : public CSlmDiscounter { | |
141 | public: | |
142 | CSlmAbsoluteDiscounter(double substract = 0.0) : c(substract) {} | |
143 | //c == 0 mean this value should be count according to r[] | |
144 | virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); | |
145 | virtual double discount(int freq); // return freq - c | |
146 | virtual const char* getName() | |
147 | { return "Absolution"; } | |
148 | protected: | |
149 | double c; | |
150 | }; | |
151 | ||
152 | class CSlmLinearDiscounter : public CSlmDiscounter { | |
153 | public: | |
154 | CSlmLinearDiscounter(double shrink = 0.0) : dis(shrink) {} | |
155 | //dis == 0 mean this value should be count according to r[] | |
156 | virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); | |
157 | virtual double discount(int freq); // return freq * dis | |
158 | virtual const char* getName() | |
159 | { return "Linear"; } | |
160 | protected: | |
161 | double dis; | |
162 | }; | |
163 | ||
164 | #endif | |
165 | ||
166 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
55 | 55 | #include <vector> |
56 | 56 | #include <algorithm> |
57 | 57 | |
58 | #include "../sim_slmbuilder.h" | |
58 | #include "sim_slmbuilder.h" | |
59 | 59 | |
60 | 60 | static struct option long_options[] = |
61 | 61 | { |
246 | 246 | ++nItems; |
247 | 247 | } |
248 | 248 | fclose(fp); |
249 | delete ngram; | |
249 | delete[] ngram; | |
250 | 250 | printf("%d ngrams.\n", nItems); fflush(stdout); |
251 | 251 | |
252 | 252 | builder.Build(); |
115 | 115 | |
116 | 116 | typedef std::map<TSIMWordId, std::string> TReverseLexicon; |
117 | 117 | |
118 | double log_conv(double input, bool input_log, bool output_log) { | |
119 | if (!(input_log ^ output_log)) return input; | |
120 | else if (input_log) return exp(-input); | |
121 | else return -log(input); | |
122 | } | |
123 | ||
118 | 124 | void |
119 | 125 | PrintARPALevel(int lvl, FILE* fp, TReverseLexicon* plexicon, bool output_log_pr) |
120 | 126 | { |
154 | 160 | } |
155 | 161 | } |
156 | 162 | |
157 | printf("/%d-gram:%d/\n", lvl, sz[lvl] - 1); | |
163 | printf("\\%d-gram\\%d\n", lvl, sz[lvl] - 1); | |
158 | 164 | while (idx[lvl] < sz[lvl] - 1) { |
159 | for (int i = lvl - 1; i > 0; --i) { | |
165 | if (lvl > 0) for (int i = lvl - 1; i > 0; --i) { | |
160 | 166 | bool change = false; |
161 | 167 | while (nodes[i][1].child <= idx[i + 1]) { |
162 | 168 | change = true; |
177 | 183 | else |
178 | 184 | printf("%d ", int(word_id)); |
179 | 185 | } |
180 | if (bLogPrFile) { | |
181 | if (output_log_pr) | |
182 | printf("%20.17lf ", double(nodes[lvl][0].pr)); | |
183 | else | |
184 | printf("%20.17lf ", exp(-double(nodes[lvl][0].pr))); | |
185 | if (lvl != N) { | |
186 | if (output_log_pr) | |
187 | printf("%20.17lf", double(nodes[lvl][0].bow)); | |
188 | else | |
189 | printf("%20.17lf", exp(-double(nodes[lvl][0].bow))); | |
190 | } | |
191 | } else { | |
192 | if (output_log_pr) | |
193 | printf("%20.17lf ", -log(double(nodes[lvl][0].pr))); | |
194 | else | |
195 | printf("%20.17lf ", double(nodes[lvl][0].pr)); | |
196 | if (lvl != N) { | |
197 | if (output_log_pr) | |
198 | printf("%20.17lf", -log(double(nodes[lvl][0].bow))); | |
199 | else | |
200 | printf("%20.17lf", double(nodes[lvl][0].bow)); | |
201 | } | |
202 | } | |
186 | printf("%20.17lf", | |
187 | log_conv(nodes[lvl][0].pr, bLogPrFile, output_log_pr)); | |
188 | if (lvl != N) printf(" %20.17lf", | |
189 | log_conv(nodes[lvl][0].bow, bLogPrFile, output_log_pr)); | |
203 | 190 | printf("\n"); |
204 | 191 | |
205 | 192 | ++idx[lvl]; |
248 | 235 | } |
249 | 236 | fseek(fp, 0, SEEK_SET); |
250 | 237 | fread(&N, sizeof(N), 1, fp); |
251 | for (int lvl = 1; lvl <= N; ++lvl) | |
238 | for (int lvl = 0; lvl <= N; ++lvl) | |
252 | 239 | PrintARPALevel(lvl, fp, plexicon, output_log_pr); |
253 | 240 | } |
254 | 241 |
0 | /* | |
1 | * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com> | |
2 | * | |
3 | * The contents of this file are subject to the terms of either the GNU Lesser | |
4 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
5 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
6 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
7 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
8 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
9 | * specific language governing permissions and limitations under the License. When | |
10 | * distributing the software, include this License Header Notice in each file and | |
11 | * include the full text of the License in the License file as well as the | |
12 | * following notice: | |
13 | * | |
14 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
15 | * (CDDL) | |
16 | * For Covered Software in this distribution, this License shall be governed by the | |
17 | * laws of the State of California (excluding conflict-of-law provisions). | |
18 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
19 | * the Federal Courts of the Northern District of California and the state courts | |
20 | * of the State of California, with venue lying in Santa Clara County, California. | |
21 | * | |
22 | * Contributor(s): | |
23 | * | |
24 | * If you wish your version of this file to be governed by only the CDDL or only | |
25 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
26 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
27 | * license." If you don't indicate a single choice of license, a recipient has the | |
28 | * option to distribute your version of this file under either the CDDL or the LGPL | |
29 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
30 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
31 | * Version 2 license, then the option applies only if the new code is made subject | |
32 | * to such option by the copyright holder. | |
33 | */ | |
34 | #include <string> | |
35 | #include <iostream> | |
36 | #include <fstream> | |
37 | #include <algorithm> | |
38 | #include "arpa_slm.h" | |
39 | ||
40 | using namespace std; | |
41 | ||
42 | char* | |
43 | getwords(char* buf, char** next) | |
44 | { | |
45 | char* word = buf; | |
46 | char* delim = strstr(buf, " "); | |
47 | if (delim == NULL) { | |
48 | cerr << "Unknown format in: " << buf << "." << endl; | |
49 | exit(2); | |
50 | } | |
51 | *delim = '\0'; | |
52 | *next = delim + 2; | |
53 | return word; | |
54 | } | |
55 | ||
56 | unsigned | |
57 | get_wid(const char* word, const TLexicon& lexicon) | |
58 | { | |
59 | TLexicon::const_iterator lexi = lexicon.find(word); | |
60 | unsigned wid; | |
61 | if (lexi != lexicon.end()) { | |
62 | wid = lexi->second; | |
63 | } else { | |
64 | cerr << "Error:\"" << word << "\" not found in lexicon." << endl; | |
65 | wid = 0; | |
66 | } | |
67 | return wid; | |
68 | } | |
69 | ||
70 | int | |
71 | CArpaSlm::TLeaf::load_words(char* buf, const TLexicon& lexicon) | |
72 | { | |
73 | int nword = 0; | |
74 | char* word, *end; | |
75 | for (word = end = buf; *end != 0; ++end) { | |
76 | if (*end == ' ') { | |
77 | assert(nword < N_GRAM); | |
78 | *end = 0; | |
79 | hw[nword++] = get_wid(word, lexicon); | |
80 | word = end + 1; | |
81 | } | |
82 | } | |
83 | if (buf != end) { | |
84 | wid = hw[nword++] = get_wid(word, lexicon); | |
85 | } | |
86 | return nword; | |
87 | } | |
88 | ||
89 | void | |
90 | CArpaSlm::TLeaf::load(istream& is, const TLexicon& lexicon) | |
91 | { | |
92 | char buf[1024]; | |
93 | is.getline(buf, sizeof(buf)); | |
94 | char* next = 0; | |
95 | char* words = getwords(buf, &next); | |
96 | load_words(words, lexicon); | |
97 | sscanf(next, "%f", &pr); | |
98 | } | |
99 | ||
100 | void | |
101 | CArpaSlm::TNode::load(istream& is, const TLexicon& lexicon) | |
102 | { | |
103 | char buf[1024]; | |
104 | is.getline(buf, sizeof(buf)); | |
105 | char* next = 0; | |
106 | char* words = getwords(buf, &next); | |
107 | load_words(words, lexicon); | |
108 | sscanf(next, "%f %f", &pr, &bow); | |
109 | } | |
110 | ||
111 | void | |
112 | CArpaSlm::TNode::load_level0(istream& is) | |
113 | { | |
114 | hw[0] = 0; | |
115 | char buf[1024]; | |
116 | is.getline(buf, sizeof(buf)); | |
117 | sscanf(buf, "%f %f", &pr, &bow); | |
118 | wid = 0; | |
119 | } | |
120 | ||
121 | void | |
122 | CArpaSlm::load(const char* filename, const TLexicon& lexicon) | |
123 | { | |
124 | printf("Loading ARPA slm..."); fflush(stdout); | |
125 | ifstream file(filename); | |
126 | char buf[1024]; | |
127 | for (int i = 0; i <= N_GRAM; ++i) { | |
128 | unsigned lvl; | |
129 | int size; | |
130 | file.getline(buf, sizeof(buf)); | |
131 | if (!file) { | |
132 | cerr << "Failed to read from" << filename << endl; | |
133 | exit(1); | |
134 | } | |
135 | sscanf(buf, "\\%d-gram\\%d%*[\n]", &lvl, &size); | |
136 | assert(lvl <= N_GRAM); | |
137 | if (lvl == 0) { | |
138 | TNode node0; | |
139 | node0.load_level0(file); | |
140 | m_levels[0].push_back(node0); | |
141 | } else if (lvl < m_N) { | |
142 | m_levels[lvl].reserve(size); | |
143 | for (int i = 0; i < size; ++i) { | |
144 | TNode node; | |
145 | node.load(file, lexicon); | |
146 | m_levels[lvl].push_back(node); | |
147 | } | |
148 | } else { | |
149 | // leaf nodes | |
150 | m_lastLevel.reserve(size); | |
151 | for (int i = 0; i < size; ++i) { | |
152 | TLeaf leaf; | |
153 | leaf.load(file, lexicon); | |
154 | m_lastLevel.push_back(leaf); | |
155 | } | |
156 | } | |
157 | } | |
158 | } | |
159 | ||
160 | template <class NodeT> | |
161 | struct CompareNode { | |
162 | const unsigned m_lvl; | |
163 | CompareNode(unsigned lvl) : m_lvl(lvl) | |
164 | { | |
165 | } | |
166 | /** | |
167 | * @return true if strictly less, false otherwise | |
168 | */ | |
169 | bool | |
170 | operator ()(const NodeT& node, const TSIMWordId hw[N_GRAM]) | |
171 | { | |
172 | for (unsigned i = 0; i < m_lvl; ++i) { | |
173 | if (node.hw[i] < hw[i]) | |
174 | return true; | |
175 | if (node.hw[i] > hw[i]) | |
176 | return false; | |
177 | } | |
178 | // node.hw[:lvl] is the same as hw[:] | |
179 | return false; | |
180 | } | |
181 | }; | |
182 | ||
183 | void | |
184 | CArpaSlm::initChild() | |
185 | { | |
186 | { | |
187 | TNode& node = m_levels[0][0]; | |
188 | node.child = 0; | |
189 | } | |
190 | for (unsigned lvl = 1; lvl < m_N; ++lvl) { | |
191 | TNodeLevel& level = m_levels[lvl]; | |
192 | unsigned last_child = 0; | |
193 | for (TNodeLevel::iterator node = level.begin(); | |
194 | node != level.end(); | |
195 | ++node) { | |
196 | node->child = last_child = find_1st_child(lvl, *node, last_child); | |
197 | } | |
198 | } | |
199 | } | |
200 | ||
201 | unsigned | |
202 | CArpaSlm::find_1st_child(unsigned lvl, const TNode& node, int last_child) | |
203 | { | |
204 | assert(lvl < m_N); | |
205 | if (lvl == m_N - 1) { | |
206 | TLeafLevel::iterator found = lower_bound( | |
207 | m_lastLevel.begin(), m_lastLevel.end(), node.hw, | |
208 | CompareNode<TLeaf>(lvl)); | |
209 | return distance(m_lastLevel.begin(), found); | |
210 | } else { | |
211 | const TNodeLevel& level = m_levels[lvl + 1]; | |
212 | TNodeLevel::const_iterator found = lower_bound(level.begin(), level.end( | |
213 | ), node.hw, | |
214 | CompareNode<TNode>(lvl)); | |
215 | return distance(level.begin(), found); | |
216 | } | |
217 | } | |
218 | ||
219 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | // -*- mode: c++ -*- | |
1 | /* | |
2 | * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com> | |
3 | * | |
4 | * The contents of this file are subject to the terms of either the GNU Lesser | |
5 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
6 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
7 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
8 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
9 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
10 | * specific language governing permissions and limitations under the License. When | |
11 | * distributing the software, include this License Header Notice in each file and | |
12 | * include the full text of the License in the License file as well as the | |
13 | * following notice: | |
14 | * | |
15 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
16 | * (CDDL) | |
17 | * For Covered Software in this distribution, this License shall be governed by the | |
18 | * laws of the State of California (excluding conflict-of-law provisions). | |
19 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
20 | * the Federal Courts of the Northern District of California and the state courts | |
21 | * of the State of California, with venue lying in Santa Clara County, California. | |
22 | * | |
23 | * Contributor(s): | |
24 | * | |
25 | * If you wish your version of this file to be governed by only the CDDL or only | |
26 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
27 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
28 | * license." If you don't indicate a single choice of license, a recipient has the | |
29 | * option to distribute your version of this file under either the CDDL or the LGPL | |
30 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
31 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
32 | * Version 2 license, then the option applies only if the new code is made subject | |
33 | * to such option by the copyright holder. | |
34 | */ | |
35 | #ifndef _ARPA_SLM_H | |
36 | #define _ARPA_SLM_H | |
37 | ||
38 | #include <istream> | |
39 | #include "common.h" | |
40 | ||
41 | using std::istream; | |
42 | ||
43 | #define N_GRAM (3) | |
44 | ||
45 | ||
46 | /* the ARPA style representation of sunpinyin's SLM */ | |
47 | class CArpaSlm { | |
48 | public: | |
49 | struct TLeaf { | |
50 | TSIMWordId hw[N_GRAM]; | |
51 | TSIMWordId wid; | |
52 | float pr; | |
53 | void load(istream&, const TLexicon&); | |
54 | int load_words(char* buf, const TLexicon& lexicon); | |
55 | TLeaf() : wid(0), pr(.0) {} | |
56 | }; | |
57 | ||
58 | struct TNode : public TLeaf { | |
59 | int child; | |
60 | float bow; | |
61 | void load(istream&, const TLexicon&); | |
62 | void load_level0(istream&); | |
63 | }; | |
64 | ||
65 | typedef std::vector<TNode> TNodeLevel; | |
66 | typedef std::vector<TLeaf> TLeafLevel; | |
67 | ||
68 | private: | |
69 | TNodeLevel m_levels[N_GRAM + 1]; /* [0..N_GRAM] */ | |
70 | TLeafLevel m_lastLevel; | |
71 | const bool m_usingLogPr; | |
72 | const unsigned m_N; | |
73 | ||
74 | public: | |
75 | /* XXX, ARPA file does not provide these information. | |
76 | so we assume this SLM is trigram, and does not use LogPr */ | |
77 | CArpaSlm() : m_usingLogPr(false), m_N(N_GRAM) {} | |
78 | bool good() const { return m_levels[0].size() != 0; } | |
79 | unsigned getN() const { return m_N; } | |
80 | bool usingLogPr() const { return m_usingLogPr; } | |
81 | const TNodeLevel& getLevel(unsigned lvl) const { return m_levels[lvl]; } | |
82 | const TLeafLevel& getLastLevel() const { return m_lastLevel; } | |
83 | unsigned getLevelSize(unsigned lvl) const { | |
84 | assert(lvl <= m_N); | |
85 | if (lvl < m_N) { | |
86 | return m_levels[lvl].size(); | |
87 | } else { | |
88 | return m_lastLevel.size(); | |
89 | } | |
90 | } | |
91 | void initChild(); | |
92 | void load(const char* filename, const TLexicon& lexicon); | |
93 | ||
94 | private: | |
95 | /** | |
96 | * find out the first child of a given node in its next level | |
97 | * @param lvl the level where node belongs to | |
98 | * @param node the node | |
99 | * @param last_child the child index of previous node | |
100 | * @return the index of the found child | |
101 | */ | |
102 | unsigned find_1st_child(unsigned lvl, const TNode& node, int last_child); | |
103 | }; | |
104 | ||
105 | #endif //_ARPA_SLM_H | |
106 | ||
107 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | // -*- mode: c++ -*- | |
1 | /* | |
2 | * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com> | |
3 | * | |
4 | * The contents of this file are subject to the terms of either the GNU Lesser | |
5 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
6 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
7 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
8 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
9 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
10 | * specific language governing permissions and limitations under the License. When | |
11 | * distributing the software, include this License Header Notice in each file and | |
12 | * include the full text of the License in the License file as well as the | |
13 | * following notice: | |
14 | * | |
15 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
16 | * (CDDL) | |
17 | * For Covered Software in this distribution, this License shall be governed by the | |
18 | * laws of the State of California (excluding conflict-of-law provisions). | |
19 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
20 | * the Federal Courts of the Northern District of California and the state courts | |
21 | * of the State of California, with venue lying in Santa Clara County, California. | |
22 | * | |
23 | * Contributor(s): | |
24 | * | |
25 | * If you wish your version of this file to be governed by only the CDDL or only | |
26 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
27 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
28 | * license." If you don't indicate a single choice of license, a recipient has the | |
29 | * option to distribute your version of this file under either the CDDL or the LGPL | |
30 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
31 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
32 | * Version 2 license, then the option applies only if the new code is made subject | |
33 | * to such option by the copyright holder. | |
34 | */ | |
35 | #ifndef _SLM_PACK_COMMON_H | |
36 | #define _SLM_PACK_COMMON_H | |
37 | ||
38 | #include <vector> | |
39 | #include <map> | |
40 | #include <string> | |
41 | #include <cmath> | |
42 | #include <cassert> | |
43 | ||
44 | #include "../slm.h" | |
45 | ||
46 | typedef std::vector<CThreadSlm::TNode> TNodeLevel; | |
47 | typedef std::vector<CThreadSlm::TLeaf> TLeafLevel; | |
48 | typedef std::vector<CThreadSlm::TNode*> TNodeLevels; | |
49 | typedef std::map<std::string, unsigned int> TLexicon; // map word to wid | |
50 | ||
51 | #endif //_SLM_PACK_COMMON_H | |
52 | ||
53 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | /* | |
1 | * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com> | |
2 | * | |
3 | * The contents of this file are subject to the terms of either the GNU Lesser | |
4 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
5 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
6 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
7 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
8 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
9 | * specific language governing permissions and limitations under the License. When | |
10 | * distributing the software, include this License Header Notice in each file and | |
11 | * include the full text of the License in the License file as well as the | |
12 | * following notice: | |
13 | * | |
14 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
15 | * (CDDL) | |
16 | * For Covered Software in this distribution, this License shall be governed by the | |
17 | * laws of the State of California (excluding conflict-of-law provisions). | |
18 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
19 | * the Federal Courts of the Northern District of California and the state courts | |
20 | * of the State of California, with venue lying in Santa Clara County, California. | |
21 | * | |
22 | * Contributor(s): | |
23 | * | |
24 | * If you wish your version of this file to be governed by only the CDDL or only | |
25 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
26 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
27 | * license." If you don't indicate a single choice of license, a recipient has the | |
28 | * option to distribute your version of this file under either the CDDL or the LGPL | |
29 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
30 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
31 | * Version 2 license, then the option applies only if the new code is made subject | |
32 | * to such option by the copyright holder. | |
33 | */ | |
34 | ||
35 | /* | |
36 | * pack ARPA format to a binary format which can be consumed by SunPinyin | |
37 | */ | |
38 | ||
39 | #ifdef HAVE_CONFIG_H | |
40 | #include "config.h" | |
41 | #endif | |
42 | ||
43 | #ifdef HAVE_ASSERT_H | |
44 | #include <assert.h> | |
45 | #endif | |
46 | ||
47 | #include <stdio.h> | |
48 | #include <unistd.h> | |
49 | #include <stdlib.h> | |
50 | ||
51 | #include <vector> | |
52 | #include <map> | |
53 | #include <iostream> | |
54 | #include <cmath> | |
55 | ||
56 | #include "../sim_slm.h" | |
57 | #include "arpa_slm.h" | |
58 | ||
59 | void | |
60 | ShowUsage(const char* progname) | |
61 | { | |
62 | printf("Usage:\n"); | |
63 | printf(" %s arpa_slm dict_file output_slm\n", progname); | |
64 | printf("\n"); | |
65 | printf("Description:\n"); | |
66 | printf( | |
67 | " %s converts the ARPA representation of SLM to the binary format of SLM. \n", | |
68 | progname); | |
69 | exit(100); | |
70 | } | |
71 | ||
72 | TLexicon | |
73 | read_lexicon(const char* filename) | |
74 | { | |
75 | printf("Loading lexicon..."); fflush(stdout); | |
76 | static char word[1024 * 10]; | |
77 | FILE* f_lex = fopen(filename, "r"); | |
78 | TLexicon lexicon; | |
79 | while (fgets(word, sizeof(word), f_lex)) { | |
80 | if (strlen(word) > 0) { | |
81 | // skip to the first non hanzi character | |
82 | char* p = word; | |
83 | while (*p == ' ' || *p == '\t') | |
84 | ++p; | |
85 | while (*p != 0 && *p != ' ' && *p != '\t') | |
86 | ++p; | |
87 | if (*p == 0) continue; | |
88 | *p++ = 0; | |
89 | // skip to the word_id | |
90 | while (*p == ' ' || *p == '\t') | |
91 | ++p; | |
92 | if (!(*p >= '0' && *p <= '9')) continue; | |
93 | ||
94 | int id; | |
95 | for (id = 0; *p >= '0' && *p <= '9'; ++p) | |
96 | id = 10 * id + (*p - '0'); | |
97 | lexicon[std::string(word)] = id; | |
98 | } | |
99 | } | |
100 | fclose(f_lex); | |
101 | printf("done.\n"); fflush(stdout); | |
102 | ||
103 | return lexicon; | |
104 | } | |
105 | ||
106 | // | |
107 | // filename [in] | |
108 | // levels[0] [in] | |
109 | // ... | |
110 | // levels[N] [in] | |
111 | // lastLevel [in] | |
112 | // | |
113 | void | |
114 | write_out(const char* filename, const CArpaSlm& slm) | |
115 | { | |
116 | printf("\nWriting out..."); fflush(stdout); | |
117 | ||
118 | FILE* fp = fopen(filename, "wb"); | |
119 | const int N = slm.getN(); | |
120 | fwrite(&N, sizeof(int), 1, fp); | |
121 | const unsigned usingLogPr = slm.usingLogPr(); | |
122 | fwrite(&usingLogPr, sizeof(unsigned), 1, fp); | |
123 | ||
124 | for (int lvl = 0; lvl <= N; ++lvl) { | |
125 | unsigned len = slm.getLevelSize(lvl) + 1; | |
126 | fwrite(&len, sizeof(unsigned), 1, fp); | |
127 | } | |
128 | ||
129 | for (int lvl = 0; lvl < N; ++lvl) { | |
130 | const CArpaSlm::TNodeLevel& level = slm.getLevel(lvl); | |
131 | for (CArpaSlm::TNodeLevel::const_iterator iter = level.begin(); | |
132 | iter != level.end(); ++iter) { | |
133 | CSIMSlm::TNode node(iter->wid, iter->child, 0, iter->bow); | |
134 | node.pr = iter->pr; | |
135 | fwrite(&node, sizeof(CSIMSlm::TNode), 1, fp); | |
136 | } | |
137 | CSIMSlm::TNode node(0x00FFFFFF, slm.getLevel(lvl + 1).size(), 1, 0); | |
138 | fwrite(&node, sizeof(CSIMSlm::TNode), 1, fp); | |
139 | } | |
140 | ||
141 | const CArpaSlm::TLeafLevel& level = slm.getLastLevel(); | |
142 | for (CArpaSlm::TLeafLevel::const_iterator iter = level.begin(); | |
143 | iter != level.end(); ++iter) { | |
144 | CSIMSlm::TLeaf node(iter->wid, 0); | |
145 | node.pr = iter->pr; | |
146 | fwrite(&node, sizeof(CSIMSlm::TLeaf), 1, fp); | |
147 | } | |
148 | CSIMSlm::TLeaf node(0, 1); | |
149 | fwrite(&node, sizeof(CSIMSlm::TLeaf), 1, fp); | |
150 | ||
151 | fclose(fp); | |
152 | printf("done!\n"); fflush(stdout); | |
153 | } | |
154 | ||
155 | int | |
156 | main(int argc, char* argv[]) | |
157 | { | |
158 | if (argc != 4) | |
159 | ShowUsage(argv[0]); | |
160 | const char* arpa_path = argv[1]; | |
161 | const char* lexicon_path = argv[2]; | |
162 | const char* output_path = argv[3]; | |
163 | ||
164 | CArpaSlm slm; | |
165 | TLexicon lexicon = read_lexicon(lexicon_path); | |
166 | slm.load(arpa_path, lexicon); | |
167 | ||
168 | if (!slm.good()) { | |
169 | std::cerr << "Failed to load language model from " << arpa_path << | |
170 | "." << std::endl; | |
171 | exit(1); | |
172 | } | |
173 | slm.initChild(); | |
174 | write_out(output_path, slm); | |
175 | return 0; | |
176 | } | |
177 | ||
178 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
48 | 48 | |
49 | 49 | #include <vector> |
50 | 50 | #include <map> |
51 | #include <math.h> | |
51 | #include <cmath> | |
52 | 52 | |
53 | 53 | #include "../sim_slm.h" |
54 | 54 | #include "../slm.h" |
55 | 55 | |
56 | 56 | #include "ValueCompress.h" |
57 | ||
58 | using std::log; | |
59 | using std::exp; | |
57 | 60 | |
58 | 61 | class CSIMSlmWithIteration : public CSIMSlm { |
59 | 62 | public: |
244 | 247 | ShowUsage(); |
245 | 248 | |
246 | 249 | printf("Loading original slm..."); fflush(stdout); |
247 | if (slm.Load(argv[1]) == false) | |
250 | if (!slm.Load(argv[1])) | |
248 | 251 | ShowUsage(); |
249 | 252 | |
250 | 253 | bool usingLogPr = slm.isUseLogPr(); |
251 | 254 | |
252 | #define EffectivePr(a) (float((usingLogPr) ? ((a) / log(2.0)) : (-log2((a))))) | |
253 | #define OriginalPr(b) (float((usingLogPr) ? ((b) * log(2.0)) : (exp2(-(b))))) | |
254 | #define EffectiveBow(a) (float((usingLogPr) ? (exp(-(a))) : ((a)))) | |
255 | #define OriginalBow(b) (float((usingLogPr) ? (-log((b))) : ((b)))) | |
255 | #define EffectivePr(a) (usingLogPr ? ((a) / log(2.0)) : -log2f((a))) | |
256 | #define OriginalPr(b) (usingLogPr ? ((b) * log(2.0)) : exp2(-(b))) | |
257 | #define EffectiveBow(a) (usingLogPr ? exp(-(a)) : (a)) | |
258 | #define OriginalBow(b) (usingLogPr ? -log((b)) : (b)) | |
256 | 259 | |
257 | 260 | printf("\nfirst pass..."); fflush(stdout); |
258 | 261 | for (int lvl = 0; lvl <= slm.getN(); ++lvl) { |
290 | 293 | }; |
291 | 294 | |
292 | 295 | for (unsigned i = 0, sz = sizeof(msprs) / sizeof(float); i < sz; ++i) { |
293 | float real_pr = (usingLogPr) ? (-log(msprs[i])) : (msprs[i]); | |
296 | float real_pr = usingLogPr ? -log(msprs[i]) : msprs[i]; | |
294 | 297 | float eff_pr = EffectivePr(real_pr); |
295 | 298 | if (pr_eff.find(eff_pr) == pr_eff.end()) { |
296 | 299 | pr_eff[eff_pr] = real_pr; |
308 | 311 | }; |
309 | 312 | |
310 | 313 | for (unsigned i = 0, sz = sizeof(msbows) / sizeof(float); i < sz; ++i) { |
311 | float real_bow = (usingLogPr) ? (-log(msbows[i])) : (msbows[i]); | |
314 | float real_bow = usingLogPr ? -log(msbows[i]) : msbows[i]; | |
312 | 315 | float eff_bow = EffectiveBow(real_bow); |
313 | 316 | if (bow_eff.find(eff_bow) == bow_eff.end()) { |
314 | 317 | bow_eff[eff_bow] = real_bow; |
357 | 360 | |
358 | 361 | std::map<float, int>::iterator prit = pr_map.find(pn->pr); |
359 | 362 | if (prit == pr_map.end()) { // This would be cause by precision error |
360 | double val = EffectivePr(pn->pr); | |
363 | float val = EffectivePr(pn->pr); | |
361 | 364 | val = OriginalPr(val); |
362 | 365 | prit = pr_map.find(val); |
363 | 366 | assert(prit != pr_map.end()); |
404 | 407 | |
405 | 408 | std::map<float, int>::iterator prit = pr_map.find(pn->pr); |
406 | 409 | if (prit == pr_map.end()) { // This would be cause by precision error |
407 | double val = EffectivePr(pn->pr); | |
410 | float val = EffectivePr(pn->pr); | |
408 | 411 | val = OriginalPr(val); |
409 | 412 | prit = pr_map.find(val); |
410 | 413 | assert(prit != pr_map.end()); |
0 | /* | |
1 | * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com> | |
2 | * | |
3 | * The contents of this file are subject to the terms of either the GNU Lesser | |
4 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
5 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
6 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
7 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
8 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
9 | * specific language governing permissions and limitations under the License. When | |
10 | * distributing the software, include this License Header Notice in each file and | |
11 | * include the full text of the License in the License file as well as the | |
12 | * following notice: | |
13 | * | |
14 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
15 | * (CDDL) | |
16 | * For Covered Software in this distribution, this License shall be governed by the | |
17 | * laws of the State of California (excluding conflict-of-law provisions). | |
18 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
19 | * the Federal Courts of the Northern District of California and the state courts | |
20 | * of the State of California, with venue lying in Santa Clara County, California. | |
21 | * | |
22 | * Contributor(s): | |
23 | * | |
24 | * If you wish your version of this file to be governed by only the CDDL or only | |
25 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
26 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
27 | * license." If you don't indicate a single choice of license, a recipient has the | |
28 | * option to distribute your version of this file under either the CDDL or the LGPL | |
29 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
30 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
31 | * Version 2 license, then the option applies only if the new code is made subject | |
32 | * to such option by the copyright holder. | |
33 | */ | |
34 | #include <algorithm> | |
35 | #include "common.h" | |
36 | #include "arpa_slm.h" | |
37 | #include "arpa_conv.h" | |
38 | ||
39 | ||
40 | // | |
41 | // convert CArpaSlm::TLeaf to CThreadSlm::TLeaf | |
42 | // | |
43 | class CArpaLeafConv | |
44 | { | |
45 | const bool usingLogPr; | |
46 | CompressedTable& m_pr_table; | |
47 | RealIndexMap& m_pr_map; | |
48 | ||
49 | public: | |
50 | CArpaLeafConv(bool usingLogPr_, | |
51 | RealIndexMap* pr_map, | |
52 | CompressedTable* pr_table) : | |
53 | usingLogPr(usingLogPr_), | |
54 | m_pr_table(*pr_table), | |
55 | m_pr_map(*pr_map) | |
56 | { | |
57 | } | |
58 | ||
59 | CThreadSlm::TLeaf | |
60 | operator()(const CArpaSlm::TLeaf& leaf) | |
61 | { | |
62 | CThreadSlm::TLeaf tleaf; | |
63 | tleaf.set_wid(leaf.wid); | |
64 | tleaf.set_bon(leaf.bon); | |
65 | tleaf.set_bol(leaf.bol); | |
66 | unsigned pr_idx = get_pr_index(leaf.pr); | |
67 | tleaf.set_pr(pr_idx); | |
68 | return tleaf; | |
69 | } | |
70 | ||
71 | // | |
72 | // lookup the Real/Effective value in the RealIndexMap for its index | |
73 | // in the CompressedTable | |
74 | // | |
75 | unsigned | |
76 | get_pr_index(float pr) | |
77 | { | |
78 | std::map<float, int>::iterator prit = m_pr_map.find(pr); | |
79 | if (prit == m_pr_map.end()) { // This could be caused by precision error | |
80 | double val = EffectivePr(pr); | |
81 | val = OriginalPr(val); | |
82 | prit = m_pr_map.find(val); | |
83 | assert(prit != m_pr_map.end()); | |
84 | } | |
85 | int idx_pr = prit->second; | |
86 | assert(usingLogPr || | |
87 | (m_pr_table[idx_pr] > 0.0 && m_pr_table[idx_pr] < 1.0)); | |
88 | assert(!usingLogPr || m_pr_table[idx_pr] > 0.0); | |
89 | return idx_pr; | |
90 | } | |
91 | }; | |
92 | ||
93 | // | |
94 | // convert CArpaSlm::TNode to CThreadSlm::TNode | |
95 | // | |
96 | class CArpaNodeConv | |
97 | { | |
98 | const bool usingLogPr; | |
99 | CArpaLeafConv m_leaf_conv; | |
100 | CompressedTable& m_bow_table; | |
101 | RealIndexMap& m_bow_map; | |
102 | ||
103 | public: | |
104 | CArpaNodeConv(bool usingLogPr_, | |
105 | RealIndexMap* pr_map, | |
106 | CompressedTable* pr_table, | |
107 | RealIndexMap* bow_map, | |
108 | CompressedTable* bow_table) : | |
109 | usingLogPr(usingLogPr_), | |
110 | m_leaf_conv(usingLogPr, pr_map, pr_table), | |
111 | m_bow_table(*bow_table), | |
112 | m_bow_map(*bow_map) | |
113 | { | |
114 | } | |
115 | ||
116 | CThreadSlm::TNode | |
117 | operator()(const CArpaSlm::TNode& node) | |
118 | { | |
119 | CThreadSlm::TNode tnode; | |
120 | tnode.set_wid(node.wid); | |
121 | tnode.set_bon(node.bon); | |
122 | tnode.set_bol(node.bol); | |
123 | tnode.set_ch(node.ch); | |
124 | unsigned pr_idx = m_leaf_conv.get_pr_index(node.pr); | |
125 | tnode.set_pr(pr_idx); | |
126 | unsigned bow_idx = get_bow_index(node.bow); | |
127 | tnode.set_bow(bow_idx); | |
128 | return tnode; | |
129 | } | |
130 | ||
131 | unsigned | |
132 | get_bow_index(float bow) | |
133 | { | |
134 | FreqMap::iterator bowit = m_bow_map.find(bow); | |
135 | if (bowit == m_bow_map.end()) { | |
136 | double val = EffectiveBow(bow); | |
137 | val = OriginalBow(val); | |
138 | bowit = m_bow_map.find(val); | |
139 | assert(bowit != m_bow_map.end()); | |
140 | } | |
141 | return bowit->second; | |
142 | } | |
143 | }; | |
144 | ||
145 | void | |
146 | compress(const CArpaSlm& slm, | |
147 | CompressedTable& pr_table, RealIndexMap& pr_map, | |
148 | CompressedTable& bow_table, RealIndexMap& bow_map, | |
149 | TNodeLevels& nodeLevels, CThreadSlm::TLeaf*& leafLevel) | |
150 | { | |
151 | CArpaLeafConv leaf_conv(slm.usingLogPr(), &pr_map, &pr_table); | |
152 | CArpaNodeConv node_conv( | |
153 | slm.usingLogPr(), &pr_map, &pr_table, &bow_map, &bow_table); | |
154 | const int N = slm.getN(); | |
155 | TNodeLevels node_levels(N); | |
156 | for (int lvl = 0; lvl < N; ++lvl) { | |
157 | const CArpaSlm::TNodeLevel& level = slm.getLevel(lvl); | |
158 | unsigned len = level.size(); | |
159 | node_levels[lvl] = new CThreadSlm::TNode[len + 1]; | |
160 | std::transform(level.begin(), level.end(), | |
161 | node_levels[lvl], node_conv); | |
162 | memset(&node_levels[lvl][len], 0, sizeof(CThreadSlm::TNode)); | |
163 | node_levels[lvl][len].set_ch(slm.getLevelSize(lvl + 1)); | |
164 | } | |
165 | ||
166 | const CArpaSlm::TLeafLevel& level = slm.getLastLevel(); | |
167 | unsigned len = level.size(); | |
168 | CThreadSlm::TLeaf* leaf_level = new CThreadSlm::TLeaf[len + 1]; | |
169 | std::transform(level.begin(), level.end(), | |
170 | leaf_level, leaf_conv); | |
171 | memset(&leaf_level[len], 0, sizeof(CThreadSlm::TLeaf)); | |
172 | nodeLevels = node_levels; | |
173 | leafLevel = leaf_level; | |
174 | } | |
175 | ||
176 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | // -*- mode: c++ -*- | |
1 | /* | |
2 | * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com> | |
3 | * | |
4 | * The contents of this file are subject to the terms of either the GNU Lesser | |
5 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
6 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
7 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
8 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
9 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
10 | * specific language governing permissions and limitations under the License. When | |
11 | * distributing the software, include this License Header Notice in each file and | |
12 | * include the full text of the License in the License file as well as the | |
13 | * following notice: | |
14 | * | |
15 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
16 | * (CDDL) | |
17 | * For Covered Software in this distribution, this License shall be governed by the | |
18 | * laws of the State of California (excluding conflict-of-law provisions). | |
19 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
20 | * the Federal Courts of the Northern District of California and the state courts | |
21 | * of the State of California, with venue lying in Santa Clara County, California. | |
22 | * | |
23 | * Contributor(s): | |
24 | * | |
25 | * If you wish your version of this file to be governed by only the CDDL or only | |
26 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
27 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
28 | * license." If you don't indicate a single choice of license, a recipient has the | |
29 | * option to distribute your version of this file under either the CDDL or the LGPL | |
30 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
31 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
32 | * Version 2 license, then the option applies only if the new code is made subject | |
33 | * to such option by the copyright holder. | |
34 | */ | |
35 | #ifndef _SLM_PACK_ARPA_CONV_H | |
36 | #define _SLM_PACK_ARPA_CONV_H | |
37 | ||
38 | #include "common.h" | |
39 | ||
40 | class CArpaSlm; | |
41 | ||
42 | // | |
43 | // slm [in] | |
44 | // pr_table [in] | |
45 | // pr_map [in] | |
46 | // bow_table [in] | |
47 | // bow_map [in] | |
48 | // nodeLevels [out] | |
49 | // leafLevel [out] | |
50 | // | |
51 | void compress(const CArpaSlm& slm, | |
52 | CompressedTable& pr_table, RealIndexMap& pr_map, | |
53 | CompressedTable& bow_table, RealIndexMap& bow_map, | |
54 | TNodeLevels& nodeLevels, CThreadSlm::TLeaf*& leafLevel); | |
55 | ||
56 | #endif //_SLM_PACK_ARPA_CONV_H | |
57 | ||
58 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | /* | |
1 | * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com> | |
2 | * | |
3 | * The contents of this file are subject to the terms of either the GNU Lesser | |
4 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
5 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
6 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
7 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
8 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
9 | * specific language governing permissions and limitations under the License. When | |
10 | * distributing the software, include this License Header Notice in each file and | |
11 | * include the full text of the License in the License file as well as the | |
12 | * following notice: | |
13 | * | |
14 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
15 | * (CDDL) | |
16 | * For Covered Software in this distribution, this License shall be governed by the | |
17 | * laws of the State of California (excluding conflict-of-law provisions). | |
18 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
19 | * the Federal Courts of the Northern District of California and the state courts | |
20 | * of the State of California, with venue lying in Santa Clara County, California. | |
21 | * | |
22 | * Contributor(s): | |
23 | * | |
24 | * If you wish your version of this file to be governed by only the CDDL or only | |
25 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
26 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
27 | * license." If you don't indicate a single choice of license, a recipient has the | |
28 | * option to distribute your version of this file under either the CDDL or the LGPL | |
29 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
30 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
31 | * Version 2 license, then the option applies only if the new code is made subject | |
32 | * to such option by the copyright holder. | |
33 | */ | |
34 | #include <string> | |
35 | #include <iostream> | |
36 | #include <fstream> | |
37 | #include <algorithm> | |
38 | #include "arpa_slm.h" | |
39 | ||
40 | using namespace std; | |
41 | ||
42 | /** | |
43 | * the GNU extension is not always available, so we invent another wheel. | |
44 | */ | |
45 | size_t | |
46 | getline(char *buf, size_t n, FILE* stream) | |
47 | { | |
48 | char* p; | |
49 | char* end = buf + n; | |
50 | for (p = buf; p != end; ++p) { | |
51 | int c = fgetc(stream); | |
52 | if (c == '\n' || c == EOF) | |
53 | break; | |
54 | *p = c; | |
55 | --n; | |
56 | } | |
57 | if (p != end) | |
58 | *p = 0; | |
59 | else | |
60 | *(p - 1) = 0; | |
61 | return p - buf; | |
62 | } | |
63 | ||
64 | char* | |
65 | getwords(char* buf, char** next) | |
66 | { | |
67 | char* word = buf; | |
68 | char* delim = strstr(buf, " "); | |
69 | if (delim == NULL) { | |
70 | cerr << "Unknown format in: " << buf << "." << endl; | |
71 | exit(2); | |
72 | } | |
73 | *delim = '\0'; | |
74 | *next = delim + 2; | |
75 | return word; | |
76 | } | |
77 | ||
78 | unsigned | |
79 | get_wid(const char* word, const TLexicon& lexicon) | |
80 | { | |
81 | TLexicon::const_iterator lexi = lexicon.find(word); | |
82 | unsigned wid; | |
83 | if (lexi != lexicon.end()) { | |
84 | wid = lexi->second; | |
85 | } else { | |
86 | cerr << "Error:\"" << word << "\" not found in lexicon." << endl; | |
87 | wid = 0; | |
88 | } | |
89 | return wid; | |
90 | } | |
91 | ||
92 | int | |
93 | CArpaSlm::TLeaf::load_words(char* buf, const TLexicon& lexicon) | |
94 | { | |
95 | int nword = 0; | |
96 | char* word, *end; | |
97 | for (word = end = buf; *end != 0; ++end) { | |
98 | if (*end == ' ') { | |
99 | assert(nword < N_GRAM); | |
100 | *end = 0; | |
101 | hw[nword++] = get_wid(word, lexicon); | |
102 | word = end + 1; | |
103 | } | |
104 | } | |
105 | if (buf != end) { | |
106 | wid = hw[nword++] = get_wid(word, lexicon); | |
107 | } | |
108 | return nword; | |
109 | } | |
110 | ||
111 | void | |
112 | CArpaSlm::TLeaf::load(istream& is, const TLexicon& lexicon) | |
113 | { | |
114 | char buf[1024]; | |
115 | is.getline(buf, sizeof(buf)); | |
116 | char* next = 0; | |
117 | char* words = getwords(buf, &next); | |
118 | load_words(words, lexicon); | |
119 | sscanf(next, "%f (%1u, %u)", | |
120 | &pr, &bol, &bon); | |
121 | } | |
122 | ||
123 | void | |
124 | CArpaSlm::TNode::load(istream& is, const TLexicon& lexicon) | |
125 | { | |
126 | char buf[1024]; | |
127 | is.getline(buf, sizeof(buf)); | |
128 | char* next = 0; | |
129 | char* words = getwords(buf, &next); | |
130 | load_words(words, lexicon); | |
131 | sscanf(next, "%f %f (%1u, %u)", | |
132 | &pr, &bow, &bol, &bon); | |
133 | } | |
134 | ||
135 | void | |
136 | CArpaSlm::TNode::load_level0(istream& is) | |
137 | { | |
138 | hw[0] = 0; | |
139 | char buf[1024]; | |
140 | is.getline(buf, sizeof(buf)); | |
141 | sscanf(buf, "%f %f (%1u, %u)", | |
142 | &pr, &bow, &bol, &bon); | |
143 | wid = 0; | |
144 | } | |
145 | ||
146 | void | |
147 | CArpaSlm::load(const char* filename, const TLexicon& lexicon) | |
148 | { | |
149 | printf("Loading ARPA slm..."); fflush(stdout); | |
150 | ifstream file(filename); | |
151 | char buf[1024]; | |
152 | for (int i = 0; i <= N_GRAM; ++i) { | |
153 | unsigned lvl; | |
154 | int size; | |
155 | file.getline(buf, sizeof(buf)); | |
156 | if (!file) { | |
157 | cerr << "Failed to read from" << filename << endl; | |
158 | exit(1); | |
159 | } | |
160 | sscanf(buf, "\\%d-gram\\%d%*[\n]", &lvl, &size); | |
161 | assert(lvl <= N_GRAM); | |
162 | if (lvl == 0) { | |
163 | TNode node0; | |
164 | node0.load_level0(file); | |
165 | m_levels[0].push_back(node0); | |
166 | } else if (lvl < m_N) { | |
167 | m_levels[lvl].reserve(size); | |
168 | for (int i = 0; i < size; ++i) { | |
169 | TNode node; | |
170 | node.load(file, lexicon); | |
171 | m_levels[lvl].push_back(node); | |
172 | } | |
173 | } else { | |
174 | // leaf nodes | |
175 | m_lastLevel.reserve(size); | |
176 | for (int i = 0; i < size; ++i) { | |
177 | TLeaf leaf; | |
178 | leaf.load(file, lexicon); | |
179 | m_lastLevel.push_back(leaf); | |
180 | } | |
181 | } | |
182 | } | |
183 | } | |
184 | ||
185 | template <class NodeT> | |
186 | struct CompareNode { | |
187 | const unsigned m_lvl; | |
188 | CompareNode(unsigned lvl) : m_lvl(lvl) | |
189 | { | |
190 | } | |
191 | /** | |
192 | * @return true if strictly less, false otherwise | |
193 | */ | |
194 | bool | |
195 | operator ()(const NodeT& node, const TSIMWordId hw[N_GRAM]) | |
196 | { | |
197 | for (unsigned i = 0; i < m_lvl; ++i) { | |
198 | if (node.hw[i] < hw[i]) | |
199 | return true; | |
200 | if (node.hw[i] > hw[i]) | |
201 | return false; | |
202 | } | |
203 | // node.hw[:lvl] is the same as hw[:] | |
204 | return false; | |
205 | } | |
206 | }; | |
207 | ||
208 | void | |
209 | CArpaSlm::threading() | |
210 | { | |
211 | { | |
212 | TNode& node = m_levels[0][0]; | |
213 | node.ch = 0; | |
214 | } | |
215 | for (unsigned lvl = 1; lvl < m_N; ++lvl) { | |
216 | TNodeLevel& level = m_levels[lvl]; | |
217 | unsigned last_child = 0; | |
218 | for (TNodeLevel::iterator node = level.begin(); | |
219 | node != level.end(); | |
220 | ++node) { | |
221 | node->ch = last_child = find_1st_child(lvl, *node, last_child); | |
222 | } | |
223 | } | |
224 | } | |
225 | ||
226 | unsigned | |
227 | CArpaSlm::find_1st_child(unsigned lvl, const TNode& node, int last_child) | |
228 | { | |
229 | assert(lvl < m_N); | |
230 | if (lvl == m_N - 1) { | |
231 | TLeafLevel::iterator found = lower_bound( | |
232 | m_lastLevel.begin(), m_lastLevel.end(), node.hw, | |
233 | CompareNode<TLeaf>(lvl)); | |
234 | return distance(m_lastLevel.begin(), found); | |
235 | } else { | |
236 | const TNodeLevel& level = m_levels[lvl + 1]; | |
237 | TNodeLevel::const_iterator found = lower_bound(level.begin(), level.end( | |
238 | ), node.hw, | |
239 | CompareNode<TNode>(lvl)); | |
240 | return distance(level.begin(), found); | |
241 | } | |
242 | } | |
243 | ||
244 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | // -*- mode: c++ -*- | |
1 | /* | |
2 | * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com> | |
3 | * | |
4 | * The contents of this file are subject to the terms of either the GNU Lesser | |
5 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
6 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
7 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
8 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
9 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
10 | * specific language governing permissions and limitations under the License. When | |
11 | * distributing the software, include this License Header Notice in each file and | |
12 | * include the full text of the License in the License file as well as the | |
13 | * following notice: | |
14 | * | |
15 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
16 | * (CDDL) | |
17 | * For Covered Software in this distribution, this License shall be governed by the | |
18 | * laws of the State of California (excluding conflict-of-law provisions). | |
19 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
20 | * the Federal Courts of the Northern District of California and the state courts | |
21 | * of the State of California, with venue lying in Santa Clara County, California. | |
22 | * | |
23 | * Contributor(s): | |
24 | * | |
25 | * If you wish your version of this file to be governed by only the CDDL or only | |
26 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
27 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
28 | * license." If you don't indicate a single choice of license, a recipient has the | |
29 | * option to distribute your version of this file under either the CDDL or the LGPL | |
30 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
31 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
32 | * Version 2 license, then the option applies only if the new code is made subject | |
33 | * to such option by the copyright holder. | |
34 | */ | |
35 | #ifndef _ARPA_SLM_H | |
36 | #define _ARPA_SLM_H | |
37 | ||
38 | #include <istream> | |
39 | #include "common.h" | |
40 | ||
41 | using std::istream; | |
42 | ||
43 | #define N_GRAM (3) | |
44 | ||
45 | ||
46 | /* the ARPA style representation of sunpinyin's SLM */ | |
47 | class CArpaSlm { | |
48 | public: | |
49 | struct TLeaf { | |
50 | TSIMWordId hw[N_GRAM]; | |
51 | TSIMWordId wid; | |
52 | float pr; | |
53 | unsigned ch; | |
54 | unsigned bon; | |
55 | unsigned bol; | |
56 | void load(istream&, const TLexicon&); | |
57 | int load_words(char* buf, const TLexicon& lexicon); | |
58 | TLeaf() : wid(0), pr(.0), ch(0), bon(0), bol(0) {} | |
59 | }; | |
60 | ||
61 | struct TNode : public TLeaf { | |
62 | float bow; | |
63 | void load(istream&, const TLexicon&); | |
64 | void load_level0(istream&); | |
65 | }; | |
66 | ||
67 | typedef std::vector<TNode> TNodeLevel; | |
68 | typedef std::vector<TLeaf> TLeafLevel; | |
69 | ||
70 | private: | |
71 | TNodeLevel m_levels[N_GRAM + 1]; /* [0..N_GRAM] */ | |
72 | TLeafLevel m_lastLevel; | |
73 | const bool m_usingLogPr; | |
74 | const unsigned m_N; | |
75 | ||
76 | public: | |
77 | /* XXX, ARPA file does not provide these information. | |
78 | so we assume this SLM is trigram, and does not use LogPr */ | |
79 | CArpaSlm() : m_usingLogPr(false), m_N(N_GRAM) {} | |
80 | bool good() const { return m_levels[0].size() != 0; } | |
81 | unsigned getN() const { return m_N; } | |
82 | bool usingLogPr() const { return m_usingLogPr; } | |
83 | const TNodeLevel& getLevel(unsigned lvl) const { return m_levels[lvl]; } | |
84 | const TLeafLevel& getLastLevel() const { return m_lastLevel; } | |
85 | unsigned getLevelSize(unsigned lvl) const { | |
86 | assert(lvl <= m_N); | |
87 | if (lvl < m_N) { | |
88 | return m_levels[lvl].size(); | |
89 | } else { | |
90 | return m_lastLevel.size(); | |
91 | } | |
92 | } | |
93 | /** | |
94 | * initialize the `ch' and `wid' fields of each node in levels | |
95 | */ | |
96 | void threading(); | |
97 | void load(const char* filename, const TLexicon& lexicon); | |
98 | ||
99 | private: | |
100 | /** | |
101 | * find out the first child of a given node in its next level | |
102 | * @param lvl the level where node belongs to | |
103 | * @param node the node | |
104 | * @param last_child the child index of previous node | |
105 | * @return the index of the found child | |
106 | */ | |
107 | unsigned find_1st_child(unsigned lvl, const TNode& node, int last_child); | |
108 | }; | |
109 | ||
110 | #endif //_ARPA_SLM_H | |
111 | ||
112 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | // -*- mode: c++ -*- | |
1 | /* | |
2 | * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com> | |
3 | * | |
4 | * The contents of this file are subject to the terms of either the GNU Lesser | |
5 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
6 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
7 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
8 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
9 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
10 | * specific language governing permissions and limitations under the License. When | |
11 | * distributing the software, include this License Header Notice in each file and | |
12 | * include the full text of the License in the License file as well as the | |
13 | * following notice: | |
14 | * | |
15 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
16 | * (CDDL) | |
17 | * For Covered Software in this distribution, this License shall be governed by the | |
18 | * laws of the State of California (excluding conflict-of-law provisions). | |
19 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
20 | * the Federal Courts of the Northern District of California and the state courts | |
21 | * of the State of California, with venue lying in Santa Clara County, California. | |
22 | * | |
23 | * Contributor(s): | |
24 | * | |
25 | * If you wish your version of this file to be governed by only the CDDL or only | |
26 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
27 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
28 | * license." If you don't indicate a single choice of license, a recipient has the | |
29 | * option to distribute your version of this file under either the CDDL or the LGPL | |
30 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
31 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
32 | * Version 2 license, then the option applies only if the new code is made subject | |
33 | * to such option by the copyright holder. | |
34 | */ | |
35 | #ifndef _SLM_PACK_COMMON_H | |
36 | #define _SLM_PACK_COMMON_H | |
37 | ||
38 | #include <vector> | |
39 | #include <map> | |
40 | #include <string> | |
41 | #include <cmath> | |
42 | #include <cassert> | |
43 | ||
44 | #include "../slm.h" | |
45 | ||
46 | typedef std::vector<CThreadSlm::TNode> TNodeLevel; | |
47 | typedef std::vector<CThreadSlm::TLeaf> TLeafLevel; | |
48 | typedef std::vector<CThreadSlm::TNode*> TNodeLevels; | |
49 | typedef std::map<float, float> EffRealMap; // map from efficient values to the real ones | |
50 | typedef std::map<float, int> FreqMap; // how often the efficient value appears | |
51 | typedef std::vector<float> CompressedTable; // array of real values, the index is stored in RealIndexMap | |
52 | typedef std::map<float, int> RealIndexMap; // map real values to their indices | |
53 | typedef std::map<std::string, unsigned int> TLexicon; // map word to wid | |
54 | ||
55 | #define EffectivePr(a) (float((usingLogPr) ? ((a) / log(2.0)) : (-log2((a))))) | |
56 | #define OriginalPr(b) (float((usingLogPr) ? ((b) * log(2.0)) : (exp2(-(b))))) | |
57 | #define EffectiveBow(a) (float((usingLogPr) ? (exp(-(a))) : ((a)))) | |
58 | #define OriginalBow(b) (float((usingLogPr) ? (-log((b))) : ((b)))) | |
59 | ||
60 | #endif //_SLM_PACK_COMMON_H | |
61 | ||
62 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | /* | |
1 | * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com> | |
2 | * | |
3 | * The contents of this file are subject to the terms of either the GNU Lesser | |
4 | * General Public License Version 2.1 only ("LGPL") or the Common Development and | |
5 | * Distribution License ("CDDL")(collectively, the "License"). You may not use this | |
6 | * file except in compliance with the License. You can obtain a copy of the CDDL at | |
7 | * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at | |
8 | * http://www.opensource.org/licenses/lgpl-license.php. See the License for the | |
9 | * specific language governing permissions and limitations under the License. When | |
10 | * distributing the software, include this License Header Notice in each file and | |
11 | * include the full text of the License in the License file as well as the | |
12 | * following notice: | |
13 | * | |
14 | * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE | |
15 | * (CDDL) | |
16 | * For Covered Software in this distribution, this License shall be governed by the | |
17 | * laws of the State of California (excluding conflict-of-law provisions). | |
18 | * Any litigation relating to this License shall be subject to the jurisdiction of | |
19 | * the Federal Courts of the Northern District of California and the state courts | |
20 | * of the State of California, with venue lying in Santa Clara County, California. | |
21 | * | |
22 | * Contributor(s): | |
23 | * | |
24 | * If you wish your version of this file to be governed by only the CDDL or only | |
25 | * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to | |
26 | * include this software in this distribution under the [CDDL or LGPL Version 2.1] | |
27 | * license." If you don't indicate a single choice of license, a recipient has the | |
28 | * option to distribute your version of this file under either the CDDL or the LGPL | |
29 | * Version 2.1, or to extend the choice of license to its licensees as provided | |
30 | * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL | |
31 | * Version 2 license, then the option applies only if the new code is made subject | |
32 | * to such option by the copyright holder. | |
33 | */ | |
34 | ||
35 | /* | |
36 | * pack ARPA format to a binary format which can be consumed by SunPinyin | |
37 | */ | |
38 | ||
39 | #ifdef HAVE_CONFIG_H | |
40 | #include "config.h" | |
41 | #endif | |
42 | ||
43 | #ifdef HAVE_ASSERT_H | |
44 | #include <assert.h> | |
45 | #endif | |
46 | ||
47 | #include <stdio.h> | |
48 | #include <unistd.h> | |
49 | #include <stdlib.h> | |
50 | ||
51 | #include <vector> | |
52 | #include <map> | |
53 | #include <iostream> | |
54 | #include <cmath> | |
55 | ||
56 | //#include "../sim_slm.h" | |
57 | #include "../slm.h" | |
58 | ||
59 | #include "../thread/ValueCompress.h" | |
60 | #include "arpa_slm.h" | |
61 | #include "arpa_conv.h" | |
62 | ||
63 | ||
64 | void | |
65 | ShowUsage(const char* progname) | |
66 | { | |
67 | printf("Usage:\n"); | |
68 | printf(" %s arpa_slm dict_file threaded_slm\n", progname); | |
69 | printf("\n"); | |
70 | printf("Description:\n"); | |
71 | printf( | |
72 | " %s converts the ARPA representation of SLM to the binary format of threaded SLM. \n", | |
73 | progname); | |
74 | exit(100); | |
75 | } | |
76 | ||
77 | /** | |
78 | * slm [in] | |
79 | * pr_eff, pr_values [out] | |
80 | * bow_eff, bow_values [out] | |
81 | */ | |
82 | ||
83 | void | |
84 | build_map(const CArpaSlm& slm, | |
85 | EffRealMap &pr_eff, | |
86 | FreqMap& pr_values, | |
87 | EffRealMap &bow_eff, | |
88 | FreqMap& bow_values) | |
89 | { | |
90 | bool usingLogPr = slm.usingLogPr(); | |
91 | ||
92 | printf("\nfirst pass..."); fflush(stdout); | |
93 | ||
94 | for (unsigned lvl = 0; lvl < slm.getN(); ++lvl) { | |
95 | typedef CArpaSlm::TNodeLevel TNodeLevel; | |
96 | const TNodeLevel& level = slm.getLevel(lvl); | |
97 | for (TNodeLevel::const_iterator node = level.begin(); | |
98 | node != level.end(); | |
99 | ++node) { | |
100 | float real_pr, eff_pr; | |
101 | real_pr = node->pr; | |
102 | eff_pr = EffectivePr(real_pr); | |
103 | if (pr_eff.find(eff_pr) == pr_eff.end()) { | |
104 | pr_eff[eff_pr] = real_pr; | |
105 | } else { // precision error cause non 1:1 mapping | |
106 | pr_eff[eff_pr] = OriginalPr(eff_pr); | |
107 | } | |
108 | ++(pr_values[eff_pr]); | |
109 | ||
110 | float real_bow, eff_bow; | |
111 | real_bow = node->bow; | |
112 | eff_bow = EffectiveBow(real_bow); | |
113 | if (bow_eff.find(eff_bow) == bow_eff.end()) { | |
114 | bow_eff[eff_bow] = real_bow; | |
115 | } else { // two values map to same distance value due to precision error | |
116 | bow_eff[eff_bow] = OriginalBow(eff_bow); | |
117 | } | |
118 | ++(bow_values[eff_bow]); | |
119 | } | |
120 | } | |
121 | typedef CArpaSlm::TLeafLevel TLeafLevel; | |
122 | const TLeafLevel& level = slm.getLastLevel(); | |
123 | for (TLeafLevel::const_iterator leaf = level.begin(); | |
124 | leaf != level.end(); | |
125 | ++leaf) { | |
126 | float real_pr, eff_pr; | |
127 | real_pr = leaf->pr; | |
128 | eff_pr = EffectivePr(real_pr); | |
129 | if (pr_eff.find(eff_pr) == pr_eff.end()) { | |
130 | pr_eff[eff_pr] = real_pr; | |
131 | } else { // precision error cause non 1:1 mapping | |
132 | pr_eff[eff_pr] = OriginalPr(eff_pr); | |
133 | } | |
134 | ++(pr_values[eff_pr]); | |
135 | } | |
136 | // Following pr value should not be grouped, or as milestone values. | |
137 | static const float msprs[] = { | |
138 | 0.9, 0.8, 0.7, 0.6, | |
139 | 1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32, 1.0 / 64, 1.0 / 128, | |
140 | 1.0 / 256, 1.0 / 512, 1.0 / 1024, 1.0 / 2048, 1.0 / 4096, 1.0 / 8192, | |
141 | 1.0 / 16384, 1.0 / 32768, 1.0 / 65536 | |
142 | }; | |
143 | ||
144 | for (unsigned i = 0, sz = sizeof(msprs) / sizeof(float); i < sz; ++i) { | |
145 | float real_pr = (usingLogPr) ? (-log(msprs[i])) : (msprs[i]); | |
146 | float eff_pr = EffectivePr(real_pr); | |
147 | assert(usingLogPr || (real_pr > 0.0 && real_pr < 1.0)); | |
148 | assert(!usingLogPr || real_pr > 0.0); | |
149 | ||
150 | if (pr_eff.find(eff_pr) == pr_eff.end()) { | |
151 | pr_eff[eff_pr] = real_pr; | |
152 | } else { // precision error causes non 1:1 mapping | |
153 | pr_eff[eff_pr] = OriginalPr(eff_pr); | |
154 | } | |
155 | pr_values[eff_pr] = 0; | |
156 | } | |
157 | ||
158 | // Following bow value should not be grouped, or as milestone values. | |
159 | static const float msbows[] = { | |
160 | 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, | |
161 | 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, | |
162 | 0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001 | |
163 | }; | |
164 | ||
165 | for (unsigned i = 0; i < sizeof(msbows) / sizeof(msbows[0]); ++i) { | |
166 | float real_bow = (usingLogPr) ? (-log(msbows[i])) : (msbows[i]); | |
167 | float eff_bow = EffectiveBow(real_bow); | |
168 | if (bow_eff.find(eff_bow) == bow_eff.end()) { | |
169 | bow_eff[eff_bow] = real_bow; | |
170 | } else { // two values map to same distance value due to precision error | |
171 | bow_eff[eff_bow] = OriginalBow(eff_bow); | |
172 | } | |
173 | bow_values[eff_bow] = 0; | |
174 | } | |
175 | } | |
176 | ||
177 | /** | |
178 | * group vaules into a smaller set of their approximations | |
179 | * | |
180 | * bow_eff [in], bow_values [in], bow_map [out], bow_table [out] | |
181 | * pr_eff [in], pr_values [in], pr_map [out], pr_table [out] | |
182 | * | |
183 | */ | |
184 | void | |
185 | group_values(bool usingLogPr, | |
186 | EffRealMap& pr_eff, | |
187 | FreqMap& pr_values, | |
188 | CompressedTable& pr_table, | |
189 | RealIndexMap& pr_map, | |
190 | EffRealMap& bow_eff, | |
191 | FreqMap& bow_values, | |
192 | CompressedTable& bow_table, | |
193 | RealIndexMap& bow_map) | |
194 | { | |
195 | printf("\nCompressing pr values..."); fflush(stdout); | |
196 | CValueCompressor vc; | |
197 | vc(pr_eff, pr_values, pr_map, pr_table, (1 << CThreadSlm::BITS_PR)); | |
198 | CompressedTable::iterator itt, itte; | |
199 | itte = pr_table.end(); | |
200 | for (itt = pr_table.begin(); itt != itte; ++itt) { | |
201 | *itt = OriginalPr(*itt); | |
202 | assert(usingLogPr || (*itt > 0.0 && *itt < 1.0)); | |
203 | assert(!usingLogPr || *itt > 0.0); | |
204 | } | |
205 | printf("%lu float values ==> %lu values", pr_eff.size(), pr_table.size()); | |
206 | ||
207 | printf("\nCompressing bow values..."); fflush(stdout); | |
208 | vc(bow_eff, bow_values, bow_map, bow_table, (1 << CThreadSlm::BITS_BOW)); | |
209 | itte = bow_table.end(); | |
210 | for (itt = bow_table.begin(); itt != itte; ++itt) | |
211 | *itt = OriginalBow(*itt); | |
212 | printf("%lu float values ==> %lu values", bow_eff.size(), bow_table.size()); | |
213 | } | |
214 | ||
215 | TLexicon | |
216 | read_lexicon(const char* filename) | |
217 | { | |
218 | printf("Loading lexicon..."); fflush(stdout); | |
219 | static char word[1024 * 10]; | |
220 | FILE* f_lex = fopen(filename, "r"); | |
221 | TLexicon lexicon; | |
222 | while (fgets(word, sizeof(word), f_lex)) { | |
223 | if (strlen(word) > 0) { | |
224 | // skip to the first non hanzi character | |
225 | char* p = word; | |
226 | while (*p == ' ' || *p == '\t') | |
227 | ++p; | |
228 | while (*p != 0 && *p != ' ' && *p != '\t') | |
229 | ++p; | |
230 | if (*p == 0) continue; | |
231 | *p++ = 0; | |
232 | // skip to the word_id | |
233 | while (*p == ' ' || *p == '\t') | |
234 | ++p; | |
235 | if (!(*p >= '0' && *p <= '9')) continue; | |
236 | ||
237 | int id; | |
238 | for (id = 0; *p >= '0' && *p <= '9'; ++p) | |
239 | id = 10 * id + (*p - '0'); | |
240 | lexicon[std::string(word)] = id; | |
241 | } | |
242 | } | |
243 | fclose(f_lex); | |
244 | printf("done.\n"); fflush(stdout); | |
245 | ||
246 | return lexicon; | |
247 | } | |
248 | ||
249 | ||
250 | ||
251 | // | |
252 | // filename [in] | |
253 | // pr_table [in] | |
254 | // bow_table [in] | |
255 | // levels[0] [in] | |
256 | // ... | |
257 | // levels[N] [in] | |
258 | // lastLevel [in] | |
259 | // | |
260 | void | |
261 | write_out(const char* filename, const CArpaSlm& slm, | |
262 | CompressedTable& pr_table, CompressedTable& bow_table, | |
263 | const TNodeLevels& levels, const CThreadSlm::TLeaf* lastLevel) | |
264 | { | |
265 | printf("\nWriting out..."); fflush(stdout); | |
266 | ||
267 | FILE* fp = fopen(filename, "wb"); | |
268 | const int N = slm.getN(); | |
269 | fwrite(&N, sizeof(int), 1, fp); | |
270 | const unsigned usingLogPr = slm.usingLogPr(); | |
271 | fwrite(&usingLogPr, sizeof(unsigned), 1, fp); | |
272 | ||
273 | for (int lvl = 0; lvl <= N; ++lvl) { | |
274 | unsigned len = slm.getLevelSize(lvl) + 1; | |
275 | fwrite(&len, sizeof(unsigned), 1, fp); | |
276 | } | |
277 | ||
278 | for (int i = 0, sz = pr_table.size(); i < (1 << CThreadSlm::BITS_PR); | |
279 | ++i) { | |
280 | if (i < sz) { | |
281 | fwrite(&pr_table[i], sizeof(float), 1, fp); | |
282 | } else { | |
283 | float dummy = 0.0F; | |
284 | fwrite(&dummy, sizeof(float), 1, fp); | |
285 | } | |
286 | } | |
287 | ||
288 | for (int i = 0, sz = bow_table.size(); i < (1 << CThreadSlm::BITS_BOW); | |
289 | ++i) { | |
290 | if (i < sz) { | |
291 | fwrite(&bow_table[i], sizeof(float), 1, fp); | |
292 | } else { | |
293 | float dummy = 0.0F; | |
294 | fwrite(&dummy, sizeof(float), 1, fp); | |
295 | } | |
296 | } | |
297 | ||
298 | for (int lvl = 0; lvl < N; ++lvl) { | |
299 | fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize( | |
300 | lvl) + 1, fp); | |
301 | } | |
302 | ||
303 | fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N) + 1, fp); | |
304 | ||
305 | fclose(fp); | |
306 | ||
307 | printf("done!\n"); fflush(stdout); | |
308 | } | |
309 | ||
310 | ||
311 | void | |
312 | cleanup(CompressedTable& pr_table, CompressedTable& bow_table, | |
313 | TNodeLevels& levels, CThreadSlm::TLeaf* lastLevel) | |
314 | { | |
315 | for (unsigned lvl = 0; lvl < levels.size(); ++lvl) | |
316 | delete[] levels[lvl]; | |
317 | delete[] lastLevel; | |
318 | bow_table.clear(); | |
319 | pr_table.clear(); | |
320 | } | |
321 | ||
322 | int | |
323 | main(int argc, char* argv[]) | |
324 | { | |
325 | if (argc != 4) | |
326 | ShowUsage(argv[0]); | |
327 | const char* arpa_path = argv[1]; | |
328 | const char* lexicon_path = argv[2]; | |
329 | const char* threaded_path = argv[3]; | |
330 | ||
331 | CArpaSlm slm; | |
332 | TLexicon lexicon = read_lexicon(lexicon_path); | |
333 | slm.load(arpa_path, lexicon); | |
334 | ||
335 | if (!slm.good()) { | |
336 | std::cerr << "Failed to load language model from " << arpa_path << | |
337 | "." << std::endl; | |
338 | exit(1); | |
339 | } | |
340 | slm.threading(); | |
341 | ||
342 | EffRealMap pr_eff, bow_eff; // effval --> val | |
343 | FreqMap pr_values, bow_values; // effval --> freq | |
344 | build_map(slm, pr_eff, pr_values, bow_eff, bow_values); | |
345 | ||
346 | RealIndexMap pr_map, bow_map; // result: val --> int | |
347 | CompressedTable pr_table, bow_table; // result: val vector | |
348 | group_values(slm.usingLogPr(), | |
349 | pr_eff, pr_values, pr_table, pr_map, | |
350 | bow_eff, bow_values, bow_table, bow_map); | |
351 | pr_values.clear(); | |
352 | bow_values.clear(); | |
353 | ||
354 | TNodeLevels levels; | |
355 | CThreadSlm::TLeaf* lastLevel; | |
356 | compress(slm, pr_table, pr_map, bow_table, bow_map, | |
357 | levels, lastLevel); | |
358 | ||
359 | pr_map.clear(); | |
360 | bow_map.clear(); | |
361 | write_out(threaded_path, slm, pr_table, bow_table, levels, lastLevel); | |
362 | ||
363 | cleanup(pr_table, bow_table, levels, lastLevel); | |
364 | return 0; | |
365 | } | |
366 | ||
367 | // -*- indent-tabs-mode: nil -*- vim:et:ts=4 |
0 | 0 | #!/usr/bin/@MAKE@ -f |
1 | 1 | # -*- mode: makefile; indent-tabs-mode: t -*- vim:noet:ts=4 |
2 | 2 | |
3 | # In case of problems, also try the following ${DL_HOST} values: | |
4 | # (copied from Gentoo's `thirdpartymirrors' file) | |
5 | # http://aarnet.dl.sourceforge.net | |
6 | # http://colocrossing.dl.sourceforge.net | |
7 | # http://cznic.dl.sourceforge.net | |
8 | # http://dfn.dl.sourceforge.net | |
9 | # http://freefr.dl.sourceforge.net | |
10 | # http://garr.dl.sourceforge.net | |
11 | # http://heanet.dl.sourceforge.net | |
12 | # http://hivelocity.dl.sourceforge.net | |
13 | # http://ignum.dl.sourceforge.net | |
14 | # http://internode.dl.sourceforge.net | |
15 | # http://iweb.dl.sourceforge.net | |
16 | # http://jaist.dl.sourceforge.net | |
17 | # http://kaz.dl.sourceforge.net | |
18 | # http://kent.dl.sourceforge.net | |
19 | # http://nchc.dl.sourceforge.net | |
20 | # http://ncu.dl.sourceforge.net | |
21 | # http://netcologne.dl.sourceforge.net | |
22 | # http://optimate.dl.sourceforge.net | |
23 | # http://softlayer.dl.sourceforge.net | |
24 | # http://sunet.dl.sourceforge.net | |
25 | # http://surfnet.dl.sourceforge.net | |
26 | # http://switch.dl.sourceforge.net | |
27 | # http://tcpdiag.dl.sourceforge.net | |
28 | # http://ufpr.dl.sourceforge.net | |
29 | # http://waia.dl.sourceforge.net | |
30 | # http://waix.dl.sourceforge.net | |
31 | ||
3 | 32 | WGET = @WGET@ |
4 | 33 | TAR = @TAR@ |
34 | W3M = @W3M@ | |
5 | 35 | ENDIANNESS = @ENDIANNESS@ |
6 | DATA_DIR = '@DATADIR@/sunpinyin' | |
36 | DATA_DIR = @DATADIR@/sunpinyin | |
7 | 37 | |
8 | DL_LIST = 'http://code.google.com/p/open-gram/downloads/list' | |
9 | DL_ROOT = 'http://open-gram.googlecode.com/files/' | |
38 | DL_LIST = https://sourceforge.net/projects/open-gram/files/ | |
39 | DL_HOST = http://heanet.dl.sourceforge.net | |
40 | DL_ROOT = ${DL_HOST}/open-gram | |
10 | 41 | DICT_PAT = 'dict\.utf8-[0-9]\+.tar.bz2' |
11 | TSLM_PAT = 'lm_sc\.t3g\.arpa-[0-9]\+.tar.bz2' | |
42 | SLM_PAT = 'lm_sc\.3gm\.arpa-[0-9]\+.tar.bz2' | |
12 | 43 | |
13 | DICT_AR = $(shell w3m -dump ${DL_LIST} | grep -o ${DICT_PAT} | sort | tail -n 1) | |
14 | TSLM_AR = $(shell w3m -dump ${DL_LIST} | grep -o ${TSLM_PAT} | sort | tail -n 1) | |
44 | DICT_AR = $(shell ${W3M} ${DL_LIST} | grep -o ${DICT_PAT} | sort -u | tail -n 1) | |
45 | SLM_AR = $(shell ${W3M} ${DL_LIST} | grep -o ${SLM_PAT} | sort -u | tail -n 1) | |
15 | 46 | |
16 | 47 | all: install |
17 | 48 | |
21 | 52 | dict.utf8: ${DICT_AR} |
22 | 53 | ${TAR} xmf $^ |
23 | 54 | |
24 | ${TSLM_AR}: | |
55 | ${SLM_AR}: | |
25 | 56 | ${WGET} ${DL_ROOT}/$@ |
26 | 57 | |
27 | lm_sc.t3g.arpa: ${TSLM_AR} | |
58 | lm_sc.3gm.arpa: ${SLM_AR} | |
28 | 59 | ${TAR} xmf $^ |
29 | 60 | |
30 | lm_sc.t3g.orig: dict.utf8 lm_sc.t3g.arpa | |
31 | tslmpack lm_sc.t3g.arpa dict.utf8 $@ | |
61 | lm_sc.3gm: lm_sc.3gm.arpa dict.utf8 | |
62 | slmpack $^ $@ | |
63 | ||
64 | lm_sc.t3g.orig: lm_sc.3gm | |
65 | slmthread $^ $@ | |
32 | 66 | |
33 | 67 | lm_sc.t3g: lm_sc.t3g.orig |
34 | 68 | tslmendian -e ${ENDIANNESS} -i $^ -o $@ |
35 | 69 | |
36 | 70 | pydict_sc.bin: dict.utf8 lm_sc.t3g |
37 | 71 | genpyt -e ${ENDIANNESS} -i dict.utf8 -s lm_sc.t3g \ |
38 | -l lm_sc.t3g.log -o $@ | |
72 | -l pydict_sc.log -o $@ | |
39 | 73 | |
40 | 74 | install: lm_sc.t3g pydict_sc.bin |
41 | 75 | install -d ${DATA_DIR} |
42 | install -Dm644 $^ ${DATA_DIR} | |
76 | install -m644 $^ ${DATA_DIR} | |
43 | 77 | |
44 | 78 | clean: |
45 | rm -rf ${DICT_AR} ${TSLM_AR} dict.utf8 lm_sc.t3g.arpa \ | |
46 | lm_sc.t3g.orig lm_sc.t3g lm_sc.t3g.log pydict_sc.bin | |
47 | ||
79 | rm -rf ${DICT_AR} ${SLM_AR} dict.utf8 lm_sc.3gm.arpa lm_sc.3gm \ | |
80 | lm_sc.t3g.orig lm_sc.t3g pydict_sc.log pydict_sc.bin |