Codebase list sunpinyin / 642cb80
New upstream version 3.0.0~git20160910 Liang Guo 7 years ago
34 changed file(s) with 1586 addition(s) and 2023 deletion(s). Raw diff Collapse all Expand all
+0
-9
README less more
0 SunPinyin
1 ===
2
3 SunPinyin is an SLM (Statistical Language Model) based input method
4 engine. To model the Chinese language, it use a backoff bigram and
5 trigram language model.
6
7 Currently, SunPinyin 2.0 is available on IBus, SCIM, and as a
8 standalone XIM Server.
0 SunPinyin
1 ===
2
3 SunPinyin is an SLM (Statistical Language Model) based input method
4 engine. To model the Chinese language, it uses a backoff bigram and
5 trigram language model.
6
7 Currently, SunPinyin 2.0 is available on IBus, SCIM, and as a
8 standalone XIM Server.
9
10 [![Build Status](https://travis-ci.org/sunpinyin/sunpinyin.svg?branch=master)](https://travis-ci.org/sunpinyin/sunpinyin)
11 import os
22 import sys
33
4 version="2.0.4"
4
5 version = "2.0.4"
56 abi_major = 3
67 abi_minor = 0
78
1213 'src/slm/ids2ngram/idngram_merge.cpp',
1314 'src/slm/mmseg/mmseg.cpp',
1415 'src/slm/tslminfo/tslminfo.cpp',
15 'src/slm/tslmpack/arpa_slm.cpp',
16 'src/slm/tslmpack/arpa_conv.cpp',
17 'src/slm/tslmpack/slmpack.cpp',
1816 'src/slm/slm.cpp',
1917 'src/slm/slminfo/slminfo.cpp',
18 'src/slm/slmpack/arpa_slm.cpp',
19 'src/slm/slmpack/slmpack.cpp',
2020 'src/slm/sim_sen.cpp',
2121 'src/slm/sim_slm.cpp',
2222 'src/slm/getwordfreq/getwordfreq.cpp',
2626 'src/slm/thread/ValueCompress.cpp',
2727 'src/slm/slmbuild/slmbuild.cpp',
2828 'src/slm/slmprune/slmprune.cpp',
29 'src/slm/sim_slmbuilder.cpp',
29 'src/slm/slmbuild/sim_slmbuilder.cpp',
3030 'src/slm/tslmendian/slm_endian.cpp',
3131 'src/slm/tslmendian/writer.cpp',
3232 'src/slm/tslmendian/slm_file.cpp',
6666 'src/slm/ids2ngram/idngram.h',
6767 'src/slm/ids2ngram/idngram_merge.h',
6868 'src/slm/slm.h',
69 'src/slm/tslmpack/arpa_slm.h',
70 'src/slm/tslmpack/common.h',
71 'src/slm/tslmpack/arpa_conv.h',
7269 'src/slm/sim_dict.h',
7370 'src/slm/sim_sen.h',
7471 'src/slm/sim_slm.h',
7572 'src/slm/thread/ValueCompress.h',
7673 'src/slm/sim_fmerge.h',
77 'src/slm/sim_slmbuilder.h',
74 'src/slm/slmbuild/sim_slmbuilder.h',
75 'src/slm/slmpack/arpa_slm.h',
76 'src/slm/slmpack/common.h',
7877 'src/slm/tslmendian/slm_file.h',
7978 'src/slm/tslmendian/writer.h',
8079 'src/lexicon/pytrie_gen.h',
132131 'src/slmthread',
133132 'src/tslmendian',
134133 'src/tslminfo',
135 'src/tslmpack',
134 'src/slmpack',
136135 'src/genpyt',
137136 'src/getwordfreq',
138137 'src/sunpinyin-dictgen',
149148 'man/slmthread.1',
150149 'man/tslmendian.1',
151150 'man/tslminfo.1',
152 'man/tslmpack.1',
151 'man/slmpack.1',
153152 'man/genpyt.1',
154153 'man/getwordfreq.1',
155154 ]
186185 opts.Add('DATADIR', default='/usr/local/share')
187186 opts.Add('ENABLE_PLUGINS', default=False)
188187
189 #
190 #==============================environment==============================
188
189 #
190 # ==============================environment==============================
191191 #
192192 #
193193 def allinc():
194 inc=[]
195 for root, dirs, files in os.walk('src'):
196 inc.append(root)
197 return inc
194 return [root for root, _, _ in os.walk('src')]
195
198196
199197 def GetOS():
200198 return platform.uname()[0]
199
201200
202201 def CreateEnvironment():
203202 make = 'make'
204203 wget = 'wget'
204 w3m = 'wget -q -O -'
205205 tar = 'tar'
206206 if GetOS() == 'Darwin':
207207 wget = 'curl -O'
208 w3m = 'curl -s'
208209 elif GetOS() == 'FreeBSD':
209210 make = 'gmake'
210211 wget = 'fetch'
212 w3m = 'fetch -o -'
211213 elif GetOS() == 'SunOS':
212214 make = 'gmake'
213215 tar = 'gtar'
214216 libln_builder = Builder(action='cd ${TARGET.dir} && ln -s ${SOURCE.name} ${TARGET.name}')
215 env = Environment(ENV = os.environ, CFLAGS = cflags, CXXFLAGS = cflags,
216 MAKE = make, WGET = wget, TAR = tar,
217 CPPPATH = ['.'] + allinc(),
218 tools = ['default', 'textfile'])
219 env.Append(BUILDERS = {'InstallAsSymlink': libln_builder})
217 env = Environment(ENV=os.environ, CFLAGS=cflags, CXXFLAGS='',
218 MAKE=make, WGET=wget, W3M=w3m, TAR=tar,
219 CPPPATH=['.'] + allinc(),
220 tools=['default', 'textfile'])
221 env.Append(BUILDERS={'InstallAsSymlink': libln_builder})
220222 env['ENDIANNESS'] = "be" if sys.byteorder == "big" else "le"
221223 return env
224
222225
223226 def PassVariables(envvar, env):
224227 for (x, y) in envvar:
275278 if GetOption('rpath') is not None and GetOS() != 'Darwin':
276279 env.MergeFlags('-Wl,-R -Wl,%s' % GetOption('rpath'))
277280
278 #
279 #==============================configure================================
281
282 #
283 # ==============================configure================================
280284 #
281285 def CheckPKGConfig(context, version='0.12.0'):
282286 context.Message('Checking for pkg-config... ')
284288 context.Result(ret)
285289 return ret
286290
291
287292 def CheckPKG(context, name):
288293 context.Message('Checking for %s... ' % name)
289294 ret = context.TryAction('pkg-config --exists \'%s\'' % name)[0]
290295 context.Result(ret)
291296 return ret
297
292298
293299 def CheckPython(context):
294300 context.Message('Checking for Python library...')
299305 '!python-config --libs'])
300306 return ret
301307
308
302309 def AppendEndianCheck(conf):
303310 conf.config_h_text += r'''
304311
312319 || defined(_POWER) || defined(__powerpc__) \
313320 || defined(__ppc__) || defined(__hpux) || defined(__hppa) \
314321 || defined(_MIPSEB) || defined(_POWER) \
315 || defined(__s390__) || (defined(__sh__) && defined(__BIG_ENDIAN__))
322 || defined(__s390__) || (defined(__sh__) && defined(__BIG_ENDIAN__)) \
323 || defined(__AARCH64EB__)
316324 # define WORDS_BIGENDIAN 1
317325
318326 #elif defined(__i386__) || defined(__i386) \
323331 || defined(__x86_64) || defined(__x86_64__) \
324332 || defined(_M_X64) || defined(__bfin__) \
325333 || defined(__alpha__) || defined(__ARMEL__) \
326 || defined(_MIPSEL) || (defined(__sh__) && defined(__LITTLE_ENDIAN__))
334 || defined(_MIPSEL) || (defined(__sh__) && defined(__LITTLE_ENDIAN__)) \
335 || defined(__AARCH64EL__)
327336 # undef WORDS_BIGENDIAN
328337
329338 #else
332341 '''
333342
334343 conf = env.Configure(clean=False, help=False, config_h='config.h',
335 custom_tests={'CheckPKGConfig' : CheckPKGConfig,
336 'CheckPKG' : CheckPKG,
344 custom_tests={'CheckPKGConfig': CheckPKGConfig,
345 'CheckPKG': CheckPKG,
337346 'CheckPython': CheckPython})
347
338348
339349 def DoConfigure():
340350 if GetOS() == 'Darwin':
358368 conf.CheckCHeader('assert.h')
359369 conf.CheckFunc('bind_textdomain_codeset')
360370 conf.CheckFunc('dcgettext')
361 conf.CheckCHeader('dlfcn.h')
362371 conf.CheckFunc('exp2')
363372 conf.CheckCHeader('fcntl.h')
364373 conf.CheckCHeader('getopt.h')
377386 conf.CheckFunc('mmap')
378387 conf.CheckFunc('munmap')
379388 conf.CheckFunc('setlocale')
380 conf.CheckFunc('strndup')
381389 conf.CheckCHeader('sys/mman.h')
382390 conf.CheckCHeader('sys/param.h')
383391 conf.CheckCHeader('sys/stat.h')
404412 DoConfigure()
405413
406414 #
407 #==============================compile==============================
415 # ==============================compile==============================
408416 #
409417 env.Object(slmsource)
410418 env.Command('src/pinyin/quanpin_trie.h', 'python/quanpin_trie_gen.py',
415423 SConscript(['src/SConscript', 'man/SConscript', 'doc/SConscript'], exports='env')
416424
417425 env.Substfile('sunpinyin-2.0.pc.in', SUBST_DICT={
418 '@PREFIX@': env['PREFIX'],
419 '@LIBDIR@': env['LIBDIR'],
420 '@VERSION@': version,
421 '@CFLAGS@': reduce(lambda a, b: a + ' ' + b,
422 map(lambda x: '-I$${includedir}' + x[3:],
423 allinc())),
424 })
426 '@PREFIX@': env['PREFIX'],
427 '@LIBDIR@': env['LIBDIR'],
428 '@VERSION@': version,
429 '@CFLAGS@': reduce(lambda a, b: a + ' ' + b,
430 map(lambda x: '-I$${includedir}' + x[3:],
431 allinc())),
432 })
425433
426434 libname_default = '%ssunpinyin%s' % (env.subst('${SHLIBPREFIX}'),
427435 env.subst('${SHLIBSUFFIX}'))
436444 else:
437445 # TODO: add install_name on Darwin?
438446 lib = env.SharedLibrary('sunpinyin', source=imesource)
447
439448
440449 def DoInstall():
441450 lib_target = None
88 For developers and expert users
99 -------------------------------
1010
11 Get `lm_sc.t3g.arpa.tar.bz2' and `dict.utf8.tar.bz2' from [1],
11 Get `lm_sc.3gm.arpa.tar.bz2' and `dict.utf8.tar.bz2' from [1] or [2],
1212 unpack them into some directory, and issue the following commands in
1313 that directory:
1414
1919 # Install the generated data files (requires root permission).
2020 make install
2121
22 [1] https://code.google.com/p/open-gram/downloads
22 [1] https://open-gram.googlecode.com/git/
23 [2] http://sourceforge.net/projects/open-gram/files/
2324
2425 -*- indent-tabs-mode: nil -*- vim:et:ts=4
77 ENDIANNESS = @ENDIANNESS@
88 endif
99
10 DICT_FILE = dict.utf8
10 SLM_TARGET = lm_sc
1111
12 SLM_TARGET = lm_sc
13 TSLM2_TEXT_FILE = ${SLM_TARGET}.t2g.arpa
12 SLM2_TEXT_FILE = ${SLM_TARGET}.2gm.arpa
13 SLM2_FILE = ${SLM_TARGET}.2gm
1414 TSLM2_ORIG_FILE = ${SLM_TARGET}.t2g.orig
1515 TSLM2_DIST_FILE = ${SLM_TARGET}.t2g
16 TSLM3_TEXT_FILE = ${SLM_TARGET}.t3g.arpa
16
17 SLM3_TEXT_FILE = ${SLM_TARGET}.3gm.arpa
18 SLM3_FILE = ${SLM_TARGET}.3gm
1719 TSLM3_ORIG_FILE = ${SLM_TARGET}.t3g.orig
1820 TSLM3_DIST_FILE = ${SLM_TARGET}.t3g
1921
22 DICT_FILE = dict.utf8
2023 PYTRIE_FILE = pydict_sc.bin
2124 PYTRIE_LOG_FILE = pydict_sc.log
2225
2528 all: slm3_dist
2629 install: slm3_install
2730
31 slm2: ${SLM2_FILE}
32 ${SLM2_FILE}: ${SLM2_TEXT_FILE} ${DICT_FILE}
33 slmpack $^ $@
34
2835 tslm2_orig: ${TSLM2_ORIG_FILE}
29 ${TSLM2_ORIG_FILE}: ${DICT_FILE} ${TSLM2_TEXT_FILE}
30 tslmpack ${TSLM2_TEXT_FILE} ${DICT_FILE} $@
36 ${TSLM2_ORIG_FILE}: ${SLM2_FILE}
37 slmthread $^ $@
3138
3239 tslm2_dist: ${TSLM2_DIST_FILE}
3340 ${TSLM2_DIST_FILE}: ${TSLM2_ORIG_FILE}
3744 genpyt -e ${ENDIANNESS} -i ${DICT_FILE} -s ${TSLM2_ORIG_FILE} \
3845 -l ${PYTRIE_LOG_FILE} -o ${PYTRIE_FILE}
3946
47 slm3: ${SLM3_FILE}
48 ${SLM3_FILE}: ${SLM3_TEXT_FILE} ${DICT_FILE}
49 slmpack $^ $@
50
4051 tslm3_orig: ${TSLM3_ORIG_FILE}
41 ${TSLM3_ORIG_FILE}: ${DICT_FILE} ${TSLM3_TEXT_FILE}
42 tslmpack ${TSLM3_TEXT_FILE} ${DICT_FILE} $@
52 ${TSLM3_ORIG_FILE}: ${SLM3_FILE}
53 slmthread $^ $@
4354
4455 tslm3_dist: ${TSLM3_DIST_FILE}
4556 ${TSLM3_DIST_FILE}: ${TSLM3_ORIG_FILE}
100100 rm -f ${BIGRAM_STAT} ${SLM2_RAW_FILE}
101101 rm -f ${TRIGRAM_STAT} ${SLM3_RAW_FILE}
102102
103 mmseg_bigram: mmseg_ids tslm2_info
104 mmseg_trigram: mmseg_ids tslm3_info
105 slm_bigram: slm2_ids tslm2_info
106 slm_trigram: slm3_ids tslm3_info
103 mmseg_bigram: mmseg_ids slm2_info
104 mmseg_trigram: mmseg_ids slm3_info
105 slm_bigram: slm2_ids slm2_info
106 slm_trigram: slm3_ids slm3_info
107107
108108 bootstrap2:
109109 make mmseg_bigram
00 import os
11 Import('env')
22
3 pod2man = Builder(action = 'pod2man < $SOURCE > $TARGET')
3 pod2man = Builder(action = 'pod2man $SOURCE $TARGET')
44 env.Append(BUILDERS = {'Man': pod2man})
55
66 env.Man('mmseg.1', 'mmseg.pod')
1010 env.Man('slmbuild.1', 'slmbuild.pod')
1111 env.Man('slmprune.1', 'slmprune.pod')
1212 env.Man('slminfo.1', 'slminfo.pod')
13 env.Man('slmpack.1', 'slmpack.pod')
1314 env.Man('slmthread.1', 'slmthread.pod')
1415 env.Man('tslmendian.1', 'tslmendian.pod')
1516 env.Man('tslminfo.1', 'tslminfo.pod')
16 env.Man('tslmpack.1', 'tslmpack.pod')
1717 env.Man('genpyt.1', 'genpyt.pod')
1818 env.Man('getwordfreq.1', 'getwordfreq.pod')
1919
0 =head1 NAME
1
2 slmpack - convert the ARPA format of SunPinyin back-off language model to its binary representation
3
4 =head1 SYNOPSIS
5
6 B<slmpack> I<arpa_file> I<dict_file> I<binary_slm_file>
7
8 =head1 DESCRIPTION
9
10 B<slmpack> converts the ARPA format of a threaded SunPinyin back-off
11 language model to its binary representation.
12
13 =head1 NOTE
14
15 If you convert a language model to ARPA format using B<slminfo>, and
16 then convert it back using B<slmpack>, the check-sum of generated
17 binary file may be different from that of the original one. The reason
18 is the padding bits in the n-gram instances are not initialized before
19 writing the data out.
20
21 =head1 AUTHOR
22
23 Originally written by Kov.Chai E<lt>tchaikov@gmail.comE<gt>.
24 Currently maintained by Kov.Chai E<lt>tchaikov@gmail.comE<gt>.
25
26 =head1 SEE ALSO
27
28 B<slminfo>(1).
29
30 =for comment
31 -*- indent-tabs-mode: nil -*- vim:et:ts=4
2323
2424 =item B<-i> I<input-lm-file>
2525
26 Identify the input file of convert. Generally, this file is generated by B<slmthread> or B<tslmpack>.
27
26 Identify the input file of convert. Generally, this file is generated by B<slmthread>.
2827
2928 =item B<-o> I<out-lm-file>
3029
4544
4645 =head1 SEE ALSO
4746
48 B<slmthread>(1). B<tslminfo>, B<tslmpack>.
47 B<slmthread>(1). B<tslminfo>.
4948
5049 =for comment
5150 -*- indent-tabs-mode: nil -*- vim:et:ts=4
+0
-33
man/tslmpack.pod less more
0 =head1 NAME
1
2 tslmpack - convert the ARPA format of SunPinyin back-off language model to its binary representation
3
4 =head1 SYNOPSIS
5
6 B<tslmpack> I<arpa_file> I<dict_file> I<binary_slm_file>
7
8 =head1 DESCRIPTION
9
10 B<tslmpack> converts the ARPA format of a threaded SunPinyin back-off
11 language model to its binary representation.
12
13 =head1 NOTE
14
15 If you convert a language model to ARPA format using B<tslminfo>, and
16 then convert it back using B<tslmpack>, the check-sum of generated
17 binary file may be different from that of the original one. The reason
18 is the padding bits in the n-gram instances are not initialized before
19 writing the data out.
20
21
22 =head1 AUTHOR
23
24 Originally written by Kov.Chai E<lt>tchaikov@gmail.comE<gt>.
25 Currently maintained by Kov.Chai E<lt>tchaikov@gmail.comE<gt>.
26
27 =head1 SEE ALSO
28
29 B<tslminfo>(1).
30
31 =for comment
32 -*- indent-tabs-mode: nil -*- vim:et:ts=4
00 import os
1
2
13 Import('env')
24
35 env.Program('mmseg', ['portability.o', 'slm/sim_dict.o', 'slm/sim_sen.o',
46 'slm/mmseg/mmseg.o'])
57
68 env.Program('slmseg', ['portability.o', 'slm/sim_dict.o', 'slm/sim_sen.o',
7 'slm/slm.o', 'slm/slmseg/slmseg.o'])
9 'slm/slm.o', 'slm/slmseg/slmseg.o'])
810
911 env.Program('ids2ngram', ['portability.o', 'slm/ids2ngram/ids2ngram.o'])
1012
1113 env.Program('idngram_merge', ['portability.o', 'slm/ids2ngram/idngram_merge.o'])
1214
13 env.Program('slmbuild', ['portability.o', 'slm/sim_slmbuilder.o',
15 env.Program('slmbuild', ['portability.o', 'slm/slmbuild/sim_slmbuilder.o',
1416 'slm/slmbuild/slmbuild.o'])
1517
1618 env.Program('slmprune', ['portability.o', 'slm/sim_slm.o',
1719 'slm/slmprune/slmprune.o'])
1820
1921 env.Program('slminfo', ['portability.o', 'slm/slminfo/slminfo.o'])
22
23 env.Program('slmpack', ['portability.o', 'slm/sim_slm.o', 'slm/slmpack/slmpack.o',
24 'slm/slmpack/arpa_slm.o'])
2025
2126 env.Program('slmthread', ['portability.o', 'slm/sim_slm.o',
2227 'slm/thread/ValueCompress.o', 'slm/thread/slmthread.o'])
2732
2833 env.Program('tslminfo', ['portability.o', 'slm/slm.o', 'slm/tslminfo/tslminfo.o'])
2934
30 env.Program('tslmpack', ['portability.o', 'slm/slm.o',
31 'slm/thread/ValueCompress.o', 'slm/tslmpack/slmpack.o',
32 'slm/tslmpack/arpa_conv.o', 'slm/tslmpack/arpa_slm.o'])
33
3435 env.Program('genpyt', ['portability.o', 'slm/slm.o', 'slm/tslmendian/writer.o',
35 'lexicon/trie_writer.o', 'lexicon/genpyt.o',
36 'lexicon/pytrie.o', 'lexicon/pytrie_gen.o',
37 'pinyin/pinyin_data.o'])
36 'lexicon/trie_writer.o', 'lexicon/genpyt.o',
37 'lexicon/pytrie.o', 'lexicon/pytrie_gen.o',
38 'pinyin/pinyin_data.o'])
3839
3940 env.Program('getwordfreq', ['portability.o', 'slm/slm.o',
4041 'slm/getwordfreq/getwordfreq.o'])
4142
4243 env.Program('testvc', ['slm/thread/ValueCompress.o', 'slm/thread/test_vc.o'])
4344
44 env.Substfile('sunpinyin-dictgen.mk.in', SUBST_DICT = {
45 env.Substfile('sunpinyin-dictgen.mk.in', SUBST_DICT={
4546 '@MAKE@': env['MAKE'],
4647 '@TAR@': env['TAR'],
4748 '@WGET@': env['WGET'],
49 '@W3M@': env['W3M'],
4850 '@DATADIR@': env['DATADIR'],
4951 '@ENDIANNESS@': env['ENDIANNESS'],
50 })
52 })
5153 env.Command('sunpinyin-dictgen', 'sunpinyin-dictgen.mk', [
52 Copy("$TARGET", "$SOURCE"),
53 Chmod("$TARGET", 0755),
54 ])
54 Copy("$TARGET", "$SOURCE"),
55 Chmod("$TARGET", 0755),
56 ])
5557
5658 # -*- indent-tabs-mode: nil -*- vim:et:ts=4
3434 * to such option by the copyright holder.
3535 */
3636
37 #ifdef MACOSX
38 #include <Python/Python.h>
39 #else
3740 #include <Python.h>
41 #endif
42
3843 #include <signal.h>
3944 #include <sstream>
4045
193193 && !m_pIC->isEmpty()) {
194194 changeMasks |= KEYEVENT_USED;
195195 if (m_candiPageFirst > 0) {
196 m_candiPageFirst -= m_candiWindowSize;
197 if (m_candiPageFirst < 0) m_candiPageFirst = 0;
196 if (m_candiPageFirst > m_candiWindowSize) {
197 m_candiPageFirst -= m_candiWindowSize;
198 } else {
199 m_candiPageFirst = 0;
200 }
198201 changeMasks |= CANDIDATE_MASK;
199202 }
200203 } else if (((modifiers == 0 && keycode == IM_VK_PAGE_DOWN)
6161 return true;
6262
6363 if (m_start == other.m_start)
64 return m_len < m_len;
64 return m_len < other.m_len;
6565
6666 return false;
6767 }
193193 const unsigned char *src = (const unsigned char*)s;
194194 TWCHAR* dst = pwcs;
195195
196 while (dst - pwcs < n) {
196 while (dst - pwcs < (ssize_t)n) {
197197 if (*src < 0xc0 || *src >= 0xfe) {
198198 if (*src < 0x80) *dst++ = *src;
199199 if (*src++ == 0) break;
264264 return sz;
265265 }
266266
267 #if !defined (HAVE_STRNDUP)
268 extern "C" char *
269 strndup(const char *s, size_t n)
270 {
271 size_t nMost;
272 char *p = NULL;
273
274 if (!s)
275 return NULL;
276
277 #ifdef __cplusplus
278 nMost = std::min(strlen(s) + 1, n + 1);
279 #else
280 nMost = min(strlen(s) + 1, n + 1);
281 #endif
282 p = (char*)malloc(nMost);
283 memcpy(p, s, nMost);
284 p[nMost - 1] = '\0';
285
286 return p;
287 }
288 #endif //HAVE_STRNDUP
289
290267 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
325325 }
326326 #endif
327327
328 #if !defined (HAVE_STRNDUP)
329 extern "C" char *strndup(const char *s, size_t n);
330 #endif //HAVE_STRNDUP
331
332328 #endif
333329
334330 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
+0
-634
src/slm/sim_slmbuilder.cpp less more
0 /*
1 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
2 *
3 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
4 *
5 * The contents of this file are subject to the terms of either the GNU Lesser
6 * General Public License Version 2.1 only ("LGPL") or the Common Development and
7 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
8 * file except in compliance with the License. You can obtain a copy of the CDDL at
9 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
10 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
11 * specific language governing permissions and limitations under the License. When
12 * distributing the software, include this License Header Notice in each file and
13 * include the full text of the License in the License file as well as the
14 * following notice:
15 *
16 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
17 * (CDDL)
18 * For Covered Software in this distribution, this License shall be governed by the
19 * laws of the State of California (excluding conflict-of-law provisions).
20 * Any litigation relating to this License shall be subject to the jurisdiction of
21 * the Federal Courts of the Northern District of California and the state courts
22 * of the State of California, with venue lying in Santa Clara County, California.
23 *
24 * Contributor(s):
25 *
26 * If you wish your version of this file to be governed by only the CDDL or only
27 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
28 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
29 * license." If you don't indicate a single choice of license, a recipient has the
30 * option to distribute your version of this file under either the CDDL or the LGPL
31 * Version 2.1, or to extend the choice of license to its licensees as provided
32 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
33 * Version 2 license, then the option applies only if the new code is made subject
34 * to such option by the copyright holder.
35 */
36
37 #ifdef HAVE_CONFIG_H
38 #include "config.h"
39 #endif
40
41 #ifdef HAVE_ASSERT_H
42 #include <assert.h>
43 #endif
44
45 #include <stdlib.h>
46 #include <math.h>
47 #include <vector>
48 #include <algorithm>
49
50 #include "sim_slmbuilder.h"
51
52 void
53 CSlmGTDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
54 {
55 if (dis != NULL)
56 delete [] dis;
57 dis = new double[--n];
58 if (thres > n) thres = n;
59 for (int freq = 1; freq < n; ++freq) {
60 if (nr[freq] == 0 || nr[freq + 1] == 0)
61 dis[freq] = 1.0;
62 else
63 dis[freq] = double(nr[freq + 1]) / nr[freq];
64 printf("%lf ", dis[freq]); fflush(stdout);
65 }
66 }
67
68 double
69 CSlmGTDiscounter::discount(int freq)
70 {
71 double newfreq = freq * ((freq < thres) ? dis[freq] : hd);
72 if (newfreq >= double(freq))
73 newfreq = freq * hd;
74 return newfreq;
75 }
76
77 void
78 CSlmAbsoluteDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
79 {
80 // normally, c should not greater than 1.0, yet when cut-off is used, it could be so.
81 if (c <= 0.0) {
82 c = double(nr[1]) / (nr[1] + 2.0 * nr[2]);
83 printf("parameter c=%lf", c); fflush(stdout);
84 } else {
85 printf("Using given parameter c=%lf", c); fflush(stdout);
86 }
87 }
88
89 double
90 CSlmAbsoluteDiscounter::discount(int freq)
91 {
92 return (freq > 0) ? (freq - c) : (0.0);
93 }
94
95 void
96 CSlmLinearDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
97 {
98 if (dis <= 0.0 || dis >= 1.0) {
99 dis = 1.0 - double(nr[1]) / nr[0];
100 printf("parameter d=%lf", dis); fflush(stdout);
101 } else {
102 printf("Using given parameter d=%lf", dis); fflush(stdout);
103 }
104 }
105
106 double
107 CSlmLinearDiscounter::discount(int freq)
108 {
109 return freq * dis;
110 }
111
112 // n=1 for unigram, n=2 for bigram;
113 // level[0] is for psuedo 0 gram, ...
114 void
115 CSlmBuilder::Create(int n)
116 {
117 assert(n != 0);
118 nlevel = n;
119 level = new void * [n + 1];
120 for (int i = 0; i < n; ++i) {
121 level[i] = new std::vector<TNode>;
122 if (i) ((TNodeLevel*)level[i])->reserve(1024);
123 }
124 //Add leaf level
125 level[n] = new std::vector<TLeaf>;
126 ((TLeafLevel*)level[n])->reserve(1024);
127
128 //Add psuedo root node
129 ((TNodeLevel*)level[0])->push_back(TNode(0, 0, 0));
130
131 //Initialize the nr[n+1][SLM_MAX_R] 2-D array
132 nr = new FREQ_TYPE[n + 1][SLM_MAX_R];
133 for (int lvl = 0; lvl < n + 1; ++lvl)
134 for (int r = 0; r < SLM_MAX_R; ++r)
135 nr[lvl][r] = 0;
136 }
137
138 void
139 CSlmBuilder::SetCut(FREQ_TYPE threshold[])
140 {
141 if (cut != NULL)
142 delete [] cut;
143 cut = new FREQ_TYPE[nlevel + 1];
144 for (int i = 0; i < nlevel; ++i)
145 cut[i + 1] = threshold[i];
146 }
147
148 void
149 CSlmBuilder::SetDiscounter(CSlmDiscounter* dis[])
150 {
151 if (discounter != NULL)
152 delete [] discounter;
153 discounter = new CSlmDiscounter* [nlevel + 1];
154 for (int i = 0; i < nlevel; ++i)
155 discounter[i + 1] = dis[i];
156 }
157
158 void
159 CSlmBuilder::SetBreakerIds(int nId, TSIMWordId brks[])
160 {
161 breaker.clear();
162 for (int i = 0; i < nId; ++i)
163 breaker.push_back(brks[i]);
164 std::make_heap(breaker.begin(), breaker.end());
165 std::sort_heap(breaker.begin(), breaker.end());
166 }
167
168 void
169 CSlmBuilder::SetExcludeIds(int nId, TSIMWordId excludes[])
170 {
171 m_excludes.clear();
172 for (int i = 0; i < nId; ++i)
173 m_excludes.push_back(excludes[i]);
174 std::make_heap(m_excludes.begin(), m_excludes.end());
175 std::sort_heap(m_excludes.begin(), m_excludes.end());
176 }
177
178 bool
179 CSlmBuilder::isBreakId(TSIMWordId id)
180 {
181 return std::binary_search(breaker.begin(), breaker.end(), id);
182 }
183
184 bool
185 CSlmBuilder::isExcludeId(TSIMWordId id)
186 {
187 return std::binary_search(m_excludes.begin(), m_excludes.end(), id);
188 }
189
190 void
191 CSlmBuilder::AddNGram(TSIMWordId* ngram, FREQ_TYPE fr)
192 {
193 int ch;
194 bool brk = isExcludeId(*ngram);
195
196 for (int i = 1; i < nlevel; ++i) {
197 TNodeLevel* pnl = (TNodeLevel*)(level[i]);
198 if (pnl->capacity() == pnl->size()) {
199 size_t newsz = 2 * pnl->capacity();
200 if (pnl->capacity() > 1024 * 1024)
201 newsz = pnl->capacity() + 1024 * 1024;
202 pnl->reserve(newsz);
203 }
204 }
205 TLeafLevel* pll = (TLeafLevel*)(level[nlevel]);
206 if (pll->capacity() == pll->size()) {
207 size_t newsz = 2 * pll->capacity();
208 if (pll->capacity() > 1024 * 1024)
209 newsz = pll->capacity() + 1024 * 1024;
210 pll->reserve(newsz);
211 }
212
213 if (!brk)
214 (*(TNodeLevel*)(level[0]))[0].freq += fr;
215
216 bool branch = false;
217 for (int i = 1; (!brk && i < nlevel); ++i) {
218 std::vector<TNode> & pv = *(TNodeLevel*)(level[i - 1]);
219 std::vector<TNode> & v = *(TNodeLevel*)(level[i]);
220 branch = branch || (pv.back().child >= (int) v.size()) ||
221 (v.back().id != ngram[i - 1]);
222 if (branch) {
223 if (i == nlevel - 1)
224 ch = ((TLeafLevel*)(level[i + 1]))->size();
225 else
226 ch = ((TNodeLevel*)(level[i + 1]))->size();
227 v.push_back(TNode(ngram[i - 1], ch, fr));
228 } else {
229 v.back().freq += fr;
230 }
231 brk = (i > 1 && isBreakId(ngram[i - 1])) || isExcludeId(ngram[i]);
232 }
233
234 // Insert to the leaf level
235 if (!brk) {
236 if (fr > cut[nlevel]) {
237 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
238 v.push_back(TLeaf(ngram[nlevel - 1], fr));
239 } else {
240 nr[nlevel][0] += fr;
241 nr[nlevel][fr] += fr;
242 }
243 }
244 }
245
246 void
247 CSlmBuilder::CountNr()
248 {
249 printf("\nCounting Nr..."); fflush(stdout);
250 for (int lvl = 1; lvl < nlevel; ++lvl) {
251 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
252 for (TNodeIterator it = v.begin(), ite = v.end(); it != ite; ++it) {
253 FREQ_TYPE freq = it->freq;
254 nr[lvl][0] += freq;
255 if (freq < (int) SLM_MAX_R && freq > 0)
256 nr[lvl][freq] += freq;
257 }
258 }
259 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
260 for (TLeafIterator it = v.begin(), ite = v.end(); it != ite; ++it) {
261 FREQ_TYPE freq = it->freq;
262 nr[nlevel][0] += freq;
263 if (freq < (int) SLM_MAX_R && freq > 0)
264 nr[nlevel][freq] += freq;
265 }
266 printf("\n"); fflush(stdout);
267 }
268
269 int
270 CSlmBuilder::CutLeafLevel(TNodeIterator pfirst,
271 TNodeIterator plast,
272 TLeafIterator chfirst,
273 TLeafIterator chlast,
274 int thred)
275 {
276 int idxfirst, idxchk;
277 TLeafIterator chchk = chfirst;
278 for (idxfirst = idxchk = 0; chchk != chlast; ++chchk, ++idxchk) {
279 //do not cut item whoese 1. freq > thred; 2. psuedo tail
280 if ((int) chchk->freq > thred || (chchk + 1) == chlast) {
281 if (idxfirst < idxchk)
282 *chfirst = *chchk;
283 for (; pfirst != plast && pfirst->child <= idxchk; ++pfirst)
284 pfirst->child = idxfirst;
285 ++idxfirst;
286 ++chfirst;
287 }
288 }
289 assert(pfirst == plast);
290 return idxfirst;
291 }
292
293 int
294 CSlmBuilder::CutNodeLevel(TNodeIterator pfirst,
295 TNodeIterator plast,
296 TNodeIterator chfirst,
297 TNodeIterator chlast,
298 int thred)
299 {
300 int idxfirst, idxchk;
301 TNodeIterator chchk = chfirst;
302 for (idxfirst = idxchk = 0; chchk != chlast; ++chchk, ++idxchk) {
303 //do not cut item whoese 1. freq > thred; 2. psuedo tail; 3. leading children
304 TNodeIterator chnext = chchk + 1;
305 if ((int) chchk->freq > thred || chnext == chlast ||
306 (chnext->child != chchk->child)) {
307 if (idxfirst < idxchk)
308 *chfirst = *chchk;
309 for (; pfirst != plast && pfirst->child <= idxchk; ++pfirst)
310 pfirst->child = idxfirst;
311 ++idxfirst;
312 ++chfirst;
313 }
314 }
315 assert(pfirst == plast);
316 return idxfirst;
317 }
318
319 void
320 CSlmBuilder::Cut()
321 {
322 printf("\nCuting according freq..."); fflush(stdout);
323 for (int lvl = nlevel; lvl > 0; --lvl) {
324 printf("\n Cut level %d with threshold %d...", lvl, cut[lvl]);
325 fflush(stdout);
326 TNodeLevel& parent = *(TNodeLevel*)(level[lvl - 1]);
327 if (lvl == nlevel) {
328 if (cut[lvl] > 0) {
329 TLeafLevel& v = *(TLeafLevel*)(level[lvl]);
330 int newsize = CutLeafLevel(parent.begin(),
331 parent.end(), v.begin(),
332 v.end(), cut[lvl]);
333 v.erase(v.begin() + newsize, v.end());
334 }
335 } else {
336 if (cut[lvl] > 0) {
337 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
338 int newsize = CutNodeLevel(parent.begin(),
339 parent.end(), v.begin(),
340 v.end(), cut[lvl]);
341 v.erase(v.begin() + newsize, v.end());
342 }
343 }
344 }
345 printf("\n"); fflush(stdout);
346 }
347
348 void
349 CSlmBuilder::AppendTails()
350 {
351 printf("\nAppending psuedo tail node for each level..."); fflush(stdout);
352 for (int lvl = 0; lvl < nlevel; ++lvl) {
353 int child_size = 0;
354 if (lvl == nlevel - 1) {
355 child_size = ((TLeafLevel*)(level[lvl + 1]))->size();
356 } else {
357 child_size = ((TNodeLevel*)(level[lvl + 1]))->size();
358 }
359 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
360 v.push_back(TNode(0x00FFFFFF, child_size, 1));
361 }
362 //also make a psuedo tail node for the leaf level
363 ((TLeafLevel*)(level[nlevel]))->push_back(TLeaf(0, 1));
364 printf("\n"); fflush(stdout);
365 }
366
367 template<class TChildLevel>
368 void
369 DiscountOneLevel(CSlmBuilder::TNodeLevel& v,
370 TChildLevel& ch,
371 CSlmDiscounter* disc,
372 int bUseLogPr)
373 {
374 CSlmBuilder::TNodeIterator it = v.begin();
375 CSlmBuilder::TNodeIterator ite = v.begin() + (v.size() - 1);
376 for (; it != ite; ++it) { //do not calc the psuedo tail item
377 CSlmBuilder::TNodeIterator itnext = it + 1;
378 double root_freq = it->freq;
379 for (int h = it->child, t = itnext->child; h < t; ++h) {
380 double pr = disc->discount(ch[h].freq) / root_freq;
381 assert(pr > 0.0 && pr < 1.0);
382 if (bUseLogPr) {
383 ch[h].pr = CSlmBuilder::PR_TYPE(-log(pr));
384 } else {
385 ch[h].pr = CSlmBuilder::PR_TYPE(pr);
386 }
387 }
388 }
389 }
390
391 void
392 CSlmBuilder::Discount()
393 {
394 printf("\nDiscounting...");
395 for (int lvl = nlevel; lvl > 0; --lvl) {
396 printf("\n Initializing level %d's %s discount method: ",
397 lvl,
398 discounter[lvl]->getName());
399 discounter[lvl]->init(SLM_MAX_R, nr[lvl]);
400 }
401 printf("\n");
402 for (int lvl = nlevel - 1; lvl >= 0; --lvl) {
403 printf("\n Discounting level %d ...", lvl + 1); fflush(stdout);
404 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
405 if (lvl == nlevel - 1) { //its child is leaf
406 TLeafLevel& ch = *(TLeafLevel*)(level[lvl + 1]);
407 DiscountOneLevel(v, ch, discounter[lvl + 1], bUseLogPr);
408 } else {
409 TNodeLevel& ch = *(TNodeLevel*)(level[lvl + 1]);
410 DiscountOneLevel(v, ch, discounter[lvl + 1], bUseLogPr);
411 }
412 }
413 printf("\n Giving psuedo root level 0 a distribution...");
414 //make the psuedo 0-gram a equal distribution
415 TNodeLevel& v0 = *(TNodeLevel*)(level[0]);
416 if (bUseLogPr) {
417 v0[0].pr = PR_TYPE(-log(double(1.0) / m_nWord));
418 } else {
419 v0[0].pr = PR_TYPE(double(1.0) / m_nWord);
420 }
421 printf("\n"); fflush(stdout);
422 }
423
424 template<class chIterator>
425 double
426 CalcNodeBow(CSlmBuilder* builder,
427 int lvl,
428 TSIMWordId words[],
429 chIterator chh,
430 chIterator cht,
431 int bUseLogPr)
432 {
433 if (chh == cht) return 1.0;
434 double sumnext = 0.0, sum = 0.0;
435 for (; chh < cht; ++chh) {
436 if (bUseLogPr) {
437 sumnext += exp(-(chh->pr));
438 } else {
439 sumnext += double(chh->pr);
440 }
441 words[lvl + 1] = chh->id;
442 sum += builder->getPr(lvl, words + 2);
443 }
444 assert(sumnext > 0.0 && sumnext < 1.05);
445 assert(sum < 1.05 && sum > 0.0);
446 //消除计算误差的影响
447 if (sumnext >= 1.0 || sum >= 1.0) {
448 double bow = ((sumnext > sum) ? sumnext : sum) + 0.0001;
449 bow = (bow - sumnext) / (bow - sum);
450 printf(
451 "\n (sigma(p(w|h)=%lf, sigma(p(w|h')=%lf) bow ==> %lf due to Calculation precision for %d-gram:",
452 sumnext,
453 sum,
454 bow,
455 lvl);
456 for (int i = 1; i <= lvl; ++i)
457 printf("%d ", words[i]);
458 return bow;
459 }
460 return (1.0 - sumnext) / (1.0 - sum);
461 }
462
463 void
464 CSlmBuilder::CalcBOW()
465 {
466 printf("\nCalculating Back-Off Weight...");
467 for (int lvl = 0; lvl < nlevel; ++lvl) {
468 printf("\n Processing level %d ", lvl); fflush(stdout);
469 TNode* base[16]; //it should be lvl+1, yet some compiler does not support it
470 int idx[16]; //it should be lvl+1, yet some compiler does not support it
471 for (int i = 0; i <= lvl; ++i) {
472 base[i] = &((*(TNodeLevel*)level[i])[0]);
473 idx[i] = 0;
474 }
475 TSIMWordId words[17]; //it should be lvl+2, yet some compiler do not support it
476 int sz = ((TNodeLevel*)(level[lvl]))->size() - 1;
477 printf("(%d items)...", sz + 1); fflush(stdout);
478 for (; idx[lvl] < sz; ++idx[lvl]) {
479 words[lvl] = base[lvl][idx[lvl]].id;
480 for (int k = lvl - 1; k >= 0; --k) {
481 while (base[k][idx[k] + 1].child <= idx[k + 1])
482 ++idx[k];
483 words[k] = base[k][idx[k]].id;
484 }
485 TNode & node = base[lvl][idx[lvl]];
486 TNode & nodenext = *((&node) + 1);
487 double bow;
488 if (lvl == nlevel - 1) {
489 TLeaf * ch = &((*(TLeafLevel*)level[lvl + 1])[0]);
490 bow = CalcNodeBow(this,
491 lvl,
492 words,
493 ch + node.child,
494 ch + nodenext.child,
495 bUseLogPr);
496 } else {
497 TNode * ch = &((*(TNodeLevel*)level[lvl + 1])[0]);
498 bow = CalcNodeBow(this,
499 lvl,
500 words,
501 ch + node.child,
502 ch + nodenext.child,
503 bUseLogPr);
504 }
505 if (bUseLogPr) {
506 node.bow = PR_TYPE(-log(bow));
507 } else {
508 node.bow = PR_TYPE(bow);
509 }
510 }
511 }
512 printf("\n"); fflush(stdout);
513 }
514
515 double
516 CSlmBuilder::getPr(int n, TSIMWordId *words)
517 {
518 int lvl;
519 double bow = 1.0;
520 void* pnode = &((*(TNodeLevel*)level[0])[0]);
521
522 assert(n <= nlevel);
523
524 if (n == 0) {
525 if (bUseLogPr) {
526 return exp(-((TNode*)pnode)->pr);
527 } else {
528 return ((TNode*)pnode)->pr;
529 }
530 }
531
532 for (lvl = 0; pnode != NULL && lvl < n; ++lvl) {
533 if (bUseLogPr) {
534 bow = exp(-((TNode*)pnode)->bow);
535 } else {
536 bow = ((TNode*)pnode)->bow;
537 }
538 pnode = FindChild(lvl, (TNode*)pnode, words[lvl]);
539 }
540
541 if (pnode != NULL) { // find the whole string
542 if (bUseLogPr) {
543 return exp(-((TLeaf*)pnode)->pr);
544 } else {
545 return ((TLeaf*)pnode)->pr;
546 }
547 } else if (lvl == n - 1) { // only find the history
548 return bow * getPr(n - 1, words + 1);
549 } else { //even not find the history
550 return getPr(n - 1, words + 1);
551 }
552 }
553
554 void*
555 CSlmBuilder::FindChild(int lvl, TNode* root, TSIMWordId id)
556 {
557 int chh = root->child, cht = (root + 1)->child;
558 if (lvl == nlevel - 1) {
559 TLeaf* pleaf = &((*(TLeafLevel*)level[lvl + 1])[0]);
560 return (void*)binary_find(pleaf, chh, cht, TLeaf(id));
561 } else {
562 TNode* pnode = &((*(TNodeLevel*)level[lvl + 1])[0]);
563 return (void*)binary_find(pnode, chh, cht, TNode(id));
564 }
565 }
566
567 void
568 CSlmBuilder::Build()
569 {
570 CountNr();
571 AppendTails();
572 Cut();
573 Discount();
574 CalcBOW();
575 }
576
577 void
578 CSlmBuilder::Write(FILE *out)
579 {
580 fwrite(&nlevel, sizeof(nlevel), 1, out);
581 fwrite(&bUseLogPr, sizeof(bUseLogPr), 1, out);
582 for (int lvl = 0; lvl <= nlevel; ++lvl) {
583 int sz = 0;
584 if (lvl == nlevel)
585 sz = ((TLeafLevel*)(level[lvl]))->size();
586 else
587 sz = ((TNodeLevel*)(level[lvl]))->size();
588 fwrite(&sz, sizeof(sz), 1, out);
589 }
590 for (int lvl = 0; lvl < nlevel; ++lvl) {
591 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
592 for (TNodeIterator it = v.begin(), ite = v.end(); it != ite; ++it)
593 fwrite(&(*it), sizeof(TNode), 1, out);
594 }
595 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
596 for (TLeafIterator it = v.begin(), ite = v.end(); it != ite; ++it)
597 fwrite(&(*it), sizeof(TLeaf), 1, out);
598 }
599
600 void
601 CSlmBuilder::Close(void)
602 {
603 if (level != NULL) {
604 for (int lvl = 0; lvl <= nlevel; ++lvl) {
605 if (lvl == nlevel)
606 delete (TLeafLevel*)(level[lvl]);
607 else
608 delete (TNodeLevel*)(level[lvl]);
609 }
610 delete [] level;
611 level = NULL;
612 }
613 if (cut != NULL) {
614 delete [] cut;
615 cut = NULL;
616 }
617 if (discounter != NULL) {
618 for (int lvl = 1; lvl <= nlevel; ++lvl) {
619 delete discounter[lvl];
620 }
621 delete [] discounter;
622 discounter = NULL;
623 }
624 if (nr != NULL) {
625 delete [] nr;
626 nr = NULL;
627 }
628 breaker.clear();
629 m_nWord = 0;
630 nlevel = 0;
631 }
632
633 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
+0
-167
src/slm/sim_slmbuilder.h less more
0 // -*- mode: c++ -*-
1 /*
2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
3 *
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
5 *
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
15 * following notice:
16 *
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
18 * (CDDL)
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
24 *
25 * Contributor(s):
26 *
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
36 */
37
38 #ifndef _SIM_SLM_BUILDER_H
39 #define _SIM_SLM_BUILDER_H
40
41 #include "../portability.h"
42
43 #include "sim_slm.h"
44
45 class CSlmDiscounter;
46
47 class CSlmBuilder {
48 public:
49 static const int SLM_MAX_R = 16;
50 typedef CSIMSlm::FREQ_TYPE FREQ_TYPE;
51 typedef CSIMSlm::PR_TYPE PR_TYPE;
52 typedef CSIMSlm::TNode TNode;
53 typedef CSIMSlm::TLeaf TLeaf;
54
55 public:
56 CSlmBuilder()
57 : nlevel(0), bUseLogPr(0), level(NULL), m_nWord(0), cut(NULL),
58 discounter(NULL), nr(NULL), breaker(), m_excludes() { }
59 ~CSlmBuilder()
60 { Close(); }
61
62 void Create(int n);
63 void SetNumberOfWord(int nWord) { this->m_nWord = nWord; }
64 void SetCut(FREQ_TYPE threshold[]);
65 void SetDiscounter(CSlmDiscounter * dis[]);
66 void SetBreakerIds(int nId, TSIMWordId brks[]);
67 void SetExcludeIds(int nId, TSIMWordId excludes[]);
68 void SetUseLogPr(int bUse)
69 { bUseLogPr = bUse; }
70
71 void AddNGram(TSIMWordId* ngram, FREQ_TYPE fr);
72 void Build();
73 void Write(FILE* out);
74 void Close();
75
76 //get pr(w[n-1] | w[0]...w[n-2]) on constructed partial model (low levels)
77 double getPr(int n, TSIMWordId* w);
78
79 public:
80 typedef std::vector<TNode> TNodeLevel;
81 typedef std::vector<TLeaf> TLeafLevel;
82 typedef TNodeLevel::iterator TNodeIterator;
83 typedef TLeafLevel::iterator TLeafIterator;
84
85 protected:
86 bool isBreakId(TSIMWordId id);
87 bool isExcludeId(TSIMWordId id);
88 void CountNr();
89 void AppendTails();
90 void Cut();
91 void Discount();
92 void CalcBOW();
93 void*FindChild(int lvl, TNode* root, TSIMWordId id);
94 int CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast,
95 TNodeIterator chfirst, TNodeIterator chlast, int thred);
96 int CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast,
97 TLeafIterator chfirst, TLeafIterator chlast, int thred);
98
99 private:
100 int nlevel, bUseLogPr;
101 void** level;
102 //level[0] is psudeo root level, level[1] is unigram level, ..., all are vector type
103
104 int m_nWord;
105 FREQ_TYPE* cut; // cut[1] is not cut threshold for 1-gram, ...
106 CSlmDiscounter** discounter; // discounter[1] is for 1-gram...
107 FREQ_TYPE(*nr)[SLM_MAX_R]; //nr[1][SLM_MAX_R] is for 1-gram...
108 std::vector<TSIMWordId> breaker;
109 std::vector<TSIMWordId> m_excludes;
110 };
111
112 class CSlmDiscounter {
113 public:
114 virtual ~CSlmDiscounter() {}
115 // n is array size, nr is FREQ_TYPE[n], nr[0] is corpuse size,or sigma r*nr;
116 // nr[1] is number of ngram items with freq 1, ...
117 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr) = 0;
118
119 // freq is the ngram frequence, not the conditional pr
120 virtual double discount(int freq) = 0;
121 virtual const char* getName() = 0;
122 };
123
124 //Good-Turing discount
125 class CSlmGTDiscounter : public CSlmDiscounter {
126 public:
127 CSlmGTDiscounter(int threshold = 10, double highfreq_discount =
128 0.95) : thres(threshold), hd(highfreq_discount),
129 dis(NULL) {}
130 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
131 virtual double discount(int freq);
132 virtual const char* getName()
133 { return "Good-Turing"; }
134 protected:
135 int thres;
136 double hd;
137 double *dis;
138 };
139
140 class CSlmAbsoluteDiscounter : public CSlmDiscounter {
141 public:
142 CSlmAbsoluteDiscounter(double substract = 0.0) : c(substract) {}
143 //c == 0 mean this value should be count according to r[]
144 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
145 virtual double discount(int freq); // return freq - c
146 virtual const char* getName()
147 { return "Absolution"; }
148 protected:
149 double c;
150 };
151
152 class CSlmLinearDiscounter : public CSlmDiscounter {
153 public:
154 CSlmLinearDiscounter(double shrink = 0.0) : dis(shrink) {}
155 //dis == 0 mean this value should be count according to r[]
156 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
157 virtual double discount(int freq); // return freq * dis
158 virtual const char* getName()
159 { return "Linear"; }
160 protected:
161 double dis;
162 };
163
164 #endif
165
166 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
0 /*
1 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
2 *
3 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
4 *
5 * The contents of this file are subject to the terms of either the GNU Lesser
6 * General Public License Version 2.1 only ("LGPL") or the Common Development and
7 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
8 * file except in compliance with the License. You can obtain a copy of the CDDL at
9 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
10 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
11 * specific language governing permissions and limitations under the License. When
12 * distributing the software, include this License Header Notice in each file and
13 * include the full text of the License in the License file as well as the
14 * following notice:
15 *
16 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
17 * (CDDL)
18 * For Covered Software in this distribution, this License shall be governed by the
19 * laws of the State of California (excluding conflict-of-law provisions).
20 * Any litigation relating to this License shall be subject to the jurisdiction of
21 * the Federal Courts of the Northern District of California and the state courts
22 * of the State of California, with venue lying in Santa Clara County, California.
23 *
24 * Contributor(s):
25 *
26 * If you wish your version of this file to be governed by only the CDDL or only
27 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
28 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
29 * license." If you don't indicate a single choice of license, a recipient has the
30 * option to distribute your version of this file under either the CDDL or the LGPL
31 * Version 2.1, or to extend the choice of license to its licensees as provided
32 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
33 * Version 2 license, then the option applies only if the new code is made subject
34 * to such option by the copyright holder.
35 */
36
37 #ifdef HAVE_CONFIG_H
38 #include "config.h"
39 #endif
40
41 #ifdef HAVE_ASSERT_H
42 #include <assert.h>
43 #endif
44
45 #include <stdlib.h>
46 #include <math.h>
47 #include <vector>
48 #include <algorithm>
49
50 #include "sim_slmbuilder.h"
51
52 void
53 CSlmGTDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
54 {
55 if (dis != NULL)
56 delete [] dis;
57 dis = new double[--n];
58 if (thres > n) thres = n;
59 for (int freq = 1; freq < n; ++freq) {
60 if (nr[freq] == 0 || nr[freq + 1] == 0)
61 dis[freq] = 1.0;
62 else
63 dis[freq] = double(nr[freq + 1]) / nr[freq];
64 printf("%lf ", dis[freq]); fflush(stdout);
65 }
66 }
67
68 double
69 CSlmGTDiscounter::discount(int freq)
70 {
71 double newfreq = freq * ((freq < thres) ? dis[freq] : hd);
72 if (newfreq >= double(freq))
73 newfreq = freq * hd;
74 return newfreq;
75 }
76
77 void
78 CSlmAbsoluteDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
79 {
80 // normally, c should not greater than 1.0, yet when cut-off is used, it could be so.
81 if (c <= 0.0) {
82 c = double(nr[1]) / (nr[1] + 2.0 * nr[2]);
83 printf("parameter c=%lf", c); fflush(stdout);
84 } else {
85 printf("Using given parameter c=%lf", c); fflush(stdout);
86 }
87 }
88
89 double
90 CSlmAbsoluteDiscounter::discount(int freq)
91 {
92 return (freq > 0) ? (freq - c) : (0.0);
93 }
94
95 void
96 CSlmLinearDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
97 {
98 if (dis <= 0.0 || dis >= 1.0) {
99 dis = 1.0 - double(nr[1]) / nr[0];
100 printf("parameter d=%lf", dis); fflush(stdout);
101 } else {
102 printf("Using given parameter d=%lf", dis); fflush(stdout);
103 }
104 }
105
106 double
107 CSlmLinearDiscounter::discount(int freq)
108 {
109 return freq * dis;
110 }
111
112 // n=1 for unigram, n=2 for bigram;
113 // level[0] is for psuedo 0 gram, ...
114 void
115 CSlmBuilder::Create(int n)
116 {
117 assert(n != 0);
118 nlevel = n;
119 level = new void * [n + 1];
120 for (int i = 0; i < n; ++i) {
121 level[i] = new std::vector<TNode>;
122 if (i) ((TNodeLevel*)level[i])->reserve(1024);
123 }
124 //Add leaf level
125 level[n] = new std::vector<TLeaf>;
126 ((TLeafLevel*)level[n])->reserve(1024);
127
128 //Add psuedo root node
129 ((TNodeLevel*)level[0])->push_back(TNode(0, 0, 0));
130
131 //Initialize the nr[n+1][SLM_MAX_R] 2-D array
132 nr = new FREQ_TYPE[n + 1][SLM_MAX_R];
133 for (int lvl = 0; lvl < n + 1; ++lvl)
134 for (int r = 0; r < SLM_MAX_R; ++r)
135 nr[lvl][r] = 0;
136 }
137
138 void
139 CSlmBuilder::SetCut(FREQ_TYPE threshold[])
140 {
141 if (cut != NULL)
142 delete [] cut;
143 cut = new FREQ_TYPE[nlevel + 1];
144 for (int i = 0; i < nlevel; ++i)
145 cut[i + 1] = threshold[i];
146 }
147
148 void
149 CSlmBuilder::SetDiscounter(CSlmDiscounter* dis[])
150 {
151 if (discounter != NULL)
152 delete [] discounter;
153 discounter = new CSlmDiscounter* [nlevel + 1];
154 for (int i = 0; i < nlevel; ++i)
155 discounter[i + 1] = dis[i];
156 }
157
158 void
159 CSlmBuilder::SetBreakerIds(int nId, TSIMWordId brks[])
160 {
161 breaker.clear();
162 for (int i = 0; i < nId; ++i)
163 breaker.push_back(brks[i]);
164 std::make_heap(breaker.begin(), breaker.end());
165 std::sort_heap(breaker.begin(), breaker.end());
166 }
167
168 void
169 CSlmBuilder::SetExcludeIds(int nId, TSIMWordId excludes[])
170 {
171 m_excludes.clear();
172 for (int i = 0; i < nId; ++i)
173 m_excludes.push_back(excludes[i]);
174 std::make_heap(m_excludes.begin(), m_excludes.end());
175 std::sort_heap(m_excludes.begin(), m_excludes.end());
176 }
177
178 bool
179 CSlmBuilder::isBreakId(TSIMWordId id)
180 {
181 return std::binary_search(breaker.begin(), breaker.end(), id);
182 }
183
184 bool
185 CSlmBuilder::isExcludeId(TSIMWordId id)
186 {
187 return std::binary_search(m_excludes.begin(), m_excludes.end(), id);
188 }
189
190 void
191 CSlmBuilder::AddNGram(TSIMWordId* ngram, FREQ_TYPE fr)
192 {
193 int ch;
194 bool brk = isExcludeId(*ngram);
195
196 for (int i = 1; i < nlevel; ++i) {
197 TNodeLevel* pnl = (TNodeLevel*)(level[i]);
198 if (pnl->capacity() == pnl->size()) {
199 size_t newsz = 2 * pnl->capacity();
200 if (pnl->capacity() > 1024 * 1024)
201 newsz = pnl->capacity() + 1024 * 1024;
202 pnl->reserve(newsz);
203 }
204 }
205 TLeafLevel* pll = (TLeafLevel*)(level[nlevel]);
206 if (pll->capacity() == pll->size()) {
207 size_t newsz = 2 * pll->capacity();
208 if (pll->capacity() > 1024 * 1024)
209 newsz = pll->capacity() + 1024 * 1024;
210 pll->reserve(newsz);
211 }
212
213 if (!brk)
214 (*(TNodeLevel*)(level[0]))[0].freq += fr;
215
216 bool branch = false;
217 for (int i = 1; (!brk && i < nlevel); ++i) {
218 std::vector<TNode> & pv = *(TNodeLevel*)(level[i - 1]);
219 std::vector<TNode> & v = *(TNodeLevel*)(level[i]);
220 branch = branch || (pv.back().child >= (int) v.size()) ||
221 (v.back().id != ngram[i - 1]);
222 if (branch) {
223 if (i == nlevel - 1)
224 ch = ((TLeafLevel*)(level[i + 1]))->size();
225 else
226 ch = ((TNodeLevel*)(level[i + 1]))->size();
227 v.push_back(TNode(ngram[i - 1], ch, fr));
228 } else {
229 v.back().freq += fr;
230 }
231 brk = (i > 1 && isBreakId(ngram[i - 1])) || isExcludeId(ngram[i]);
232 }
233
234 // Insert to the leaf level
235 if (!brk) {
236 if (fr > cut[nlevel]) {
237 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
238 v.push_back(TLeaf(ngram[nlevel - 1], fr));
239 } else {
240 nr[nlevel][0] += fr;
241 nr[nlevel][fr] += fr;
242 }
243 }
244 }
245
246 void
247 CSlmBuilder::CountNr()
248 {
249 printf("\nCounting Nr..."); fflush(stdout);
250 for (int lvl = 1; lvl < nlevel; ++lvl) {
251 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
252 for (TNodeIterator it = v.begin(), ite = v.end(); it != ite; ++it) {
253 FREQ_TYPE freq = it->freq;
254 nr[lvl][0] += freq;
255 if (freq < (int) SLM_MAX_R && freq > 0)
256 nr[lvl][freq] += freq;
257 }
258 }
259 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
260 for (TLeafIterator it = v.begin(), ite = v.end(); it != ite; ++it) {
261 FREQ_TYPE freq = it->freq;
262 nr[nlevel][0] += freq;
263 if (freq < (int) SLM_MAX_R && freq > 0)
264 nr[nlevel][freq] += freq;
265 }
266 printf("\n"); fflush(stdout);
267 }
268
269 int
270 CSlmBuilder::CutLeafLevel(TNodeIterator pfirst,
271 TNodeIterator plast,
272 TLeafIterator chfirst,
273 TLeafIterator chlast,
274 int thred)
275 {
276 int idxfirst, idxchk;
277 TLeafIterator chchk = chfirst;
278 for (idxfirst = idxchk = 0; chchk != chlast; ++chchk, ++idxchk) {
279 //do not cut item whoese 1. freq > thred; 2. psuedo tail
280 if ((int) chchk->freq > thred || (chchk + 1) == chlast) {
281 if (idxfirst < idxchk)
282 *chfirst = *chchk;
283 for (; pfirst != plast && pfirst->child <= idxchk; ++pfirst)
284 pfirst->child = idxfirst;
285 ++idxfirst;
286 ++chfirst;
287 }
288 }
289 assert(pfirst == plast);
290 return idxfirst;
291 }
292
293 int
294 CSlmBuilder::CutNodeLevel(TNodeIterator pfirst,
295 TNodeIterator plast,
296 TNodeIterator chfirst,
297 TNodeIterator chlast,
298 int thred)
299 {
300 int idxfirst, idxchk;
301 TNodeIterator chchk = chfirst;
302 for (idxfirst = idxchk = 0; chchk != chlast; ++chchk, ++idxchk) {
303 //do not cut item whoese 1. freq > thred; 2. psuedo tail; 3. leading children
304 TNodeIterator chnext = chchk + 1;
305 if ((int) chchk->freq > thred || chnext == chlast ||
306 (chnext->child != chchk->child)) {
307 if (idxfirst < idxchk)
308 *chfirst = *chchk;
309 for (; pfirst != plast && pfirst->child <= idxchk; ++pfirst)
310 pfirst->child = idxfirst;
311 ++idxfirst;
312 ++chfirst;
313 }
314 }
315 assert(pfirst == plast);
316 return idxfirst;
317 }
318
319 void
320 CSlmBuilder::Cut()
321 {
322 printf("\nCuting according freq..."); fflush(stdout);
323 for (int lvl = nlevel; lvl > 0; --lvl) {
324 printf("\n Cut level %d with threshold %d...", lvl, cut[lvl]);
325 fflush(stdout);
326 TNodeLevel& parent = *(TNodeLevel*)(level[lvl - 1]);
327 if (lvl == nlevel) {
328 if (cut[lvl] > 0) {
329 TLeafLevel& v = *(TLeafLevel*)(level[lvl]);
330 int newsize = CutLeafLevel(parent.begin(),
331 parent.end(), v.begin(),
332 v.end(), cut[lvl]);
333 v.erase(v.begin() + newsize, v.end());
334 }
335 } else {
336 if (cut[lvl] > 0) {
337 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
338 int newsize = CutNodeLevel(parent.begin(),
339 parent.end(), v.begin(),
340 v.end(), cut[lvl]);
341 v.erase(v.begin() + newsize, v.end());
342 }
343 }
344 }
345 printf("\n"); fflush(stdout);
346 }
347
348 void
349 CSlmBuilder::AppendTails()
350 {
351 printf("\nAppending psuedo tail node for each level..."); fflush(stdout);
352 for (int lvl = 0; lvl < nlevel; ++lvl) {
353 int child_size = 0;
354 if (lvl == nlevel - 1) {
355 child_size = ((TLeafLevel*)(level[lvl + 1]))->size();
356 } else {
357 child_size = ((TNodeLevel*)(level[lvl + 1]))->size();
358 }
359 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
360 v.push_back(TNode(0x00FFFFFF, child_size, 1));
361 }
362 //also make a psuedo tail node for the leaf level
363 ((TLeafLevel*)(level[nlevel]))->push_back(TLeaf(0, 1));
364 printf("\n"); fflush(stdout);
365 }
366
367 template<class TChildLevel>
368 void
369 DiscountOneLevel(CSlmBuilder::TNodeLevel& v,
370 TChildLevel& ch,
371 CSlmDiscounter* disc,
372 int bUseLogPr)
373 {
374 CSlmBuilder::TNodeIterator it = v.begin();
375 CSlmBuilder::TNodeIterator ite = v.begin() + (v.size() - 1);
376 for (; it != ite; ++it) { //do not calc the psuedo tail item
377 CSlmBuilder::TNodeIterator itnext = it + 1;
378 double root_freq = it->freq;
379 for (int h = it->child, t = itnext->child; h < t; ++h) {
380 double pr = disc->discount(ch[h].freq) / root_freq;
381 assert(pr > 0.0 && pr < 1.0);
382 if (bUseLogPr) {
383 ch[h].pr = CSlmBuilder::PR_TYPE(-log(pr));
384 } else {
385 ch[h].pr = CSlmBuilder::PR_TYPE(pr);
386 }
387 }
388 }
389 }
390
391 void
392 CSlmBuilder::Discount()
393 {
394 printf("\nDiscounting...");
395 for (int lvl = nlevel; lvl > 0; --lvl) {
396 printf("\n Initializing level %d's %s discount method: ",
397 lvl,
398 discounter[lvl]->getName());
399 discounter[lvl]->init(SLM_MAX_R, nr[lvl]);
400 }
401 printf("\n");
402 for (int lvl = nlevel - 1; lvl >= 0; --lvl) {
403 printf("\n Discounting level %d ...", lvl + 1); fflush(stdout);
404 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
405 if (lvl == nlevel - 1) { //its child is leaf
406 TLeafLevel& ch = *(TLeafLevel*)(level[lvl + 1]);
407 DiscountOneLevel(v, ch, discounter[lvl + 1], bUseLogPr);
408 } else {
409 TNodeLevel& ch = *(TNodeLevel*)(level[lvl + 1]);
410 DiscountOneLevel(v, ch, discounter[lvl + 1], bUseLogPr);
411 }
412 }
413 printf("\n Giving psuedo root level 0 a distribution...");
414 //make the psuedo 0-gram a equal distribution
415 TNodeLevel& v0 = *(TNodeLevel*)(level[0]);
416 if (bUseLogPr) {
417 v0[0].pr = PR_TYPE(-log(double(1.0) / m_nWord));
418 } else {
419 v0[0].pr = PR_TYPE(double(1.0) / m_nWord);
420 }
421 printf("\n"); fflush(stdout);
422 }
423
424 template<class chIterator>
425 double
426 CalcNodeBow(CSlmBuilder* builder,
427 int lvl,
428 TSIMWordId words[],
429 chIterator chh,
430 chIterator cht,
431 int bUseLogPr)
432 {
433 if (chh == cht) return 1.0;
434 double sumnext = 0.0, sum = 0.0;
435 for (; chh < cht; ++chh) {
436 if (bUseLogPr) {
437 sumnext += exp(-(chh->pr));
438 } else {
439 sumnext += double(chh->pr);
440 }
441 words[lvl + 1] = chh->id;
442 sum += builder->getPr(lvl, words + 2);
443 }
444 assert(sumnext > 0.0 && sumnext < 1.05);
445 assert(sum < 1.05 && sum > 0.0);
446 //消除计算误差的影响
447 if (sumnext >= 1.0 || sum >= 1.0) {
448 double bow = ((sumnext > sum) ? sumnext : sum) + 0.0001;
449 bow = (bow - sumnext) / (bow - sum);
450 printf(
451 "\n (sigma(p(w|h)=%lf, sigma(p(w|h')=%lf) bow ==> %lf due to Calculation precision for %d-gram:",
452 sumnext,
453 sum,
454 bow,
455 lvl);
456 for (int i = 1; i <= lvl; ++i)
457 printf("%d ", words[i]);
458 return bow;
459 }
460 return (1.0 - sumnext) / (1.0 - sum);
461 }
462
463 void
464 CSlmBuilder::CalcBOW()
465 {
466 printf("\nCalculating Back-Off Weight...");
467 for (int lvl = 0; lvl < nlevel; ++lvl) {
468 printf("\n Processing level %d ", lvl); fflush(stdout);
469 TNode* base[16]; //it should be lvl+1, yet some compiler does not support it
470 int idx[16]; //it should be lvl+1, yet some compiler does not support it
471 for (int i = 0; i <= lvl; ++i) {
472 base[i] = &((*(TNodeLevel*)level[i])[0]);
473 idx[i] = 0;
474 }
475 TSIMWordId words[17]; //it should be lvl+2, yet some compiler do not support it
476 int sz = ((TNodeLevel*)(level[lvl]))->size() - 1;
477 printf("(%d items)...", sz + 1); fflush(stdout);
478 for (; idx[lvl] < sz; ++idx[lvl]) {
479 words[lvl] = base[lvl][idx[lvl]].id;
480 for (int k = lvl - 1; k >= 0; --k) {
481 while (base[k][idx[k] + 1].child <= idx[k + 1])
482 ++idx[k];
483 words[k] = base[k][idx[k]].id;
484 }
485 TNode & node = base[lvl][idx[lvl]];
486 TNode & nodenext = *((&node) + 1);
487 double bow;
488 if (lvl == nlevel - 1) {
489 TLeaf * ch = &((*(TLeafLevel*)level[lvl + 1])[0]);
490 bow = CalcNodeBow(this,
491 lvl,
492 words,
493 ch + node.child,
494 ch + nodenext.child,
495 bUseLogPr);
496 } else {
497 TNode * ch = &((*(TNodeLevel*)level[lvl + 1])[0]);
498 bow = CalcNodeBow(this,
499 lvl,
500 words,
501 ch + node.child,
502 ch + nodenext.child,
503 bUseLogPr);
504 }
505 if (bUseLogPr) {
506 node.bow = PR_TYPE(-log(bow));
507 } else {
508 node.bow = PR_TYPE(bow);
509 }
510 }
511 }
512 printf("\n"); fflush(stdout);
513 }
514
515 double
516 CSlmBuilder::getPr(int n, TSIMWordId *words)
517 {
518 int lvl;
519 double bow = 1.0;
520 void* pnode = &((*(TNodeLevel*)level[0])[0]);
521
522 assert(n <= nlevel);
523
524 if (n == 0) {
525 if (bUseLogPr) {
526 return exp(-((TNode*)pnode)->pr);
527 } else {
528 return ((TNode*)pnode)->pr;
529 }
530 }
531
532 for (lvl = 0; pnode != NULL && lvl < n; ++lvl) {
533 if (bUseLogPr) {
534 bow = exp(-((TNode*)pnode)->bow);
535 } else {
536 bow = ((TNode*)pnode)->bow;
537 }
538 pnode = FindChild(lvl, (TNode*)pnode, words[lvl]);
539 }
540
541 if (pnode != NULL) { // find the whole string
542 if (bUseLogPr) {
543 return exp(-((TLeaf*)pnode)->pr);
544 } else {
545 return ((TLeaf*)pnode)->pr;
546 }
547 } else if (lvl == n - 1) { // only find the history
548 return bow * getPr(n - 1, words + 1);
549 } else { //even not find the history
550 return getPr(n - 1, words + 1);
551 }
552 }
553
554 void*
555 CSlmBuilder::FindChild(int lvl, TNode* root, TSIMWordId id)
556 {
557 int chh = root->child, cht = (root + 1)->child;
558 if (lvl == nlevel - 1) {
559 TLeaf* pleaf = &((*(TLeafLevel*)level[lvl + 1])[0]);
560 return (void*)binary_find(pleaf, chh, cht, TLeaf(id));
561 } else {
562 TNode* pnode = &((*(TNodeLevel*)level[lvl + 1])[0]);
563 return (void*)binary_find(pnode, chh, cht, TNode(id));
564 }
565 }
566
567 void
568 CSlmBuilder::Build()
569 {
570 CountNr();
571 AppendTails();
572 Cut();
573 Discount();
574 CalcBOW();
575 }
576
577 void
578 CSlmBuilder::Write(FILE *out)
579 {
580 fwrite(&nlevel, sizeof(nlevel), 1, out);
581 fwrite(&bUseLogPr, sizeof(bUseLogPr), 1, out);
582 for (int lvl = 0; lvl <= nlevel; ++lvl) {
583 int sz = 0;
584 if (lvl == nlevel)
585 sz = ((TLeafLevel*)(level[lvl]))->size();
586 else
587 sz = ((TNodeLevel*)(level[lvl]))->size();
588 fwrite(&sz, sizeof(sz), 1, out);
589 }
590 for (int lvl = 0; lvl < nlevel; ++lvl) {
591 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
592 for (TNodeIterator it = v.begin(), ite = v.end(); it != ite; ++it)
593 fwrite(&(*it), sizeof(TNode), 1, out);
594 }
595 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
596 for (TLeafIterator it = v.begin(), ite = v.end(); it != ite; ++it)
597 fwrite(&(*it), sizeof(TLeaf), 1, out);
598 }
599
600 void
601 CSlmBuilder::Close(void)
602 {
603 if (level != NULL) {
604 for (int lvl = 0; lvl <= nlevel; ++lvl) {
605 if (lvl == nlevel)
606 delete (TLeafLevel*)(level[lvl]);
607 else
608 delete (TNodeLevel*)(level[lvl]);
609 }
610 delete [] level;
611 level = NULL;
612 }
613 if (cut != NULL) {
614 delete [] cut;
615 cut = NULL;
616 }
617 if (discounter != NULL) {
618 for (int lvl = 1; lvl <= nlevel; ++lvl) {
619 delete discounter[lvl];
620 }
621 delete [] discounter;
622 discounter = NULL;
623 }
624 if (nr != NULL) {
625 delete [] nr;
626 nr = NULL;
627 }
628 breaker.clear();
629 m_nWord = 0;
630 nlevel = 0;
631 }
632
633 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
0 // -*- mode: c++ -*-
1 /*
2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
3 *
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
5 *
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
15 * following notice:
16 *
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
18 * (CDDL)
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
24 *
25 * Contributor(s):
26 *
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
36 */
37
38 #ifndef _SIM_SLM_BUILDER_H
39 #define _SIM_SLM_BUILDER_H
40
41 #include "../../portability.h"
42
43 #include "sim_slm.h"
44
45 class CSlmDiscounter;
46
47 class CSlmBuilder {
48 public:
49 static const int SLM_MAX_R = 16;
50 typedef CSIMSlm::FREQ_TYPE FREQ_TYPE;
51 typedef CSIMSlm::PR_TYPE PR_TYPE;
52 typedef CSIMSlm::TNode TNode;
53 typedef CSIMSlm::TLeaf TLeaf;
54
55 public:
56 CSlmBuilder()
57 : nlevel(0), bUseLogPr(0), level(NULL), m_nWord(0), cut(NULL),
58 discounter(NULL), nr(NULL), breaker(), m_excludes() { }
59 ~CSlmBuilder()
60 { Close(); }
61
62 void Create(int n);
63 void SetNumberOfWord(int nWord) { this->m_nWord = nWord; }
64 void SetCut(FREQ_TYPE threshold[]);
65 void SetDiscounter(CSlmDiscounter * dis[]);
66 void SetBreakerIds(int nId, TSIMWordId brks[]);
67 void SetExcludeIds(int nId, TSIMWordId excludes[]);
68 void SetUseLogPr(int bUse)
69 { bUseLogPr = bUse; }
70
71 void AddNGram(TSIMWordId* ngram, FREQ_TYPE fr);
72 void Build();
73 void Write(FILE* out);
74 void Close();
75
76 //get pr(w[n-1] | w[0]...w[n-2]) on constructed partial model (low levels)
77 double getPr(int n, TSIMWordId* w);
78
79 public:
80 typedef std::vector<TNode> TNodeLevel;
81 typedef std::vector<TLeaf> TLeafLevel;
82 typedef TNodeLevel::iterator TNodeIterator;
83 typedef TLeafLevel::iterator TLeafIterator;
84
85 protected:
86 bool isBreakId(TSIMWordId id);
87 bool isExcludeId(TSIMWordId id);
88 void CountNr();
89 void AppendTails();
90 void Cut();
91 void Discount();
92 void CalcBOW();
93 void*FindChild(int lvl, TNode* root, TSIMWordId id);
94 int CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast,
95 TNodeIterator chfirst, TNodeIterator chlast, int thred);
96 int CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast,
97 TLeafIterator chfirst, TLeafIterator chlast, int thred);
98
99 private:
100 int nlevel, bUseLogPr;
101 void** level;
102 //level[0] is psudeo root level, level[1] is unigram level, ..., all are vector type
103
104 int m_nWord;
105 FREQ_TYPE* cut; // cut[1] is not cut threshold for 1-gram, ...
106 CSlmDiscounter** discounter; // discounter[1] is for 1-gram...
107 FREQ_TYPE(*nr)[SLM_MAX_R]; //nr[1][SLM_MAX_R] is for 1-gram...
108 std::vector<TSIMWordId> breaker;
109 std::vector<TSIMWordId> m_excludes;
110 };
111
112 class CSlmDiscounter {
113 public:
114 virtual ~CSlmDiscounter() {}
115 // n is array size, nr is FREQ_TYPE[n], nr[0] is corpuse size,or sigma r*nr;
116 // nr[1] is number of ngram items with freq 1, ...
117 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr) = 0;
118
119 // freq is the ngram frequence, not the conditional pr
120 virtual double discount(int freq) = 0;
121 virtual const char* getName() = 0;
122 };
123
124 //Good-Turing discount
125 class CSlmGTDiscounter : public CSlmDiscounter {
126 public:
127 CSlmGTDiscounter(int threshold = 10, double highfreq_discount =
128 0.95) : thres(threshold), hd(highfreq_discount),
129 dis(NULL) {}
130 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
131 virtual double discount(int freq);
132 virtual const char* getName()
133 { return "Good-Turing"; }
134 protected:
135 int thres;
136 double hd;
137 double *dis;
138 };
139
140 class CSlmAbsoluteDiscounter : public CSlmDiscounter {
141 public:
142 CSlmAbsoluteDiscounter(double substract = 0.0) : c(substract) {}
143 //c == 0 mean this value should be count according to r[]
144 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
145 virtual double discount(int freq); // return freq - c
146 virtual const char* getName()
147 { return "Absolution"; }
148 protected:
149 double c;
150 };
151
152 class CSlmLinearDiscounter : public CSlmDiscounter {
153 public:
154 CSlmLinearDiscounter(double shrink = 0.0) : dis(shrink) {}
155 //dis == 0 mean this value should be count according to r[]
156 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
157 virtual double discount(int freq); // return freq * dis
158 virtual const char* getName()
159 { return "Linear"; }
160 protected:
161 double dis;
162 };
163
164 #endif
165
166 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
5555 #include <vector>
5656 #include <algorithm>
5757
58 #include "../sim_slmbuilder.h"
58 #include "sim_slmbuilder.h"
5959
6060 static struct option long_options[] =
6161 {
246246 ++nItems;
247247 }
248248 fclose(fp);
249 delete ngram;
249 delete[] ngram;
250250 printf("%d ngrams.\n", nItems); fflush(stdout);
251251
252252 builder.Build();
115115
116116 typedef std::map<TSIMWordId, std::string> TReverseLexicon;
117117
118 double log_conv(double input, bool input_log, bool output_log) {
119 if (!(input_log ^ output_log)) return input;
120 else if (input_log) return exp(-input);
121 else return -log(input);
122 }
123
118124 void
119125 PrintARPALevel(int lvl, FILE* fp, TReverseLexicon* plexicon, bool output_log_pr)
120126 {
154160 }
155161 }
156162
157 printf("/%d-gram:%d/\n", lvl, sz[lvl] - 1);
163 printf("\\%d-gram\\%d\n", lvl, sz[lvl] - 1);
158164 while (idx[lvl] < sz[lvl] - 1) {
159 for (int i = lvl - 1; i > 0; --i) {
165 if (lvl > 0) for (int i = lvl - 1; i > 0; --i) {
160166 bool change = false;
161167 while (nodes[i][1].child <= idx[i + 1]) {
162168 change = true;
177183 else
178184 printf("%d ", int(word_id));
179185 }
180 if (bLogPrFile) {
181 if (output_log_pr)
182 printf("%20.17lf ", double(nodes[lvl][0].pr));
183 else
184 printf("%20.17lf ", exp(-double(nodes[lvl][0].pr)));
185 if (lvl != N) {
186 if (output_log_pr)
187 printf("%20.17lf", double(nodes[lvl][0].bow));
188 else
189 printf("%20.17lf", exp(-double(nodes[lvl][0].bow)));
190 }
191 } else {
192 if (output_log_pr)
193 printf("%20.17lf ", -log(double(nodes[lvl][0].pr)));
194 else
195 printf("%20.17lf ", double(nodes[lvl][0].pr));
196 if (lvl != N) {
197 if (output_log_pr)
198 printf("%20.17lf", -log(double(nodes[lvl][0].bow)));
199 else
200 printf("%20.17lf", double(nodes[lvl][0].bow));
201 }
202 }
186 printf("%20.17lf",
187 log_conv(nodes[lvl][0].pr, bLogPrFile, output_log_pr));
188 if (lvl != N) printf(" %20.17lf",
189 log_conv(nodes[lvl][0].bow, bLogPrFile, output_log_pr));
203190 printf("\n");
204191
205192 ++idx[lvl];
248235 }
249236 fseek(fp, 0, SEEK_SET);
250237 fread(&N, sizeof(N), 1, fp);
251 for (int lvl = 1; lvl <= N; ++lvl)
238 for (int lvl = 0; lvl <= N; ++lvl)
252239 PrintARPALevel(lvl, fp, plexicon, output_log_pr);
253240 }
254241
0 /*
1 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
2 *
3 * The contents of this file are subject to the terms of either the GNU Lesser
4 * General Public License Version 2.1 only ("LGPL") or the Common Development and
5 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
6 * file except in compliance with the License. You can obtain a copy of the CDDL at
7 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
8 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
9 * specific language governing permissions and limitations under the License. When
10 * distributing the software, include this License Header Notice in each file and
11 * include the full text of the License in the License file as well as the
12 * following notice:
13 *
14 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
15 * (CDDL)
16 * For Covered Software in this distribution, this License shall be governed by the
17 * laws of the State of California (excluding conflict-of-law provisions).
18 * Any litigation relating to this License shall be subject to the jurisdiction of
19 * the Federal Courts of the Northern District of California and the state courts
20 * of the State of California, with venue lying in Santa Clara County, California.
21 *
22 * Contributor(s):
23 *
24 * If you wish your version of this file to be governed by only the CDDL or only
25 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
26 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
27 * license." If you don't indicate a single choice of license, a recipient has the
28 * option to distribute your version of this file under either the CDDL or the LGPL
29 * Version 2.1, or to extend the choice of license to its licensees as provided
30 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
31 * Version 2 license, then the option applies only if the new code is made subject
32 * to such option by the copyright holder.
33 */
34 #include <string>
35 #include <iostream>
36 #include <fstream>
37 #include <algorithm>
38 #include "arpa_slm.h"
39
40 using namespace std;
41
42 char*
43 getwords(char* buf, char** next)
44 {
45 char* word = buf;
46 char* delim = strstr(buf, " ");
47 if (delim == NULL) {
48 cerr << "Unknown format in: " << buf << "." << endl;
49 exit(2);
50 }
51 *delim = '\0';
52 *next = delim + 2;
53 return word;
54 }
55
56 unsigned
57 get_wid(const char* word, const TLexicon& lexicon)
58 {
59 TLexicon::const_iterator lexi = lexicon.find(word);
60 unsigned wid;
61 if (lexi != lexicon.end()) {
62 wid = lexi->second;
63 } else {
64 cerr << "Error:\"" << word << "\" not found in lexicon." << endl;
65 wid = 0;
66 }
67 return wid;
68 }
69
70 int
71 CArpaSlm::TLeaf::load_words(char* buf, const TLexicon& lexicon)
72 {
73 int nword = 0;
74 char* word, *end;
75 for (word = end = buf; *end != 0; ++end) {
76 if (*end == ' ') {
77 assert(nword < N_GRAM);
78 *end = 0;
79 hw[nword++] = get_wid(word, lexicon);
80 word = end + 1;
81 }
82 }
83 if (buf != end) {
84 wid = hw[nword++] = get_wid(word, lexicon);
85 }
86 return nword;
87 }
88
89 void
90 CArpaSlm::TLeaf::load(istream& is, const TLexicon& lexicon)
91 {
92 char buf[1024];
93 is.getline(buf, sizeof(buf));
94 char* next = 0;
95 char* words = getwords(buf, &next);
96 load_words(words, lexicon);
97 sscanf(next, "%f", &pr);
98 }
99
100 void
101 CArpaSlm::TNode::load(istream& is, const TLexicon& lexicon)
102 {
103 char buf[1024];
104 is.getline(buf, sizeof(buf));
105 char* next = 0;
106 char* words = getwords(buf, &next);
107 load_words(words, lexicon);
108 sscanf(next, "%f %f", &pr, &bow);
109 }
110
111 void
112 CArpaSlm::TNode::load_level0(istream& is)
113 {
114 hw[0] = 0;
115 char buf[1024];
116 is.getline(buf, sizeof(buf));
117 sscanf(buf, "%f %f", &pr, &bow);
118 wid = 0;
119 }
120
121 void
122 CArpaSlm::load(const char* filename, const TLexicon& lexicon)
123 {
124 printf("Loading ARPA slm..."); fflush(stdout);
125 ifstream file(filename);
126 char buf[1024];
127 for (int i = 0; i <= N_GRAM; ++i) {
128 unsigned lvl;
129 int size;
130 file.getline(buf, sizeof(buf));
131 if (!file) {
132 cerr << "Failed to read from" << filename << endl;
133 exit(1);
134 }
135 sscanf(buf, "\\%d-gram\\%d%*[\n]", &lvl, &size);
136 assert(lvl <= N_GRAM);
137 if (lvl == 0) {
138 TNode node0;
139 node0.load_level0(file);
140 m_levels[0].push_back(node0);
141 } else if (lvl < m_N) {
142 m_levels[lvl].reserve(size);
143 for (int i = 0; i < size; ++i) {
144 TNode node;
145 node.load(file, lexicon);
146 m_levels[lvl].push_back(node);
147 }
148 } else {
149 // leaf nodes
150 m_lastLevel.reserve(size);
151 for (int i = 0; i < size; ++i) {
152 TLeaf leaf;
153 leaf.load(file, lexicon);
154 m_lastLevel.push_back(leaf);
155 }
156 }
157 }
158 }
159
160 template <class NodeT>
161 struct CompareNode {
162 const unsigned m_lvl;
163 CompareNode(unsigned lvl) : m_lvl(lvl)
164 {
165 }
166 /**
167 * @return true if strictly less, false otherwise
168 */
169 bool
170 operator ()(const NodeT& node, const TSIMWordId hw[N_GRAM])
171 {
172 for (unsigned i = 0; i < m_lvl; ++i) {
173 if (node.hw[i] < hw[i])
174 return true;
175 if (node.hw[i] > hw[i])
176 return false;
177 }
178 // node.hw[:lvl] is the same as hw[:]
179 return false;
180 }
181 };
182
183 void
184 CArpaSlm::initChild()
185 {
186 {
187 TNode& node = m_levels[0][0];
188 node.child = 0;
189 }
190 for (unsigned lvl = 1; lvl < m_N; ++lvl) {
191 TNodeLevel& level = m_levels[lvl];
192 unsigned last_child = 0;
193 for (TNodeLevel::iterator node = level.begin();
194 node != level.end();
195 ++node) {
196 node->child = last_child = find_1st_child(lvl, *node, last_child);
197 }
198 }
199 }
200
201 unsigned
202 CArpaSlm::find_1st_child(unsigned lvl, const TNode& node, int last_child)
203 {
204 assert(lvl < m_N);
205 if (lvl == m_N - 1) {
206 TLeafLevel::iterator found = lower_bound(
207 m_lastLevel.begin(), m_lastLevel.end(), node.hw,
208 CompareNode<TLeaf>(lvl));
209 return distance(m_lastLevel.begin(), found);
210 } else {
211 const TNodeLevel& level = m_levels[lvl + 1];
212 TNodeLevel::const_iterator found = lower_bound(level.begin(), level.end(
213 ), node.hw,
214 CompareNode<TNode>(lvl));
215 return distance(level.begin(), found);
216 }
217 }
218
219 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
0 // -*- mode: c++ -*-
1 /*
2 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
3 *
4 * The contents of this file are subject to the terms of either the GNU Lesser
5 * General Public License Version 2.1 only ("LGPL") or the Common Development and
6 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
7 * file except in compliance with the License. You can obtain a copy of the CDDL at
8 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
9 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
10 * specific language governing permissions and limitations under the License. When
11 * distributing the software, include this License Header Notice in each file and
12 * include the full text of the License in the License file as well as the
13 * following notice:
14 *
15 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
16 * (CDDL)
17 * For Covered Software in this distribution, this License shall be governed by the
18 * laws of the State of California (excluding conflict-of-law provisions).
19 * Any litigation relating to this License shall be subject to the jurisdiction of
20 * the Federal Courts of the Northern District of California and the state courts
21 * of the State of California, with venue lying in Santa Clara County, California.
22 *
23 * Contributor(s):
24 *
25 * If you wish your version of this file to be governed by only the CDDL or only
26 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
27 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
28 * license." If you don't indicate a single choice of license, a recipient has the
29 * option to distribute your version of this file under either the CDDL or the LGPL
30 * Version 2.1, or to extend the choice of license to its licensees as provided
31 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
32 * Version 2 license, then the option applies only if the new code is made subject
33 * to such option by the copyright holder.
34 */
35 #ifndef _ARPA_SLM_H
36 #define _ARPA_SLM_H
37
38 #include <istream>
39 #include "common.h"
40
41 using std::istream;
42
43 #define N_GRAM (3)
44
45
46 /* the ARPA style representation of sunpinyin's SLM */
47 class CArpaSlm {
48 public:
49 struct TLeaf {
50 TSIMWordId hw[N_GRAM];
51 TSIMWordId wid;
52 float pr;
53 void load(istream&, const TLexicon&);
54 int load_words(char* buf, const TLexicon& lexicon);
55 TLeaf() : wid(0), pr(.0) {}
56 };
57
58 struct TNode : public TLeaf {
59 int child;
60 float bow;
61 void load(istream&, const TLexicon&);
62 void load_level0(istream&);
63 };
64
65 typedef std::vector<TNode> TNodeLevel;
66 typedef std::vector<TLeaf> TLeafLevel;
67
68 private:
69 TNodeLevel m_levels[N_GRAM + 1]; /* [0..N_GRAM] */
70 TLeafLevel m_lastLevel;
71 const bool m_usingLogPr;
72 const unsigned m_N;
73
74 public:
75 /* XXX, ARPA file does not provide these information.
76 so we assume this SLM is trigram, and does not use LogPr */
77 CArpaSlm() : m_usingLogPr(false), m_N(N_GRAM) {}
78 bool good() const { return m_levels[0].size() != 0; }
79 unsigned getN() const { return m_N; }
80 bool usingLogPr() const { return m_usingLogPr; }
81 const TNodeLevel& getLevel(unsigned lvl) const { return m_levels[lvl]; }
82 const TLeafLevel& getLastLevel() const { return m_lastLevel; }
83 unsigned getLevelSize(unsigned lvl) const {
84 assert(lvl <= m_N);
85 if (lvl < m_N) {
86 return m_levels[lvl].size();
87 } else {
88 return m_lastLevel.size();
89 }
90 }
91 void initChild();
92 void load(const char* filename, const TLexicon& lexicon);
93
94 private:
95 /**
96 * find out the first child of a given node in its next level
97 * @param lvl the level where node belongs to
98 * @param node the node
99 * @param last_child the child index of previous node
100 * @return the index of the found child
101 */
102 unsigned find_1st_child(unsigned lvl, const TNode& node, int last_child);
103 };
104
105 #endif //_ARPA_SLM_H
106
107 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
0 // -*- mode: c++ -*-
1 /*
2 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
3 *
4 * The contents of this file are subject to the terms of either the GNU Lesser
5 * General Public License Version 2.1 only ("LGPL") or the Common Development and
6 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
7 * file except in compliance with the License. You can obtain a copy of the CDDL at
8 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
9 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
10 * specific language governing permissions and limitations under the License. When
11 * distributing the software, include this License Header Notice in each file and
12 * include the full text of the License in the License file as well as the
13 * following notice:
14 *
15 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
16 * (CDDL)
17 * For Covered Software in this distribution, this License shall be governed by the
18 * laws of the State of California (excluding conflict-of-law provisions).
19 * Any litigation relating to this License shall be subject to the jurisdiction of
20 * the Federal Courts of the Northern District of California and the state courts
21 * of the State of California, with venue lying in Santa Clara County, California.
22 *
23 * Contributor(s):
24 *
25 * If you wish your version of this file to be governed by only the CDDL or only
26 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
27 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
28 * license." If you don't indicate a single choice of license, a recipient has the
29 * option to distribute your version of this file under either the CDDL or the LGPL
30 * Version 2.1, or to extend the choice of license to its licensees as provided
31 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
32 * Version 2 license, then the option applies only if the new code is made subject
33 * to such option by the copyright holder.
34 */
35 #ifndef _SLM_PACK_COMMON_H
36 #define _SLM_PACK_COMMON_H
37
38 #include <vector>
39 #include <map>
40 #include <string>
41 #include <cmath>
42 #include <cassert>
43
44 #include "../slm.h"
45
46 typedef std::vector<CThreadSlm::TNode> TNodeLevel;
47 typedef std::vector<CThreadSlm::TLeaf> TLeafLevel;
48 typedef std::vector<CThreadSlm::TNode*> TNodeLevels;
49 typedef std::map<std::string, unsigned int> TLexicon; // map word to wid
50
51 #endif //_SLM_PACK_COMMON_H
52
53 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
0 /*
1 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
2 *
3 * The contents of this file are subject to the terms of either the GNU Lesser
4 * General Public License Version 2.1 only ("LGPL") or the Common Development and
5 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
6 * file except in compliance with the License. You can obtain a copy of the CDDL at
7 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
8 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
9 * specific language governing permissions and limitations under the License. When
10 * distributing the software, include this License Header Notice in each file and
11 * include the full text of the License in the License file as well as the
12 * following notice:
13 *
14 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
15 * (CDDL)
16 * For Covered Software in this distribution, this License shall be governed by the
17 * laws of the State of California (excluding conflict-of-law provisions).
18 * Any litigation relating to this License shall be subject to the jurisdiction of
19 * the Federal Courts of the Northern District of California and the state courts
20 * of the State of California, with venue lying in Santa Clara County, California.
21 *
22 * Contributor(s):
23 *
24 * If you wish your version of this file to be governed by only the CDDL or only
25 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
26 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
27 * license." If you don't indicate a single choice of license, a recipient has the
28 * option to distribute your version of this file under either the CDDL or the LGPL
29 * Version 2.1, or to extend the choice of license to its licensees as provided
30 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
31 * Version 2 license, then the option applies only if the new code is made subject
32 * to such option by the copyright holder.
33 */
34
35 /*
36 * pack ARPA format to a binary format which can be consumed by SunPinyin
37 */
38
39 #ifdef HAVE_CONFIG_H
40 #include "config.h"
41 #endif
42
43 #ifdef HAVE_ASSERT_H
44 #include <assert.h>
45 #endif
46
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <stdlib.h>
50
51 #include <vector>
52 #include <map>
53 #include <iostream>
54 #include <cmath>
55
56 #include "../sim_slm.h"
57 #include "arpa_slm.h"
58
59 void
60 ShowUsage(const char* progname)
61 {
62 printf("Usage:\n");
63 printf(" %s arpa_slm dict_file output_slm\n", progname);
64 printf("\n");
65 printf("Description:\n");
66 printf(
67 " %s converts the ARPA representation of SLM to the binary format of SLM. \n",
68 progname);
69 exit(100);
70 }
71
72 TLexicon
73 read_lexicon(const char* filename)
74 {
75 printf("Loading lexicon..."); fflush(stdout);
76 static char word[1024 * 10];
77 FILE* f_lex = fopen(filename, "r");
78 TLexicon lexicon;
79 while (fgets(word, sizeof(word), f_lex)) {
80 if (strlen(word) > 0) {
81 // skip to the first non hanzi character
82 char* p = word;
83 while (*p == ' ' || *p == '\t')
84 ++p;
85 while (*p != 0 && *p != ' ' && *p != '\t')
86 ++p;
87 if (*p == 0) continue;
88 *p++ = 0;
89 // skip to the word_id
90 while (*p == ' ' || *p == '\t')
91 ++p;
92 if (!(*p >= '0' && *p <= '9')) continue;
93
94 int id;
95 for (id = 0; *p >= '0' && *p <= '9'; ++p)
96 id = 10 * id + (*p - '0');
97 lexicon[std::string(word)] = id;
98 }
99 }
100 fclose(f_lex);
101 printf("done.\n"); fflush(stdout);
102
103 return lexicon;
104 }
105
106 //
107 // filename [in]
108 // levels[0] [in]
109 // ...
110 // levels[N] [in]
111 // lastLevel [in]
112 //
113 void
114 write_out(const char* filename, const CArpaSlm& slm)
115 {
116 printf("\nWriting out..."); fflush(stdout);
117
118 FILE* fp = fopen(filename, "wb");
119 const int N = slm.getN();
120 fwrite(&N, sizeof(int), 1, fp);
121 const unsigned usingLogPr = slm.usingLogPr();
122 fwrite(&usingLogPr, sizeof(unsigned), 1, fp);
123
124 for (int lvl = 0; lvl <= N; ++lvl) {
125 unsigned len = slm.getLevelSize(lvl) + 1;
126 fwrite(&len, sizeof(unsigned), 1, fp);
127 }
128
129 for (int lvl = 0; lvl < N; ++lvl) {
130 const CArpaSlm::TNodeLevel& level = slm.getLevel(lvl);
131 for (CArpaSlm::TNodeLevel::const_iterator iter = level.begin();
132 iter != level.end(); ++iter) {
133 CSIMSlm::TNode node(iter->wid, iter->child, 0, iter->bow);
134 node.pr = iter->pr;
135 fwrite(&node, sizeof(CSIMSlm::TNode), 1, fp);
136 }
137 CSIMSlm::TNode node(0x00FFFFFF, slm.getLevel(lvl + 1).size(), 1, 0);
138 fwrite(&node, sizeof(CSIMSlm::TNode), 1, fp);
139 }
140
141 const CArpaSlm::TLeafLevel& level = slm.getLastLevel();
142 for (CArpaSlm::TLeafLevel::const_iterator iter = level.begin();
143 iter != level.end(); ++iter) {
144 CSIMSlm::TLeaf node(iter->wid, 0);
145 node.pr = iter->pr;
146 fwrite(&node, sizeof(CSIMSlm::TLeaf), 1, fp);
147 }
148 CSIMSlm::TLeaf node(0, 1);
149 fwrite(&node, sizeof(CSIMSlm::TLeaf), 1, fp);
150
151 fclose(fp);
152 printf("done!\n"); fflush(stdout);
153 }
154
155 int
156 main(int argc, char* argv[])
157 {
158 if (argc != 4)
159 ShowUsage(argv[0]);
160 const char* arpa_path = argv[1];
161 const char* lexicon_path = argv[2];
162 const char* output_path = argv[3];
163
164 CArpaSlm slm;
165 TLexicon lexicon = read_lexicon(lexicon_path);
166 slm.load(arpa_path, lexicon);
167
168 if (!slm.good()) {
169 std::cerr << "Failed to load language model from " << arpa_path <<
170 "." << std::endl;
171 exit(1);
172 }
173 slm.initChild();
174 write_out(output_path, slm);
175 return 0;
176 }
177
178 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
4848
4949 #include <vector>
5050 #include <map>
51 #include <math.h>
51 #include <cmath>
5252
5353 #include "../sim_slm.h"
5454 #include "../slm.h"
5555
5656 #include "ValueCompress.h"
57
58 using std::log;
59 using std::exp;
5760
5861 class CSIMSlmWithIteration : public CSIMSlm {
5962 public:
244247 ShowUsage();
245248
246249 printf("Loading original slm..."); fflush(stdout);
247 if (slm.Load(argv[1]) == false)
250 if (!slm.Load(argv[1]))
248251 ShowUsage();
249252
250253 bool usingLogPr = slm.isUseLogPr();
251254
252 #define EffectivePr(a) (float((usingLogPr) ? ((a) / log(2.0)) : (-log2((a)))))
253 #define OriginalPr(b) (float((usingLogPr) ? ((b) * log(2.0)) : (exp2(-(b)))))
254 #define EffectiveBow(a) (float((usingLogPr) ? (exp(-(a))) : ((a))))
255 #define OriginalBow(b) (float((usingLogPr) ? (-log((b))) : ((b))))
255 #define EffectivePr(a) (usingLogPr ? ((a) / log(2.0)) : -log2f((a)))
256 #define OriginalPr(b) (usingLogPr ? ((b) * log(2.0)) : exp2(-(b)))
257 #define EffectiveBow(a) (usingLogPr ? exp(-(a)) : (a))
258 #define OriginalBow(b) (usingLogPr ? -log((b)) : (b))
256259
257260 printf("\nfirst pass..."); fflush(stdout);
258261 for (int lvl = 0; lvl <= slm.getN(); ++lvl) {
290293 };
291294
292295 for (unsigned i = 0, sz = sizeof(msprs) / sizeof(float); i < sz; ++i) {
293 float real_pr = (usingLogPr) ? (-log(msprs[i])) : (msprs[i]);
296 float real_pr = usingLogPr ? -log(msprs[i]) : msprs[i];
294297 float eff_pr = EffectivePr(real_pr);
295298 if (pr_eff.find(eff_pr) == pr_eff.end()) {
296299 pr_eff[eff_pr] = real_pr;
308311 };
309312
310313 for (unsigned i = 0, sz = sizeof(msbows) / sizeof(float); i < sz; ++i) {
311 float real_bow = (usingLogPr) ? (-log(msbows[i])) : (msbows[i]);
314 float real_bow = usingLogPr ? -log(msbows[i]) : msbows[i];
312315 float eff_bow = EffectiveBow(real_bow);
313316 if (bow_eff.find(eff_bow) == bow_eff.end()) {
314317 bow_eff[eff_bow] = real_bow;
357360
358361 std::map<float, int>::iterator prit = pr_map.find(pn->pr);
359362 if (prit == pr_map.end()) { // This would be cause by precision error
360 double val = EffectivePr(pn->pr);
363 float val = EffectivePr(pn->pr);
361364 val = OriginalPr(val);
362365 prit = pr_map.find(val);
363366 assert(prit != pr_map.end());
404407
405408 std::map<float, int>::iterator prit = pr_map.find(pn->pr);
406409 if (prit == pr_map.end()) { // This would be cause by precision error
407 double val = EffectivePr(pn->pr);
410 float val = EffectivePr(pn->pr);
408411 val = OriginalPr(val);
409412 prit = pr_map.find(val);
410413 assert(prit != pr_map.end());
+0
-177
src/slm/tslmpack/arpa_conv.cpp less more
0 /*
1 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
2 *
3 * The contents of this file are subject to the terms of either the GNU Lesser
4 * General Public License Version 2.1 only ("LGPL") or the Common Development and
5 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
6 * file except in compliance with the License. You can obtain a copy of the CDDL at
7 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
8 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
9 * specific language governing permissions and limitations under the License. When
10 * distributing the software, include this License Header Notice in each file and
11 * include the full text of the License in the License file as well as the
12 * following notice:
13 *
14 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
15 * (CDDL)
16 * For Covered Software in this distribution, this License shall be governed by the
17 * laws of the State of California (excluding conflict-of-law provisions).
18 * Any litigation relating to this License shall be subject to the jurisdiction of
19 * the Federal Courts of the Northern District of California and the state courts
20 * of the State of California, with venue lying in Santa Clara County, California.
21 *
22 * Contributor(s):
23 *
24 * If you wish your version of this file to be governed by only the CDDL or only
25 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
26 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
27 * license." If you don't indicate a single choice of license, a recipient has the
28 * option to distribute your version of this file under either the CDDL or the LGPL
29 * Version 2.1, or to extend the choice of license to its licensees as provided
30 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
31 * Version 2 license, then the option applies only if the new code is made subject
32 * to such option by the copyright holder.
33 */
34 #include <algorithm>
35 #include "common.h"
36 #include "arpa_slm.h"
37 #include "arpa_conv.h"
38
39
40 //
41 // convert CArpaSlm::TLeaf to CThreadSlm::TLeaf
42 //
43 class CArpaLeafConv
44 {
45 const bool usingLogPr;
46 CompressedTable& m_pr_table;
47 RealIndexMap& m_pr_map;
48
49 public:
50 CArpaLeafConv(bool usingLogPr_,
51 RealIndexMap* pr_map,
52 CompressedTable* pr_table) :
53 usingLogPr(usingLogPr_),
54 m_pr_table(*pr_table),
55 m_pr_map(*pr_map)
56 {
57 }
58
59 CThreadSlm::TLeaf
60 operator()(const CArpaSlm::TLeaf& leaf)
61 {
62 CThreadSlm::TLeaf tleaf;
63 tleaf.set_wid(leaf.wid);
64 tleaf.set_bon(leaf.bon);
65 tleaf.set_bol(leaf.bol);
66 unsigned pr_idx = get_pr_index(leaf.pr);
67 tleaf.set_pr(pr_idx);
68 return tleaf;
69 }
70
71 //
72 // lookup the Real/Effective value in the RealIndexMap for its index
73 // in the CompressedTable
74 //
75 unsigned
76 get_pr_index(float pr)
77 {
78 std::map<float, int>::iterator prit = m_pr_map.find(pr);
79 if (prit == m_pr_map.end()) { // This could be caused by precision error
80 double val = EffectivePr(pr);
81 val = OriginalPr(val);
82 prit = m_pr_map.find(val);
83 assert(prit != m_pr_map.end());
84 }
85 int idx_pr = prit->second;
86 assert(usingLogPr ||
87 (m_pr_table[idx_pr] > 0.0 && m_pr_table[idx_pr] < 1.0));
88 assert(!usingLogPr || m_pr_table[idx_pr] > 0.0);
89 return idx_pr;
90 }
91 };
92
93 //
94 // convert CArpaSlm::TNode to CThreadSlm::TNode
95 //
96 class CArpaNodeConv
97 {
98 const bool usingLogPr;
99 CArpaLeafConv m_leaf_conv;
100 CompressedTable& m_bow_table;
101 RealIndexMap& m_bow_map;
102
103 public:
104 CArpaNodeConv(bool usingLogPr_,
105 RealIndexMap* pr_map,
106 CompressedTable* pr_table,
107 RealIndexMap* bow_map,
108 CompressedTable* bow_table) :
109 usingLogPr(usingLogPr_),
110 m_leaf_conv(usingLogPr, pr_map, pr_table),
111 m_bow_table(*bow_table),
112 m_bow_map(*bow_map)
113 {
114 }
115
116 CThreadSlm::TNode
117 operator()(const CArpaSlm::TNode& node)
118 {
119 CThreadSlm::TNode tnode;
120 tnode.set_wid(node.wid);
121 tnode.set_bon(node.bon);
122 tnode.set_bol(node.bol);
123 tnode.set_ch(node.ch);
124 unsigned pr_idx = m_leaf_conv.get_pr_index(node.pr);
125 tnode.set_pr(pr_idx);
126 unsigned bow_idx = get_bow_index(node.bow);
127 tnode.set_bow(bow_idx);
128 return tnode;
129 }
130
131 unsigned
132 get_bow_index(float bow)
133 {
134 FreqMap::iterator bowit = m_bow_map.find(bow);
135 if (bowit == m_bow_map.end()) {
136 double val = EffectiveBow(bow);
137 val = OriginalBow(val);
138 bowit = m_bow_map.find(val);
139 assert(bowit != m_bow_map.end());
140 }
141 return bowit->second;
142 }
143 };
144
145 void
146 compress(const CArpaSlm& slm,
147 CompressedTable& pr_table, RealIndexMap& pr_map,
148 CompressedTable& bow_table, RealIndexMap& bow_map,
149 TNodeLevels& nodeLevels, CThreadSlm::TLeaf*& leafLevel)
150 {
151 CArpaLeafConv leaf_conv(slm.usingLogPr(), &pr_map, &pr_table);
152 CArpaNodeConv node_conv(
153 slm.usingLogPr(), &pr_map, &pr_table, &bow_map, &bow_table);
154 const int N = slm.getN();
155 TNodeLevels node_levels(N);
156 for (int lvl = 0; lvl < N; ++lvl) {
157 const CArpaSlm::TNodeLevel& level = slm.getLevel(lvl);
158 unsigned len = level.size();
159 node_levels[lvl] = new CThreadSlm::TNode[len + 1];
160 std::transform(level.begin(), level.end(),
161 node_levels[lvl], node_conv);
162 memset(&node_levels[lvl][len], 0, sizeof(CThreadSlm::TNode));
163 node_levels[lvl][len].set_ch(slm.getLevelSize(lvl + 1));
164 }
165
166 const CArpaSlm::TLeafLevel& level = slm.getLastLevel();
167 unsigned len = level.size();
168 CThreadSlm::TLeaf* leaf_level = new CThreadSlm::TLeaf[len + 1];
169 std::transform(level.begin(), level.end(),
170 leaf_level, leaf_conv);
171 memset(&leaf_level[len], 0, sizeof(CThreadSlm::TLeaf));
172 nodeLevels = node_levels;
173 leafLevel = leaf_level;
174 }
175
176 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
+0
-59
src/slm/tslmpack/arpa_conv.h less more
0 // -*- mode: c++ -*-
1 /*
2 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
3 *
4 * The contents of this file are subject to the terms of either the GNU Lesser
5 * General Public License Version 2.1 only ("LGPL") or the Common Development and
6 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
7 * file except in compliance with the License. You can obtain a copy of the CDDL at
8 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
9 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
10 * specific language governing permissions and limitations under the License. When
11 * distributing the software, include this License Header Notice in each file and
12 * include the full text of the License in the License file as well as the
13 * following notice:
14 *
15 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
16 * (CDDL)
17 * For Covered Software in this distribution, this License shall be governed by the
18 * laws of the State of California (excluding conflict-of-law provisions).
19 * Any litigation relating to this License shall be subject to the jurisdiction of
20 * the Federal Courts of the Northern District of California and the state courts
21 * of the State of California, with venue lying in Santa Clara County, California.
22 *
23 * Contributor(s):
24 *
25 * If you wish your version of this file to be governed by only the CDDL or only
26 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
27 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
28 * license." If you don't indicate a single choice of license, a recipient has the
29 * option to distribute your version of this file under either the CDDL or the LGPL
30 * Version 2.1, or to extend the choice of license to its licensees as provided
31 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
32 * Version 2 license, then the option applies only if the new code is made subject
33 * to such option by the copyright holder.
34 */
35 #ifndef _SLM_PACK_ARPA_CONV_H
36 #define _SLM_PACK_ARPA_CONV_H
37
38 #include "common.h"
39
40 class CArpaSlm;
41
42 //
43 // slm [in]
44 // pr_table [in]
45 // pr_map [in]
46 // bow_table [in]
47 // bow_map [in]
48 // nodeLevels [out]
49 // leafLevel [out]
50 //
51 void compress(const CArpaSlm& slm,
52 CompressedTable& pr_table, RealIndexMap& pr_map,
53 CompressedTable& bow_table, RealIndexMap& bow_map,
54 TNodeLevels& nodeLevels, CThreadSlm::TLeaf*& leafLevel);
55
56 #endif //_SLM_PACK_ARPA_CONV_H
57
58 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
+0
-245
src/slm/tslmpack/arpa_slm.cpp less more
0 /*
1 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
2 *
3 * The contents of this file are subject to the terms of either the GNU Lesser
4 * General Public License Version 2.1 only ("LGPL") or the Common Development and
5 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
6 * file except in compliance with the License. You can obtain a copy of the CDDL at
7 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
8 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
9 * specific language governing permissions and limitations under the License. When
10 * distributing the software, include this License Header Notice in each file and
11 * include the full text of the License in the License file as well as the
12 * following notice:
13 *
14 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
15 * (CDDL)
16 * For Covered Software in this distribution, this License shall be governed by the
17 * laws of the State of California (excluding conflict-of-law provisions).
18 * Any litigation relating to this License shall be subject to the jurisdiction of
19 * the Federal Courts of the Northern District of California and the state courts
20 * of the State of California, with venue lying in Santa Clara County, California.
21 *
22 * Contributor(s):
23 *
24 * If you wish your version of this file to be governed by only the CDDL or only
25 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
26 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
27 * license." If you don't indicate a single choice of license, a recipient has the
28 * option to distribute your version of this file under either the CDDL or the LGPL
29 * Version 2.1, or to extend the choice of license to its licensees as provided
30 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
31 * Version 2 license, then the option applies only if the new code is made subject
32 * to such option by the copyright holder.
33 */
34 #include <string>
35 #include <iostream>
36 #include <fstream>
37 #include <algorithm>
38 #include "arpa_slm.h"
39
40 using namespace std;
41
42 /**
43 * the GNU extension is not always available, so we invent another wheel.
44 */
45 size_t
46 getline(char *buf, size_t n, FILE* stream)
47 {
48 char* p;
49 char* end = buf + n;
50 for (p = buf; p != end; ++p) {
51 int c = fgetc(stream);
52 if (c == '\n' || c == EOF)
53 break;
54 *p = c;
55 --n;
56 }
57 if (p != end)
58 *p = 0;
59 else
60 *(p - 1) = 0;
61 return p - buf;
62 }
63
64 char*
65 getwords(char* buf, char** next)
66 {
67 char* word = buf;
68 char* delim = strstr(buf, " ");
69 if (delim == NULL) {
70 cerr << "Unknown format in: " << buf << "." << endl;
71 exit(2);
72 }
73 *delim = '\0';
74 *next = delim + 2;
75 return word;
76 }
77
78 unsigned
79 get_wid(const char* word, const TLexicon& lexicon)
80 {
81 TLexicon::const_iterator lexi = lexicon.find(word);
82 unsigned wid;
83 if (lexi != lexicon.end()) {
84 wid = lexi->second;
85 } else {
86 cerr << "Error:\"" << word << "\" not found in lexicon." << endl;
87 wid = 0;
88 }
89 return wid;
90 }
91
92 int
93 CArpaSlm::TLeaf::load_words(char* buf, const TLexicon& lexicon)
94 {
95 int nword = 0;
96 char* word, *end;
97 for (word = end = buf; *end != 0; ++end) {
98 if (*end == ' ') {
99 assert(nword < N_GRAM);
100 *end = 0;
101 hw[nword++] = get_wid(word, lexicon);
102 word = end + 1;
103 }
104 }
105 if (buf != end) {
106 wid = hw[nword++] = get_wid(word, lexicon);
107 }
108 return nword;
109 }
110
111 void
112 CArpaSlm::TLeaf::load(istream& is, const TLexicon& lexicon)
113 {
114 char buf[1024];
115 is.getline(buf, sizeof(buf));
116 char* next = 0;
117 char* words = getwords(buf, &next);
118 load_words(words, lexicon);
119 sscanf(next, "%f (%1u, %u)",
120 &pr, &bol, &bon);
121 }
122
123 void
124 CArpaSlm::TNode::load(istream& is, const TLexicon& lexicon)
125 {
126 char buf[1024];
127 is.getline(buf, sizeof(buf));
128 char* next = 0;
129 char* words = getwords(buf, &next);
130 load_words(words, lexicon);
131 sscanf(next, "%f %f (%1u, %u)",
132 &pr, &bow, &bol, &bon);
133 }
134
135 void
136 CArpaSlm::TNode::load_level0(istream& is)
137 {
138 hw[0] = 0;
139 char buf[1024];
140 is.getline(buf, sizeof(buf));
141 sscanf(buf, "%f %f (%1u, %u)",
142 &pr, &bow, &bol, &bon);
143 wid = 0;
144 }
145
146 void
147 CArpaSlm::load(const char* filename, const TLexicon& lexicon)
148 {
149 printf("Loading ARPA slm..."); fflush(stdout);
150 ifstream file(filename);
151 char buf[1024];
152 for (int i = 0; i <= N_GRAM; ++i) {
153 unsigned lvl;
154 int size;
155 file.getline(buf, sizeof(buf));
156 if (!file) {
157 cerr << "Failed to read from" << filename << endl;
158 exit(1);
159 }
160 sscanf(buf, "\\%d-gram\\%d%*[\n]", &lvl, &size);
161 assert(lvl <= N_GRAM);
162 if (lvl == 0) {
163 TNode node0;
164 node0.load_level0(file);
165 m_levels[0].push_back(node0);
166 } else if (lvl < m_N) {
167 m_levels[lvl].reserve(size);
168 for (int i = 0; i < size; ++i) {
169 TNode node;
170 node.load(file, lexicon);
171 m_levels[lvl].push_back(node);
172 }
173 } else {
174 // leaf nodes
175 m_lastLevel.reserve(size);
176 for (int i = 0; i < size; ++i) {
177 TLeaf leaf;
178 leaf.load(file, lexicon);
179 m_lastLevel.push_back(leaf);
180 }
181 }
182 }
183 }
184
185 template <class NodeT>
186 struct CompareNode {
187 const unsigned m_lvl;
188 CompareNode(unsigned lvl) : m_lvl(lvl)
189 {
190 }
191 /**
192 * @return true if strictly less, false otherwise
193 */
194 bool
195 operator ()(const NodeT& node, const TSIMWordId hw[N_GRAM])
196 {
197 for (unsigned i = 0; i < m_lvl; ++i) {
198 if (node.hw[i] < hw[i])
199 return true;
200 if (node.hw[i] > hw[i])
201 return false;
202 }
203 // node.hw[:lvl] is the same as hw[:]
204 return false;
205 }
206 };
207
208 void
209 CArpaSlm::threading()
210 {
211 {
212 TNode& node = m_levels[0][0];
213 node.ch = 0;
214 }
215 for (unsigned lvl = 1; lvl < m_N; ++lvl) {
216 TNodeLevel& level = m_levels[lvl];
217 unsigned last_child = 0;
218 for (TNodeLevel::iterator node = level.begin();
219 node != level.end();
220 ++node) {
221 node->ch = last_child = find_1st_child(lvl, *node, last_child);
222 }
223 }
224 }
225
226 unsigned
227 CArpaSlm::find_1st_child(unsigned lvl, const TNode& node, int last_child)
228 {
229 assert(lvl < m_N);
230 if (lvl == m_N - 1) {
231 TLeafLevel::iterator found = lower_bound(
232 m_lastLevel.begin(), m_lastLevel.end(), node.hw,
233 CompareNode<TLeaf>(lvl));
234 return distance(m_lastLevel.begin(), found);
235 } else {
236 const TNodeLevel& level = m_levels[lvl + 1];
237 TNodeLevel::const_iterator found = lower_bound(level.begin(), level.end(
238 ), node.hw,
239 CompareNode<TNode>(lvl));
240 return distance(level.begin(), found);
241 }
242 }
243
244 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
+0
-113
src/slm/tslmpack/arpa_slm.h less more
0 // -*- mode: c++ -*-
1 /*
2 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
3 *
4 * The contents of this file are subject to the terms of either the GNU Lesser
5 * General Public License Version 2.1 only ("LGPL") or the Common Development and
6 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
7 * file except in compliance with the License. You can obtain a copy of the CDDL at
8 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
9 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
10 * specific language governing permissions and limitations under the License. When
11 * distributing the software, include this License Header Notice in each file and
12 * include the full text of the License in the License file as well as the
13 * following notice:
14 *
15 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
16 * (CDDL)
17 * For Covered Software in this distribution, this License shall be governed by the
18 * laws of the State of California (excluding conflict-of-law provisions).
19 * Any litigation relating to this License shall be subject to the jurisdiction of
20 * the Federal Courts of the Northern District of California and the state courts
21 * of the State of California, with venue lying in Santa Clara County, California.
22 *
23 * Contributor(s):
24 *
25 * If you wish your version of this file to be governed by only the CDDL or only
26 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
27 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
28 * license." If you don't indicate a single choice of license, a recipient has the
29 * option to distribute your version of this file under either the CDDL or the LGPL
30 * Version 2.1, or to extend the choice of license to its licensees as provided
31 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
32 * Version 2 license, then the option applies only if the new code is made subject
33 * to such option by the copyright holder.
34 */
35 #ifndef _ARPA_SLM_H
36 #define _ARPA_SLM_H
37
38 #include <istream>
39 #include "common.h"
40
41 using std::istream;
42
43 #define N_GRAM (3)
44
45
46 /* the ARPA style representation of sunpinyin's SLM */
47 class CArpaSlm {
48 public:
49 struct TLeaf {
50 TSIMWordId hw[N_GRAM];
51 TSIMWordId wid;
52 float pr;
53 unsigned ch;
54 unsigned bon;
55 unsigned bol;
56 void load(istream&, const TLexicon&);
57 int load_words(char* buf, const TLexicon& lexicon);
58 TLeaf() : wid(0), pr(.0), ch(0), bon(0), bol(0) {}
59 };
60
61 struct TNode : public TLeaf {
62 float bow;
63 void load(istream&, const TLexicon&);
64 void load_level0(istream&);
65 };
66
67 typedef std::vector<TNode> TNodeLevel;
68 typedef std::vector<TLeaf> TLeafLevel;
69
70 private:
71 TNodeLevel m_levels[N_GRAM + 1]; /* [0..N_GRAM] */
72 TLeafLevel m_lastLevel;
73 const bool m_usingLogPr;
74 const unsigned m_N;
75
76 public:
77 /* XXX, ARPA file does not provide these information.
78 so we assume this SLM is trigram, and does not use LogPr */
79 CArpaSlm() : m_usingLogPr(false), m_N(N_GRAM) {}
80 bool good() const { return m_levels[0].size() != 0; }
81 unsigned getN() const { return m_N; }
82 bool usingLogPr() const { return m_usingLogPr; }
83 const TNodeLevel& getLevel(unsigned lvl) const { return m_levels[lvl]; }
84 const TLeafLevel& getLastLevel() const { return m_lastLevel; }
85 unsigned getLevelSize(unsigned lvl) const {
86 assert(lvl <= m_N);
87 if (lvl < m_N) {
88 return m_levels[lvl].size();
89 } else {
90 return m_lastLevel.size();
91 }
92 }
93 /**
94 * initialize the `ch' and `wid' fields of each node in levels
95 */
96 void threading();
97 void load(const char* filename, const TLexicon& lexicon);
98
99 private:
100 /**
101 * find out the first child of a given node in its next level
102 * @param lvl the level where node belongs to
103 * @param node the node
104 * @param last_child the child index of previous node
105 * @return the index of the found child
106 */
107 unsigned find_1st_child(unsigned lvl, const TNode& node, int last_child);
108 };
109
110 #endif //_ARPA_SLM_H
111
112 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
+0
-63
src/slm/tslmpack/common.h less more
0 // -*- mode: c++ -*-
1 /*
2 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
3 *
4 * The contents of this file are subject to the terms of either the GNU Lesser
5 * General Public License Version 2.1 only ("LGPL") or the Common Development and
6 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
7 * file except in compliance with the License. You can obtain a copy of the CDDL at
8 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
9 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
10 * specific language governing permissions and limitations under the License. When
11 * distributing the software, include this License Header Notice in each file and
12 * include the full text of the License in the License file as well as the
13 * following notice:
14 *
15 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
16 * (CDDL)
17 * For Covered Software in this distribution, this License shall be governed by the
18 * laws of the State of California (excluding conflict-of-law provisions).
19 * Any litigation relating to this License shall be subject to the jurisdiction of
20 * the Federal Courts of the Northern District of California and the state courts
21 * of the State of California, with venue lying in Santa Clara County, California.
22 *
23 * Contributor(s):
24 *
25 * If you wish your version of this file to be governed by only the CDDL or only
26 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
27 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
28 * license." If you don't indicate a single choice of license, a recipient has the
29 * option to distribute your version of this file under either the CDDL or the LGPL
30 * Version 2.1, or to extend the choice of license to its licensees as provided
31 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
32 * Version 2 license, then the option applies only if the new code is made subject
33 * to such option by the copyright holder.
34 */
35 #ifndef _SLM_PACK_COMMON_H
36 #define _SLM_PACK_COMMON_H
37
38 #include <vector>
39 #include <map>
40 #include <string>
41 #include <cmath>
42 #include <cassert>
43
44 #include "../slm.h"
45
46 typedef std::vector<CThreadSlm::TNode> TNodeLevel;
47 typedef std::vector<CThreadSlm::TLeaf> TLeafLevel;
48 typedef std::vector<CThreadSlm::TNode*> TNodeLevels;
49 typedef std::map<float, float> EffRealMap; // map from efficient values to the real ones
50 typedef std::map<float, int> FreqMap; // how often the efficient value appears
51 typedef std::vector<float> CompressedTable; // array of real values, the index is stored in RealIndexMap
52 typedef std::map<float, int> RealIndexMap; // map real values to their indices
53 typedef std::map<std::string, unsigned int> TLexicon; // map word to wid
54
55 #define EffectivePr(a) (float((usingLogPr) ? ((a) / log(2.0)) : (-log2((a)))))
56 #define OriginalPr(b) (float((usingLogPr) ? ((b) * log(2.0)) : (exp2(-(b)))))
57 #define EffectiveBow(a) (float((usingLogPr) ? (exp(-(a))) : ((a))))
58 #define OriginalBow(b) (float((usingLogPr) ? (-log((b))) : ((b))))
59
60 #endif //_SLM_PACK_COMMON_H
61
62 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
+0
-368
src/slm/tslmpack/slmpack.cpp less more
0 /*
1 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
2 *
3 * The contents of this file are subject to the terms of either the GNU Lesser
4 * General Public License Version 2.1 only ("LGPL") or the Common Development and
5 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
6 * file except in compliance with the License. You can obtain a copy of the CDDL at
7 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
8 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
9 * specific language governing permissions and limitations under the License. When
10 * distributing the software, include this License Header Notice in each file and
11 * include the full text of the License in the License file as well as the
12 * following notice:
13 *
14 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
15 * (CDDL)
16 * For Covered Software in this distribution, this License shall be governed by the
17 * laws of the State of California (excluding conflict-of-law provisions).
18 * Any litigation relating to this License shall be subject to the jurisdiction of
19 * the Federal Courts of the Northern District of California and the state courts
20 * of the State of California, with venue lying in Santa Clara County, California.
21 *
22 * Contributor(s):
23 *
24 * If you wish your version of this file to be governed by only the CDDL or only
25 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
26 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
27 * license." If you don't indicate a single choice of license, a recipient has the
28 * option to distribute your version of this file under either the CDDL or the LGPL
29 * Version 2.1, or to extend the choice of license to its licensees as provided
30 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
31 * Version 2 license, then the option applies only if the new code is made subject
32 * to such option by the copyright holder.
33 */
34
35 /*
36 * pack ARPA format to a binary format which can be consumed by SunPinyin
37 */
38
39 #ifdef HAVE_CONFIG_H
40 #include "config.h"
41 #endif
42
43 #ifdef HAVE_ASSERT_H
44 #include <assert.h>
45 #endif
46
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <stdlib.h>
50
51 #include <vector>
52 #include <map>
53 #include <iostream>
54 #include <cmath>
55
56 //#include "../sim_slm.h"
57 #include "../slm.h"
58
59 #include "../thread/ValueCompress.h"
60 #include "arpa_slm.h"
61 #include "arpa_conv.h"
62
63
64 void
65 ShowUsage(const char* progname)
66 {
67 printf("Usage:\n");
68 printf(" %s arpa_slm dict_file threaded_slm\n", progname);
69 printf("\n");
70 printf("Description:\n");
71 printf(
72 " %s converts the ARPA representation of SLM to the binary format of threaded SLM. \n",
73 progname);
74 exit(100);
75 }
76
77 /**
78 * slm [in]
79 * pr_eff, pr_values [out]
80 * bow_eff, bow_values [out]
81 */
82
83 void
84 build_map(const CArpaSlm& slm,
85 EffRealMap &pr_eff,
86 FreqMap& pr_values,
87 EffRealMap &bow_eff,
88 FreqMap& bow_values)
89 {
90 bool usingLogPr = slm.usingLogPr();
91
92 printf("\nfirst pass..."); fflush(stdout);
93
94 for (unsigned lvl = 0; lvl < slm.getN(); ++lvl) {
95 typedef CArpaSlm::TNodeLevel TNodeLevel;
96 const TNodeLevel& level = slm.getLevel(lvl);
97 for (TNodeLevel::const_iterator node = level.begin();
98 node != level.end();
99 ++node) {
100 float real_pr, eff_pr;
101 real_pr = node->pr;
102 eff_pr = EffectivePr(real_pr);
103 if (pr_eff.find(eff_pr) == pr_eff.end()) {
104 pr_eff[eff_pr] = real_pr;
105 } else { // precision error cause non 1:1 mapping
106 pr_eff[eff_pr] = OriginalPr(eff_pr);
107 }
108 ++(pr_values[eff_pr]);
109
110 float real_bow, eff_bow;
111 real_bow = node->bow;
112 eff_bow = EffectiveBow(real_bow);
113 if (bow_eff.find(eff_bow) == bow_eff.end()) {
114 bow_eff[eff_bow] = real_bow;
115 } else { // two values map to same distance value due to precision error
116 bow_eff[eff_bow] = OriginalBow(eff_bow);
117 }
118 ++(bow_values[eff_bow]);
119 }
120 }
121 typedef CArpaSlm::TLeafLevel TLeafLevel;
122 const TLeafLevel& level = slm.getLastLevel();
123 for (TLeafLevel::const_iterator leaf = level.begin();
124 leaf != level.end();
125 ++leaf) {
126 float real_pr, eff_pr;
127 real_pr = leaf->pr;
128 eff_pr = EffectivePr(real_pr);
129 if (pr_eff.find(eff_pr) == pr_eff.end()) {
130 pr_eff[eff_pr] = real_pr;
131 } else { // precision error cause non 1:1 mapping
132 pr_eff[eff_pr] = OriginalPr(eff_pr);
133 }
134 ++(pr_values[eff_pr]);
135 }
136 // Following pr value should not be grouped, or as milestone values.
137 static const float msprs[] = {
138 0.9, 0.8, 0.7, 0.6,
139 1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32, 1.0 / 64, 1.0 / 128,
140 1.0 / 256, 1.0 / 512, 1.0 / 1024, 1.0 / 2048, 1.0 / 4096, 1.0 / 8192,
141 1.0 / 16384, 1.0 / 32768, 1.0 / 65536
142 };
143
144 for (unsigned i = 0, sz = sizeof(msprs) / sizeof(float); i < sz; ++i) {
145 float real_pr = (usingLogPr) ? (-log(msprs[i])) : (msprs[i]);
146 float eff_pr = EffectivePr(real_pr);
147 assert(usingLogPr || (real_pr > 0.0 && real_pr < 1.0));
148 assert(!usingLogPr || real_pr > 0.0);
149
150 if (pr_eff.find(eff_pr) == pr_eff.end()) {
151 pr_eff[eff_pr] = real_pr;
152 } else { // precision error causes non 1:1 mapping
153 pr_eff[eff_pr] = OriginalPr(eff_pr);
154 }
155 pr_values[eff_pr] = 0;
156 }
157
158 // Following bow value should not be grouped, or as milestone values.
159 static const float msbows[] = {
160 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,
161 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001,
162 0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001
163 };
164
165 for (unsigned i = 0; i < sizeof(msbows) / sizeof(msbows[0]); ++i) {
166 float real_bow = (usingLogPr) ? (-log(msbows[i])) : (msbows[i]);
167 float eff_bow = EffectiveBow(real_bow);
168 if (bow_eff.find(eff_bow) == bow_eff.end()) {
169 bow_eff[eff_bow] = real_bow;
170 } else { // two values map to same distance value due to precision error
171 bow_eff[eff_bow] = OriginalBow(eff_bow);
172 }
173 bow_values[eff_bow] = 0;
174 }
175 }
176
177 /**
178 * group vaules into a smaller set of their approximations
179 *
180 * bow_eff [in], bow_values [in], bow_map [out], bow_table [out]
181 * pr_eff [in], pr_values [in], pr_map [out], pr_table [out]
182 *
183 */
184 void
185 group_values(bool usingLogPr,
186 EffRealMap& pr_eff,
187 FreqMap& pr_values,
188 CompressedTable& pr_table,
189 RealIndexMap& pr_map,
190 EffRealMap& bow_eff,
191 FreqMap& bow_values,
192 CompressedTable& bow_table,
193 RealIndexMap& bow_map)
194 {
195 printf("\nCompressing pr values..."); fflush(stdout);
196 CValueCompressor vc;
197 vc(pr_eff, pr_values, pr_map, pr_table, (1 << CThreadSlm::BITS_PR));
198 CompressedTable::iterator itt, itte;
199 itte = pr_table.end();
200 for (itt = pr_table.begin(); itt != itte; ++itt) {
201 *itt = OriginalPr(*itt);
202 assert(usingLogPr || (*itt > 0.0 && *itt < 1.0));
203 assert(!usingLogPr || *itt > 0.0);
204 }
205 printf("%lu float values ==> %lu values", pr_eff.size(), pr_table.size());
206
207 printf("\nCompressing bow values..."); fflush(stdout);
208 vc(bow_eff, bow_values, bow_map, bow_table, (1 << CThreadSlm::BITS_BOW));
209 itte = bow_table.end();
210 for (itt = bow_table.begin(); itt != itte; ++itt)
211 *itt = OriginalBow(*itt);
212 printf("%lu float values ==> %lu values", bow_eff.size(), bow_table.size());
213 }
214
215 TLexicon
216 read_lexicon(const char* filename)
217 {
218 printf("Loading lexicon..."); fflush(stdout);
219 static char word[1024 * 10];
220 FILE* f_lex = fopen(filename, "r");
221 TLexicon lexicon;
222 while (fgets(word, sizeof(word), f_lex)) {
223 if (strlen(word) > 0) {
224 // skip to the first non hanzi character
225 char* p = word;
226 while (*p == ' ' || *p == '\t')
227 ++p;
228 while (*p != 0 && *p != ' ' && *p != '\t')
229 ++p;
230 if (*p == 0) continue;
231 *p++ = 0;
232 // skip to the word_id
233 while (*p == ' ' || *p == '\t')
234 ++p;
235 if (!(*p >= '0' && *p <= '9')) continue;
236
237 int id;
238 for (id = 0; *p >= '0' && *p <= '9'; ++p)
239 id = 10 * id + (*p - '0');
240 lexicon[std::string(word)] = id;
241 }
242 }
243 fclose(f_lex);
244 printf("done.\n"); fflush(stdout);
245
246 return lexicon;
247 }
248
249
250
251 //
252 // filename [in]
253 // pr_table [in]
254 // bow_table [in]
255 // levels[0] [in]
256 // ...
257 // levels[N] [in]
258 // lastLevel [in]
259 //
260 void
261 write_out(const char* filename, const CArpaSlm& slm,
262 CompressedTable& pr_table, CompressedTable& bow_table,
263 const TNodeLevels& levels, const CThreadSlm::TLeaf* lastLevel)
264 {
265 printf("\nWriting out..."); fflush(stdout);
266
267 FILE* fp = fopen(filename, "wb");
268 const int N = slm.getN();
269 fwrite(&N, sizeof(int), 1, fp);
270 const unsigned usingLogPr = slm.usingLogPr();
271 fwrite(&usingLogPr, sizeof(unsigned), 1, fp);
272
273 for (int lvl = 0; lvl <= N; ++lvl) {
274 unsigned len = slm.getLevelSize(lvl) + 1;
275 fwrite(&len, sizeof(unsigned), 1, fp);
276 }
277
278 for (int i = 0, sz = pr_table.size(); i < (1 << CThreadSlm::BITS_PR);
279 ++i) {
280 if (i < sz) {
281 fwrite(&pr_table[i], sizeof(float), 1, fp);
282 } else {
283 float dummy = 0.0F;
284 fwrite(&dummy, sizeof(float), 1, fp);
285 }
286 }
287
288 for (int i = 0, sz = bow_table.size(); i < (1 << CThreadSlm::BITS_BOW);
289 ++i) {
290 if (i < sz) {
291 fwrite(&bow_table[i], sizeof(float), 1, fp);
292 } else {
293 float dummy = 0.0F;
294 fwrite(&dummy, sizeof(float), 1, fp);
295 }
296 }
297
298 for (int lvl = 0; lvl < N; ++lvl) {
299 fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize(
300 lvl) + 1, fp);
301 }
302
303 fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N) + 1, fp);
304
305 fclose(fp);
306
307 printf("done!\n"); fflush(stdout);
308 }
309
310
311 void
312 cleanup(CompressedTable& pr_table, CompressedTable& bow_table,
313 TNodeLevels& levels, CThreadSlm::TLeaf* lastLevel)
314 {
315 for (unsigned lvl = 0; lvl < levels.size(); ++lvl)
316 delete[] levels[lvl];
317 delete[] lastLevel;
318 bow_table.clear();
319 pr_table.clear();
320 }
321
322 int
323 main(int argc, char* argv[])
324 {
325 if (argc != 4)
326 ShowUsage(argv[0]);
327 const char* arpa_path = argv[1];
328 const char* lexicon_path = argv[2];
329 const char* threaded_path = argv[3];
330
331 CArpaSlm slm;
332 TLexicon lexicon = read_lexicon(lexicon_path);
333 slm.load(arpa_path, lexicon);
334
335 if (!slm.good()) {
336 std::cerr << "Failed to load language model from " << arpa_path <<
337 "." << std::endl;
338 exit(1);
339 }
340 slm.threading();
341
342 EffRealMap pr_eff, bow_eff; // effval --> val
343 FreqMap pr_values, bow_values; // effval --> freq
344 build_map(slm, pr_eff, pr_values, bow_eff, bow_values);
345
346 RealIndexMap pr_map, bow_map; // result: val --> int
347 CompressedTable pr_table, bow_table; // result: val vector
348 group_values(slm.usingLogPr(),
349 pr_eff, pr_values, pr_table, pr_map,
350 bow_eff, bow_values, bow_table, bow_map);
351 pr_values.clear();
352 bow_values.clear();
353
354 TNodeLevels levels;
355 CThreadSlm::TLeaf* lastLevel;
356 compress(slm, pr_table, pr_map, bow_table, bow_map,
357 levels, lastLevel);
358
359 pr_map.clear();
360 bow_map.clear();
361 write_out(threaded_path, slm, pr_table, bow_table, levels, lastLevel);
362
363 cleanup(pr_table, bow_table, levels, lastLevel);
364 return 0;
365 }
366
367 // -*- indent-tabs-mode: nil -*- vim:et:ts=4
00 #!/usr/bin/@MAKE@ -f
11 # -*- mode: makefile; indent-tabs-mode: t -*- vim:noet:ts=4
22
3 # In case of problems, also try the following ${DL_HOST} values:
4 # (copied from Gentoo's `thirdpartymirrors' file)
5 # http://aarnet.dl.sourceforge.net
6 # http://colocrossing.dl.sourceforge.net
7 # http://cznic.dl.sourceforge.net
8 # http://dfn.dl.sourceforge.net
9 # http://freefr.dl.sourceforge.net
10 # http://garr.dl.sourceforge.net
11 # http://heanet.dl.sourceforge.net
12 # http://hivelocity.dl.sourceforge.net
13 # http://ignum.dl.sourceforge.net
14 # http://internode.dl.sourceforge.net
15 # http://iweb.dl.sourceforge.net
16 # http://jaist.dl.sourceforge.net
17 # http://kaz.dl.sourceforge.net
18 # http://kent.dl.sourceforge.net
19 # http://nchc.dl.sourceforge.net
20 # http://ncu.dl.sourceforge.net
21 # http://netcologne.dl.sourceforge.net
22 # http://optimate.dl.sourceforge.net
23 # http://softlayer.dl.sourceforge.net
24 # http://sunet.dl.sourceforge.net
25 # http://surfnet.dl.sourceforge.net
26 # http://switch.dl.sourceforge.net
27 # http://tcpdiag.dl.sourceforge.net
28 # http://ufpr.dl.sourceforge.net
29 # http://waia.dl.sourceforge.net
30 # http://waix.dl.sourceforge.net
31
332 WGET = @WGET@
433 TAR = @TAR@
34 W3M = @W3M@
535 ENDIANNESS = @ENDIANNESS@
6 DATA_DIR = '@DATADIR@/sunpinyin'
36 DATA_DIR = @DATADIR@/sunpinyin
737
8 DL_LIST = 'http://code.google.com/p/open-gram/downloads/list'
9 DL_ROOT = 'http://open-gram.googlecode.com/files/'
38 DL_LIST = https://sourceforge.net/projects/open-gram/files/
39 DL_HOST = http://heanet.dl.sourceforge.net
40 DL_ROOT = ${DL_HOST}/open-gram
1041 DICT_PAT = 'dict\.utf8-[0-9]\+.tar.bz2'
11 TSLM_PAT = 'lm_sc\.t3g\.arpa-[0-9]\+.tar.bz2'
42 SLM_PAT = 'lm_sc\.3gm\.arpa-[0-9]\+.tar.bz2'
1243
13 DICT_AR = $(shell w3m -dump ${DL_LIST} | grep -o ${DICT_PAT} | sort | tail -n 1)
14 TSLM_AR = $(shell w3m -dump ${DL_LIST} | grep -o ${TSLM_PAT} | sort | tail -n 1)
44 DICT_AR = $(shell ${W3M} ${DL_LIST} | grep -o ${DICT_PAT} | sort -u | tail -n 1)
45 SLM_AR = $(shell ${W3M} ${DL_LIST} | grep -o ${SLM_PAT} | sort -u | tail -n 1)
1546
1647 all: install
1748
2152 dict.utf8: ${DICT_AR}
2253 ${TAR} xmf $^
2354
24 ${TSLM_AR}:
55 ${SLM_AR}:
2556 ${WGET} ${DL_ROOT}/$@
2657
27 lm_sc.t3g.arpa: ${TSLM_AR}
58 lm_sc.3gm.arpa: ${SLM_AR}
2859 ${TAR} xmf $^
2960
30 lm_sc.t3g.orig: dict.utf8 lm_sc.t3g.arpa
31 tslmpack lm_sc.t3g.arpa dict.utf8 $@
61 lm_sc.3gm: lm_sc.3gm.arpa dict.utf8
62 slmpack $^ $@
63
64 lm_sc.t3g.orig: lm_sc.3gm
65 slmthread $^ $@
3266
3367 lm_sc.t3g: lm_sc.t3g.orig
3468 tslmendian -e ${ENDIANNESS} -i $^ -o $@
3569
3670 pydict_sc.bin: dict.utf8 lm_sc.t3g
3771 genpyt -e ${ENDIANNESS} -i dict.utf8 -s lm_sc.t3g \
38 -l lm_sc.t3g.log -o $@
72 -l pydict_sc.log -o $@
3973
4074 install: lm_sc.t3g pydict_sc.bin
4175 install -d ${DATA_DIR}
42 install -Dm644 $^ ${DATA_DIR}
76 install -m644 $^ ${DATA_DIR}
4377
4478 clean:
45 rm -rf ${DICT_AR} ${TSLM_AR} dict.utf8 lm_sc.t3g.arpa \
46 lm_sc.t3g.orig lm_sc.t3g lm_sc.t3g.log pydict_sc.bin
47
79 rm -rf ${DICT_AR} ${SLM_AR} dict.utf8 lm_sc.3gm.arpa lm_sc.3gm \
80 lm_sc.t3g.orig lm_sc.t3g pydict_sc.log pydict_sc.bin