Codebase list teckit / upstream/2.5.4_svn140+ds1
Imported Upstream version 2.5.4~svn140+ds1 Daniel Glassey 7 years ago
88 changed file(s) with 118202 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 AUTHORS
1
2 TECkit is written by Jonathan Kew, SIL International.
3
4 The Perl interface is by Martin Hosken, and the JNI interface is by Keith Stribley.
5 Thanks to Ulrik Petersen for patches to improve portability, especially for MS VC++
6 and for 64-bit systems. Authors of other library code included with TECkit are as
7 noted in the relevant source files.
0 see license/LICENSING.txt
0 (Not here. See the Subversion logs.)
0 INSTALL
1
2 Note: If you checked out teckit from svn, or for any other reason don't have
3 a file called "configure", you will need to initialize the directory using
4 GNU autotools. Instructions for this are given further down.
5
6 For typical Unix-like systems:
7
8 ./configure
9 make
10 [sudo] make install
11
12 Installs tools into /usr/local/lib, with libraries and header files in
13 /usr/local/lib and /usr/local/include respectively.
14
15 Typical configure options such as --prefix should work, though little testing
16 has been done.
17
18 The tools installed are:
19 teckit_compile
20 compiler to create binary mapping tables (.tec) from text files (.map)
21 txtconv
22 simple tool to apply a mapping to a plain-text file
23 sfconv
24 tool to apply mappings to a Standard Format file, as specified by
25 a control file
26
27 BUILDING for WINDOWS
28
29 You will need to install MinGW.
30
31 On Mac or Linux, you can use the package system as follows:
32
33 gcc-mingw32 (Debian/Ubuntu)
34 i386-mingw32-gcc (MacPorts)
35
36 You may also need some other packages on Mac, such as updated autotools.
37
38 On Windows, install MinGW+MSys. Probably the easiest way to do this is to
39 download and run mingw-get-inst. Check the options for C++ and MSYS Basic.
40 Once installed, you can open a Unix-like shell by running MinGW > MinGW Shell
41 from the All Programs menu. File and directory names use forward slashes in MSys,
42 and C:\ is represented as /c
43
44 Using the cd command, navigate to the place where you checked out or unzipped
45 the teckit sources. Note that if there are spaces in any of the parent directory
46 names you may experience difficulty building. If so, move the source directory
47 to a location that doesn't involve spaces (eg C:\src\teckit).
48
49 If necessary, follow the GNU AUTOTOOLS instructions below, then run:
50
51 ./build-windows-binaries.sh
52
53 This will create executables and DLLs in teckit-windows-bin. Documentation,
54 header files, sample tools, etc. can be added and the result zipped to produce
55 a release.
56
57 BUILDING a LINUX PACKAGE
58
59 Run the following script:
60
61 ./build-linux-package.sh
62
63 The results will be in the teckit-linux subdirectory.
64
65 BUILDING a MAC PACKAGE
66
67 Run the following script:
68
69 ./build-mac-binaries.sh
70
71 This will create files in the teckit-mac subdirectory. To make the package itself:
72
73 cd mac-installer
74 ./create-pkg.sh
75
76 The result will be a file called TECkit.dmg containing a single .pkg file.
77
78 GNU AUTOTOOLS
79
80 If you don't have a file called "configure" in the top-level teckit source
81 directory, you will need to initialize the directory using GNU autotools.
82
83 On Windows, install the autotools by running:
84
85 mingw-get install mingw32-autotools
86
87 On Mac or Linux, use the package system (MacPorts on Mac).
88
89 Then, on all platforms, run:
90
91 ./autogen.sh
92
93 in the teckit source directory. This normally needs to be done only once, but
94 if you update the directory from svn you may need to run it again.
0 ACLOCAL_AMFLAGS = -I m4
1
2 SUBDIRS = lib bin docs test
3
4 pkgconfigdir = $(libdir)/pkgconfig
5 pkgconfig_DATA = teckit.pc
6
7 EXTRA_DIST = license/License_CPLv05.txt
8 EXTRA_DIST += license/License_LGPLv21.txt
9 EXTRA_DIST += license/LICENSING.txt
10
0 # Makefile.in generated by automake 1.14.1 from Makefile.am.
1 # @configure_input@
2
3 # Copyright (C) 1994-2013 Free Software Foundation, Inc.
4
5 # This Makefile.in is free software; the Free Software Foundation
6 # gives unlimited permission to copy and/or distribute it,
7 # with or without modifications, as long as this notice is preserved.
8
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY, to the extent permitted by law; without
11 # even the implied warranty of MERCHANTABILITY or FITNESS FOR A
12 # PARTICULAR PURPOSE.
13
14 @SET_MAKE@
15
16 VPATH = @srcdir@
17 am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
18 am__make_running_with_option = \
19 case $${target_option-} in \
20 ?) ;; \
21 *) echo "am__make_running_with_option: internal error: invalid" \
22 "target option '$${target_option-}' specified" >&2; \
23 exit 1;; \
24 esac; \
25 has_opt=no; \
26 sane_makeflags=$$MAKEFLAGS; \
27 if $(am__is_gnu_make); then \
28 sane_makeflags=$$MFLAGS; \
29 else \
30 case $$MAKEFLAGS in \
31 *\\[\ \ ]*) \
32 bs=\\; \
33 sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
34 | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
35 esac; \
36 fi; \
37 skip_next=no; \
38 strip_trailopt () \
39 { \
40 flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
41 }; \
42 for flg in $$sane_makeflags; do \
43 test $$skip_next = yes && { skip_next=no; continue; }; \
44 case $$flg in \
45 *=*|--*) continue;; \
46 -*I) strip_trailopt 'I'; skip_next=yes;; \
47 -*I?*) strip_trailopt 'I';; \
48 -*O) strip_trailopt 'O'; skip_next=yes;; \
49 -*O?*) strip_trailopt 'O';; \
50 -*l) strip_trailopt 'l'; skip_next=yes;; \
51 -*l?*) strip_trailopt 'l';; \
52 -[dEDm]) skip_next=yes;; \
53 -[JT]) skip_next=yes;; \
54 esac; \
55 case $$flg in \
56 *$$target_option*) has_opt=yes; break;; \
57 esac; \
58 done; \
59 test $$has_opt = yes
60 am__make_dryrun = (target_option=n; $(am__make_running_with_option))
61 am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
62 pkgdatadir = $(datadir)/@PACKAGE@
63 pkgincludedir = $(includedir)/@PACKAGE@
64 pkglibdir = $(libdir)/@PACKAGE@
65 pkglibexecdir = $(libexecdir)/@PACKAGE@
66 am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
67 install_sh_DATA = $(install_sh) -c -m 644
68 install_sh_PROGRAM = $(install_sh) -c
69 install_sh_SCRIPT = $(install_sh) -c
70 INSTALL_HEADER = $(INSTALL_DATA)
71 transform = $(program_transform_name)
72 NORMAL_INSTALL = :
73 PRE_INSTALL = :
74 POST_INSTALL = :
75 NORMAL_UNINSTALL = :
76 PRE_UNINSTALL = :
77 POST_UNINSTALL = :
78 build_triplet = @build@
79 host_triplet = @host@
80 target_triplet = @target@
81 subdir = .
82 DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
83 $(top_srcdir)/configure $(am__configure_deps) \
84 $(srcdir)/config.h.in $(srcdir)/installed-top.pc.in \
85 $(srcdir)/teckit.pc.in $(srcdir)/uninstalled-top.pc.in AUTHORS \
86 COPYING ChangeLog INSTALL NEWS README compile config.guess \
87 config.sub depcomp install-sh missing ltmain.sh
88 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
89 am__aclocal_m4_deps = $(top_srcdir)/m4/libtool.m4 \
90 $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
91 $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
92 $(top_srcdir)/zlib-1.2.3/withenable.ac \
93 $(top_srcdir)/zlib-1.2.3/zlib.ac $(top_srcdir)/configure.ac
94 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
95 $(ACLOCAL_M4)
96 am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
97 configure.lineno config.status.lineno
98 mkinstalldirs = $(install_sh) -d
99 CONFIG_HEADER = config.h
100 CONFIG_CLEAN_FILES = teckit.pc teckit-uninstalled.pc
101 CONFIG_CLEAN_VPATH_FILES =
102 AM_V_P = $(am__v_P_@AM_V@)
103 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
104 am__v_P_0 = false
105 am__v_P_1 = :
106 AM_V_GEN = $(am__v_GEN_@AM_V@)
107 am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
108 am__v_GEN_0 = @echo " GEN " $@;
109 am__v_GEN_1 =
110 AM_V_at = $(am__v_at_@AM_V@)
111 am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
112 am__v_at_0 = @
113 am__v_at_1 =
114 SOURCES =
115 DIST_SOURCES =
116 RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
117 ctags-recursive dvi-recursive html-recursive info-recursive \
118 install-data-recursive install-dvi-recursive \
119 install-exec-recursive install-html-recursive \
120 install-info-recursive install-pdf-recursive \
121 install-ps-recursive install-recursive installcheck-recursive \
122 installdirs-recursive pdf-recursive ps-recursive \
123 tags-recursive uninstall-recursive
124 am__can_run_installinfo = \
125 case $$AM_UPDATE_INFO_DIR in \
126 n|no|NO) false;; \
127 *) (install-info --version) >/dev/null 2>&1;; \
128 esac
129 am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
130 am__vpath_adj = case $$p in \
131 $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
132 *) f=$$p;; \
133 esac;
134 am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
135 am__install_max = 40
136 am__nobase_strip_setup = \
137 srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
138 am__nobase_strip = \
139 for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
140 am__nobase_list = $(am__nobase_strip_setup); \
141 for p in $$list; do echo "$$p $$p"; done | \
142 sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
143 $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
144 if (++n[$$2] == $(am__install_max)) \
145 { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
146 END { for (dir in files) print dir, files[dir] }'
147 am__base_list = \
148 sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
149 sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
150 am__uninstall_files_from_dir = { \
151 test -z "$$files" \
152 || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
153 || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
154 $(am__cd) "$$dir" && rm -f $$files; }; \
155 }
156 am__installdirs = "$(DESTDIR)$(pkgconfigdir)"
157 DATA = $(pkgconfig_DATA)
158 RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \
159 distclean-recursive maintainer-clean-recursive
160 am__recursive_targets = \
161 $(RECURSIVE_TARGETS) \
162 $(RECURSIVE_CLEAN_TARGETS) \
163 $(am__extra_recursive_targets)
164 AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
165 cscope distdir dist dist-all distcheck
166 am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) \
167 $(LISP)config.h.in
168 # Read a list of newline-separated strings from the standard input,
169 # and print each of them once, without duplicates. Input order is
170 # *not* preserved.
171 am__uniquify_input = $(AWK) '\
172 BEGIN { nonempty = 0; } \
173 { items[$$0] = 1; nonempty = 1; } \
174 END { if (nonempty) { for (i in items) print i; }; } \
175 '
176 # Make sure the list of sources is unique. This is necessary because,
177 # e.g., the same source file might be shared among _SOURCES variables
178 # for different programs/libraries.
179 am__define_uniq_tagged_files = \
180 list='$(am__tagged_files)'; \
181 unique=`for i in $$list; do \
182 if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
183 done | $(am__uniquify_input)`
184 ETAGS = etags
185 CTAGS = ctags
186 CSCOPE = cscope
187 DIST_SUBDIRS = $(SUBDIRS)
188 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
189 distdir = $(PACKAGE)-$(VERSION)
190 top_distdir = $(distdir)
191 am__remove_distdir = \
192 if test -d "$(distdir)"; then \
193 find "$(distdir)" -type d ! -perm -200 -exec chmod u+w {} ';' \
194 && rm -rf "$(distdir)" \
195 || { sleep 5 && rm -rf "$(distdir)"; }; \
196 else :; fi
197 am__post_remove_distdir = $(am__remove_distdir)
198 am__relativize = \
199 dir0=`pwd`; \
200 sed_first='s,^\([^/]*\)/.*$$,\1,'; \
201 sed_rest='s,^[^/]*/*,,'; \
202 sed_last='s,^.*/\([^/]*\)$$,\1,'; \
203 sed_butlast='s,/*[^/]*$$,,'; \
204 while test -n "$$dir1"; do \
205 first=`echo "$$dir1" | sed -e "$$sed_first"`; \
206 if test "$$first" != "."; then \
207 if test "$$first" = ".."; then \
208 dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
209 dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
210 else \
211 first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
212 if test "$$first2" = "$$first"; then \
213 dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
214 else \
215 dir2="../$$dir2"; \
216 fi; \
217 dir0="$$dir0"/"$$first"; \
218 fi; \
219 fi; \
220 dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
221 done; \
222 reldir="$$dir2"
223 DIST_ARCHIVES = $(distdir).tar.gz
224 GZIP_ENV = --best
225 DIST_TARGETS = dist-gzip
226 distuninstallcheck_listfiles = find . -type f -print
227 am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \
228 | sed 's|^\./|$(prefix)/|' | grep -v '$(infodir)/dir$$'
229 distcleancheck_listfiles = find . -type f -print
230 ACLOCAL = @ACLOCAL@
231 AMTAR = @AMTAR@
232 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
233 AR = @AR@
234 AUTOCONF = @AUTOCONF@
235 AUTOHEADER = @AUTOHEADER@
236 AUTOMAKE = @AUTOMAKE@
237 AWK = @AWK@
238 CC = @CC@
239 CCDEPMODE = @CCDEPMODE@
240 CFLAGS = @CFLAGS@
241 CPP = @CPP@
242 CPPFLAGS = @CPPFLAGS@
243 CXX = @CXX@
244 CXXCPP = @CXXCPP@
245 CXXDEPMODE = @CXXDEPMODE@
246 CXXFLAGS = @CXXFLAGS@
247 CYGPATH_W = @CYGPATH_W@
248 DEFS = @DEFS@
249 DEPDIR = @DEPDIR@
250 DLLTOOL = @DLLTOOL@
251 DSYMUTIL = @DSYMUTIL@
252 DUMPBIN = @DUMPBIN@
253 ECHO_C = @ECHO_C@
254 ECHO_N = @ECHO_N@
255 ECHO_T = @ECHO_T@
256 EGREP = @EGREP@
257 EXEEXT = @EXEEXT@
258 FGREP = @FGREP@
259 GREP = @GREP@
260 INSTALL = @INSTALL@
261 INSTALL_DATA = @INSTALL_DATA@
262 INSTALL_PROGRAM = @INSTALL_PROGRAM@
263 INSTALL_SCRIPT = @INSTALL_SCRIPT@
264 INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
265 LD = @LD@
266 LDFLAGS = @LDFLAGS@
267 LDZLIB = @LDZLIB@
268 LIBOBJS = @LIBOBJS@
269 LIBS = @LIBS@
270 LIBTOOL = @LIBTOOL@
271 LIPO = @LIPO@
272 LN_S = @LN_S@
273 LTLIBOBJS = @LTLIBOBJS@
274 MAINT = @MAINT@
275 MAKEINFO = @MAKEINFO@
276 MANIFEST_TOOL = @MANIFEST_TOOL@
277 MKDIR_P = @MKDIR_P@
278 NM = @NM@
279 NMEDIT = @NMEDIT@
280 OBJDUMP = @OBJDUMP@
281 OBJEXT = @OBJEXT@
282 OTOOL = @OTOOL@
283 OTOOL64 = @OTOOL64@
284 PACKAGE = @PACKAGE@
285 PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
286 PACKAGE_NAME = @PACKAGE_NAME@
287 PACKAGE_STRING = @PACKAGE_STRING@
288 PACKAGE_TARNAME = @PACKAGE_TARNAME@
289 PACKAGE_URL = @PACKAGE_URL@
290 PACKAGE_VERSION = @PACKAGE_VERSION@
291 PATH_SEPARATOR = @PATH_SEPARATOR@
292 RANLIB = @RANLIB@
293 RC = @RC@
294 SED = @SED@
295 SET_MAKE = @SET_MAKE@
296 SHELL = @SHELL@
297 STRIP = @STRIP@
298 VERSION = @VERSION@
299 ZLIBCPPFLAGS = @ZLIBCPPFLAGS@
300 ZLIBDEP = @ZLIBDEP@
301 ZLIBDIR = @ZLIBDIR@
302 abs_builddir = @abs_builddir@
303 abs_srcdir = @abs_srcdir@
304 abs_top_builddir = @abs_top_builddir@
305 abs_top_srcdir = @abs_top_srcdir@
306 ac_ct_AR = @ac_ct_AR@
307 ac_ct_CC = @ac_ct_CC@
308 ac_ct_CXX = @ac_ct_CXX@
309 ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
310 am__include = @am__include@
311 am__leading_dot = @am__leading_dot@
312 am__quote = @am__quote@
313 am__tar = @am__tar@
314 am__untar = @am__untar@
315 bindir = @bindir@
316 build = @build@
317 build_alias = @build_alias@
318 build_cpu = @build_cpu@
319 build_os = @build_os@
320 build_vendor = @build_vendor@
321 builddir = @builddir@
322 datadir = @datadir@
323 datarootdir = @datarootdir@
324 docdir = @docdir@
325 dvidir = @dvidir@
326 exec_prefix = @exec_prefix@
327 expat_CFLAGS = @expat_CFLAGS@
328 expat_LIBS = @expat_LIBS@
329 host = @host@
330 host_alias = @host_alias@
331 host_cpu = @host_cpu@
332 host_os = @host_os@
333 host_vendor = @host_vendor@
334 htmldir = @htmldir@
335 includedir = @includedir@
336 infodir = @infodir@
337 install_sh = @install_sh@
338 libdir = @libdir@
339 libexecdir = @libexecdir@
340 localedir = @localedir@
341 localstatedir = @localstatedir@
342 mandir = @mandir@
343 mkdir_p = @mkdir_p@
344 oldincludedir = @oldincludedir@
345 pdfdir = @pdfdir@
346 prefix = @prefix@
347 program_transform_name = @program_transform_name@
348 psdir = @psdir@
349 sbindir = @sbindir@
350 sharedstatedir = @sharedstatedir@
351 srcdir = @srcdir@
352 sysconfdir = @sysconfdir@
353 target = @target@
354 target_alias = @target_alias@
355 target_cpu = @target_cpu@
356 target_os = @target_os@
357 target_vendor = @target_vendor@
358 top_build_prefix = @top_build_prefix@
359 top_builddir = @top_builddir@
360 top_srcdir = @top_srcdir@
361 ACLOCAL_AMFLAGS = -I m4
362 SUBDIRS = lib bin docs test
363 pkgconfigdir = $(libdir)/pkgconfig
364 pkgconfig_DATA = teckit.pc
365 EXTRA_DIST = license/License_CPLv05.txt license/License_LGPLv21.txt \
366 license/LICENSING.txt
367 all: config.h
368 $(MAKE) $(AM_MAKEFLAGS) all-recursive
369
370 .SUFFIXES:
371 am--refresh: Makefile
372 @:
373 $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps)
374 @for dep in $?; do \
375 case '$(am__configure_deps)' in \
376 *$$dep*) \
377 echo ' cd $(srcdir) && $(AUTOMAKE) --foreign'; \
378 $(am__cd) $(srcdir) && $(AUTOMAKE) --foreign \
379 && exit 0; \
380 exit 1;; \
381 esac; \
382 done; \
383 echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign Makefile'; \
384 $(am__cd) $(top_srcdir) && \
385 $(AUTOMAKE) --foreign Makefile
386 .PRECIOUS: Makefile
387 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
388 @case '$?' in \
389 *config.status*) \
390 echo ' $(SHELL) ./config.status'; \
391 $(SHELL) ./config.status;; \
392 *) \
393 echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
394 cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
395 esac;
396
397 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
398 $(SHELL) ./config.status --recheck
399
400 $(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
401 $(am__cd) $(srcdir) && $(AUTOCONF)
402 $(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
403 $(am__cd) $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
404 $(am__aclocal_m4_deps):
405
406 config.h: stamp-h1
407 @test -f $@ || rm -f stamp-h1
408 @test -f $@ || $(MAKE) $(AM_MAKEFLAGS) stamp-h1
409
410 stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
411 @rm -f stamp-h1
412 cd $(top_builddir) && $(SHELL) ./config.status config.h
413 $(srcdir)/config.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
414 ($(am__cd) $(top_srcdir) && $(AUTOHEADER))
415 rm -f stamp-h1
416 touch $@
417
418 distclean-hdr:
419 -rm -f config.h stamp-h1
420 teckit.pc: $(top_builddir)/config.status $(srcdir)/installed-top.pc.in $(srcdir)/teckit.pc.in
421 cd $(top_builddir) && $(SHELL) ./config.status $@
422 teckit-uninstalled.pc: $(top_builddir)/config.status $(srcdir)/uninstalled-top.pc.in $(srcdir)/teckit.pc.in
423 cd $(top_builddir) && $(SHELL) ./config.status $@
424
425 mostlyclean-libtool:
426 -rm -f *.lo
427
428 clean-libtool:
429 -rm -rf .libs _libs
430
431 distclean-libtool:
432 -rm -f libtool config.lt
433 install-pkgconfigDATA: $(pkgconfig_DATA)
434 @$(NORMAL_INSTALL)
435 @list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \
436 if test -n "$$list"; then \
437 echo " $(MKDIR_P) '$(DESTDIR)$(pkgconfigdir)'"; \
438 $(MKDIR_P) "$(DESTDIR)$(pkgconfigdir)" || exit 1; \
439 fi; \
440 for p in $$list; do \
441 if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
442 echo "$$d$$p"; \
443 done | $(am__base_list) | \
444 while read files; do \
445 echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(pkgconfigdir)'"; \
446 $(INSTALL_DATA) $$files "$(DESTDIR)$(pkgconfigdir)" || exit $$?; \
447 done
448
449 uninstall-pkgconfigDATA:
450 @$(NORMAL_UNINSTALL)
451 @list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \
452 files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
453 dir='$(DESTDIR)$(pkgconfigdir)'; $(am__uninstall_files_from_dir)
454
455 # This directory's subdirectories are mostly independent; you can cd
456 # into them and run 'make' without going through this Makefile.
457 # To change the values of 'make' variables: instead of editing Makefiles,
458 # (1) if the variable is set in 'config.status', edit 'config.status'
459 # (which will cause the Makefiles to be regenerated when you run 'make');
460 # (2) otherwise, pass the desired values on the 'make' command line.
461 $(am__recursive_targets):
462 @fail=; \
463 if $(am__make_keepgoing); then \
464 failcom='fail=yes'; \
465 else \
466 failcom='exit 1'; \
467 fi; \
468 dot_seen=no; \
469 target=`echo $@ | sed s/-recursive//`; \
470 case "$@" in \
471 distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
472 *) list='$(SUBDIRS)' ;; \
473 esac; \
474 for subdir in $$list; do \
475 echo "Making $$target in $$subdir"; \
476 if test "$$subdir" = "."; then \
477 dot_seen=yes; \
478 local_target="$$target-am"; \
479 else \
480 local_target="$$target"; \
481 fi; \
482 ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
483 || eval $$failcom; \
484 done; \
485 if test "$$dot_seen" = "no"; then \
486 $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
487 fi; test -z "$$fail"
488
489 ID: $(am__tagged_files)
490 $(am__define_uniq_tagged_files); mkid -fID $$unique
491 tags: tags-recursive
492 TAGS: tags
493
494 tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
495 set x; \
496 here=`pwd`; \
497 if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
498 include_option=--etags-include; \
499 empty_fix=.; \
500 else \
501 include_option=--include; \
502 empty_fix=; \
503 fi; \
504 list='$(SUBDIRS)'; for subdir in $$list; do \
505 if test "$$subdir" = .; then :; else \
506 test ! -f $$subdir/TAGS || \
507 set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
508 fi; \
509 done; \
510 $(am__define_uniq_tagged_files); \
511 shift; \
512 if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
513 test -n "$$unique" || unique=$$empty_fix; \
514 if test $$# -gt 0; then \
515 $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
516 "$$@" $$unique; \
517 else \
518 $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
519 $$unique; \
520 fi; \
521 fi
522 ctags: ctags-recursive
523
524 CTAGS: ctags
525 ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
526 $(am__define_uniq_tagged_files); \
527 test -z "$(CTAGS_ARGS)$$unique" \
528 || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
529 $$unique
530
531 GTAGS:
532 here=`$(am__cd) $(top_builddir) && pwd` \
533 && $(am__cd) $(top_srcdir) \
534 && gtags -i $(GTAGS_ARGS) "$$here"
535 cscope: cscope.files
536 test ! -s cscope.files \
537 || $(CSCOPE) -b -q $(AM_CSCOPEFLAGS) $(CSCOPEFLAGS) -i cscope.files $(CSCOPE_ARGS)
538 clean-cscope:
539 -rm -f cscope.files
540 cscope.files: clean-cscope cscopelist
541 cscopelist: cscopelist-recursive
542
543 cscopelist-am: $(am__tagged_files)
544 list='$(am__tagged_files)'; \
545 case "$(srcdir)" in \
546 [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
547 *) sdir=$(subdir)/$(srcdir) ;; \
548 esac; \
549 for i in $$list; do \
550 if test -f "$$i"; then \
551 echo "$(subdir)/$$i"; \
552 else \
553 echo "$$sdir/$$i"; \
554 fi; \
555 done >> $(top_builddir)/cscope.files
556
557 distclean-tags:
558 -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
559 -rm -f cscope.out cscope.in.out cscope.po.out cscope.files
560
561 distdir: $(DISTFILES)
562 $(am__remove_distdir)
563 test -d "$(distdir)" || mkdir "$(distdir)"
564 @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
565 topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
566 list='$(DISTFILES)'; \
567 dist_files=`for file in $$list; do echo $$file; done | \
568 sed -e "s|^$$srcdirstrip/||;t" \
569 -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
570 case $$dist_files in \
571 */*) $(MKDIR_P) `echo "$$dist_files" | \
572 sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
573 sort -u` ;; \
574 esac; \
575 for file in $$dist_files; do \
576 if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
577 if test -d $$d/$$file; then \
578 dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
579 if test -d "$(distdir)/$$file"; then \
580 find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
581 fi; \
582 if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
583 cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
584 find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
585 fi; \
586 cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
587 else \
588 test -f "$(distdir)/$$file" \
589 || cp -p $$d/$$file "$(distdir)/$$file" \
590 || exit 1; \
591 fi; \
592 done
593 @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
594 if test "$$subdir" = .; then :; else \
595 $(am__make_dryrun) \
596 || test -d "$(distdir)/$$subdir" \
597 || $(MKDIR_P) "$(distdir)/$$subdir" \
598 || exit 1; \
599 dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
600 $(am__relativize); \
601 new_distdir=$$reldir; \
602 dir1=$$subdir; dir2="$(top_distdir)"; \
603 $(am__relativize); \
604 new_top_distdir=$$reldir; \
605 echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
606 echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
607 ($(am__cd) $$subdir && \
608 $(MAKE) $(AM_MAKEFLAGS) \
609 top_distdir="$$new_top_distdir" \
610 distdir="$$new_distdir" \
611 am__remove_distdir=: \
612 am__skip_length_check=: \
613 am__skip_mode_fix=: \
614 distdir) \
615 || exit 1; \
616 fi; \
617 done
618 -test -n "$(am__skip_mode_fix)" \
619 || find "$(distdir)" -type d ! -perm -755 \
620 -exec chmod u+rwx,go+rx {} \; -o \
621 ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
622 ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
623 ! -type d ! -perm -444 -exec $(install_sh) -c -m a+r {} {} \; \
624 || chmod -R a+r "$(distdir)"
625 dist-gzip: distdir
626 tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
627 $(am__post_remove_distdir)
628
629 dist-bzip2: distdir
630 tardir=$(distdir) && $(am__tar) | BZIP2=$${BZIP2--9} bzip2 -c >$(distdir).tar.bz2
631 $(am__post_remove_distdir)
632
633 dist-lzip: distdir
634 tardir=$(distdir) && $(am__tar) | lzip -c $${LZIP_OPT--9} >$(distdir).tar.lz
635 $(am__post_remove_distdir)
636
637 dist-xz: distdir
638 tardir=$(distdir) && $(am__tar) | XZ_OPT=$${XZ_OPT--e} xz -c >$(distdir).tar.xz
639 $(am__post_remove_distdir)
640
641 dist-tarZ: distdir
642 @echo WARNING: "Support for shar distribution archives is" \
643 "deprecated." >&2
644 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2
645 tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
646 $(am__post_remove_distdir)
647
648 dist-shar: distdir
649 @echo WARNING: "Support for distribution archives compressed with" \
650 "legacy program 'compress' is deprecated." >&2
651 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2
652 shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
653 $(am__post_remove_distdir)
654
655 dist-zip: distdir
656 -rm -f $(distdir).zip
657 zip -rq $(distdir).zip $(distdir)
658 $(am__post_remove_distdir)
659
660 dist dist-all:
661 $(MAKE) $(AM_MAKEFLAGS) $(DIST_TARGETS) am__post_remove_distdir='@:'
662 $(am__post_remove_distdir)
663
664 # This target untars the dist file and tries a VPATH configuration. Then
665 # it guarantees that the distribution is self-contained by making another
666 # tarfile.
667 distcheck: dist
668 case '$(DIST_ARCHIVES)' in \
669 *.tar.gz*) \
670 GZIP=$(GZIP_ENV) gzip -dc $(distdir).tar.gz | $(am__untar) ;;\
671 *.tar.bz2*) \
672 bzip2 -dc $(distdir).tar.bz2 | $(am__untar) ;;\
673 *.tar.lz*) \
674 lzip -dc $(distdir).tar.lz | $(am__untar) ;;\
675 *.tar.xz*) \
676 xz -dc $(distdir).tar.xz | $(am__untar) ;;\
677 *.tar.Z*) \
678 uncompress -c $(distdir).tar.Z | $(am__untar) ;;\
679 *.shar.gz*) \
680 GZIP=$(GZIP_ENV) gzip -dc $(distdir).shar.gz | unshar ;;\
681 *.zip*) \
682 unzip $(distdir).zip ;;\
683 esac
684 chmod -R a-w $(distdir)
685 chmod u+w $(distdir)
686 mkdir $(distdir)/_build $(distdir)/_inst
687 chmod a-w $(distdir)
688 test -d $(distdir)/_build || exit 0; \
689 dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
690 && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
691 && am__cwd=`pwd` \
692 && $(am__cd) $(distdir)/_build \
693 && ../configure \
694 $(AM_DISTCHECK_CONFIGURE_FLAGS) \
695 $(DISTCHECK_CONFIGURE_FLAGS) \
696 --srcdir=.. --prefix="$$dc_install_base" \
697 && $(MAKE) $(AM_MAKEFLAGS) \
698 && $(MAKE) $(AM_MAKEFLAGS) dvi \
699 && $(MAKE) $(AM_MAKEFLAGS) check \
700 && $(MAKE) $(AM_MAKEFLAGS) install \
701 && $(MAKE) $(AM_MAKEFLAGS) installcheck \
702 && $(MAKE) $(AM_MAKEFLAGS) uninstall \
703 && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
704 distuninstallcheck \
705 && chmod -R a-w "$$dc_install_base" \
706 && ({ \
707 (cd ../.. && umask 077 && mkdir "$$dc_destdir") \
708 && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
709 && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
710 && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
711 distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
712 } || { rm -rf "$$dc_destdir"; exit 1; }) \
713 && rm -rf "$$dc_destdir" \
714 && $(MAKE) $(AM_MAKEFLAGS) dist \
715 && rm -rf $(DIST_ARCHIVES) \
716 && $(MAKE) $(AM_MAKEFLAGS) distcleancheck \
717 && cd "$$am__cwd" \
718 || exit 1
719 $(am__post_remove_distdir)
720 @(echo "$(distdir) archives ready for distribution: "; \
721 list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \
722 sed -e 1h -e 1s/./=/g -e 1p -e 1x -e '$$p' -e '$$x'
723 distuninstallcheck:
724 @test -n '$(distuninstallcheck_dir)' || { \
725 echo 'ERROR: trying to run $@ with an empty' \
726 '$$(distuninstallcheck_dir)' >&2; \
727 exit 1; \
728 }; \
729 $(am__cd) '$(distuninstallcheck_dir)' || { \
730 echo 'ERROR: cannot chdir into $(distuninstallcheck_dir)' >&2; \
731 exit 1; \
732 }; \
733 test `$(am__distuninstallcheck_listfiles) | wc -l` -eq 0 \
734 || { echo "ERROR: files left after uninstall:" ; \
735 if test -n "$(DESTDIR)"; then \
736 echo " (check DESTDIR support)"; \
737 fi ; \
738 $(distuninstallcheck_listfiles) ; \
739 exit 1; } >&2
740 distcleancheck: distclean
741 @if test '$(srcdir)' = . ; then \
742 echo "ERROR: distcleancheck can only run from a VPATH build" ; \
743 exit 1 ; \
744 fi
745 @test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
746 || { echo "ERROR: files left in build directory after distclean:" ; \
747 $(distcleancheck_listfiles) ; \
748 exit 1; } >&2
749 check-am: all-am
750 check: check-recursive
751 all-am: Makefile $(DATA) config.h
752 installdirs: installdirs-recursive
753 installdirs-am:
754 for dir in "$(DESTDIR)$(pkgconfigdir)"; do \
755 test -z "$$dir" || $(MKDIR_P) "$$dir"; \
756 done
757 install: install-recursive
758 install-exec: install-exec-recursive
759 install-data: install-data-recursive
760 uninstall: uninstall-recursive
761
762 install-am: all-am
763 @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
764
765 installcheck: installcheck-recursive
766 install-strip:
767 if test -z '$(STRIP)'; then \
768 $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
769 install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
770 install; \
771 else \
772 $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
773 install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
774 "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
775 fi
776 mostlyclean-generic:
777
778 clean-generic:
779
780 distclean-generic:
781 -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
782 -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
783
784 maintainer-clean-generic:
785 @echo "This command is intended for maintainers to use"
786 @echo "it deletes files that may require special tools to rebuild."
787 clean: clean-recursive
788
789 clean-am: clean-generic clean-libtool mostlyclean-am
790
791 distclean: distclean-recursive
792 -rm -f $(am__CONFIG_DISTCLEAN_FILES)
793 -rm -f Makefile
794 distclean-am: clean-am distclean-generic distclean-hdr \
795 distclean-libtool distclean-tags
796
797 dvi: dvi-recursive
798
799 dvi-am:
800
801 html: html-recursive
802
803 html-am:
804
805 info: info-recursive
806
807 info-am:
808
809 install-data-am: install-pkgconfigDATA
810
811 install-dvi: install-dvi-recursive
812
813 install-dvi-am:
814
815 install-exec-am:
816
817 install-html: install-html-recursive
818
819 install-html-am:
820
821 install-info: install-info-recursive
822
823 install-info-am:
824
825 install-man:
826
827 install-pdf: install-pdf-recursive
828
829 install-pdf-am:
830
831 install-ps: install-ps-recursive
832
833 install-ps-am:
834
835 installcheck-am:
836
837 maintainer-clean: maintainer-clean-recursive
838 -rm -f $(am__CONFIG_DISTCLEAN_FILES)
839 -rm -rf $(top_srcdir)/autom4te.cache
840 -rm -f Makefile
841 maintainer-clean-am: distclean-am maintainer-clean-generic
842
843 mostlyclean: mostlyclean-recursive
844
845 mostlyclean-am: mostlyclean-generic mostlyclean-libtool
846
847 pdf: pdf-recursive
848
849 pdf-am:
850
851 ps: ps-recursive
852
853 ps-am:
854
855 uninstall-am: uninstall-pkgconfigDATA
856
857 .MAKE: $(am__recursive_targets) all install-am install-strip
858
859 .PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
860 am--refresh check check-am clean clean-cscope clean-generic \
861 clean-libtool cscope cscopelist-am ctags ctags-am dist \
862 dist-all dist-bzip2 dist-gzip dist-lzip dist-shar dist-tarZ \
863 dist-xz dist-zip distcheck distclean distclean-generic \
864 distclean-hdr distclean-libtool distclean-tags distcleancheck \
865 distdir distuninstallcheck dvi dvi-am html html-am info \
866 info-am install install-am install-data install-data-am \
867 install-dvi install-dvi-am install-exec install-exec-am \
868 install-html install-html-am install-info install-info-am \
869 install-man install-pdf install-pdf-am install-pkgconfigDATA \
870 install-ps install-ps-am install-strip installcheck \
871 installcheck-am installdirs installdirs-am maintainer-clean \
872 maintainer-clean-generic mostlyclean mostlyclean-generic \
873 mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
874 uninstall-am uninstall-pkgconfigDATA
875
876
877 # Tell versions [3.59,3.63) of GNU make to not export all variables.
878 # Otherwise a system limit (for SysV at least) may be exceeded.
879 .NOEXPORT:
0 2014-06-30
1 Version 2.5.4
2 Updated Unicode character names and normalization data to 7.0.0
3 Fixed data normalization bug
4 Updated tests
5 Improved Unicode version updating
6 Fixed compiler warnings
7
8 2011-01-13
9 Updated Unicode character names and normalization data to 6.0.0
10 Updated copyright dates and contact details
11
12 2009-01-30
13 Fixed returning zero-length strings in Perl
14
15 2008-04-07
16 updated Unicode character names and normalization data to 5.1
17 now building Windows release with mingw32-gcc instead of CodeWarrior
18 minor compiler bugfixes and code cleanup for portability
19
20 2006-03-16
21 updated Unicode character names and normalization data to 5.0
22 added license files and docs to subversion repository
23 released new Windows binary package, supporting -x option in teckit_compile
0 README
1
2 This is TECkit, a library for encoding conversion, usable through standalone
3 tools or by linking with other software packages.
4
5
6 See the docs folder for TECkit mapping language and conversion tool usage notes.
7
8
9 The teckit_compile tool now supports a new option, not described in the PDF
10 documentation:
11
12 -x generate XML representation rather than compiled table
13
14 This is primarily intended for use by the Reprise utility, and the XML format
15 produced is subject to change according to the needs of that tool.
0 /*
1 * Copyright 2001-2004 Unicode, Inc.
2 *
3 * Disclaimer
4 *
5 * This source code is provided as is by Unicode, Inc. No claims are
6 * made as to fitness for any particular purpose. No warranties of any
7 * kind are expressed or implied. The recipient agrees to determine
8 * applicability of information provided. If this file has been
9 * purchased on magnetic or optical media from Unicode, Inc., the
10 * sole remedy for any claim will be exchange of defective media
11 * within 90 days of receipt.
12 *
13 * Limitations on Rights to Redistribute This Code
14 *
15 * Unicode, Inc. hereby grants the right to freely use the information
16 * supplied in this file in the creation of products supporting the
17 * Unicode Standard, and to make copies of this file in any form
18 * for internal or external distribution as long as this notice
19 * remains attached.
20 */
21
22 /* ---------------------------------------------------------------------
23
24 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
25 Author: Mark E. Davis, 1994.
26 Rev History: Rick McGowan, fixes & updates May 2001.
27 Sept 2001: fixed const & error conditions per
28 mods suggested by S. Parent & A. Lillich.
29 June 2002: Tim Dodd added detection and handling of incomplete
30 source sequences, enhanced error detection, added casts
31 to eliminate compiler warnings.
32 July 2003: slight mods to back out aggressive FFFE detection.
33 Jan 2004: updated switches in from-UTF8 conversions.
34 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
35
36 See the header file "ConvertUTF.h" for complete documentation.
37
38 ------------------------------------------------------------------------ */
39
40
41 #include "ConvertUTF.h"
42 #ifdef CVTUTF_DEBUG
43 #include <stdio.h>
44 #endif
45
46 static const int halfShift = 10; /* used for shifting by 10 bits */
47
48 static const UTF32 halfBase = 0x0010000UL;
49 static const UTF32 halfMask = 0x3FFUL;
50
51 #define UNI_SUR_HIGH_START (UTF32)0xD800
52 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
53 #define UNI_SUR_LOW_START (UTF32)0xDC00
54 #define UNI_SUR_LOW_END (UTF32)0xDFFF
55 #define false 0
56 #define true 1
57
58 /* --------------------------------------------------------------------- */
59
60 ConversionResult ConvertUTF32toUTF16 (
61 const UTF32** sourceStart, const UTF32* sourceEnd,
62 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
63 ConversionResult result = conversionOK;
64 const UTF32* source = *sourceStart;
65 UTF16* target = *targetStart;
66 while (source < sourceEnd) {
67 UTF32 ch;
68 if (target >= targetEnd) {
69 result = targetExhausted; break;
70 }
71 ch = *source++;
72 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
73 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
74 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
75 if (flags == strictConversion) {
76 --source; /* return to the illegal value itself */
77 result = sourceIllegal;
78 break;
79 } else {
80 *target++ = UNI_REPLACEMENT_CHAR;
81 }
82 } else {
83 *target++ = (UTF16)ch; /* normal case */
84 }
85 } else if (ch > UNI_MAX_LEGAL_UTF32) {
86 if (flags == strictConversion) {
87 result = sourceIllegal;
88 } else {
89 *target++ = UNI_REPLACEMENT_CHAR;
90 }
91 } else {
92 /* target is a character in range 0xFFFF - 0x10FFFF. */
93 if (target + 1 >= targetEnd) {
94 --source; /* Back up source pointer! */
95 result = targetExhausted; break;
96 }
97 ch -= halfBase;
98 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
99 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
100 }
101 }
102 *sourceStart = source;
103 *targetStart = target;
104 return result;
105 }
106
107 /* --------------------------------------------------------------------- */
108
109 ConversionResult ConvertUTF16toUTF32 (
110 const UTF16** sourceStart, const UTF16* sourceEnd,
111 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
112 ConversionResult result = conversionOK;
113 const UTF16* source = *sourceStart;
114 UTF32* target = *targetStart;
115 UTF32 ch, ch2;
116 while (source < sourceEnd) {
117 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
118 ch = *source++;
119 /* If we have a surrogate pair, convert to UTF32 first. */
120 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
121 /* If the 16 bits following the high surrogate are in the source buffer... */
122 if (source < sourceEnd) {
123 ch2 = *source;
124 /* If it's a low surrogate, convert to UTF32. */
125 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
126 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
127 + (ch2 - UNI_SUR_LOW_START) + halfBase;
128 ++source;
129 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
130 --source; /* return to the illegal value itself */
131 result = sourceIllegal;
132 break;
133 }
134 } else { /* We don't have the 16 bits following the high surrogate. */
135 --source; /* return to the high surrogate */
136 result = sourceExhausted;
137 break;
138 }
139 } else if (flags == strictConversion) {
140 /* UTF-16 surrogate values are illegal in UTF-32 */
141 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
142 --source; /* return to the illegal value itself */
143 result = sourceIllegal;
144 break;
145 }
146 }
147 if (target >= targetEnd) {
148 source = oldSource; /* Back up source pointer! */
149 result = targetExhausted; break;
150 }
151 *target++ = ch;
152 }
153 *sourceStart = source;
154 *targetStart = target;
155 #ifdef CVTUTF_DEBUG
156 if (result == sourceIllegal) {
157 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
158 fflush(stderr);
159 }
160 #endif
161 return result;
162 }
163
164 /* --------------------------------------------------------------------- */
165
166 /*
167 * Index into the table below with the first byte of a UTF-8 sequence to
168 * get the number of trailing bytes that are supposed to follow it.
169 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
170 * left as-is for anyone who may want to do such conversion, which was
171 * allowed in earlier algorithms.
172 */
173 static const char trailingBytesForUTF8[256] = {
174 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
175 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
176 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
177 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
178 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
179 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
181 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
182 };
183
184 /*
185 * Magic values subtracted from a buffer value during UTF8 conversion.
186 * This table contains as many values as there might be trailing bytes
187 * in a UTF-8 sequence.
188 */
189 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
190 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
191
192 /*
193 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
194 * into the first byte, depending on how many bytes follow. There are
195 * as many entries in this table as there are UTF-8 sequence types.
196 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
197 * for *legal* UTF-8 will be 4 or fewer bytes total.
198 */
199 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
200
201 /* --------------------------------------------------------------------- */
202
203 /* The interface converts a whole buffer to avoid function-call overhead.
204 * Constants have been gathered. Loops & conditionals have been removed as
205 * much as possible for efficiency, in favor of drop-through switches.
206 * (See "Note A" at the bottom of the file for equivalent code.)
207 * If your compiler supports it, the "isLegalUTF8" call can be turned
208 * into an inline function.
209 */
210
211 /* --------------------------------------------------------------------- */
212
213 ConversionResult ConvertUTF16toUTF8 (
214 const UTF16** sourceStart, const UTF16* sourceEnd,
215 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
216 ConversionResult result = conversionOK;
217 const UTF16* source = *sourceStart;
218 UTF8* target = *targetStart;
219 while (source < sourceEnd) {
220 UTF32 ch;
221 unsigned short bytesToWrite = 0;
222 const UTF32 byteMask = 0xBF;
223 const UTF32 byteMark = 0x80;
224 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
225 ch = *source++;
226 /* If we have a surrogate pair, convert to UTF32 first. */
227 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
228 /* If the 16 bits following the high surrogate are in the source buffer... */
229 if (source < sourceEnd) {
230 UTF32 ch2 = *source;
231 /* If it's a low surrogate, convert to UTF32. */
232 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
233 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
234 + (ch2 - UNI_SUR_LOW_START) + halfBase;
235 ++source;
236 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
237 --source; /* return to the illegal value itself */
238 result = sourceIllegal;
239 break;
240 }
241 } else { /* We don't have the 16 bits following the high surrogate. */
242 --source; /* return to the high surrogate */
243 result = sourceExhausted;
244 break;
245 }
246 } else if (flags == strictConversion) {
247 /* UTF-16 surrogate values are illegal in UTF-32 */
248 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
249 --source; /* return to the illegal value itself */
250 result = sourceIllegal;
251 break;
252 }
253 }
254 /* Figure out how many bytes the result will require */
255 if (ch < (UTF32)0x80) { bytesToWrite = 1;
256 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
257 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
258 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
259 } else { bytesToWrite = 3;
260 ch = UNI_REPLACEMENT_CHAR;
261 }
262
263 target += bytesToWrite;
264 if (target > targetEnd) {
265 source = oldSource; /* Back up source pointer! */
266 target -= bytesToWrite; result = targetExhausted; break;
267 }
268 switch (bytesToWrite) { /* note: everything falls through. */
269 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
270 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
271 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
272 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
273 }
274 target += bytesToWrite;
275 }
276 *sourceStart = source;
277 *targetStart = target;
278 return result;
279 }
280
281 /* --------------------------------------------------------------------- */
282
283 /*
284 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
285 * This must be called with the length pre-determined by the first byte.
286 * If not calling this from ConvertUTF8to*, then the length can be set by:
287 * length = trailingBytesForUTF8[*source]+1;
288 * and the sequence is illegal right away if there aren't that many bytes
289 * available.
290 * If presented with a length > 4, this returns false. The Unicode
291 * definition of UTF-8 goes up to 4-byte sequences.
292 */
293
294 static Boolean isLegalUTF8(const UTF8 *source, int length) {
295 UTF8 a;
296 const UTF8 *srcptr = source+length;
297 switch (length) {
298 default: return false;
299 /* Everything else falls through when "true"... */
300 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
301 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
302 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
303
304 switch (*source) {
305 /* no fall-through in this inner switch */
306 case 0xE0: if (a < 0xA0) return false; break;
307 case 0xED: if (a > 0x9F) return false; break;
308 case 0xF0: if (a < 0x90) return false; break;
309 case 0xF4: if (a > 0x8F) return false; break;
310 default: if (a < 0x80) return false;
311 }
312
313 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
314 }
315 if (*source > 0xF4) return false;
316 return true;
317 }
318
319 /* --------------------------------------------------------------------- */
320
321 /*
322 * Exported function to return whether a UTF-8 sequence is legal or not.
323 * This is not used here; it's just exported.
324 */
325 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
326 int length = trailingBytesForUTF8[*source]+1;
327 if (source+length > sourceEnd) {
328 return false;
329 }
330 return isLegalUTF8(source, length);
331 }
332
333 /* --------------------------------------------------------------------- */
334
335 ConversionResult ConvertUTF8toUTF16 (
336 const UTF8** sourceStart, const UTF8* sourceEnd,
337 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
338 ConversionResult result = conversionOK;
339 const UTF8* source = *sourceStart;
340 UTF16* target = *targetStart;
341 while (source < sourceEnd) {
342 UTF32 ch = 0;
343 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
344 if (source + extraBytesToRead >= sourceEnd) {
345 result = sourceExhausted; break;
346 }
347 /* Do this check whether lenient or strict */
348 if (! isLegalUTF8(source, extraBytesToRead+1)) {
349 result = sourceIllegal;
350 break;
351 }
352 /*
353 * The cases all fall through. See "Note A" below.
354 */
355 switch (extraBytesToRead) {
356 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
357 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
358 case 3: ch += *source++; ch <<= 6;
359 case 2: ch += *source++; ch <<= 6;
360 case 1: ch += *source++; ch <<= 6;
361 case 0: ch += *source++;
362 }
363 ch -= offsetsFromUTF8[extraBytesToRead];
364
365 if (target >= targetEnd) {
366 source -= (extraBytesToRead+1); /* Back up source pointer! */
367 result = targetExhausted; break;
368 }
369 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
370 /* UTF-16 surrogate values are illegal in UTF-32 */
371 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
372 if (flags == strictConversion) {
373 source -= (extraBytesToRead+1); /* return to the illegal value itself */
374 result = sourceIllegal;
375 break;
376 } else {
377 *target++ = UNI_REPLACEMENT_CHAR;
378 }
379 } else {
380 *target++ = (UTF16)ch; /* normal case */
381 }
382 } else if (ch > UNI_MAX_UTF16) {
383 if (flags == strictConversion) {
384 result = sourceIllegal;
385 source -= (extraBytesToRead+1); /* return to the start */
386 break; /* Bail out; shouldn't continue */
387 } else {
388 *target++ = UNI_REPLACEMENT_CHAR;
389 }
390 } else {
391 /* target is a character in range 0xFFFF - 0x10FFFF. */
392 if (target + 1 >= targetEnd) {
393 source -= (extraBytesToRead+1); /* Back up source pointer! */
394 result = targetExhausted; break;
395 }
396 ch -= halfBase;
397 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
398 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
399 }
400 }
401 *sourceStart = source;
402 *targetStart = target;
403 return result;
404 }
405
406 /* --------------------------------------------------------------------- */
407
408 ConversionResult ConvertUTF32toUTF8 (
409 const UTF32** sourceStart, const UTF32* sourceEnd,
410 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
411 ConversionResult result = conversionOK;
412 const UTF32* source = *sourceStart;
413 UTF8* target = *targetStart;
414 while (source < sourceEnd) {
415 UTF32 ch;
416 unsigned short bytesToWrite = 0;
417 const UTF32 byteMask = 0xBF;
418 const UTF32 byteMark = 0x80;
419 ch = *source++;
420 if (flags == strictConversion ) {
421 /* UTF-16 surrogate values are illegal in UTF-32 */
422 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
423 --source; /* return to the illegal value itself */
424 result = sourceIllegal;
425 break;
426 }
427 }
428 /*
429 * Figure out how many bytes the result will require. Turn any
430 * illegally large UTF32 things (> Plane 17) into replacement chars.
431 */
432 if (ch < (UTF32)0x80) { bytesToWrite = 1;
433 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
434 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
435 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
436 } else { bytesToWrite = 3;
437 ch = UNI_REPLACEMENT_CHAR;
438 result = sourceIllegal;
439 }
440
441 target += bytesToWrite;
442 if (target > targetEnd) {
443 --source; /* Back up source pointer! */
444 target -= bytesToWrite; result = targetExhausted; break;
445 }
446 switch (bytesToWrite) { /* note: everything falls through. */
447 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
448 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
449 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
450 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
451 }
452 target += bytesToWrite;
453 }
454 *sourceStart = source;
455 *targetStart = target;
456 return result;
457 }
458
459 /* --------------------------------------------------------------------- */
460
461 ConversionResult ConvertUTF8toUTF32 (
462 const UTF8** sourceStart, const UTF8* sourceEnd,
463 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
464 ConversionResult result = conversionOK;
465 const UTF8* source = *sourceStart;
466 UTF32* target = *targetStart;
467 while (source < sourceEnd) {
468 UTF32 ch = 0;
469 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
470 if (source + extraBytesToRead >= sourceEnd) {
471 result = sourceExhausted; break;
472 }
473 /* Do this check whether lenient or strict */
474 if (! isLegalUTF8(source, extraBytesToRead+1)) {
475 result = sourceIllegal;
476 break;
477 }
478 /*
479 * The cases all fall through. See "Note A" below.
480 */
481 switch (extraBytesToRead) {
482 case 5: ch += *source++; ch <<= 6;
483 case 4: ch += *source++; ch <<= 6;
484 case 3: ch += *source++; ch <<= 6;
485 case 2: ch += *source++; ch <<= 6;
486 case 1: ch += *source++; ch <<= 6;
487 case 0: ch += *source++;
488 }
489 ch -= offsetsFromUTF8[extraBytesToRead];
490
491 if (target >= targetEnd) {
492 source -= (extraBytesToRead+1); /* Back up the source pointer! */
493 result = targetExhausted; break;
494 }
495 if (ch <= UNI_MAX_LEGAL_UTF32) {
496 /*
497 * UTF-16 surrogate values are illegal in UTF-32, and anything
498 * over Plane 17 (> 0x10FFFF) is illegal.
499 */
500 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
501 if (flags == strictConversion) {
502 source -= (extraBytesToRead+1); /* return to the illegal value itself */
503 result = sourceIllegal;
504 break;
505 } else {
506 *target++ = UNI_REPLACEMENT_CHAR;
507 }
508 } else {
509 *target++ = ch;
510 }
511 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
512 result = sourceIllegal;
513 *target++ = UNI_REPLACEMENT_CHAR;
514 }
515 }
516 *sourceStart = source;
517 *targetStart = target;
518 return result;
519 }
520
521 /* ---------------------------------------------------------------------
522
523 Note A.
524 The fall-through switches in UTF-8 reading code save a
525 temp variable, some decrements & conditionals. The switches
526 are equivalent to the following loop:
527 {
528 int tmpBytesToRead = extraBytesToRead+1;
529 do {
530 ch += *source++;
531 --tmpBytesToRead;
532 if (tmpBytesToRead) ch <<= 6;
533 } while (tmpBytesToRead > 0);
534 }
535 In UTF-8 writing code, the switches on "bytesToWrite" are
536 similarly unrolled loops.
537
538 --------------------------------------------------------------------- */
0 /*
1 * Copyright 2001-2004 Unicode, Inc.
2 *
3 * Disclaimer
4 *
5 * This source code is provided as is by Unicode, Inc. No claims are
6 * made as to fitness for any particular purpose. No warranties of any
7 * kind are expressed or implied. The recipient agrees to determine
8 * applicability of information provided. If this file has been
9 * purchased on magnetic or optical media from Unicode, Inc., the
10 * sole remedy for any claim will be exchange of defective media
11 * within 90 days of receipt.
12 *
13 * Limitations on Rights to Redistribute This Code
14 *
15 * Unicode, Inc. hereby grants the right to freely use the information
16 * supplied in this file in the creation of products supporting the
17 * Unicode Standard, and to make copies of this file in any form
18 * for internal or external distribution as long as this notice
19 * remains attached.
20 */
21
22 /* ---------------------------------------------------------------------
23
24 Conversions between UTF32, UTF-16, and UTF-8. Header file.
25
26 Several funtions are included here, forming a complete set of
27 conversions between the three formats. UTF-7 is not included
28 here, but is handled in a separate source file.
29
30 Each of these routines takes pointers to input buffers and output
31 buffers. The input buffers are const.
32
33 Each routine converts the text between *sourceStart and sourceEnd,
34 putting the result into the buffer between *targetStart and
35 targetEnd. Note: the end pointers are *after* the last item: e.g.
36 *(sourceEnd - 1) is the last item.
37
38 The return result indicates whether the conversion was successful,
39 and if not, whether the problem was in the source or target buffers.
40 (Only the first encountered problem is indicated.)
41
42 After the conversion, *sourceStart and *targetStart are both
43 updated to point to the end of last text successfully converted in
44 the respective buffers.
45
46 Input parameters:
47 sourceStart - pointer to a pointer to the source buffer.
48 The contents of this are modified on return so that
49 it points at the next thing to be converted.
50 targetStart - similarly, pointer to pointer to the target buffer.
51 sourceEnd, targetEnd - respectively pointers to the ends of the
52 two buffers, for overflow checking only.
53
54 These conversion functions take a ConversionFlags argument. When this
55 flag is set to strict, both irregular sequences and isolated surrogates
56 will cause an error. When the flag is set to lenient, both irregular
57 sequences and isolated surrogates are converted.
58
59 Whether the flag is strict or lenient, all illegal sequences will cause
60 an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
61 or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
62 must check for illegal sequences.
63
64 When the flag is set to lenient, characters over 0x10FFFF are converted
65 to the replacement character; otherwise (when the flag is set to strict)
66 they constitute an error.
67
68 Output parameters:
69 The value "sourceIllegal" is returned from some routines if the input
70 sequence is malformed. When "sourceIllegal" is returned, the source
71 value will point to the illegal value that caused the problem. E.g.,
72 in UTF-8 when a sequence is malformed, it points to the start of the
73 malformed sequence.
74
75 Author: Mark E. Davis, 1994.
76 Rev History: Rick McGowan, fixes & updates May 2001.
77 Fixes & updates, Sept 2001.
78
79 ------------------------------------------------------------------------ */
80
81 /* ---------------------------------------------------------------------
82 The following 4 definitions are compiler-specific.
83 The C standard does not guarantee that wchar_t has at least
84 16 bits, so wchar_t is no less portable than unsigned short!
85 All should be unsigned values to avoid sign extension during
86 bit mask & shift operations.
87 ------------------------------------------------------------------------ */
88
89 typedef unsigned long UTF32; /* at least 32 bits */
90 typedef unsigned short UTF16; /* at least 16 bits */
91 typedef unsigned char UTF8; /* typically 8 bits */
92 typedef unsigned char Boolean; /* 0 or 1 */
93
94 /* Some fundamental constants */
95 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
96 #define UNI_MAX_BMP (UTF32)0x0000FFFF
97 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
98 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
99 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
100
101 typedef enum {
102 conversionOK, /* conversion successful */
103 sourceExhausted, /* partial character in source, but hit end */
104 targetExhausted, /* insuff. room in target for conversion */
105 sourceIllegal /* source sequence is illegal/malformed */
106 } ConversionResult;
107
108 typedef enum {
109 strictConversion = 0,
110 lenientConversion
111 } ConversionFlags;
112
113 /* This is for C++ and does no harm in C */
114 #ifdef __cplusplus
115 extern "C" {
116 #endif
117
118 ConversionResult ConvertUTF8toUTF16 (
119 const UTF8** sourceStart, const UTF8* sourceEnd,
120 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
121
122 ConversionResult ConvertUTF16toUTF8 (
123 const UTF16** sourceStart, const UTF16* sourceEnd,
124 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
125
126 ConversionResult ConvertUTF8toUTF32 (
127 const UTF8** sourceStart, const UTF8* sourceEnd,
128 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
129
130 ConversionResult ConvertUTF32toUTF8 (
131 const UTF32** sourceStart, const UTF32* sourceEnd,
132 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
133
134 ConversionResult ConvertUTF16toUTF32 (
135 const UTF16** sourceStart, const UTF16* sourceEnd,
136 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
137
138 ConversionResult ConvertUTF32toUTF16 (
139 const UTF32** sourceStart, const UTF32* sourceEnd,
140 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
141
142 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
143
144 #ifdef __cplusplus
145 }
146 #endif
147
148 /* --------------------------------------------------------------------- */
0 #define DEBUG 1
1 #define _MSL_NO_LOCALE 1
2 #define XML_DTD 1
0 #define _MSL_NO_LOCALE 1
1 #define XML_DTD 1
0 #include <cstring>
1 #include <cstdio>
2 #include <cstdlib>
3
4 #include <string>
5 #include <fstream>
6 #include <map>
7 #include <vector>
8
9 using namespace std;
10
11 #include "TECkit_Engine.h"
12 #include "ConvertUTF.h"
13 #include "sfReader.h"
14
15 #ifndef platformUTF16
16 #ifdef __MWERKS__
17 #if __dest_os==__win32_os
18 #define platformUTF16 kForm_UTF16LE
19 #else
20 #define platformUTF16 kForm_UTF16BE
21 #endif
22 #endif
23 #endif
24
25 #ifndef platformUTF16
26 #ifdef __APPLE__
27 #include <TargetConditionals.h>
28 #if TARGET_RT_BIG_ENDIAN
29 #define platformUTF16 kForm_UTF16BE
30 #else
31 #define platformUTF16 kForm_UTF16LE
32 #endif
33 #endif
34 #endif
35
36 #ifndef platformUTF16
37 #include "config.h"
38 #if WORDS_BIGENDIAN
39 #define platformUTF16 kForm_UTF16BE
40 #else
41 #define platformUTF16 kForm_UTF16LE
42 #endif
43 #endif
44
45 #if HAVE_LIBEXPAT
46 #include <expat.h>
47 #else
48 #include "expat/xmlparse/xmlparse.h"
49 #endif
50
51
52 char* gMappingDirectory;
53
54 typedef basic_string<UniChar> ustring;
55
56 map<string,string> sfmMappings;
57 map<string,string> inlineMappings;
58
59 map<string,string>* mappings;
60
61 string defaultMapping;
62 string sfmMapping;
63 string inlineMapping;
64
65 ustring sfmCharsU;
66 ustring inlineCharsU;
67 long escapeCharU = 0x5c;
68 long inlineEscapeCharU = -1;
69 long startInlineU = -1;
70 long endInlineU = -1;
71
72 const char* defaultMarkerChars = "abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
73
74 map<string,TECkit_Converter> converters;
75
76 enum dir_enum {
77 kDirection_Unspecified = 0,
78 kDirection_8_U,
79 kDirection_U_8
80 } direction = kDirection_Unspecified;
81
82 int inForm = kForm_Unspecified;
83 int outForm = kForm_Unspecified;
84
85 static int sDepth;
86 static int sError;
87
88 static inline bool
89 streq(const char* s, const char* t)
90 {
91 return (strcmp(s, t) == 0);
92 }
93
94 static ustring
95 Utf8ToString(const char* s)
96 {
97 int len = strlen(s);
98 UniChar* buf = new UniChar[len];
99 const Byte* sourceStart = (Byte*)s;
100 UniChar* targetStart = buf;
101 int status = ConvertUTF8toUTF16(&sourceStart, sourceStart + len, &targetStart, targetStart + len, lenientConversion);
102 if (status != conversionOK) {
103 fprintf(stderr, "error %d converting UTF-8 to UTF-16\n", status);
104 exit(1);
105 }
106 ustring ustr(buf, targetStart - buf);
107 delete[] buf;
108 return ustr;
109 }
110
111 static void
112 startElement(void* /*userData*/, const XML_Char *name, const XML_Char **atts)
113 {
114 switch (sDepth) {
115 case 0:
116 if (streq(name, "sfConversion")) {
117 while (*atts) {
118 const XML_Char* n = *atts++;
119 const XML_Char* v = *atts++;
120 if (streq(n, "defaultMapping")) {
121 defaultMapping = v;
122 }
123 else {
124 fprintf(stderr, "unrecognized attribute \"%s\" for <sfConversion>\n", n);
125 sError = 1;
126 break;
127 }
128 }
129 if (defaultMapping.length() == 0) {
130 fprintf(stderr, "<sfConversion> requires \"defaultMapping\" attribute\n");
131 sError = 1;
132 break;
133 }
134 }
135 else {
136 fprintf(stderr, "expected <sfConversion>, not <%s>, as top-level element in control file\n", name);
137 sError = 1;
138 break;
139 }
140 break;
141
142 case 1:
143 if (streq(name, "sfMarkers")) {
144 while (*atts) {
145 const XML_Char* n = *atts++;
146 const XML_Char* v = *atts++;
147 if (streq(n, "escape")) {
148 ustring u = Utf8ToString(v);
149 if (u.length() != 1) {
150 fprintf(stderr, "\"escape\" attribute of <sfMarkers> must be a single character\n");
151 sError = 1;
152 break;
153 }
154 escapeCharU = u[0];
155 }
156 else if (streq(n, "chars")) {
157 sfmCharsU = Utf8ToString(v);
158 }
159 else if (streq(n, "mapping")) {
160 sfmMapping = v;
161 }
162 else {
163 fprintf(stderr, "unrecognized attribute \"%s\" for <sfMarkers>\n", n);
164 sError = 1;
165 break;
166 }
167 }
168 if (sfmMapping.length() == 0)
169 sfmMapping = defaultMapping;
170 if (sfmCharsU.length() == 0) {
171 sfmCharsU.reserve(96);
172 for (const char* cp = defaultMarkerChars; *cp; ++cp)
173 sfmCharsU.append(1, *cp);
174 }
175 mappings = &sfmMappings;
176 }
177 else if (streq(name, "inlineMarkers")) {
178 while (*atts) {
179 const XML_Char* n = *atts++;
180 const XML_Char* v = *atts++;
181 if (streq(n, "escape")) {
182 ustring u = Utf8ToString(v);
183 if (u.length() != 1) {
184 fprintf(stderr, "\"escape\" attribute of <inlineMarkers> must be a single character\n");
185 sError = 1;
186 break;
187 }
188 inlineEscapeCharU = u[0];
189 }
190 else if (streq(n, "start")) {
191 ustring u = Utf8ToString(v);
192 if (u.length() != 1) {
193 fprintf(stderr, "\"start\" attribute of <inlineMarkers> must be a single character\n");
194 sError = 1;
195 break;
196 }
197 startInlineU = u[0];
198 }
199 else if (streq(n, "end")) {
200 ustring u = Utf8ToString(v);
201 if (u.length() != 1) {
202 fprintf(stderr, "\"end\" attribute of <inlineMarkers> must be a single character\n");
203 sError = 1;
204 break;
205 }
206 endInlineU = u[0];
207 }
208 else if (streq(n, "chars")) {
209 inlineCharsU = Utf8ToString(v);
210 }
211 else if (streq(n, "mapping")) {
212 inlineMapping = v;
213 }
214 else {
215 fprintf(stderr, "unrecognized attribute \"%s\" for <inlineMarkers>\n", n);
216 sError = 1;
217 break;
218 }
219 }
220 if (inlineMapping.length() == 0)
221 inlineMapping = defaultMapping;
222 if (inlineCharsU.length() == 0) {
223 inlineCharsU.reserve(96);
224 for (const char* cp = defaultMarkerChars; *cp; ++cp)
225 inlineCharsU.append(1, *cp);
226 }
227 mappings = &inlineMappings;
228 }
229 else {
230 fprintf(stderr, "unrecognized element <%s> in <sfConversion>\n", name);
231 sError = 1;
232 break;
233 }
234 break;
235
236 case 2:
237 if (streq(name, "marker")) {
238 string marker;
239 string mapping;
240 while (*atts) {
241 const XML_Char* n = *atts++;
242 const XML_Char* v = *atts++;
243 if (streq(n, "name")) {
244 marker = v;
245 }
246 else if (streq(n, "mapping")) {
247 mapping = v;
248 }
249 else {
250 fprintf(stderr, "unrecognized attribute \"%s\" for <marker>\n", n);
251 sError = 1;
252 break;
253 }
254 }
255 if (marker.length() == 0 || mapping.length() == 0) {
256 fprintf(stderr, "<marker> requires \"name\" and \"mapping\" attributes\n");
257 sError = 1;
258 break;
259 }
260 if ((*mappings).find(marker) != (*mappings).end()) {
261 fprintf(stderr, "duplicate entry for marker \"%s\"\n", marker.c_str());
262 sError = 1;
263 break;
264 }
265 (*mappings)[marker] = mapping;
266 }
267 else {
268 fprintf(stderr, "unrecognized element <%s>\n", name);
269 sError = 1;
270 break;
271 }
272 break;
273
274 default:
275 fprintf(stderr, "control file elements nested improperly at <%s>\n", name);
276 sError = 1;
277 break;
278 }
279
280 ++sDepth;
281 }
282
283 static void
284 endElement(void* /*userData*/, const XML_Char */*name*/)
285 {
286 --sDepth;
287 }
288
289 bool bom = false;
290
291 static TECkit_Converter
292 makeConverter(const string& mappingName, int direction)
293 {
294 string mapFileName;
295 if (gMappingDirectory)
296 mapFileName += gMappingDirectory;
297 mapFileName += mappingName;
298 mapFileName += ".tec";
299
300 FILE* mapFile = fopen(mapFileName.c_str(), "rb");
301 if (mapFile == 0) {
302 fprintf(stderr, "unable to read mapping file for %s (file %s)\n", mappingName.c_str(), mapFileName.c_str());
303 exit(1);
304 }
305
306 fseek(mapFile, 0, SEEK_END);
307 long fileSize = ftell(mapFile);
308 fseek(mapFile, 0, SEEK_SET);
309
310 unsigned char* buf = (unsigned char*)malloc(fileSize);
311 if (buf == 0) {
312 fprintf(stderr, "unable to read mapping file for %s (file %s)\n", mappingName.c_str(), mapFileName.c_str());
313 exit(1);
314 }
315 fread(buf, 1, fileSize, mapFile);
316 fclose(mapFile);
317
318 TECkit_Converter converter;
319 TECkit_Status status = TECkit_CreateConverter(buf, fileSize,
320 (direction == kDirection_8_U),
321 (direction == kDirection_8_U) ? kForm_Bytes : platformUTF16,
322 (direction == kDirection_8_U) ? outForm : kForm_Bytes,
323 &converter);
324 if (status != kStatus_NoError) {
325 fprintf(stderr, "failed to create converter for %s (file %s)\n", mappingName.c_str(), mapFileName.c_str());
326 exit(1);
327 }
328
329 free(buf);
330
331 return converter;
332 }
333
334 static bool
335 read_control_file(const char* controlFile)
336 {
337 FILE* ctlFile = fopen(controlFile, "r");
338 if (ctlFile == 0) {
339 fprintf(stderr, "unable to open control file %s\n", controlFile);
340 return false;
341 }
342
343 char buf[BUFSIZ];
344 XML_Parser parser = XML_ParserCreate(0);
345 int done;
346
347 XML_SetElementHandler(parser, startElement, endElement);
348
349 int status = 0;
350 do {
351 size_t len = fread(buf, 1, sizeof(buf), ctlFile);
352 done = len < sizeof(buf);
353 if (!XML_Parse(parser, buf, len, done)) {
354 fprintf(stderr, "XML parse error: %s at line %lu\n", XML_ErrorString(XML_GetErrorCode(parser)), (unsigned long)XML_GetCurrentLineNumber(parser));
355 status = 1;
356 }
357 } while (!status && !done);
358
359 fclose(ctlFile);
360
361 XML_ParserFree(parser);
362
363 if (status != 0)
364 return false;
365
366 converters[defaultMapping] = makeConverter(defaultMapping, direction);
367
368 if (converters.find(sfmMapping) == converters.end())
369 converters[sfmMapping] = makeConverter(sfmMapping, direction);
370 if (inlineMapping.length() > 0 && converters.find(inlineMapping) == converters.end())
371 converters[sfmMapping] = makeConverter(inlineMapping, direction);
372
373 for (map<string,string>::const_iterator i = sfmMappings.begin(); i != sfmMappings.end(); ++i) {
374 if (converters.find(i->second) == converters.end())
375 converters[i->second] = makeConverter(i->second, direction);
376 }
377 for (map<string,string>::const_iterator i = inlineMappings.begin(); i != inlineMappings.end(); ++i) {
378 if (converters.find(i->second) == converters.end())
379 converters[i->second] = makeConverter(i->second, direction);
380 }
381 return true;
382 }
383
384 static void
385 write_converted(const Byte* data, long nBytes, TECkit_Converter converter, FILE* outFile)
386 {
387 static Byte* convBuffer = 0;
388 static UInt32 bufferSize = 0;
389
390 UInt32 reqSpace = nBytes * 4 + 256; // probably plenty of space
391 UInt32 sourceUsed, destUsed;
392 int status;
393
394 // do the conversion
395
396 while (1) {
397 if (bufferSize < reqSpace) {
398 if (convBuffer != 0)
399 delete[] convBuffer;
400 bufferSize = reqSpace;
401 convBuffer = new Byte[bufferSize];
402 }
403 status = TECkit_ConvertBuffer(
404 converter,
405 const_cast<Byte*>(data),
406 nBytes,
407 &sourceUsed,
408 convBuffer,
409 bufferSize,
410 &destUsed,
411 true);
412 if (status == kStatus_OutputBufferFull) {
413 reqSpace *= 2; // output didn't fit, enlarge buffer and try again
414 continue;
415 }
416 if (status != kStatus_NoError) {
417 fprintf(stderr, "error %d in TECkit_Convert\n", status);
418 exit(1);
419 }
420 UInt32 destUsed2;
421 status = TECkit_Flush(
422 converter,
423 convBuffer + destUsed,
424 bufferSize - destUsed,
425 &destUsed2);
426 if (status == kStatus_OutputBufferFull) {
427 reqSpace *= 2;
428 continue;
429 }
430 TECkit_ResetConverter(converter);
431 if (status != kStatus_NoError) {
432 fprintf(stderr, "error %d in TECkit_Flush\n", status);
433 exit(1);
434 }
435 nBytes = destUsed + destUsed2;
436 break;
437 }
438
439 fwrite(convBuffer, 1, nBytes, outFile);
440 }
441
442 static void
443 convertMarker(const ustring& marker, TECkit_Converter converter, string& cnvMarker)
444 {
445 int status;
446 if (cnvMarker.size() < marker.length() * 4)
447 cnvMarker.resize(marker.length() * 4 + 32);
448 while (1) {
449 UInt32 sourceUsed, destUsed, destUsed2;
450 status = TECkit_ConvertBuffer(converter,
451 (Byte*)marker.data(),
452 marker.size() * 2,
453 &sourceUsed,
454 (Byte*)cnvMarker.data(),
455 cnvMarker.size(),
456 &destUsed,
457 true);
458 if (status == kStatus_OutputBufferFull) {
459 cnvMarker.resize(cnvMarker.size() * 2);
460 continue;
461 }
462 status = TECkit_Flush(converter,
463 (Byte*)cnvMarker.data() + destUsed,
464 cnvMarker.size() - destUsed,
465 &destUsed2);
466 if (status == kStatus_OutputBufferFull) {
467 cnvMarker.resize(cnvMarker.size() * 2);
468 continue;
469 }
470 cnvMarker.resize(destUsed + destUsed2);
471 TECkit_ResetConverter(converter);
472 if (status != kStatus_NoError) {
473 fprintf(stderr, "error %d converting SFM from Unicode\n", status);
474 exit(1);
475 }
476 break;
477 }
478 }
479
480 static long
481 convertSingleChar(UniChar inChar, TECkit_Converter converter)
482 {
483 int status;
484 Byte buf[32];
485 UInt32 sourceUsed, destUsed, destUsed2;
486
487 status = TECkit_ConvertBuffer(converter,
488 (Byte*)&inChar,
489 2,
490 &sourceUsed,
491 &buf[0],
492 sizeof(buf),
493 &destUsed,
494 true);
495 if (status == kStatus_OutputBufferFull || destUsed > 1) {
496 fprintf(stderr, "marker characters must map to single byte values");
497 exit(1);
498 }
499 status = TECkit_Flush(converter,
500 &buf[0] + destUsed,
501 sizeof(buf) - destUsed,
502 &destUsed2);
503 if (status == kStatus_OutputBufferFull || destUsed + destUsed2 != 1) {
504 fprintf(stderr, "marker characters must map to single byte values");
505 exit(1);
506 }
507 TECkit_ResetConverter(converter);
508
509 if (status != kStatus_NoError) {
510 fprintf(stderr, "error %d converting marker characters from Unicode\n", status);
511 exit(1);
512 }
513
514 return buf[0];
515 }
516
517 static void
518 process(const char* inputFile, const char* outputFile)
519 {
520 TECkit_Converter defaultConverter = converters[defaultMapping];
521 TECkit_Converter sfmConverter = converters[sfmMapping];
522 TECkit_Converter inlineConverter = converters[inlineMapping];
523
524 FILE* outFile = fopen(outputFile, "wb");
525 if (!outFile) {
526 fprintf(stderr, "unable to open output file %s\n", outputFile);
527 exit(1);
528 }
529
530 FILE* inFile = fopen(inputFile, "rb");
531 if (!inFile) {
532 fprintf(stderr, "unable to open input file %s\n", inputFile);
533 exit(1);
534 }
535
536 if (direction == kDirection_8_U) {
537 // *** Byte to Unicode conversion
538 if (outForm == kForm_Unspecified)
539 outForm = kForm_UTF8;
540
541 if (bom) {
542 if (outForm == kForm_UTF8) {
543 Byte bom[] = "\xEF\xBB\xBF";
544 fwrite(bom, 3, 1, outFile);
545 }
546 else if (outForm == kForm_UTF16BE) {
547 Byte bom[] = "\xFE\xFF";
548 fwrite(&bom, 2, 1, outFile);
549 }
550 else {
551 Byte bom[] = "\xFF\xFE";
552 fwrite(&bom, 2, 1, outFile);
553 }
554 }
555
556 sfReader<char> reader(inFile);
557
558 TECkit_Converter markerMapping = makeConverter(sfmMapping, kDirection_U_8);
559
560 reader.escapeChar = convertSingleChar(escapeCharU, markerMapping);
561 for (size_t i = 0; i < sfmCharsU.length(); ++i)
562 reader.sfmChars.append(1, convertSingleChar(sfmCharsU[i], markerMapping));
563
564 if (inlineEscapeCharU != -1) {
565 if (inlineMapping != sfmMapping) {
566 TECkit_DisposeConverter(markerMapping);
567 markerMapping = makeConverter(inlineMapping, kDirection_U_8);
568 }
569 reader.inlineEscapeChar = convertSingleChar(inlineEscapeCharU, markerMapping);
570 reader.startInline = convertSingleChar(startInlineU, markerMapping);
571 reader.endInline = convertSingleChar(endInlineU, markerMapping);
572 for (size_t i = 0; i < inlineCharsU.length(); ++i)
573 reader.inlineChars.append(1, convertSingleChar(inlineCharsU[i], markerMapping));
574 }
575
576 TECkit_DisposeConverter(markerMapping);
577
578 vector<TECkit_Converter> converterStack;
579 converterStack.assign(1, defaultConverter);
580 int dataType;
581 while ((dataType = reader.next(converterStack.size() > 1)) != END_OF_FILE) {
582 map<string,string>::const_iterator i;
583 switch (dataType) {
584 case BODY_TEXT:
585 write_converted((Byte*)reader.text.data(), reader.text.length(), converterStack.back(), outFile);
586 break;
587
588 case SFM:
589 i = sfmMappings.find(reader.text);
590 if (i == sfmMappings.end())
591 converterStack.assign(1, defaultConverter);
592 else
593 converterStack.assign(1, converters[i->second]);
594 reader.text.insert(reader.text.begin(), reader.escapeChar);
595 write_converted((Byte*)reader.text.data(), reader.text.length(), sfmConverter, outFile);
596 break;
597
598 case INLINE_MARKER:
599 i = inlineMappings.find(reader.text);
600 if (i == inlineMappings.end())
601 converterStack.assign(1, converterStack.back());
602 else
603 converterStack.assign(1, converters[i->second]);
604 reader.text.insert(reader.text.begin(), reader.escapeChar);
605 write_converted((Byte*)reader.text.data(), reader.text.length(), inlineConverter, outFile);
606 break;
607
608 case INLINE_START:
609 i = inlineMappings.find(reader.text);
610 if (i == inlineMappings.end())
611 converterStack.push_back(converterStack.back());
612 else
613 converterStack.push_back(converters[i->second]);
614 reader.text.insert(reader.text.begin(), reader.inlineEscapeChar);
615 reader.text.insert(reader.text.end(), reader.startInline);
616 write_converted((Byte*)reader.text.data(), reader.text.length(), inlineConverter, outFile);
617 break;
618
619 case INLINE_END:
620 reader.text.insert(reader.text.end(), reader.endInline);
621 write_converted((Byte*)reader.text.data(), reader.text.length(), inlineConverter, outFile);
622 converterStack.pop_back();
623 break;
624 }
625 }
626 }
627 else {
628 // *** Unicode to Byte conversion
629 Byte bom[3];
630 long pos = ftell(inFile);
631 if (fread(bom, 3, 1, inFile)) {
632 if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf) {
633 inForm = kForm_UTF8;
634 }
635 else if (bom[0] == 0xfe && bom[1] == 0xff) {
636 inForm = kForm_UTF16BE;
637 fseek(inFile, pos + 2, SEEK_SET);
638 }
639 else if (bom[0] == 0xff && bom[1] == 0xfe) {
640 inForm = kForm_UTF16LE;
641 fseek(inFile, pos + 2, SEEK_SET);
642 }
643 else {
644 if (inForm == kForm_Unspecified) {
645 if (bom[0] == 0)
646 inForm = kForm_UTF16BE;
647 else if (bom[1] == 0)
648 inForm = kForm_UTF16LE;
649 else
650 inForm = kForm_UTF8;
651 }
652 fseek(inFile, pos, SEEK_SET);
653 }
654 }
655 else
656 fseek(inFile, pos, SEEK_SET);
657
658 sfReader<UniChar> reader(inFile, inForm);
659
660 reader.escapeChar = escapeCharU;
661 reader.sfmChars = sfmCharsU;
662
663 if (inlineEscapeCharU != -1) {
664 reader.inlineEscapeChar = inlineEscapeCharU;
665 reader.startInline = startInlineU;
666 reader.endInline = endInlineU;
667 reader.inlineChars = inlineCharsU;
668 }
669
670 vector<TECkit_Converter> converterStack;
671 converterStack.assign(1, defaultConverter);
672 int dataType;
673 while ((dataType = reader.next(converterStack.size() > 1)) != END_OF_FILE) {
674 map<string,string>::const_iterator i;
675 static string cnvMarker;
676 switch (dataType) {
677 case BODY_TEXT:
678 write_converted((Byte*)reader.text.data(), reader.text.length() * 2, converterStack.back(), outFile);
679 break;
680
681 case SFM:
682 convertMarker(reader.text, sfmConverter, cnvMarker);
683 i = sfmMappings.find(cnvMarker);
684 if (i == sfmMappings.end())
685 converterStack.assign(1, defaultConverter);
686 else
687 converterStack.assign(1, converters[i->second]);
688 reader.text.insert(reader.text.begin(), reader.escapeChar);
689 write_converted((Byte*)reader.text.data(), reader.text.length() * 2, sfmConverter, outFile);
690 break;
691
692 case INLINE_MARKER:
693 convertMarker(reader.text, inlineConverter, cnvMarker);
694 i = inlineMappings.find(cnvMarker);
695 if (i == inlineMappings.end())
696 converterStack.assign(1, converterStack.back());
697 else
698 converterStack.assign(1, converters[i->second]);
699 reader.text.insert(reader.text.begin(), reader.inlineEscapeChar);
700 write_converted((Byte*)reader.text.data(), reader.text.length() * 2, inlineConverter, outFile);
701 break;
702
703 case INLINE_START:
704 convertMarker(reader.text, inlineConverter, cnvMarker);
705 i = inlineMappings.find(cnvMarker);
706 if (i == inlineMappings.end())
707 converterStack.push_back(converterStack.back());
708 else
709 converterStack.push_back(converters[i->second]);
710 reader.text.insert(reader.text.begin(), reader.inlineEscapeChar);
711 reader.text.insert(reader.text.end(), reader.startInline);
712 write_converted((Byte*)reader.text.data(), reader.text.length() * 2, inlineConverter, outFile);
713 break;
714
715 case INLINE_END:
716 reader.text.insert(reader.text.end(), reader.endInline);
717 write_converted((Byte*)reader.text.data(), reader.text.length() * 2, inlineConverter, outFile);
718 converterStack.pop_back();
719 break;
720 }
721 }
722 }
723
724 fclose(inFile);
725 fclose(outFile);
726 }
727
728 #ifdef __MWERKS__
729 #if (__dest_os == __mac_os)
730 #include <console.h>
731 #endif
732 #endif
733
734 int
735 main(
736 int argc,
737 char** argv)
738 {
739 #ifdef __MWERKS__
740 #if (__dest_os == __mac_os)
741 argc = ccommand(&argv);
742 #endif
743 #endif
744
745 char* controlFile = 0;
746 char* inputFile = 0;
747 char* outputFile = 0;
748
749 bool cmdLineErr = (argc < 2);
750
751 char unicodeFormat = kForm_UTF8;
752
753 int normForm = 0;
754
755 while (--argc) {
756 char *arg = *++argv;
757 if (arg[0] == '-') {
758 if (strlen(arg + 1) == 1) {
759 switch (arg[1]) {
760 case 'c':
761 if (controlFile != 0) {
762 fprintf(stderr, "repeated argument -c\n");
763 cmdLineErr = true;
764 continue;
765 }
766 if (argc == 0) {
767 fprintf(stderr, "missing file name after -c\n");
768 cmdLineErr = true;
769 continue;
770 }
771 controlFile = *++argv;
772 --argc;
773 continue;
774 case 'd':
775 if (gMappingDirectory != 0) {
776 fprintf(stderr, "repeated argument -d\n");
777 cmdLineErr = true;
778 continue;
779 }
780 if (argc == 0) {
781 fprintf(stderr, "missing directory path after -d\n");
782 cmdLineErr = true;
783 continue;
784 }
785 gMappingDirectory = *++argv;
786 --argc;
787 continue;
788 case 'i':
789 if (inputFile != 0) {
790 fprintf(stderr, "repeated argument -i\n");
791 cmdLineErr = true;
792 continue;
793 }
794 if (argc == 0) {
795 fprintf(stderr, "missing file name after -i\n");
796 cmdLineErr = true;
797 continue;
798 }
799 inputFile = *++argv;
800 --argc;
801 continue;
802 case 'o':
803 if (outputFile != 0) {
804 fprintf(stderr, "repeated argument -o\n");
805 cmdLineErr = true;
806 continue;
807 }
808 if (argc == 0) {
809 fprintf(stderr, "missing file name after -o\n");
810 cmdLineErr = true;
811 continue;
812 }
813 outputFile = *++argv;
814 --argc;
815 continue;
816 case 'h':
817 cmdLineErr = true; // to get "usage" message
818 continue;
819 }
820 }
821 else if (strcmp(arg + 1, "utf8") == 0)
822 unicodeFormat = kForm_UTF8;
823 else if (strcmp(arg + 1, "be") == 0)
824 unicodeFormat = kForm_UTF16BE;
825 else if (strcmp(arg + 1, "le") == 0)
826 unicodeFormat = kForm_UTF16LE;
827 else if (strcmp(arg + 1, "bom") == 0)
828 bom = true;
829 else if (strcmp(arg + 1, "u8") == 0)
830 direction = kDirection_U_8;
831 else if (strcmp(arg + 1, "8u") == 0)
832 direction = kDirection_8_U;
833 else if (strcmp(arg + 1, "nfc") == 0)
834 normForm = kForm_NFC;
835 else if (strcmp(arg + 1, "nfd") == 0)
836 normForm = kForm_NFD;
837 else {
838 fprintf(stderr, "Unknown option: %s\n", arg);
839 cmdLineErr = true;
840 }
841 }
842 else {
843 cmdLineErr = true;
844 }
845 }
846
847 if (cmdLineErr || direction == kDirection_Unspecified || unicodeFormat == kForm_Unspecified) {
848 fprintf(stderr, "\
849 8-bit to Unicode:\n\
850 SFconv -8u [-utf8|-be|-le] [-bom] -c ControlFile [-d MappingDirectory] -i InFile -o OutFile\n\
851 Unicode to 8-bit:\n\
852 SFconv -u8 [-utf8|-be|-le] -c ControlFile [-d MappingDirectory] -i InFile -o OutFile\n");
853 return 1;
854 }
855
856 if (direction == kDirection_8_U) {
857 inForm = kForm_Bytes;
858 outForm = unicodeFormat + normForm;
859 }
860 else {
861 inForm = unicodeFormat;
862 outForm = kForm_Bytes;
863 }
864
865 if (!read_control_file(controlFile))
866 exit(1);
867
868 process(inputFile, outputFile);
869
870 return 0;
871 }
0 /*
1 The contents of this file are subject to the Mozilla Public License
2 Version 1.1 (the "License"); you may not use this file except in
3 csompliance with the License. You may obtain a copy of the License at
4 http://www.mozilla.org/MPL/
5
6 Software distributed under the License is distributed on an "AS IS"
7 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
8 License for the specific language governing rights and limitations
9 under the License.
10
11 The Original Code is expat.
12
13 The Initial Developer of the Original Code is James Clark.
14 Portions created by James Clark are Copyright (C) 1998, 1999
15 James Clark. All Rights Reserved.
16
17 Contributor(s):
18
19 Alternatively, the contents of this file may be used under the terms
20 of the GNU General Public License (the "GPL"), in which case the
21 provisions of the GPL are applicable instead of those above. If you
22 wish to allow use of your version of this file only under the terms of
23 the GPL and not to allow others to use your version of this file under
24 the MPL, indicate your decision by deleting the provisions above and
25 replace them with the notice and other provisions required by the
26 GPL. If you do not delete the provisions above, a recipient may use
27 your version of this file under either the MPL or the GPL.
28 */
29
30 #include "xmldef.h"
31
32 #ifdef XML_UNICODE_WCHAR_T
33 #ifndef XML_UNICODE
34 #define XML_UNICODE
35 #endif
36 #endif
37
38 #include "hashtable.h"
39
40 #define INIT_SIZE 64
41
42 static
43 int keyeq(KEY s1, KEY s2)
44 {
45 for (; *s1 == *s2; s1++, s2++)
46 if (*s1 == 0)
47 return 1;
48 return 0;
49 }
50
51 static
52 unsigned long hash(KEY s)
53 {
54 unsigned long h = 0;
55 while (*s)
56 h = (h << 5) + h + (unsigned char)*s++;
57 return h;
58 }
59
60 NAMED *lookup(HASH_TABLE *table, KEY name, size_t createSize)
61 {
62 size_t i;
63 if (table->size == 0) {
64 if (!createSize)
65 return 0;
66 table->v = calloc(INIT_SIZE, sizeof(NAMED *));
67 if (!table->v)
68 return 0;
69 table->size = INIT_SIZE;
70 table->usedLim = INIT_SIZE / 2;
71 i = hash(name) & (table->size - 1);
72 }
73 else {
74 unsigned long h = hash(name);
75 for (i = h & (table->size - 1);
76 table->v[i];
77 i == 0 ? i = table->size - 1 : --i) {
78 if (keyeq(name, table->v[i]->name))
79 return table->v[i];
80 }
81 if (!createSize)
82 return 0;
83 if (table->used == table->usedLim) {
84 /* check for overflow */
85 size_t newSize = table->size * 2;
86 NAMED **newV = calloc(newSize, sizeof(NAMED *));
87 if (!newV)
88 return 0;
89 for (i = 0; i < table->size; i++)
90 if (table->v[i]) {
91 size_t j;
92 for (j = hash(table->v[i]->name) & (newSize - 1);
93 newV[j];
94 j == 0 ? j = newSize - 1 : --j)
95 ;
96 newV[j] = table->v[i];
97 }
98 free(table->v);
99 table->v = newV;
100 table->size = newSize;
101 table->usedLim = newSize/2;
102 for (i = h & (table->size - 1);
103 table->v[i];
104 i == 0 ? i = table->size - 1 : --i)
105 ;
106 }
107 }
108 table->v[i] = calloc(1, createSize);
109 if (!table->v[i])
110 return 0;
111 table->v[i]->name = name;
112 (table->used)++;
113 return table->v[i];
114 }
115
116 void hashTableDestroy(HASH_TABLE *table)
117 {
118 size_t i;
119 for (i = 0; i < table->size; i++) {
120 NAMED *p = table->v[i];
121 if (p)
122 free(p);
123 }
124 free(table->v);
125 }
126
127 void hashTableInit(HASH_TABLE *p)
128 {
129 p->size = 0;
130 p->usedLim = 0;
131 p->used = 0;
132 p->v = 0;
133 }
134
135 void hashTableIterInit(HASH_TABLE_ITER *iter, const HASH_TABLE *table)
136 {
137 iter->p = table->v;
138 iter->end = iter->p + table->size;
139 }
140
141 NAMED *hashTableIterNext(HASH_TABLE_ITER *iter)
142 {
143 while (iter->p != iter->end) {
144 NAMED *tem = *(iter->p)++;
145 if (tem)
146 return tem;
147 }
148 return 0;
149 }
150
0 /*
1 The contents of this file are subject to the Mozilla Public License
2 Version 1.1 (the "License"); you may not use this file except in
3 compliance with the License. You may obtain a copy of the License at
4 http://www.mozilla.org/MPL/
5
6 Software distributed under the License is distributed on an "AS IS"
7 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
8 License for the specific language governing rights and limitations
9 under the License.
10
11 The Original Code is expat.
12
13 The Initial Developer of the Original Code is James Clark.
14 Portions created by James Clark are Copyright (C) 1998, 1999
15 James Clark. All Rights Reserved.
16
17 Contributor(s):
18
19 Alternatively, the contents of this file may be used under the terms
20 of the GNU General Public License (the "GPL"), in which case the
21 provisions of the GPL are applicable instead of those above. If you
22 wish to allow use of your version of this file only under the terms of
23 the GPL and not to allow others to use your version of this file under
24 the MPL, indicate your decision by deleting the provisions above and
25 replace them with the notice and other provisions required by the
26 GPL. If you do not delete the provisions above, a recipient may use
27 your version of this file under either the MPL or the GPL.
28 */
29
30 #include "xmldef.h"
31 #include "xmlparse.h"
32
33 #ifdef XML_UNICODE
34 #define XML_ENCODE_MAX XML_UTF16_ENCODE_MAX
35 #define XmlConvert XmlUtf16Convert
36 #define XmlGetInternalEncoding XmlGetUtf16InternalEncoding
37 #define XmlGetInternalEncodingNS XmlGetUtf16InternalEncodingNS
38 #define XmlEncode XmlUtf16Encode
39 #define MUST_CONVERT(enc, s) (!(enc)->isUtf16 || (((unsigned long)s) & 1))
40 typedef unsigned short ICHAR;
41 #else
42 #define XML_ENCODE_MAX XML_UTF8_ENCODE_MAX
43 #define XmlConvert XmlUtf8Convert
44 #define XmlGetInternalEncoding XmlGetUtf8InternalEncoding
45 #define XmlGetInternalEncodingNS XmlGetUtf8InternalEncodingNS
46 #define XmlEncode XmlUtf8Encode
47 #define MUST_CONVERT(enc, s) (!(enc)->isUtf8)
48 typedef char ICHAR;
49 #endif
50
51
52 #ifndef XML_NS
53
54 #define XmlInitEncodingNS XmlInitEncoding
55 #define XmlInitUnknownEncodingNS XmlInitUnknownEncoding
56 #undef XmlGetInternalEncodingNS
57 #define XmlGetInternalEncodingNS XmlGetInternalEncoding
58 #define XmlParseXmlDeclNS XmlParseXmlDecl
59
60 #endif
61
62 #ifdef XML_UNICODE_WCHAR_T
63 #define XML_T(x) L ## x
64 #else
65 #define XML_T(x) x
66 #endif
67
68 /* Round up n to be a multiple of sz, where sz is a power of 2. */
69 #define ROUND_UP(n, sz) (((n) + ((sz) - 1)) & ~((sz) - 1))
70
71 #include "xmltok.h"
72 #include "xmlrole.h"
73 #include "hashtable.h"
74
75 #define INIT_TAG_BUF_SIZE 32 /* must be a multiple of sizeof(XML_Char) */
76 #define INIT_DATA_BUF_SIZE 1024
77 #define INIT_ATTS_SIZE 16
78 #define INIT_BLOCK_SIZE 1024
79 #define INIT_BUFFER_SIZE 1024
80
81 #define EXPAND_SPARE 24
82
83 typedef struct binding {
84 struct prefix *prefix;
85 struct binding *nextTagBinding;
86 struct binding *prevPrefixBinding;
87 const struct attribute_id *attId;
88 XML_Char *uri;
89 int uriLen;
90 int uriAlloc;
91 } BINDING;
92
93 typedef struct prefix {
94 const XML_Char *name;
95 BINDING *binding;
96 } PREFIX;
97
98 typedef struct {
99 const XML_Char *str;
100 const XML_Char *localPart;
101 int uriLen;
102 } TAG_NAME;
103
104 typedef struct tag {
105 struct tag *parent;
106 const char *rawName;
107 int rawNameLength;
108 TAG_NAME name;
109 char *buf;
110 char *bufEnd;
111 BINDING *bindings;
112 } TAG;
113
114 typedef struct {
115 const XML_Char *name;
116 const XML_Char *textPtr;
117 int textLen;
118 const XML_Char *systemId;
119 const XML_Char *base;
120 const XML_Char *publicId;
121 const XML_Char *notation;
122 char open;
123 } ENTITY;
124
125 typedef struct block {
126 struct block *next;
127 int size;
128 XML_Char s[1];
129 } BLOCK;
130
131 typedef struct {
132 BLOCK *blocks;
133 BLOCK *freeBlocks;
134 const XML_Char *end;
135 XML_Char *ptr;
136 XML_Char *start;
137 } STRING_POOL;
138
139 /* The XML_Char before the name is used to determine whether
140 an attribute has been specified. */
141 typedef struct attribute_id {
142 XML_Char *name;
143 PREFIX *prefix;
144 char maybeTokenized;
145 char xmlns;
146 } ATTRIBUTE_ID;
147
148 typedef struct {
149 const ATTRIBUTE_ID *id;
150 char isCdata;
151 const XML_Char *value;
152 } DEFAULT_ATTRIBUTE;
153
154 typedef struct {
155 const XML_Char *name;
156 PREFIX *prefix;
157 int nDefaultAtts;
158 int allocDefaultAtts;
159 DEFAULT_ATTRIBUTE *defaultAtts;
160 } ELEMENT_TYPE;
161
162 typedef struct {
163 HASH_TABLE generalEntities;
164 HASH_TABLE elementTypes;
165 HASH_TABLE attributeIds;
166 HASH_TABLE prefixes;
167 STRING_POOL pool;
168 int complete;
169 int standalone;
170 #ifdef XML_DTD
171 HASH_TABLE paramEntities;
172 #endif /* XML_DTD */
173 PREFIX defaultPrefix;
174 } DTD;
175
176 typedef struct open_internal_entity {
177 const char *internalEventPtr;
178 const char *internalEventEndPtr;
179 struct open_internal_entity *next;
180 ENTITY *entity;
181 } OPEN_INTERNAL_ENTITY;
182
183 typedef enum XML_Error Processor(XML_Parser parser,
184 const char *start,
185 const char *end,
186 const char **endPtr);
187
188 static Processor prologProcessor;
189 static Processor prologInitProcessor;
190 static Processor contentProcessor;
191 static Processor cdataSectionProcessor;
192 #ifdef XML_DTD
193 static Processor ignoreSectionProcessor;
194 #endif /* XML_DTD */
195 static Processor epilogProcessor;
196 static Processor errorProcessor;
197 static Processor externalEntityInitProcessor;
198 static Processor externalEntityInitProcessor2;
199 static Processor externalEntityInitProcessor3;
200 static Processor externalEntityContentProcessor;
201
202 static enum XML_Error
203 handleUnknownEncoding(XML_Parser parser, const XML_Char *encodingName);
204 static enum XML_Error
205 processXmlDecl(XML_Parser parser, int isGeneralTextEntity, const char *, const char *);
206 static enum XML_Error
207 initializeEncoding(XML_Parser parser);
208 static enum XML_Error
209 doProlog(XML_Parser parser, const ENCODING *enc, const char *s,
210 const char *end, int tok, const char *next, const char **nextPtr);
211 static enum XML_Error
212 processInternalParamEntity(XML_Parser parser, ENTITY *entity);
213 static enum XML_Error
214 doContent(XML_Parser parser, int startTagLevel, const ENCODING *enc,
215 const char *start, const char *end, const char **endPtr);
216 static enum XML_Error
217 doCdataSection(XML_Parser parser, const ENCODING *, const char **startPtr, const char *end, const char **nextPtr);
218 #ifdef XML_DTD
219 static enum XML_Error
220 doIgnoreSection(XML_Parser parser, const ENCODING *, const char **startPtr, const char *end, const char **nextPtr);
221 #endif /* XML_DTD */
222 static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *, const char *s,
223 TAG_NAME *tagNamePtr, BINDING **bindingsPtr);
224 static
225 int addBinding(XML_Parser parser, PREFIX *prefix, const ATTRIBUTE_ID *attId, const XML_Char *uri, BINDING **bindingsPtr);
226 static int
227 defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *, int isCdata, const XML_Char *dfltValue);
228 static enum XML_Error
229 storeAttributeValue(XML_Parser parser, const ENCODING *, int isCdata, const char *, const char *,
230 STRING_POOL *);
231 static enum XML_Error
232 appendAttributeValue(XML_Parser parser, const ENCODING *, int isCdata, const char *, const char *,
233 STRING_POOL *);
234 static ATTRIBUTE_ID *
235 getAttributeId(XML_Parser parser, const ENCODING *enc, const char *start, const char *end);
236 static int setElementTypePrefix(XML_Parser parser, ELEMENT_TYPE *);
237 static enum XML_Error
238 storeEntityValue(XML_Parser parser, const ENCODING *enc, const char *start, const char *end);
239 static int
240 reportProcessingInstruction(XML_Parser parser, const ENCODING *enc, const char *start, const char *end);
241 static int
242 reportComment(XML_Parser parser, const ENCODING *enc, const char *start, const char *end);
243 static void
244 reportDefault(XML_Parser parser, const ENCODING *enc, const char *start, const char *end);
245
246 static const XML_Char *getContext(XML_Parser parser);
247 static int setContext(XML_Parser parser, const XML_Char *context);
248 static void normalizePublicId(XML_Char *s);
249 static int dtdInit(DTD *);
250 static void dtdDestroy(DTD *);
251 static int dtdCopy(DTD *newDtd, const DTD *oldDtd);
252 static int copyEntityTable(HASH_TABLE *, STRING_POOL *, const HASH_TABLE *);
253 #ifdef XML_DTD
254 static void dtdSwap(DTD *, DTD *);
255 #endif /* XML_DTD */
256 static void poolInit(STRING_POOL *);
257 static void poolClear(STRING_POOL *);
258 static void poolDestroy(STRING_POOL *);
259 static XML_Char *poolAppend(STRING_POOL *pool, const ENCODING *enc,
260 const char *ptr, const char *end);
261 static XML_Char *poolStoreString(STRING_POOL *pool, const ENCODING *enc,
262 const char *ptr, const char *end);
263 static int poolGrow(STRING_POOL *pool);
264 static const XML_Char *poolCopyString(STRING_POOL *pool, const XML_Char *s);
265 static const XML_Char *poolCopyStringN(STRING_POOL *pool, const XML_Char *s, int n);
266
267 #define poolStart(pool) ((pool)->start)
268 #define poolEnd(pool) ((pool)->ptr)
269 #define poolLength(pool) ((pool)->ptr - (pool)->start)
270 #define poolChop(pool) ((void)--(pool->ptr))
271 #define poolLastChar(pool) (((pool)->ptr)[-1])
272 #define poolDiscard(pool) ((pool)->ptr = (pool)->start)
273 #define poolFinish(pool) ((pool)->start = (pool)->ptr)
274 #define poolAppendChar(pool, c) \
275 (((pool)->ptr == (pool)->end && !poolGrow(pool)) \
276 ? 0 \
277 : ((*((pool)->ptr)++ = c), 1))
278
279 typedef struct {
280 /* The first member must be userData so that the XML_GetUserData macro works. */
281 void *m_userData;
282 void *m_handlerArg;
283 char *m_buffer;
284 /* first character to be parsed */
285 const char *m_bufferPtr;
286 /* past last character to be parsed */
287 char *m_bufferEnd;
288 /* allocated end of buffer */
289 const char *m_bufferLim;
290 long m_parseEndByteIndex;
291 const char *m_parseEndPtr;
292 XML_Char *m_dataBuf;
293 XML_Char *m_dataBufEnd;
294 XML_StartElementHandler m_startElementHandler;
295 XML_EndElementHandler m_endElementHandler;
296 XML_CharacterDataHandler m_characterDataHandler;
297 XML_ProcessingInstructionHandler m_processingInstructionHandler;
298 XML_CommentHandler m_commentHandler;
299 XML_StartCdataSectionHandler m_startCdataSectionHandler;
300 XML_EndCdataSectionHandler m_endCdataSectionHandler;
301 XML_DefaultHandler m_defaultHandler;
302 XML_StartDoctypeDeclHandler m_startDoctypeDeclHandler;
303 XML_EndDoctypeDeclHandler m_endDoctypeDeclHandler;
304 XML_UnparsedEntityDeclHandler m_unparsedEntityDeclHandler;
305 XML_NotationDeclHandler m_notationDeclHandler;
306 XML_StartNamespaceDeclHandler m_startNamespaceDeclHandler;
307 XML_EndNamespaceDeclHandler m_endNamespaceDeclHandler;
308 XML_NotStandaloneHandler m_notStandaloneHandler;
309 XML_ExternalEntityRefHandler m_externalEntityRefHandler;
310 void *m_externalEntityRefHandlerArg;
311 XML_UnknownEncodingHandler m_unknownEncodingHandler;
312 const ENCODING *m_encoding;
313 INIT_ENCODING m_initEncoding;
314 const ENCODING *m_internalEncoding;
315 const XML_Char *m_protocolEncodingName;
316 int m_ns;
317 void *m_unknownEncodingMem;
318 void *m_unknownEncodingData;
319 void *m_unknownEncodingHandlerData;
320 void (*m_unknownEncodingRelease)(void *);
321 PROLOG_STATE m_prologState;
322 Processor *m_processor;
323 enum XML_Error m_errorCode;
324 const char *m_eventPtr;
325 const char *m_eventEndPtr;
326 const char *m_positionPtr;
327 OPEN_INTERNAL_ENTITY *m_openInternalEntities;
328 int m_defaultExpandInternalEntities;
329 int m_tagLevel;
330 ENTITY *m_declEntity;
331 const XML_Char *m_declNotationName;
332 const XML_Char *m_declNotationPublicId;
333 ELEMENT_TYPE *m_declElementType;
334 ATTRIBUTE_ID *m_declAttributeId;
335 char m_declAttributeIsCdata;
336 DTD m_dtd;
337 const XML_Char *m_curBase;
338 TAG *m_tagStack;
339 TAG *m_freeTagList;
340 BINDING *m_inheritedBindings;
341 BINDING *m_freeBindingList;
342 int m_attsSize;
343 int m_nSpecifiedAtts;
344 ATTRIBUTE *m_atts;
345 POSITION m_position;
346 STRING_POOL m_tempPool;
347 STRING_POOL m_temp2Pool;
348 char *m_groupConnector;
349 unsigned m_groupSize;
350 int m_hadExternalDoctype;
351 XML_Char m_namespaceSeparator;
352 #ifdef XML_DTD
353 enum XML_ParamEntityParsing m_paramEntityParsing;
354 XML_Parser m_parentParser;
355 #endif