source/Engine.cpp - teckit (feac16e7-4be5-4b4a-807b-f5dab566d3ea/main)

Tree @feac16e7-4be5-4b4a-807b-f5dab566d3ea/main (Download .tar.gz)

Engine.cpp @feac16e7-4be5-4b4a-807b-f5dab566d3ea/main — raw · history · blame

/*------------------------------------------------------------------------
Copyright (C) 2002-2016 SIL International. All rights reserved.

Distributable under the terms of either the Common Public License or the
GNU Lesser General Public License, as specified in the LICENSING.txt file.

File: Engine.cp
Responsibility: Jonathan Kew
Last reviewed: Not yet.

Description:
    Implements the TECkit conversion engine.
-------------------------------------------------------------------------*/

/*
	2008-01-23  jk  revised endian-ness stuff to allow Universal build
	2006-06-02	jk	added support for extended string rules (>255 per initial char)
	2006-06-02	jk	fixed bug handling passes with no mapping rules
	2006-01-12	jk	remove multi-char constants, use kTableType_XXX from TECkit_Format.h
	2005-07-19	jk	revised to use WORDS_BIGENDIAN conditional, config.h
	2005-05-06	jk	patched match() to forget matches within groups if we backtrack out
	2004-03-19	jk	rewrote match() to fix group/repeat bugs and be more efficient
	2004-03-12	jk	finished updating for version 2.1 with ...Opt APIs
*/

//#define TRACING	1

#ifdef HAVE_CONFIG_H
#	include "config.h"	/* a Unix-ish setup where we have config.h available */
#endif

#if	(defined(__dest_os) && (__dest_os == __win32_os)) || defined(WIN32)	/* Windows target: little-endian */
#	undef WORDS_BIGENDIAN
#endif

#ifdef __APPLE__
#include <TargetConditionals.h>
#endif

#if defined(TARGET_RT_BIG_ENDIAN)	/* the CodeWarrior prefix files or Apple TargetConditionals.h sets this */
#	if TARGET_RT_BIG_ENDIAN
#		undef WORDS_BIGENDIAN
#		define WORDS_BIGENDIAN 1
#	else
#		undef WORDS_BIGENDIAN
#	endif
#endif

#if	(defined(__dest_os) && (__dest_os == __win32_os)) || defined(WIN32)
#	define WIN32_LEAN_AND_MEAN
#	define NOSERVICE
#	define NOMCX
#	include <windows.h>

	BOOL WINAPI
	DllMain(HINSTANCE /*hInst*/, DWORD /*wDataSeg*/, LPVOID /*lpReserved*/)
	{
		return true;
	}
#endif

#include "Engine.h"

#ifdef TRACING
#include <iostream>

int	traceLevel = 1;
#endif

#include <cstdlib>
#include <cstring>
#include <algorithm>

#include "zlib.h"

using namespace std;

/* we apply READ to values read from the compiled table, to provide byte-swapping where needed */
inline UInt8
READ(const UInt8 p)
{
	return p;
}

inline UInt16
READ(const UInt16 p)
{
#ifdef WORDS_BIGENDIAN
	return p;
#else
	return (p >> 8) + (p << 8);
#endif
}

inline UInt32
READ(const UInt32 p)
{
#ifdef WORDS_BIGENDIAN
	return p;
#else
	return (p >> 24) + ((p >> 8) & 0x0000ff00) + ((p << 8) & 0x00ff0000) + (p << 24);
#endif
}

Stage::Stage()
	: oBuffer(0)
	, oBufSize(0)
	, oBufEnd(0)
	, oBufPtr(0)
	, prevStage(0)
{
}

Stage::~Stage()
{
	if (prevStage && prevStage->prevStage)
		delete prevStage;
}

UInt32
Stage::lookaheadCount() const
{
	return 0;
}

#include "NormalizationData.c"

Normalizer::Normalizer(bool compose)
	: prevCombClass(0)
	, oBufSafe(0)
	, bCompose(compose)
{
	oBufSize = 256;
	oBuffer = new UInt32[oBufSize];
}

Normalizer::~Normalizer()
{
	delete[] oBuffer;
}

/* constants for algorithmic Hangul decomposition */
#define	SBase	0xAC00
#define	LBase	0x1100
#define	VBase	0x1161
#define	TBase	0x11A7
#define	LCount	19
#define	VCount	21
#define	TCount	28
#define	NCount	(VCount * TCount)
#define	SCount	(LCount * NCount)

UInt32
Normalizer::process()
{
	UInt32	inChar = prevStage->getChar();
	if (inChar == kNeedMoreInput || inChar == kInvalidChar || inChar == kUnmappedChar)
		return inChar;
	if (inChar == kEndOfText) {
		generateChar(kEndOfText);
		return inChar;
	}

	UInt32	SIndex = inChar - SBase;
	if (SIndex >= SCount)
		decompose(inChar);
	else {
		generateChar(LBase + SIndex / NCount);
		generateChar(VBase + (SIndex % NCount) / TCount);
		UInt32 T = SIndex % TCount;
		if (T != 0)
			generateChar(TBase + T);
	}
	
	return 0;
}

void
Normalizer::Reset()
{
	oBufPtr = oBufEnd = 0;
	prevCombClass = 0;
	oBufSafe = 0;
}

void
Normalizer::decompose(UInt32 c)
{
	UInt32	prefix = decomposeOne(c);
	if (prefix != 0xffff)
		decompose(prefix);
	if (c != 0xffff)
		generateChar(c);
}

UInt32
Normalizer::decomposeOne(UInt32& c)
{
	UInt32	plane = c >> 16;
	UInt32	page = (c >> 8) & 0xff;
	UInt32	ch = c & 0xff;
	
	UInt16	charIndex = dcCharIndex[dcPageMaps[dcPlaneMap[plane]][page]][ch];
	if (charIndex == 0)
		return 0xffff;
	c = dcDecomposition[charIndex][1];
	return dcDecomposition[charIndex][0];
}

void
Normalizer::generateChar(UInt32 c)
{
	int	combClass = 0;
	if (c != kEndOfText) {
		UInt32	plane = c >> 16;
		UInt32	page = (c >> 8) & 0xff;
		UInt32	ch = c & 0xff;
		combClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch];
	}
	
	if (combClass != 0) {
		// combiners are always buffered for sorting and possible composition
		if (prevCombClass <= combClass) {
			appendChar(c);
			prevCombClass = combClass;
		}
		else
			insertChar(c, combClass);
	}
	else {
		if (bCompose) {
			if (oBufEnd > 0) {
				// check whether last buffered char and current char should form Hangul syllable
				UInt32	last = oBuffer[oBufEnd - 1];

				// 1. check to see if two current characters are L and V
				UInt32	LIndex = last - LBase;
				if (LIndex < LCount) {
					UInt32	VIndex = c - VBase;
					if (VIndex < VCount) {
						// make syllable of form LV
						last = SBase + (LIndex * VCount + VIndex) * TCount;
						oBuffer[oBufEnd - 1] = last; // reset last
						return; // don't append c, and don't update oBufSafe as a following V would compose
					}
				}

				// 2. check to see if two current characters are LV and T
				UInt32	SIndex = last - SBase;
				if (SIndex < SCount && (SIndex % TCount) == 0) {
					UInt32	TIndex = c - TBase;
					if (TIndex <= TCount) {
						// make syllable of form LVT
						last += TIndex;
						oBuffer[oBufEnd - 1] = last; // reset last
						oBufSafe = oBufEnd;	// no more composition will be possible now
						return; // don't append c
					}
				}
			}

			// search for canonical compositions in the buffered text, and update oBufSafe if possible
			compose();
		}
		else
			oBufSafe = oBufEnd;
		appendChar(c);
		if (c == kEndOfText)
			oBufSafe = oBufEnd;
		prevCombClass = 0;
	}
}

void
Normalizer::appendChar(UInt32 c)
{
	/* unlikely that we'd ever need to do this--it would take a long string of non-spacing marks! */
	if (oBufEnd == oBufSize)
		growOutBuf();

	oBuffer[oBufEnd++] = c;
}

void
Normalizer::insertChar(UInt32 insCh, int insCombClass)
{
	if (oBufEnd == oBufSize)
		growOutBuf();

	UInt32 i;
	for (i = oBufEnd - 1; i > 0; --i) {
		UInt32	c = oBuffer[i];
		UInt32	plane = c >> 16;
		UInt32	page = (c >> 8) & 0xff;
		UInt32	ch = c & 0xff;
		int	combClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch];
		if (insCombClass >= combClass)
			break;
	}
	++i;
	
	for (UInt32 j = oBufEnd; j > i; --j)
		oBuffer[j] = oBuffer[j - 1];

	oBuffer[i] = insCh;
	oBufEnd++;
}

void
Normalizer::growOutBuf()
{
	UInt32	newSize = oBufSize + 256;
	UInt32*	newBuf = new UInt32[newSize];
	for (long i = 0; i < oBufSize; ++i)
		newBuf[i] = oBuffer[i];
	delete[] oBuffer;
	oBuffer = newBuf;
	oBufSize = newSize;
}

void
Normalizer::compose()
{
	// search for compositions in oBuffer up to oBufEnd
	UInt32	starterPos = 0;

	UInt32	c = oBuffer[0];
	UInt32	plane = c >> 16;
	UInt32	page = (c >> 8) & 0xff;
	UInt32	ch = c & 0xff;
	int		lastClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch];
	if (lastClass != 0)
		lastClass = 256;

	if (oBufEnd > 1) {
		UInt32	compPos = 1;
		UInt16	li = cLCharIndex[cLPageMaps[cLPlaneMap[plane]][page]][ch];

	    for (long decompPos = 1; decompPos < oBufEnd; ++decompPos) {
			c = oBuffer[decompPos];
			plane = c >> 16;
			page = (c >> 8) & 0xff;
			ch = c & 0xff;
			int		chClass = ccCharClass[ccPageMaps[ccPlaneMap[plane]][page]][ch];
			UInt16	ri = cRCharIndex[cRPageMaps[cRPlaneMap[plane]][page]][ch];
	        UInt32	cmp = cComposites[li][ri];
			if (cmp != 0 && (lastClass < chClass || lastClass == 0)) {
	            oBuffer[starterPos] = cmp;
				plane = cmp >> 16;
				page = (cmp >> 8) & 0xff;
				ch = cmp & 0xff;
				li = cLCharIndex[cLPageMaps[cLPlaneMap[plane]][page]][ch];
	        }
	        else {
	            if (chClass == 0) {
	                starterPos = compPos;
					plane = c >> 16;
					page = (c >> 8) & 0xff;
					ch = c & 0xff;
					li = cLCharIndex[cLPageMaps[cLPlaneMap[plane]][page]][ch];
	            }
	            lastClass = chClass;
	            oBuffer[compPos++] = c;
	        }
	    }
	    oBufEnd = compPos;
	}
	    
    // update oBufSafe to pass any chars that definitely can't compose
	if (lastClass != 0)
		oBufSafe = oBufEnd;
	else
		oBufSafe = starterPos;
}

UInt32
Normalizer::getChar()
{
	UInt32	c;
	while (oBufSafe == 0) {
		c = process();
		if (c == kNeedMoreInput || c == kInvalidChar || c == kUnmappedChar)
			return c;
	}
	c = oBuffer[oBufPtr++];
	if (oBufPtr == oBufSafe) {
		for (long i = oBufPtr; i < oBufEnd; ++i)
			oBuffer[i - oBufPtr] = oBuffer[i];
		oBufEnd -= oBufPtr;
		oBufSafe = oBufPtr = 0;
	}
	return c;
}

Pass::Pass(const TableHeader* inTable, Converter* cnv)
	: converter(cnv)
	, tableHeader(inTable)
	, iBuffer(0)
	, iBufSize(0)
	, iBufStart(0)
	, iBufEnd(0)
	, iBufPtr(0)
{
	bInputIsUnicode 	= ((READ(tableHeader->type) & 0xFF000000) >> 24) == 'U';
	bOutputIsUnicode	= (READ(tableHeader->type) & 0x000000FF) == 'U';
	bSupplementaryChars	= (READ(tableHeader->flags) & kTableFlags_Supplementary) != 0;

	numPageMaps = 1;
	pageBase		= reinterpret_cast<const Byte*>(tableHeader) + READ(tableHeader->pageBase);
	lookupBase		= reinterpret_cast<const Lookup*>(reinterpret_cast<const Byte*>(tableHeader) + READ(tableHeader->lookupBase));
	matchClassBase	= reinterpret_cast<const Byte*>(tableHeader) + READ(tableHeader->matchClassBase);
	repClassBase	= reinterpret_cast<const Byte*>(tableHeader) + READ(tableHeader->repClassBase);
	stringListBase	= reinterpret_cast<const Byte*>(tableHeader) + READ(tableHeader->stringListBase);
	stringRuleData	= reinterpret_cast<const Byte*>(tableHeader) + READ(tableHeader->stringRuleData);

	if (bInputIsUnicode && bSupplementaryChars) {
		// support supplementary plane chars
		planeMap = pageBase;
		pageBase += 20;
		numPageMaps = READ(*(planeMap + 17));
	}
	
	iBufSize = (READ(inTable->maxMatch) + READ(inTable->maxPre) + READ(inTable->maxPost) + 7) & ~0x0003;
	iBuffer = new UInt32[iBufSize];

	oBufSize = (READ(inTable->maxOutput) + 7) & ~0x0003;
	oBuffer = new UInt32[oBufSize];
}

Pass::~Pass()
{
	delete[] oBuffer;
	delete[] iBuffer;
}

void
Pass::Reset()
{
	iBufStart = iBufEnd = iBufPtr = 0;
	oBufPtr = oBufEnd = 0;
}

UInt32
Pass::getChar()
	// called by next Pass when it wants the next character from us
{
	while (oBufPtr == oBufEnd) {
		oBufPtr = oBufEnd = 0;
		UInt32	c = DoMapping();
		if (c == kNeedMoreInput || c == kInvalidChar || c == kUnmappedChar)
			return c;
	}
	return oBuffer[oBufPtr++];
}

void
Pass::outputChar(UInt32 c)
	// Called by DoMapping to generate a character in the output stream
{
	if (oBufEnd < oBufSize)
		oBuffer[oBufEnd++] = c;
			// Cannot overflow provided the table correctly declares maxOutput
			// (so the compiler had better get it right!)
}

UInt32
Pass::lookaheadCount() const
	// return how many characters of lookahead this pass has in its input buffer
{
	return iBufEnd < iBufPtr
		? // iBufEnd has wrapped but iBufPtr hasn't
			iBufEnd + (iBufSize - iBufPtr)
		: // pointers are in the "normal" order
			iBufEnd - iBufPtr;
}

UInt32
Pass::inputChar(long inIndex)
	// Called by DoMapping or match to read the character at a given location
	// relative to the current input stream location
{
	long	target = iBufPtr + inIndex;
	if (inIndex < 0) {
		// look back
		if (target < 0)
			target += iBufSize;
		if (iBufPtr < iBufStart) {
			// iBufPtr has wrapped back to beginning of buffer, leaving iBufStart beyond it
			// so the valid pre-context is from iBufStart to iBufSize-1 and 0 to iBufPtr-1
			if (target >= iBufStart || target < iBufPtr)
				return iBuffer[target];
		}
		else {
			// iBufPtr points beyond iBufStart
			// so the valid pre-context is from iBufStart to iBufPtr-1
			if (target >= iBufStart && target < iBufPtr)
				return iBuffer[target];
		}
		return kEndOfText;
	}
	else {
		// look ahead
		if (target >= iBufSize)
			target -= iBufSize;
		if (iBufPtr == iBufEnd) {
			// ensure that current character is actually available
			UInt32	ch = prevStage->getChar();
			if (ch == kNeedMoreInput || ch == kInvalidChar || ch == kUnmappedChar)
				return ch;	// don't put this into iBuffer!
			iBuffer[iBufEnd++] = ch;
			if (iBufEnd == iBufSize)
				iBufEnd = 0;
			if (iBufEnd == iBufStart) {
				++iBufStart;
				if (iBufStart == iBufSize)
					iBufStart = 0;
			}
		}
		long	index = iBufPtr;
		while (index != target) {
			// scan forward as far as necessary, reading in required chars
			if (index == iBufSize - 1)
				index = 0;
			else
				++index;
			if (index == iBufEnd) {
				UInt32	ch = prevStage->getChar();
				if (ch == kNeedMoreInput || ch == kInvalidChar || ch == kUnmappedChar)
					return ch;
				iBuffer[iBufEnd++] = ch;
				if (iBufEnd == iBufSize)
					iBufEnd = 0;
				if (iBufEnd == iBufStart) {
					++iBufStart;
					if (iBufStart == iBufSize)
						iBufStart = 0;
				}
			}
		}
		return iBuffer[index];
	}
	return kEndOfText;
}

void
Pass::advanceInput(unsigned int numChars)
	// Called by DoMapping to move forward in the input stream
	// Will only move forward over chars already examined by a rule;
	//	therefore, getChar() can't return kEndOfText, kNeedMoreInput, etc.
{
	for (unsigned int i = 0; i < numChars; ++i) {
		if (iBufPtr == iBufEnd) {
			iBuffer[iBufEnd++] = prevStage->getChar();
			if (iBufEnd == iBufStart) {
				++iBufStart;
				if (iBufStart == iBufSize)
					iBufStart = 0;
			}
			if (iBufEnd == iBufSize)
				iBufEnd = 0;
		}
		iBufPtr++;
		if (iBufPtr == iBufSize)
			iBufPtr = 0;
	}
}

template<class T>
static const T*
binary_search(const T* array, UInt32 count, UInt32 value)
{
	while (count > 0) {
		const T*	i = array;
		UInt32	count2 = count / 2;
		i += count2;
		if (READ(*i) < value) {
			array = i + 1;
			count -= count2 + 1;
		}
		else
			count = count2;
	}
	return array;
}

long
Pass::classMatch(UInt32 classNumber, UInt32 inChar) const
{
	const UInt32*	classPtr = reinterpret_cast<const UInt32*>(matchClassBase + READ(*(reinterpret_cast<const UInt32*>(matchClassBase) + classNumber)));
	UInt32			memberCount = READ(*classPtr++);
	if (bInputIsUnicode) {
		if (bSupplementaryChars) {
			// classes are 32-bit
			const UInt32*	p = binary_search(classPtr, memberCount, inChar);
			if (READ(*p) == inChar)
				return p - classPtr;
		}
		else {
			// classes are 16-bit
			const UInt16*	p = binary_search(reinterpret_cast<const UInt16*>(classPtr), memberCount, inChar);
			if (READ(*p) == inChar)
				return p - reinterpret_cast<const UInt16*>(classPtr);
		}
	}
	else {
		// classes are 8-bit
		const UInt8*	p = binary_search(reinterpret_cast<const UInt8*>(classPtr), memberCount, inChar);
		if (READ(*p) == inChar)
			return p - reinterpret_cast<const UInt8*>(classPtr);
	}
	return -1;
}

UInt32
Pass::repClassMember(UInt32 classNumber, UInt32 index) const
{
	const UInt32*	classPtr = reinterpret_cast<const UInt32*>(repClassBase + READ(*(reinterpret_cast<const UInt32*>(repClassBase) + classNumber)));
	UInt32			memberCount = READ(*classPtr++);
	if (index < memberCount)
		if (bOutputIsUnicode)
			if (bSupplementaryChars)
				return READ(classPtr[index]);
			else
				return READ(reinterpret_cast<const UInt16*>(classPtr)[index]);
		else {
			return READ(reinterpret_cast<const UInt8*>(classPtr)[index]);
		}
	else
		return 0;	// this can't happen if the compiler is right!
}

#ifdef TRACING
static int _depth = 0;
#endif

#define RETURN(x)	do { _rval = (x); goto _return_label; } while (0)

#define matchYes	1
#define matchNo		0
UInt32
Pass::match(int index, int repeats, int textLoc)
{
/*
	attempt to match pattern starting at /index/
	initial repeat count is /repeats/
	text offset is /textLoc/

	recurses whenever we might need to backtrack

	returns
		matchYes	- succeeded
		matchNo		- can't match at this position
		other values, eg:
			kNeedMoreInput
			kInvalidChar
			kUnmappedChar
					- aborted without a definite decision
*/

#ifdef TRACING
cerr << "match(" << index << ", " << repeats << ", " << textLoc << ")\n";
#endif

	UInt32	_rval = matchNo;

	// we come back here to loop rather than recurse, with new values for the arguments
RESTART:

	// if this is the first attempt to match at this index, record where we are
	if (repeats == 0) {
		if (index == matchElems)
			matchedLength = textLoc;
		if (index < infoLimit) {
			info[index].matchedSpan.start = textLoc;
#ifdef TRACING
cerr << "info[" << index << "].matchedSpan.start = " << textLoc << "\n";
#endif
		}
	}

	// if we're at the end of the pattern, we have a match
	if (index >= patternLength)
		RETURN(matchYes);

	if (index == 0 && repeats == 0)
		sgrStack = 0;	// ensure this is cleared at start of pattern (shouldn't be necessary?)

	{	// gcc complains about jumping past initializers (from RETURN above) without this
		UInt32				mr;
		const MatchElem&	m = pattern[index];
		int					repeatMin = READ(m.flags.repeat) >> 4;
		int					repeatMax = READ(m.flags.repeat) & 0x0f;
		UInt8				type      = READ(m.flags.type);
		bool				negate    = ((type & kMatchElem_Negate) != 0);

		type = ((type & kMatchElem_NonLit) != 0)
			? type & kMatchElem_TypeMask
			: 0;

		int		classIndex;
		bool	matches;
		UInt32	inChar;
		
		// start of group: try each alternative in turn
		if (type == kMatchElem_Type_BGroup) {
			// try matching one of the alternatives in the group (again)
			info[index].groupRepeats = repeats;
			if (repeats < repeatMax) {
				int	altIndex = index;
				while (true) {
					mr = match(altIndex + 1, 0, textLoc);
					if (mr != matchNo)
						RETURN(mr);
					// failed, so step ahead to next alternative or end of group
					altIndex += READ(pattern[altIndex].value.bgroup.dNext);
					if ((READ(pattern[altIndex].flags.type) & kMatchElem_TypeMask) != kMatchElem_Type_OR)
						break;
				}
			}
			// if the group has matched enough times...
			if (repeats >= repeatMin) {
				// try to match following stuff
#ifdef TRACING
cerr << "repeats >= repeatMin\n";
#endif
				mr = match(index + READ(m.value.bgroup.dAfter), 0, textLoc);
				if (mr == matchYes) {
					if (index < infoLimit) {
						info[index].matchedSpan.limit = textLoc;
#ifdef TRACING
cerr << "group returning matchYes; info[" << index << "].matchedSpan.limit = " << textLoc << "\n";
#endif
						// don't allow elements within the group to indicate matches beyond the span of the group itself
						for (int i = index + READ(m.value.bgroup.dAfter) - 1; i > index; --i)
							if (i < infoLimit) {
								if (info[i].matchedSpan.start > textLoc)
									info[i].matchedSpan.start = textLoc;
								if (info[i].matchedSpan.limit > textLoc)
									info[i].matchedSpan.limit = textLoc;
							}
					}
				}
				RETURN(mr);
			}
			// otherwise just backtrack
			RETURN(matchNo);
		}
		
		// reached end of an alternative
		else if (type == kMatchElem_Type_OR || type == kMatchElem_Type_EGroup) {
			int	startIndex = index - READ(m.value.egroup.dStart);
			mr = match(startIndex, info[startIndex].groupRepeats + 1, textLoc);
			RETURN(mr);
		}
		
		// not a group, so we loop rather than recurse until optionality strikes
		else {
			// ensure that item matches at least repeatMin times
			while (repeats < repeatMin) {
				inChar = inputChar(textLoc);
				if (inChar == kInvalidChar || inChar == kNeedMoreInput || inChar == kUnmappedChar)
					RETURN(inChar);
				matches = false;
				switch (type) {
					case 0:	// literal
						matches = (READ(m.value.usv.data) & kUSVMask) == inChar;
						break;
					
					case kMatchElem_Type_Class:
						classIndex = classMatch(READ(m.value.cls.index), inChar);
						matches = (classIndex != -1);
						if (matches && repeats == 0 && index < infoLimit)
							info[index].classIndex = classIndex;
						break;
					
					case kMatchElem_Type_ANY:
						matches = (inChar != kEndOfText);
						break;
					
					case kMatchElem_Type_EOS:
						matches = (inChar == kEndOfText);
						break;
				}
				matches = (matches != negate);
				if (!matches)
					RETURN(matchNo);
				++repeats;
				textLoc += direction;
			}
			
			if (index < infoLimit) {
				info[index].matchedSpan.limit = textLoc;
#ifdef TRACING
cerr << "info[" << index << "].matchedSpan.limit = " << textLoc << "\n";
#endif
			}

			if (repeatMin == repeatMax) {
				// no need to recurse, as no optionality
				++index;
				repeats = 0;
				goto RESTART;
			}
			
			// try for another repeat if allowed
			if (repeats < repeatMax) {
				inChar = inputChar(textLoc);
				if (inChar == kInvalidChar || inChar == kNeedMoreInput || inChar == kUnmappedChar)
					RETURN(inChar);
				matches = false;
				switch (type) {
					case 0:	// literal
						matches = (READ(m.value.usv.data) & kUSVMask) == inChar;
						break;
					
					case kMatchElem_Type_Class:
						classIndex = classMatch(READ(m.value.cls.index), inChar);
						matches = (classIndex != -1);
						if (matches && repeats == 0 && index < infoLimit)
							info[index].classIndex = classIndex;
						break;
					
					case kMatchElem_Type_ANY:
						matches = (inChar != kEndOfText);
						break;
					
					case kMatchElem_Type_EOS:
						matches = (inChar == kEndOfText);
						break;
				}
				matches = (matches != negate);
				if (matches) {
					mr = match(index, repeats + 1, textLoc + direction);
					if (mr != matchNo)
						RETURN(mr);
				}
			}
			
			// otherwise try to match the remainder of the pattern
			mr = match(index + 1, 0, textLoc);
			RETURN(mr);
		}
	}

_return_label:

	if (_rval == matchNo)
		if (index < infoLimit) {
			info[index].matchedSpan.limit = textLoc;
#ifdef TRACING
cerr << "rval == matchNo; setting info[" << index << "].matchedSpan.limit = " << textLoc << "\n";
#endif
		}

#ifdef TRACING
cerr << "RETURN(" << (_rval == matchYes ? "matchYes" : "matchNo") << ")\n";
#endif
    return _rval;
}

#undef RETURN

#ifdef TRACING
static void
printMatchElem(const MatchElem& m)
{
	string	rval;
	char	buf[20];
	if (m.flags.type & kMatchElem_Negate)
		rval += "!";
	if (m.flags.type & kMatchElem_NonLit) {
		switch (m.flags.type & kMatchElem_TypeMask) {
			case kMatchElem_Type_Class:
				sprintf(buf, "[%d]", m.value.cls.index);
				rval += buf;
				break;
			case kMatchElem_Type_BGroup:
				rval += "(";
				break;
			case kMatchElem_Type_EGroup:
				rval += ")";
				break;
			case kMatchElem_Type_OR:
				rval += "|";
				break;
			case kMatchElem_Type_ANY:
				rval += ".";
				break;
			case kMatchElem_Type_EOS:
				rval += "#";
				break;
			case kMatchElem_Type_Copy:
				rval += "@";
				break;
		}
	}
	else {
		UInt32	v = m.value.usv.data & kUSVMask;
		if (v >= ' ' && v < 0x7e) {
			sprintf(buf, "'%c'", (char)v);
			rval += buf;
		}
		else {
			sprintf(buf, "0x%04X", (UInt32)v);
			rval += buf;
		}
	}
	if (!(m.flags.type & kMatchElem_NonLit) || (m.flags.type & kMatchElem_TypeMask) != kMatchElem_Type_BGroup)
		switch (m.flags.repeat) {
			case 0x01:
				rval += "?";
				break;
			case 0x11:
				break;
			case 0x0F:
				rval += "*";
				break;
			case 0x1F:
				rval += "+";
				break;
			default:
				sprintf(buf, "{%d,%d}", m.flags.repeat >> 4, m.flags.repeat & 0x0F);
				rval += buf;
				break;
		}
	cerr << rval;
}

static void
printMatch(const StringRule* rule)
{
	for (int i = 0; i < READ(rule->matchLength); ++i) {
		cerr << " ";
		printMatchElem(((MatchElem*)(rule + 1))[i]);
//		cerr << "<" << i << ">";
	}
	if (READ(rule->preLength) > 0 || READ(rule->postLength) > 0) {
		cerr << " /";
		for (int i = READ(rule->preLength) - 1; i >= 0; --i) {
			cerr << " ";
			printMatchElem(((MatchElem*)(rule + 1))[READ(rule->matchLength) + READ(rule->postLength) + i]);
		}
		cerr << " _";
		for (int i = 0; i < READ(rule->postLength); ++i) {
			cerr << " ";
			printMatchElem(((MatchElem*)(rule + 1))[READ(rule->matchLength) + i]);
		}
	}
}

static void
printRep(const StringRule* rule)
{
	const RepElem*	r = (const RepElem*)((const MatchElem*)(rule + 1) + rule->matchLength + rule->preLength + rule->postLength);
	for (int i = 0; i < READ(rule->repLength); ++i, ++r) {
		cerr << " ";
		switch (READ(r->flags.type)) {
			case kRepElem_Literal:
				{
					UInt32	v;
					char	buf[20];
					v = READ(r->value);
					if (v >= ' ' && v <= 0x7e) {
						sprintf(buf, "'%c'", v);
						cerr << buf;
					}
					else {
						sprintf(buf, "0x%04X", v);
						cerr << buf;
					}
				}
				break;

			case kRepElem_Class:
				cerr << "[" << (int)READ(r->flags.repClass) << "," << (int)READ(r->flags.matchIndex) << "]";
				break;

			case kRepElem_Copy:
				cerr << "@" << (int)READ(r->flags.matchIndex);
				break;

			case kRepElem_Unmapped:
				cerr << "?";
				break;
		}
	}
}
#endif

UInt32
Pass::DoMapping()
{
	UInt32	inChar = inputChar(0);
	if (inChar == kNeedMoreInput || inChar == kInvalidChar || inChar == kUnmappedChar)
		return inChar;
	if (inChar == kEndOfText) {
		outputChar(kEndOfText);
		return inChar;
	}
	matchedLength = 1;

	const Lookup*	lookup;
	if (bInputIsUnicode) {
		// Unicode lookup
		UInt16	charIndex = 0;
		if (reinterpret_cast<const UInt8*>(lookupBase) == pageBase) {
			// leave charIndex == 0 : pass with no rules
		}
		else {
			UInt8	plane = inChar >> 16;
			const UInt8*	pageMap = 0;
			if (bSupplementaryChars) {
				if ((plane < 17) && (READ(planeMap[plane]) != 0xff)) {
					pageMap = reinterpret_cast<const UInt8*>(pageBase + 256 * READ(planeMap[plane]));
					goto GOT_PAGE_MAP;
				}
			}
			else if (plane == 0) {
				pageMap = pageBase;
			GOT_PAGE_MAP:
				UInt8	page = (inChar >> 8) & 0xff;
				if (READ(pageMap[page]) != 0xff) {
					const UInt16*	charMapBase = reinterpret_cast<const UInt16*>(pageBase + 256 * numPageMaps);
					const UInt16*	charMap = charMapBase + 256 * READ(pageMap[page]);
					charIndex = READ(charMap[inChar & 0xff]);
				}
			}
		}
		lookup = lookupBase + charIndex;
	}
	else {
		// byte-oriented lookup
		if (pageBase != reinterpret_cast<const Byte*>(tableHeader)) {
			// dbcsPage present
			long	pageNumber = READ(pageBase[inChar]);
			if (pageNumber == 0)
				// not a valid DBCS lead byte
				lookup = lookupBase + inChar;
			else {
				UInt32	nextChar = inputChar(1);
				if (nextChar == kNeedMoreInput || nextChar == kInvalidChar || nextChar == kUnmappedChar)
					return nextChar;
				if (nextChar == kEndOfText)
					lookup = lookupBase + inChar;
				else {
					lookup = lookupBase + pageNumber * 256 + nextChar;
					if (READ(lookup->rules.type) == kLookupType_IllegalDBCS)
						// illegal DBCS sequence; map lead byte alone
						lookup = lookupBase + inChar;
					else
						matchedLength = 2;
				}
			}
		}
		else
			// single-byte only
			lookup = lookupBase + inChar;
	}

	UInt8	ruleType = READ(lookup->rules.type);
	if (ruleType == kLookupType_StringRules || (ruleType & kLookupType_RuleTypeMask) == kLookupType_ExtStringRules) {
		// process string rule list
		const UInt32*	ruleList = reinterpret_cast<const UInt32*>(stringListBase) + READ(lookup->rules.ruleIndex);
		bool			matched = false;
		bool			allowInsertion = true;
		int ruleCount = READ(lookup->rules.ruleCount);
		if ((ruleType & kLookupType_RuleTypeMask) == kLookupType_ExtStringRules)
			ruleCount += 256 * (ruleType & kLookupType_ExtRuleCountMask);
		for ( ; ruleCount > 0; --ruleCount) {
			const StringRule*	rule = reinterpret_cast<const StringRule*>(stringRuleData + READ(*ruleList));
#ifdef TRACING
if (traceLevel > 0) {
	cerr << "** trying match: ";
	printMatch(rule);
	cerr << "\n";
}
#endif
			ruleList++;

			matchElems = READ(rule->matchLength);
			if (matchElems == 0 && allowInsertion == false)
				continue;
			patternLength = matchElems + READ(rule->postLength);
			pattern = reinterpret_cast<const MatchElem*>(rule + 1);	// point past the defined struct for the rule header
			direction = 1;
			infoLimit = matchElems;

			// clear junk...
			for (int i = 0; i < infoLimit; ++i)
				info[i].matchedSpan.start = info[i].matchedSpan.limit = 0;
                        
			UInt32	mr = match(0, 0, 0);
			if (mr == matchYes) {
				if (matchedLength == 0 && allowInsertion == false)
					continue;
				pattern += patternLength;
				patternLength = READ(rule->preLength);
				if (patternLength > 0) {
					direction = -1;
					infoLimit = 0;
					matchElems = -1;
					mr = match(0, 0, -1);
				}
				if (mr == matchYes) {
					// RULE MATCHED! execute it
#ifdef TRACING
if (traceLevel > 0) {
	cerr << "** MATCHED:";
	printMatch(rule);
	cerr << "\n";
	
	cerr << "** RANGES:";
	for (int i = 0; i < READ(rule->matchLength); ++i) {
		cerr << " <" << info[i].matchedSpan.start << ":" << info[i].matchedSpan.limit << ">";
	}
	cerr << "\n";
	
	cerr << "** REPLACEMENT:";
	printRep(rule);
	cerr << "\n";

	cerr << "** GENERATES:";
}
#endif
					const RepElem*	r = reinterpret_cast<const RepElem*>(pattern + patternLength);
					for (int i = 0; i < READ(rule->repLength); ++i, ++r) {
#ifdef TRACING
if (traceLevel > 0)
	cerr << " <";
#endif
						switch (READ(r->flags.type)) {
							case kRepElem_Literal:
								outputChar(READ(r->value));
#ifdef TRACING
if (traceLevel > 0)
	cerr << (int)READ(r->value);
#endif
								break;

							case kRepElem_Class:
								{
									const MatchInfo&	myInfo = info[READ(r->flags.matchIndex)];
									if (myInfo.matchedSpan.start < myInfo.matchedSpan.limit) {
										outputChar(repClassMember(READ(r->flags.repClass), myInfo.classIndex));
#ifdef TRACING
if (traceLevel > 0)
	cerr << (int)repClassMember(READ(r->flags.repClass), myInfo.classIndex);
#endif
									}
								}
								break;

							case kRepElem_Copy:
								{
									const MatchInfo*	myInfo = &info[READ(r->flags.matchIndex)];
									for (int i = myInfo->matchedSpan.start; i < myInfo->matchedSpan.limit; ++i) {
										outputChar(inputChar(i));
#ifdef TRACING
if (traceLevel > 0)
	cerr << (i > myInfo->matchedSpan.start ? "," : "") << (int)inputChar(i);
#endif
									}
								}
								break;

							case kRepElem_Unmapped:
								if (bOutputIsUnicode == bInputIsUnicode) {
									outputChar(inChar);
#ifdef TRACING
if (traceLevel > 0)
	cerr << (int)inChar;
#endif
								}
								else {
									switch (converter->unmappedBehavior) {
										case kOptionsUnmapped_DontUseReplacementChar:
											return kUnmappedChar;

										case kOptionsUnmapped_UseReplacementCharWithWarning:
											converter->warningStatus |= kStatus_UsedReplacement;
											// fall through

										default:	// case kOptionsUnmapped_UseReplacementCharSilently:
											outputChar(READ(tableHeader->replacementChar));
											break;
									}
#ifdef TRACING
if (traceLevel > 0)
	cerr << (int)READ(tableHeader->replacementChar);
#endif
								}
								break;
						}
#ifdef TRACING
if (traceLevel > 0)
	cerr << ">";
#endif
					}
#ifdef TRACING
if (traceLevel > 0)
	cerr << endl;
#endif
					if (matchedLength > 0) {
						// we've matched the current input character, so break the loop
						matched = true;
						break;
					}
					else {
						// must have been an insertion (or null!) rule, so skip any further insertion rules
						allowInsertion = false;
					}
				}
				else if (mr != matchNo) {
					return mr;
				}
			}
			else if (mr != matchNo) {
				return mr;
			}
		}
		if (!matched) {
			// no rule matched the current input char, so we simulate a default "Unmapped" lookup
			if (bOutputIsUnicode == bInputIsUnicode)
				// B->B or U->U simply copies the input to the output
				outputChar(inChar);
			else {
				// B->U or U->B uses the replacement char or fails, depending on options
				switch (converter->unmappedBehavior) {
					case kOptionsUnmapped_DontUseReplacementChar:
						return kUnmappedChar;

					case kOptionsUnmapped_UseReplacementCharWithWarning:
						converter->warningStatus |= kStatus_UsedReplacement;
						// fall through

					default:	// case kOptionsUnmapped_UseReplacementCharSilently:
						outputChar(READ(tableHeader->replacementChar));
						break;
				}
			}
			matchedLength = 1;
		}
	}
	else if (ruleType == kLookupType_Unmapped) {
		if (bOutputIsUnicode == bInputIsUnicode)
			outputChar(inChar);
		else {
			switch (converter->unmappedBehavior) {
				case kOptionsUnmapped_DontUseReplacementChar:
					return kUnmappedChar;

				case kOptionsUnmapped_UseReplacementCharWithWarning:
					converter->warningStatus |= kStatus_UsedReplacement;
					// fall through

				default:	// case kOptionsUnmapped_UseReplacementCharSilently:
					outputChar(READ(tableHeader->replacementChar));
					break;
			}
		}
	}
	else {
		// direct character output
		if (bOutputIsUnicode) {
			UInt32	usv = READ(lookup->usv);
			if (usv <= 0x0010ffff)
				outputChar(usv);
		}
		else {
			for (int i = 0; i < READ(lookup->bytes.count); ++i)
				outputChar(READ(lookup->bytes.data[i]));
		}
	}
	
	advanceInput(matchedLength);
	return 0;
}

Converter::Converter(const Byte* inTable, UInt32 inTableSize, bool inForward,
						UInt16 inForm, UInt16 outForm)
	: table(0)
	, finalStage(0)
	, forward(inForward)
	, inputForm(inForm & kForm_EncodingFormMask)
	, outputForm(outForm & kForm_EncodingFormMask)
	, savedCount(0)
	, pendingOutputChar(kInvalidChar)
	, status(kStatus_NoError)
	, warningStatus(0)
{
	finalStage = this;
	UInt16	normForm = 0;
	if (inTable != 0) {
		const FileHeader*	fh = reinterpret_cast<const FileHeader*>(inTable);
		if (READ(fh->type) == kMagicNumberCmp) {
			// the table is compressed; allocate a new buffer and decompress
			unsigned long	uncompressedLen = READ(fh->version);
			table = static_cast<Byte*>(malloc(uncompressedLen));
			if (table == 0) {
				status = kStatus_OutOfMemory;
				return;
			}
			int	result = uncompress(table, &uncompressedLen, inTable + 2 * sizeof(UInt32), inTableSize - 2 * sizeof(UInt32));
			if (result != Z_OK) {
				status = kStatus_InvalidMapping;
				return;
			}
			fh = reinterpret_cast<const FileHeader*>(table);
		}
		
		if (READ(fh->type) != kMagicNumber) {
			status = kStatus_InvalidMapping;
			return;
		}
		if ((READ(fh->version) & 0xFFFF0000) > (kCurrentFileVersion & 0xFFFF0000)) {
			status = kStatus_BadMappingVersion;
			return;
		}

		if (table == 0) {
			table = static_cast<Byte*>(malloc(inTableSize));
			if (table == 0) {
				status = kStatus_OutOfMemory;
				return;
			}
			memcpy(table, inTable, inTableSize);
		}

		fh = reinterpret_cast<const FileHeader*>(table);
		const UInt32*	nameOffsets = reinterpret_cast<const UInt32*>(table + sizeof(FileHeader));
		const UInt32*	tableBase = nameOffsets + READ(fh->numNames);
		UInt32			numTables = READ(fh->numFwdTables);
		if (!forward) {
			tableBase += numTables;
			numTables = READ(fh->numRevTables);
		}
		
		// check that the outputForm matches the output of the mapping
		UInt32	targetFlags = forward ? READ(fh->formFlagsRHS) : READ(fh->formFlagsLHS);
		if ((targetFlags & kFlags_Unicode) != 0) {
			if (outputForm < kForm_UTF8 || outputForm > kForm_UTF32LE) {
				status = kStatus_InvalidForm;
				return;
			}
		}
		else {
			if (outputForm != kForm_Bytes) {
				status = kStatus_InvalidForm;
				return;
			}
		}
		
		// if converting from Unicode, prefix a Normalizer if the mapping wants it
		UInt32	sourceFlags = forward ? READ(fh->formFlagsLHS) : READ(fh->formFlagsRHS);
		if ((sourceFlags & kFlags_Unicode) != 0) {
			// check that the inputForm is a Unicode form
			if (inputForm < kForm_UTF8 || inputForm > kForm_UTF32LE) {
				status = kStatus_InvalidForm;
				return;
			}
			Stage*	n = 0;
			if ((sourceFlags & kFlags_ExpectsNFD) != 0) {
				n = new Normalizer(false);
				normForm = kForm_NFD;
			}
			else if ((sourceFlags & kFlags_ExpectsNFC) != 0) {
				n = new Normalizer(true);
				normForm = kForm_NFC;
			}
			if (n != 0) {
				n->prevStage = finalStage;
				finalStage = n;
			}
		}
		else {
			// check that the inputForm is bytes
			if (inputForm != kForm_Bytes) {
				status = kStatus_InvalidForm;
				return;
			}
		}

		// create the processing pipeline
		for (UInt32 i = 0; i < numTables; ++i) {
			const TableHeader*	t = reinterpret_cast<const TableHeader*>(table + READ(tableBase[i]));
			Stage*	p = 0;
			switch (READ(t->type)) {
				case kTableType_BB:
				case kTableType_BU:
				case kTableType_UU:
				case kTableType_UB:
					p = new Pass(t, this);
					normForm = 0;
					break;
				case kTableType_NFC:
					p = new Normalizer(true);
					normForm = kForm_NFC;
					break;
				case kTableType_NFD:
					p = new Normalizer(false);
					normForm = kForm_NFD;
					break;
			}
			if (p == 0) {
				status = kStatus_InvalidMapping;
				return;
			}
			p->prevStage = finalStage;
			finalStage = p;
		}
	}
	else {
		// No mapping table provided, so we're mapping Unicode->Unicode,
		// possibly doing normalization and/or encoding form change.
		// Just check here that the input and output encoding forms are valid.
		if (inputForm < kForm_UTF8 || inputForm > kForm_UTF32LE || outputForm < kForm_UTF8 || outputForm > kForm_UTF32LE) {
			status = kStatus_InvalidForm;
			return;
		}
	}

	// if converting to Unicode, add a Normalizer pass at the end if requested
	if (outputForm >= kForm_UTF8 && outputForm <= kForm_UTF32LE) {
		Stage*	n = 0;
		if ((outForm & kForm_NormalizationMask) == kForm_NFD && normForm != kForm_NFD)
			n = new Normalizer(false);
		else if ((outForm & kForm_NormalizationMask) == kForm_NFC && normForm != kForm_NFC)
			n = new Normalizer(true);
		if (n != 0) {
			n->prevStage = finalStage;
			finalStage = n;
		}
	}
}

Converter::~Converter()
{
	if (finalStage != this)
		delete finalStage;

	if (table != 0)
		free(table);

	table = 0;
}

static UInt32
offsetsFromUTF8[6] =	{
	0x00000000UL,
	0x00003080UL,
	0x000E2080UL, 
	0x03C82080UL,
	0xFA082080UL,
	0x82082080UL
};

static UInt8
bytesFromUTF8[256] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

static UInt8
firstByteMark[7] = {
	0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
};

const int halfShift					= 10;
const UInt32 halfBase				= 0x0010000UL;
const UInt32 halfMask				= 0x3FFUL;
const UInt32 kSurrogateHighStart	= 0xD800UL;
const UInt32 kSurrogateHighEnd		= 0xDBFFUL;
const UInt32 kSurrogateLowStart		= 0xDC00UL;
const UInt32 byteMask				= 0x000000BFUL;
const UInt32 byteMark				= 0x00000080UL;

UInt32
Converter::getChar()
{
	if (dataPtr >= savedCount + dataLen)
		return inputComplete ? kEndOfText : kNeedMoreInput;
	if (inputForm == kForm_Bytes)
		return data[dataPtr++];
	return _getCharFn();
}

UInt32
Converter::_getCharFn()
{
//	This is ONLY called from the public getChar() function, which has already done these tests:
//
//	if (dataPtr >= dataLen)
//		return inputComplete ? kEndOfText : kNeedMoreInput;
//	
//	if (inputForm == kForm_Bytes)
//		return data[dataPtr++];
	
	UInt32	rval = 0;

	if (savedCount > 0) {	// the less efficient version is only called if really needed
		rval = _getCharWithSavedBytes();
		return rval;
	}

#define CHECK_AVAIL(x)				\
	if (dataPtr + (x) > dataLen) {	\
		if (inputComplete)			\
			return kInvalidChar;	\
		else {						\
			_savePendingBytes();	\
			return kNeedMoreInput;	\
		}							\
	}
	
	switch (inputForm) {
		case kForm_UTF8:
			{
				UInt16 extraBytes = bytesFromUTF8[data[dataPtr]];
				CHECK_AVAIL(extraBytes + 1);
				switch (extraBytes) {	// note: code falls through cases!
					case 5:	rval += data[dataPtr++]; rval <<= 6;
					case 4:	rval += data[dataPtr++]; rval <<= 6;
					case 3:	rval += data[dataPtr++]; rval <<= 6;
					case 2:	rval += data[dataPtr++]; rval <<= 6;
					case 1:	rval += data[dataPtr++]; rval <<= 6;
					case 0:	rval += data[dataPtr++];
				};
				rval -= offsetsFromUTF8[extraBytes];
			}
			break;

		case kForm_UTF16BE:
			CHECK_AVAIL(2);
			rval = data[dataPtr++] << 8;
			rval += data[dataPtr++];
			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {
				// check that 2 more bytes are available
				dataPtr -= 2;
				CHECK_AVAIL(4);	// if we don't have 4 bytes available, this will return with kNeedMoreInput,
								// and we'll retry from the beginning of the high surrogate once more is available
				dataPtr += 2;
				UInt32	low = data[dataPtr++] << 8;
				low += data[dataPtr++];
				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;
			}
			break;

		case kForm_UTF16LE:
			CHECK_AVAIL(2);
			rval = data[dataPtr++];
			rval += data[dataPtr++] << 8;
			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {
				dataPtr -= 2;
				CHECK_AVAIL(4);
				dataPtr += 2;
				UInt32	low = data[dataPtr++];
				low += data[dataPtr++] << 8;
				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;
			}
			break;

		case kForm_UTF32BE:
			CHECK_AVAIL(4);
			rval = data[dataPtr++] << 24;
			rval += data[dataPtr++] << 16;
			rval += data[dataPtr++] << 8;
			rval += data[dataPtr++];
			break;

		case kForm_UTF32LE:
			CHECK_AVAIL(4);
			rval = data[dataPtr++];
			rval += data[dataPtr++] << 8;
			rval += data[dataPtr++] << 16;
			rval += data[dataPtr++] << 24;
			break;
	}

	return rval;
}

UInt32
Converter::_getCharWithSavedBytes()
	// This is a version of _getCharFn() that respects "saved bytes";
	// only call this if (savedCount > 0) because it has additional overhead for every byte read
{
	UInt32	rval = 0;

#undef CHECK_AVAIL
#define CHECK_AVAIL(x)							\
	if (dataPtr + (x) > savedCount + dataLen) {	\
		if (inputComplete)						\
			return kInvalidChar;				\
		else {									\
			_savePendingBytes();				\
			return kNeedMoreInput;				\
		}										\
	}

#define DATA(x)	(x < savedCount ? savedBytes[x] : data[x - savedCount])

	switch (inputForm) {
		case kForm_UTF8:
			{
				UInt16 extraBytes = bytesFromUTF8[DATA(dataPtr)];
				CHECK_AVAIL(extraBytes + 1);
				switch (extraBytes) {	// note: code falls through cases!
					case 5:	rval += DATA(dataPtr); dataPtr++; rval <<= 6;
					case 4:	rval += DATA(dataPtr); dataPtr++; rval <<= 6;
					case 3:	rval += DATA(dataPtr); dataPtr++; rval <<= 6;
					case 2:	rval += DATA(dataPtr); dataPtr++; rval <<= 6;
					case 1:	rval += DATA(dataPtr); dataPtr++; rval <<= 6;
					case 0:	rval += DATA(dataPtr); dataPtr++;
				};
				rval -= offsetsFromUTF8[extraBytes];
			}
			break;

		case kForm_UTF16BE:
			CHECK_AVAIL(2);
			rval = DATA(dataPtr) << 8; dataPtr++;
			rval += DATA(dataPtr); dataPtr++;
			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {
				dataPtr -= 2;
				CHECK_AVAIL(4);
				dataPtr += 2;
				UInt32	low = DATA(dataPtr) << 8; dataPtr++;
				low += DATA(dataPtr); dataPtr++;
				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;
			}
			break;

		case kForm_UTF16LE:
			CHECK_AVAIL(2);
			rval = DATA(dataPtr); dataPtr++;
			rval += DATA(dataPtr) << 8; dataPtr++;
			if (rval >= kSurrogateHighStart && rval <= kSurrogateHighEnd) {
				dataPtr -= 2;
				CHECK_AVAIL(4);
				dataPtr += 2;
				UInt32	low = DATA(dataPtr); dataPtr++;
				low += DATA(dataPtr) << 8; dataPtr++;
				rval = ((rval - kSurrogateHighStart) << halfShift) + (low - kSurrogateLowStart) + halfBase;
			}
			break;

		case kForm_UTF32BE:
			CHECK_AVAIL(4);
			rval = DATA(dataPtr) << 24; dataPtr++;
			rval += DATA(dataPtr) << 16; dataPtr++;
			rval += DATA(dataPtr) << 8; dataPtr++;
			rval += DATA(dataPtr); dataPtr++;
			break;

		case kForm_UTF32LE:
			CHECK_AVAIL(4);
			rval = DATA(dataPtr); dataPtr++;
			rval += DATA(dataPtr) << 8; dataPtr++;
			rval += DATA(dataPtr) << 16; dataPtr++;
			rval += DATA(dataPtr) << 24; dataPtr++;
			break;
	}
	
	if (dataPtr >= savedCount) {
		dataPtr -= savedCount;
		savedCount = 0;
	}
	
	return rval;
}

void
Converter::_savePendingBytes()
{
	dataPtr -= savedCount;
	while (dataPtr < dataLen)
		savedBytes[savedCount++] = data[dataPtr++];
}

bool
Converter::IsForward() const
{
	return forward;
}

void
Converter::GetFlags(UInt32& sourceFlags, UInt32& targetFlags) const
{
	const FileHeader*	fh = reinterpret_cast<const FileHeader*>(table);
	if (forward) {
		sourceFlags = READ(fh->formFlagsLHS);
		targetFlags = READ(fh->formFlagsRHS);
	}
	else {
		sourceFlags = READ(fh->formFlagsRHS);
		targetFlags = READ(fh->formFlagsLHS);
	}
}

static bool
getNamePtrFromTable(const Byte* table, UInt16 nameID, const Byte*& outNamePtr, UInt32& outNameLen)
{
	const FileHeader*	fh = reinterpret_cast<const FileHeader*>(table);
	const UInt32*		nameOffsets = reinterpret_cast<const UInt32*>(table + sizeof(FileHeader));
	for (UInt32 i = 0; i < READ(fh->numNames); ++i) {
		const NameRec*	n = reinterpret_cast<const NameRec*>(table + READ(nameOffsets[i]));
		if (READ(n->nameID) == nameID) {
			outNameLen = READ(n->nameLength);
			outNamePtr = reinterpret_cast<const Byte*>(n) + sizeof(NameRec);
			return true;
		}
	}
	return false;
}

bool
Converter::GetNamePtr(UInt16 nameID, const Byte*& outNamePtr, UInt32& outNameLen) const
{
	return getNamePtrFromTable(table, nameID, outNamePtr, outNameLen);
}

TECkit_Status
Converter::ConvertBufferOpt(
	const Byte* inBuffer, UInt32 inLength, UInt32* inUsed,
	Byte* outBuffer, UInt32 outLength, UInt32* outUsed,
	UInt32 inOptions, UInt32* lookaheadCount)
{
	TECkit_Status	rval;
#undef RETURN
#define RETURN(returnStatus)	rval = returnStatus; goto RETURN_LABEL

	UInt32	outPtr = 0;
	
	data = inBuffer;
	dataLen = inLength;
	dataPtr = 0;
	inputComplete = ((inOptions & kOptionsMask_InputComplete) == kOptionsComplete_InputIsComplete);
	unmappedBehavior = (inOptions & kOptionsMask_UnmappedBehavior);
	
	UInt32	c;
	if (pendingOutputChar != kInvalidChar) {
		c = pendingOutputChar;
		pendingOutputChar = kInvalidChar;
		goto GOT_CHAR;
	}
	while (1) {
		c = finalStage->getChar();
	GOT_CHAR:
		switch (c) {
			case kEndOfText:
				RETURN(kStatus_NoError);

			case kNeedMoreInput:
				RETURN(kStatus_NeedMoreInput);

			case kInvalidChar:
				RETURN(kStatus_IncompleteChar);

			case kUnmappedChar:
				RETURN(kStatus_UnmappedChar);

			default:
				switch (outputForm) {
					case kForm_Bytes:
						if (outPtr == outLength) {
							pendingOutputChar = c;
							RETURN(kStatus_OutputBufferFull);
						}
						outBuffer[outPtr++] = c;
						break;

					case kForm_UTF8:
						{
							int	bytesToWrite;
							if (c < 0x80) {				bytesToWrite = 1;
							} else if (c < 0x800) {		bytesToWrite = 2;
							} else if (c < 0x10000) {	bytesToWrite = 3;
							} else if (c < 0x200000) {	bytesToWrite = 4;
							} else {					bytesToWrite = 2;
														c = 0x0000fffd;
							};
							if (outPtr + bytesToWrite > outLength) {
								pendingOutputChar = c;
								RETURN (kStatus_OutputBufferFull);
							}
							outPtr += bytesToWrite;
							switch (bytesToWrite) {	/* note: code falls through cases! */
								case 4:	outBuffer[--outPtr] = (c | byteMark) & byteMask; c >>= 6;
								case 3:	outBuffer[--outPtr] = (c | byteMark) & byteMask; c >>= 6;
								case 2:	outBuffer[--outPtr] = (c | byteMark) & byteMask; c >>= 6;
								case 1:	outBuffer[--outPtr] =  c | firstByteMark[bytesToWrite];
							};
							outPtr += bytesToWrite;
						}
						break;

					case kForm_UTF16BE:
						if (c < 0x00010000) {
							if (outPtr + 2 > outLength) {
								pendingOutputChar = c;
								RETURN (kStatus_OutputBufferFull);
							}
							outBuffer[outPtr++] = c >> 8;
							outBuffer[outPtr++] = c;
						}
						else {
							if (outPtr + 4 > outLength) {
								pendingOutputChar = c;
								RETURN (kStatus_OutputBufferFull);
							}
							c -= halfBase;
							UInt32	hi = (c >> halfShift) + kSurrogateHighStart;
							UInt32	lo = (c & halfMask) + kSurrogateLowStart;
							outBuffer[outPtr++] = hi >> 8;
							outBuffer[outPtr++] = hi;
							outBuffer[outPtr++] = lo >> 8;
							outBuffer[outPtr++] = lo;
						}
						break;

					case kForm_UTF16LE:
						if (c < 0x00010000) {
							if (outPtr + 2 > outLength) {
								pendingOutputChar = c;
								RETURN (kStatus_OutputBufferFull);
							}
							outBuffer[outPtr++] = c;
							outBuffer[outPtr++] = c >> 8;
						}
						else {
							if (outPtr + 4 > outLength) {
								pendingOutputChar = c;
								RETURN (kStatus_OutputBufferFull);
							}
							c -= halfBase;
							UInt32	hi = (c >> halfShift) + kSurrogateHighStart;
							UInt32	lo = (c & halfMask) + kSurrogateLowStart;
							outBuffer[outPtr++] = hi;
							outBuffer[outPtr++] = hi >> 8;
							outBuffer[outPtr++] = lo;
							outBuffer[outPtr++] = lo >> 8;
						}
						break;

					case kForm_UTF32BE:
						if (outPtr + 4 > outLength) {
							pendingOutputChar = c;
							RETURN (kStatus_OutputBufferFull);
						}
						outBuffer[outPtr++] = c >> 24;
						outBuffer[outPtr++] = c >> 16;
						outBuffer[outPtr++] = c >> 8;
						outBuffer[outPtr++] = c;
						break;

					case kForm_UTF32LE:
						if (outPtr + 4 > outLength) {
							pendingOutputChar = c;
							RETURN (kStatus_OutputBufferFull);
						}
						outBuffer[outPtr++] = c;
						outBuffer[outPtr++] = c >> 8;
						outBuffer[outPtr++] = c >> 16;
						outBuffer[outPtr++] = c >> 24;
						break;
				}
				break;
		}
	}
	RETURN (kStatus_NoError);

RETURN_LABEL:
	if (inUsed)
		*inUsed = dataPtr;
	if (outUsed)
		*outUsed = outPtr;
	if (lookaheadCount) {
		*lookaheadCount = 0;
		Stage*	s = finalStage;
		while (s != this) {
			*lookaheadCount += s->lookaheadCount();
			s = s->prevStage;
		}
	}

	rval |= warningStatus;
	if ((rval & kStatusMask_Basic) == kStatus_NoError)
		Reset();
	
	return rval;
}

void
Converter::Reset()
{
	pendingOutputChar = kInvalidChar;
	savedCount = 0;
	dataPtr = 0;
	dataLen = 0;
	warningStatus = 0;
	Stage*	s = finalStage;
	while (s != this) {
		s->Reset();
		s = s->prevStage;
	}
}

bool
Converter::Validate(const Converter* cnv)
{
	if (!cnv)
		return false;
	if (cnv->status != kStatus_NoError)
		return false;
	if (cnv->table != 0) {
		const FileHeader*	fh = reinterpret_cast<const FileHeader*>(cnv->table);
		if (READ(fh->type) != kMagicNumber)
			return false;
	}
	return true;
}

TECkit_Status
WINAPI
TECkit_CreateConverter(
	Byte*				mapping,
	UInt32				mappingSize,
	Byte				mapForward,
	UInt16				inputForm,
	UInt16				outputForm,
	TECkit_Converter*	converter)
{
	TECkit_Status	status = kStatus_NoError;
	Converter*	cnv = 0;
	*converter = 0;
	try {
		cnv = new Converter(mapping, mappingSize, mapForward, inputForm, outputForm);
		status = cnv->creationStatus();
		if (status == kStatus_NoError)
			*converter = reinterpret_cast<TECkit_Converter>(cnv);
		else
			delete cnv;
	}
	catch (Converter::Exception e) {
		status = e.errorCode;
	}
	catch (...) {
		status = kStatus_Exception;
	}
	return status;
}

TECkit_Status
WINAPI
TECkit_DisposeConverter(
	TECkit_Converter	converter)
{
	TECkit_Status	status = kStatus_NoError;
	Converter*	cnv = reinterpret_cast<Converter*>(converter);
	if (!Converter::Validate(cnv))
		status = kStatus_InvalidConverter;
	else
		delete cnv;
	return status;
}

TECkit_Status
WINAPI
TECkit_GetConverterName(
	TECkit_Converter	converter,
	UInt16				nameID,
	Byte*				nameBuffer,
	UInt32				bufferSize,
	UInt32*				nameLength)
{
	TECkit_Status	status = kStatus_NoError;
	Converter*	cnv = reinterpret_cast<Converter*>(converter);
	if (!Converter::Validate(cnv))
		status = kStatus_InvalidConverter;
	else {
		const Byte*	namePtr;
		if (cnv->GetNamePtr(nameID, namePtr, *nameLength)) {
			UInt16	copyBytes = *nameLength < bufferSize ? *nameLength : bufferSize;
			if (copyBytes > 0)
				memcpy(nameBuffer, namePtr, copyBytes);
		}
		else
			status = kStatus_NameNotFound;
	}
	return status;
}

TECkit_Status
WINAPI
TECkit_GetConverterFlags(
	TECkit_Converter	converter,
	UInt32*				sourceFlags,
	UInt32*				targetFlags)
{
	TECkit_Status	status = kStatus_NoError;
	Converter*	cnv = reinterpret_cast<Converter*>(converter);
	if (!Converter::Validate(cnv))
		status = kStatus_InvalidConverter;
	else
		cnv->GetFlags(*sourceFlags, *targetFlags);
	return status;
}

TECkit_Status
WINAPI
TECkit_ResetConverter(
	TECkit_Converter	converter)
{
	TECkit_Status	status = kStatus_NoError;
	Converter*	cnv = reinterpret_cast<Converter*>(converter);
	if (!Converter::Validate(cnv))
		status = kStatus_InvalidConverter;
	else
		cnv->Reset();
	return status;
}

TECkit_Status
WINAPI
TECkit_ConvertBufferOpt(
	TECkit_Converter	converter,
	const Byte*			inBuffer,
	UInt32				inLength,
	UInt32*				inUsed,
	Byte*				outBuffer,
	UInt32				outLength,
	UInt32*				outUsed,
	UInt32				inOptions,
	UInt32*				lookaheadCount)
{
	TECkit_Status	status = kStatus_NoError;
	Converter*	cnv = reinterpret_cast<Converter*>(converter);
	if (!Converter::Validate(cnv))
		status = kStatus_InvalidConverter;
	else
		status = cnv->ConvertBufferOpt(inBuffer, inLength, inUsed, outBuffer, outLength, outUsed, inOptions, lookaheadCount);
	return status;
}

TECkit_Status
WINAPI
TECkit_ConvertBuffer(
	TECkit_Converter	converter,
	const Byte*			inBuffer,
	UInt32				inLength,
	UInt32*				inUsed,
	Byte*				outBuffer,
	UInt32				outLength,
	UInt32*				outUsed,
	Byte				inputIsComplete)
{
	return TECkit_ConvertBufferOpt(converter, inBuffer, inLength, inUsed, outBuffer, outLength, outUsed,
			kOptionsUnmapped_UseReplacementCharSilently + (inputIsComplete ? kOptionsComplete_InputIsComplete : 0), 0);
}

TECkit_Status
WINAPI
TECkit_FlushOpt(
	TECkit_Converter	converter,
	Byte*				outBuffer,
	UInt32				outLength,
	UInt32*				outUsed,
	UInt32				inOptions,
	UInt32*				lookaheadCount)
{
	TECkit_Status	status = kStatus_NoError;
	Converter*	cnv = reinterpret_cast<Converter*>(converter);
	if (!Converter::Validate(cnv))
		status = kStatus_InvalidConverter;
	else
		status = cnv->ConvertBufferOpt(0, 0, 0, outBuffer, outLength, outUsed,
			inOptions | kOptionsComplete_InputIsComplete, lookaheadCount);
	return status;
}

TECkit_Status
WINAPI
TECkit_Flush(
	TECkit_Converter	converter,
	Byte*				outBuffer,
	UInt32				outLength,
	UInt32*				outUsed)
{
	return TECkit_FlushOpt(converter, outBuffer, outLength, outUsed, kOptionsUnmapped_UseReplacementCharSilently, 0);
}

TECkit_Status
WINAPI
TECkit_GetMappingFlags(
	Byte*				mapping,
	UInt32				mappingSize,
	UInt32*				lhsFlags,
	UInt32*				rhsFlags)
{
	TECkit_Status	status = kStatus_NoError;
	if (mapping == 0)
		status = kStatus_InvalidMapping;
	else {
		const FileHeader*	fh = reinterpret_cast<const FileHeader*>(mapping);
		FileHeader			header;
		if (READ(fh->type) == kMagicNumberCmp) {
			// compressed mapping, so we need to decompress enough of it to read the flags
			unsigned long	uncompressedLen = sizeof(FileHeader);
			int	result = uncompress(reinterpret_cast<Byte*>(&header), &uncompressedLen, mapping + 2 * sizeof(UInt32), mappingSize - 2 * sizeof(UInt32));
			if (result != Z_BUF_ERROR)
				status = kStatus_InvalidMapping;
			fh = &header;
		}
		if (status == kStatus_NoError && READ(fh->type) == kMagicNumber) {
			if ((READ(fh->version) & 0xFFFF0000) > (kCurrentFileVersion & 0xFFFF0000))
				status = kStatus_BadMappingVersion;
			else {
				*lhsFlags = READ(fh->formFlagsLHS);
				*rhsFlags = READ(fh->formFlagsRHS);
			}
		}
		else
			status = kStatus_InvalidMapping;
	}
	return status;
}

TECkit_Status
WINAPI
TECkit_GetMappingName(
	Byte*				mapping,
	UInt32				mappingSize,
	UInt16				nameID,
	Byte*				nameBuffer,
	UInt32				bufferSize,
	UInt32*				nameLength)
{
	void*	buf = 0;
	TECkit_Status	status = kStatus_NoError;
	if (mapping == 0)
		status = kStatus_InvalidMapping;
	else {
		const FileHeader*	fh = reinterpret_cast<const FileHeader*>(mapping);
		FileHeader			header;
		if (READ(fh->type) == kMagicNumberCmp) {
			// compressed mapping, so we need to decompress the fixed header to read the headerLength field,
			// and then decompress the complete header to get the names
			unsigned long	uncompressedLen = sizeof(FileHeader);
			int	result = uncompress(reinterpret_cast<Byte*>(&header), &uncompressedLen, mapping + 2 * sizeof(UInt32), mappingSize - 2 * sizeof(UInt32));
			if (result != Z_BUF_ERROR)
				status = kStatus_InvalidMapping;
			else {
				fh = &header;
				uncompressedLen = READ(fh->headerLength);
				buf = malloc(uncompressedLen);
				if (buf == 0)
					status = kStatus_OutOfMemory;
				else {
					result = uncompress(static_cast<Byte*>(buf), &uncompressedLen, mapping + 2 * sizeof(UInt32), mappingSize - 2 * sizeof(UInt32));
					if (result != Z_BUF_ERROR)
						status = kStatus_InvalidMapping;
					fh = static_cast<const FileHeader*>(buf);
				}
			}
		}
		if (status == kStatus_NoError && READ(fh->type) == kMagicNumber) {
			if ((READ(fh->version) & 0xFFFF0000) > (kCurrentFileVersion & 0xFFFF0000))
				status = kStatus_BadMappingVersion;
			else {
				const Byte*	namePtr;
				if (getNamePtrFromTable(reinterpret_cast<const Byte*>(fh), nameID, namePtr, *nameLength)) {
					UInt16	copyBytes = *nameLength < bufferSize ? *nameLength : bufferSize;
					if (copyBytes > 0)
						memcpy(nameBuffer, namePtr, copyBytes);
				}
				else
					status = kStatus_NameNotFound;
			}
		}
		else
			status = kStatus_InvalidMapping;
		if (buf != 0)
			free(buf);
	}
	return status;
}

UInt32
WINAPI
TECkit_GetVersion()
{
	return kCurrentTECkitVersion;
}