Package list teckit / debian/2.5.8+ds2-5 source / TECkit_Format.h
debian/2.5.8+ds2-5

Tree @debian/2.5.8+ds2-5 (Download .tar.gz)

TECkit_Format.h @debian/2.5.8+ds2-5raw · history · blame

/*------------------------------------------------------------------------
Copyright (C) 2002-2016 SIL International. All rights reserved.

Distributable under the terms of either the Common Public License or the
GNU Lesser General Public License, as specified in the LICENSING.txt file.

File: TECkit_Format.h
Responsibility: Jonathan Kew
Last reviewed: Not yet.

Description:
    Definitions used in the TECkit binary table format

	2006-06-02	jk	added support for extended string rules (>255 per initial char)
-------------------------------------------------------------------------*/

#ifndef __TECkit_Format_H__
#define __TECkit_Format_H__

#include "TECkit_Common.h"

#define kMagicNumber			0x714d6170	/* 'qMap' */
#define kMagicNumberCmp			0x7a516d70	/* 'zQmp' */

#define kFileVersion2_1			0x00020001	/* version before tables with ExtStringRules */
#define kCurrentFileVersion		0x00030000	/* current version */

#define kTableVersion2			0x00020000
#define kCurrentTableVersion	0x00030000	/* actually, the engine doesn't check this,
												it only looks at the file version */

struct NameRec {
	UInt16	nameID;
	UInt16	nameLength;
/*
	Byte	data[nameLength];
	pad to 2-byte boundary
*/
};

struct FileHeader {
	UInt32	type;					/* magic number = 'qMap' */
	UInt32	version;				/* version = kFileCurrentVersion */
	UInt32	headerLength;			/* length of this header including offset arrays and name records */
	UInt32	formFlagsLHS;			/* flags for normalization form, Unicode/byte encoding on LHS of mapping */
	UInt32	formFlagsRHS;			/* flags for normalization form, Unicode/byte encoding on RHS of mapping */
	UInt32	numNames;				/* number of strings in the names table */
	UInt32	numFwdTables;			/* number of tables in forward pipeline */
	UInt32	numRevTables;			/* number of tables in reverse pipeline */
#if 0
	UInt32	nameOffsets[numNames];	/* offsets from FileHeader to each NameRec */
	UInt32	fwdBase[numFwdTables];	/* offsets from FileHeader to forward tables */
	UInt32	revBase[numRevTables];	/* offsets from FileHeader to reverse tables */
	NameRec	names[numNames];		/* the name records */
#endif
};

struct TableHeader {
	UInt32	type;					/* type = 'B->B', 'B->U', 'U->B', 'U->U' */
									/* or type = 'NFC ', 'NFD ', and no additional header fields are present */
	UInt32	version;				/* version = kCurrentTableVersion */
	UInt32	length;					/* total length of this table */
	UInt32	flags;					/* flags:
										0x00000001:	supplementary-plane Unicode characters supported in mapping and classes
										0x00000002:	DBCS support (BB/BU tables only) in lookup table
									*/
	UInt32	pageBase;				/* offset from table header to page table (Ux tables) or dbcsPage table (Bx tables) */
	UInt32	lookupBase;				/* offset from table header to lookup table(s) */
	UInt32	matchClassBase;			/* offset from table header to match class definitions */
	UInt32	repClassBase;			/* offset from table header to replacement class definitions */
	UInt32	stringListBase;			/* offset from table header to string rule lists */
	UInt32	stringRuleData;			/* offset from table header to string rule data */
	UInt8	maxMatch;				/* max number of input code units matched by a rule */
	UInt8	maxPre;					/* max number of input code units matched by pre-context */
	UInt8	maxPost;				/* max number of input code units matched by post-context */
	UInt8	maxOutput;				/* max number of output code units generated by a rule */
	UInt32	replacementChar;		/* default output for unmapped codes */
};
#ifndef __cplusplus
typedef struct TableHeader		TableHeader;
#endif

#define kTableType_BB				0x422d3e42
#define kTableType_BU				0x422d3e55
#define kTableType_UB				0x552d3e42
#define kTableType_UU				0x552d3e55

#define kTableType_NFC				0x4e464320
#define kTableType_NFD				0x4e464420

#define	kTableFlags_Supplementary	0x0001
#define	kTableFlags_DBCS			0x0002

union Lookup {
	/* for any table when string rules are used */
	struct {
		UInt8	type;				/*
										0xff: use string rules
										0xfe: illegal DBCS trailing byte
										0xfd: unmapped character: copy (BB/UU) or output default (UB/BU)
										0x00-0x03: direct lookup
									*/
		UInt8	ruleCount;			/* number of rules for this code */
		UInt16	ruleIndex;			/* index into stringList of start of rule list for this code */
	}	rules;
	/* for UB and BB tables with direct byte output */
	struct {
		UInt8	count;				/* count of bytes present in data[]: 0-3 */
		UInt8	data[3];
	}	bytes;
	/* for BU and UU tables with direct Unicode output */
	UInt32		usv;				/* unicode scalar value */
};
#ifndef __cplusplus
typedef union Lookup			Lookup;
#endif

#define kLookupType_StringRules		0xff
#define kLookupType_IllegalDBCS		0xfe
#define kLookupType_Unmapped		0xfd

#define kLookupType_RuleTypeMask		0xc0
#define kLookupType_ExtStringRules		0x80
#define kLookupType_ExtRuleCountMask	0x3f

/*
	/rules.ruleOffset/ points to an array of /rules.ruleCount/ UInt32 values which are the offsets
	from stringRuleData to each rule to test for this character
*/

struct StringRule {
	UInt8	matchLength;			/* length of match string in matchElements */
	UInt8	postLength;				/* length of post-context in matchElements */
	UInt8	preLength;				/* length of pre-context in matchElements */
	UInt8	repLength;				/* length of replacement string in repElements */
#if 0
	MatchElem	matchString[];
	MatchElem	postContext[];
	MatchElem	preContext[];		/* reversed */
	RepElem		repString[];
#endif
};
#ifndef __cplusplus
typedef struct StringRule		StringRule;
#endif

union MatchElem {
#ifdef __cplusplus
				MatchElem()
					{ }
#endif
	struct {
		UInt8	repeat;				/* repeat count: (min << 4) + max */
		UInt8	type;				/* 
										0x80:	negate flag (not allowed with group)
										0x40:	non-literal flag--if set, bits 0x3f indicate specific type (value must not be zero)
												Note that if 'non-literal' flag is NOT set, remaining bits are not used as type code
												but are part of a USV value (or must be set to zero for literal byte data).
									*/
		UInt16	reserved;
	}	flags;
	union {
		struct {
			UInt16		reserved;
			UInt8		dNext;		/* offset to following OR or EGroup element */
			UInt8		dAfter;		/* offset to element after the group for BGroup */
		}	bgroup;
		struct {
			UInt16		reserved;
			UInt8		dNext;		/* offset to following OR or EGroup element (for OR only) */
			UInt8		dStart;		/* reverse offset to corresponding BGroup */
		}	egroup;					/* (also used for OR elements) */
		struct {
			UInt16		reserved;
			UInt16		index;		/* index of character class */
		}	cls;
		struct {
			UInt8		reserved[3];
			UInt8		data;		/* literal byte */
		}	byte;
		struct {
			UInt32		data;		/* literal Unicode scalar: must mask with kUSVMask, as top bits overlap flags.repeat and "negate" bit in flags.type */
		}	usv;
	}	value;
};
#ifndef __cplusplus
typedef union MatchElem			MatchElem;
#endif

#define	kMatchElem_Negate			0x80	/* negated test */
#define kMatchElem_NonLit			0x40	/* test value is not a literal character; need to check type */

#define kMatchElem_TypeMask			0x3f	/* Mask for type value. Note that type 0 must not be used (=literal) */
#define	kMatchElem_Type_Class		0x01	/* class match */
#define kMatchElem_Type_BGroup		0x02	/* begin group */
#define kMatchElem_Type_EGroup		0x03	/* end group */
#define	kMatchElem_Type_OR			0x04	/* special code: OR */
#define	kMatchElem_Type_ANY			0x05	/* special code: ANY */
#define	kMatchElem_Type_EOS			0x06	/* special code: EOS */
#define kMatchElem_Type_Copy		0x07	/* copy matched item (invalid; for internal compiler use) */

#define kUSVMask					0x001fffff

union RepElem {
	struct {
		UInt8	type;				/* see kRepElem_... below */
		UInt8	matchIndex;			/* index of corresponding item in matchString for type == kRepElem_Class or kRepElem_Copy */
		UInt16	repClass;			/* repClass if type == kRepElem_Class */
	}	flags;
	UInt32	value;					/* literal value (mask with kUSVMask) if flags.type == kRepElem_Literal */
};
#ifndef __cplusplus
typedef union RepElem			RepElem;
#endif

#define kRepElem_Literal			0x00
#define kRepElem_Class				kMatchElem_Type_Class
#define kRepElem_Copy				kMatchElem_Type_Copy
#define kRepElem_Unmapped			0x0f	/* used in default terminator rules */

#endif	/* __TECkit_Format_H__ */