/*------------------------------------------------------------------------
Copyright (C) 2002-2016 SIL International. All rights reserved.
Distributable under the terms of either the Common Public License or the
GNU Lesser General Public License, as specified in the LICENSING.txt file.
File: TECkit_Format.h
Responsibility: Jonathan Kew
Last reviewed: Not yet.
Description:
Definitions used in the TECkit binary table format
2006-06-02 jk added support for extended string rules (>255 per initial char)
-------------------------------------------------------------------------*/
#ifndef __TECkit_Format_H__
#define __TECkit_Format_H__
#include "TECkit_Common.h"
#define kMagicNumber 0x714d6170 /* 'qMap' */
#define kMagicNumberCmp 0x7a516d70 /* 'zQmp' */
#define kFileVersion2_1 0x00020001 /* version before tables with ExtStringRules */
#define kCurrentFileVersion 0x00030000 /* current version */
#define kTableVersion2 0x00020000
#define kCurrentTableVersion 0x00030000 /* actually, the engine doesn't check this,
it only looks at the file version */
struct NameRec {
UInt16 nameID;
UInt16 nameLength;
/*
Byte data[nameLength];
pad to 2-byte boundary
*/
};
struct FileHeader {
UInt32 type; /* magic number = 'qMap' */
UInt32 version; /* version = kFileCurrentVersion */
UInt32 headerLength; /* length of this header including offset arrays and name records */
UInt32 formFlagsLHS; /* flags for normalization form, Unicode/byte encoding on LHS of mapping */
UInt32 formFlagsRHS; /* flags for normalization form, Unicode/byte encoding on RHS of mapping */
UInt32 numNames; /* number of strings in the names table */
UInt32 numFwdTables; /* number of tables in forward pipeline */
UInt32 numRevTables; /* number of tables in reverse pipeline */
#if 0
UInt32 nameOffsets[numNames]; /* offsets from FileHeader to each NameRec */
UInt32 fwdBase[numFwdTables]; /* offsets from FileHeader to forward tables */
UInt32 revBase[numRevTables]; /* offsets from FileHeader to reverse tables */
NameRec names[numNames]; /* the name records */
#endif
};
struct TableHeader {
UInt32 type; /* type = 'B->B', 'B->U', 'U->B', 'U->U' */
/* or type = 'NFC ', 'NFD ', and no additional header fields are present */
UInt32 version; /* version = kCurrentTableVersion */
UInt32 length; /* total length of this table */
UInt32 flags; /* flags:
0x00000001: supplementary-plane Unicode characters supported in mapping and classes
0x00000002: DBCS support (BB/BU tables only) in lookup table
*/
UInt32 pageBase; /* offset from table header to page table (Ux tables) or dbcsPage table (Bx tables) */
UInt32 lookupBase; /* offset from table header to lookup table(s) */
UInt32 matchClassBase; /* offset from table header to match class definitions */
UInt32 repClassBase; /* offset from table header to replacement class definitions */
UInt32 stringListBase; /* offset from table header to string rule lists */
UInt32 stringRuleData; /* offset from table header to string rule data */
UInt8 maxMatch; /* max number of input code units matched by a rule */
UInt8 maxPre; /* max number of input code units matched by pre-context */
UInt8 maxPost; /* max number of input code units matched by post-context */
UInt8 maxOutput; /* max number of output code units generated by a rule */
UInt32 replacementChar; /* default output for unmapped codes */
};
#ifndef __cplusplus
typedef struct TableHeader TableHeader;
#endif
#define kTableType_BB 0x422d3e42
#define kTableType_BU 0x422d3e55
#define kTableType_UB 0x552d3e42
#define kTableType_UU 0x552d3e55
#define kTableType_NFC 0x4e464320
#define kTableType_NFD 0x4e464420
#define kTableFlags_Supplementary 0x0001
#define kTableFlags_DBCS 0x0002
union Lookup {
/* for any table when string rules are used */
struct {
UInt8 type; /*
0xff: use string rules
0xfe: illegal DBCS trailing byte
0xfd: unmapped character: copy (BB/UU) or output default (UB/BU)
0x00-0x03: direct lookup
*/
UInt8 ruleCount; /* number of rules for this code */
UInt16 ruleIndex; /* index into stringList of start of rule list for this code */
} rules;
/* for UB and BB tables with direct byte output */
struct {
UInt8 count; /* count of bytes present in data[]: 0-3 */
UInt8 data[3];
} bytes;
/* for BU and UU tables with direct Unicode output */
UInt32 usv; /* unicode scalar value */
};
#ifndef __cplusplus
typedef union Lookup Lookup;
#endif
#define kLookupType_StringRules 0xff
#define kLookupType_IllegalDBCS 0xfe
#define kLookupType_Unmapped 0xfd
#define kLookupType_RuleTypeMask 0xc0
#define kLookupType_ExtStringRules 0x80
#define kLookupType_ExtRuleCountMask 0x3f
/*
/rules.ruleOffset/ points to an array of /rules.ruleCount/ UInt32 values which are the offsets
from stringRuleData to each rule to test for this character
*/
struct StringRule {
UInt8 matchLength; /* length of match string in matchElements */
UInt8 postLength; /* length of post-context in matchElements */
UInt8 preLength; /* length of pre-context in matchElements */
UInt8 repLength; /* length of replacement string in repElements */
#if 0
MatchElem matchString[];
MatchElem postContext[];
MatchElem preContext[]; /* reversed */
RepElem repString[];
#endif
};
#ifndef __cplusplus
typedef struct StringRule StringRule;
#endif
union MatchElem {
#ifdef __cplusplus
MatchElem()
{ }
#endif
struct {
UInt8 repeat; /* repeat count: (min << 4) + max */
UInt8 type; /*
0x80: negate flag (not allowed with group)
0x40: non-literal flag--if set, bits 0x3f indicate specific type (value must not be zero)
Note that if 'non-literal' flag is NOT set, remaining bits are not used as type code
but are part of a USV value (or must be set to zero for literal byte data).
*/
UInt16 reserved;
} flags;
union {
struct {
UInt16 reserved;
UInt8 dNext; /* offset to following OR or EGroup element */
UInt8 dAfter; /* offset to element after the group for BGroup */
} bgroup;
struct {
UInt16 reserved;
UInt8 dNext; /* offset to following OR or EGroup element (for OR only) */
UInt8 dStart; /* reverse offset to corresponding BGroup */
} egroup; /* (also used for OR elements) */
struct {
UInt16 reserved;
UInt16 index; /* index of character class */
} cls;
struct {
UInt8 reserved[3];
UInt8 data; /* literal byte */
} byte;
struct {
UInt32 data; /* literal Unicode scalar: must mask with kUSVMask, as top bits overlap flags.repeat and "negate" bit in flags.type */
} usv;
} value;
};
#ifndef __cplusplus
typedef union MatchElem MatchElem;
#endif
#define kMatchElem_Negate 0x80 /* negated test */
#define kMatchElem_NonLit 0x40 /* test value is not a literal character; need to check type */
#define kMatchElem_TypeMask 0x3f /* Mask for type value. Note that type 0 must not be used (=literal) */
#define kMatchElem_Type_Class 0x01 /* class match */
#define kMatchElem_Type_BGroup 0x02 /* begin group */
#define kMatchElem_Type_EGroup 0x03 /* end group */
#define kMatchElem_Type_OR 0x04 /* special code: OR */
#define kMatchElem_Type_ANY 0x05 /* special code: ANY */
#define kMatchElem_Type_EOS 0x06 /* special code: EOS */
#define kMatchElem_Type_Copy 0x07 /* copy matched item (invalid; for internal compiler use) */
#define kUSVMask 0x001fffff
union RepElem {
struct {
UInt8 type; /* see kRepElem_... below */
UInt8 matchIndex; /* index of corresponding item in matchString for type == kRepElem_Class or kRepElem_Copy */
UInt16 repClass; /* repClass if type == kRepElem_Class */
} flags;
UInt32 value; /* literal value (mask with kUSVMask) if flags.type == kRepElem_Literal */
};
#ifndef __cplusplus
typedef union RepElem RepElem;
#endif
#define kRepElem_Literal 0x00
#define kRepElem_Class kMatchElem_Type_Class
#define kRepElem_Copy kMatchElem_Type_Copy
#define kRepElem_Unmapped 0x0f /* used in default terminator rules */
#endif /* __TECkit_Format_H__ */