// -*- coding:unix; mode:c++; tab-width:4; c-basic-offset:4; indent-tabs-mode:nil -*-
/*------------------------------------------------------------------------------
VnConv: Vietnamese Encoding Converter Library
UniKey Project: http://unikey.sourceforge.net
Copyleft (C) 1998-2002 Pham Kim Long
Contact: longp@cslab.felk.cvut.cz
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
--------------------------------------------------------------------------------*/
#include <stddef.h>
#include <search.h>
#include <memory.h>
#include <ctype.h>
#include <stdlib.h>
#include "charset.h"
#include "data.h"
int LoVowel['z'-'a'+1];
int HiVowel['Z'-'A'+1];
#define IS_VOWEL(x) ((x >= 'a' && x <= 'z' && LoVowel[x-'a']) || (x >= 'A' && x <= 'Z' && HiVowel[x-'A']))
SingleByteCharset *SgCharsets[CONV_TOTAL_SINGLE_CHARSETS];
DoubleByteCharset *DbCharsets[CONV_TOTAL_DOUBLE_CHARSETS];
DllExport CVnCharsetLib VnCharsetLibObj;
//////////////////////////////////////////////////////
// Generic VnCharset class
//////////////////////////////////////////////////////
int VnCharset::elementSize()
{
return 1;
}
//-------------------------------------------
int VnInternalCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
if (!is.getNextDW(stdChar)) {
bytesRead = 0;
return 0;
}
bytesRead = sizeof(UKDWORD);
return 1;
}
//-------------------------------------------
int VnInternalCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
outLen = sizeof(StdVnChar);
os.putW((UKWORD)stdChar);
return os.putW((UKWORD)(stdChar>>(sizeof(UKWORD)*8)));
}
//-------------------------------------------
int VnInternalCharset::elementSize()
{
return 4;
}
//-------------------------------------------
SingleByteCharset::SingleByteCharset(unsigned char * vnChars)
{
int i;
m_vnChars = vnChars;
memset(m_stdMap, 0, 256*sizeof(UKWORD));
for (i=0; i<TOTAL_VNCHARS; i++) {
if (vnChars[i] != 0 && (i==TOTAL_VNCHARS-1 || vnChars[i] != vnChars[i+1]))
m_stdMap[vnChars[i]] = i + 1;
}
}
//-------------------------------------------
int SingleByteCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
unsigned char ch;
if (!is.getNext(ch)) {
bytesRead = 0;
return 0;
}
stdChar = (m_stdMap[ch])? (VnStdCharOffset + m_stdMap[ch] - 1) : ch;
bytesRead = 1;
return 1;
}
//-------------------------------------------
int SingleByteCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
int ret;
unsigned char ch;
if (stdChar >= VnStdCharOffset) {
outLen = 1;
ch = m_vnChars[stdChar - VnStdCharOffset];
if (ch == 0)
ch = (stdChar == StdStartQuote)? PadStartQuote :
((stdChar == StdEndQuote)? PadEndQuote :
((stdChar == StdEllipsis)? PadEllipsis: PadChar) );
ret = os.putB(ch);
}
else {
if (stdChar > 255 || m_stdMap[stdChar]) {
//this character is missing in the charset
// output padding character
outLen = 1;
ret = os.putB(PadChar);
}
else {
outLen = 1;
ret = os.putB((UKBYTE)stdChar);
}
}
return ret;
}
//-------------------------------------------
int wideCharCompare(const void *ele1, const void *ele2)
{
UKWORD ch1 = LOWORD(*((UKDWORD *)ele1));
UKWORD ch2 = LOWORD(*((UKDWORD *)ele2));
return (ch1 == ch2)? 0 : ((ch1 > ch2)? 1 : -1);
}
//-------------------------------------------
UnicodeCharset::UnicodeCharset(UnicodeChar *vnChars)
{
UKDWORD i;
m_toUnicode = vnChars;
for (i=0; i<TOTAL_VNCHARS; i++)
m_vnChars[i] = (i << 16) + vnChars[i]; // high word is used for index
qsort(m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
}
//-------------------------------------------
int UnicodeCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
UnicodeChar uniCh;
if (!is.getNextW(uniCh)) {
bytesRead = 0;
return 0;
}
bytesRead = sizeof(UnicodeChar);
UKDWORD key = uniCh;
UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
if (pChar)
stdChar = VnStdCharOffset + HIWORD(*pChar);
else
stdChar = uniCh;
return 1;
}
//-------------------------------------------
int UnicodeCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
outLen = sizeof(UnicodeChar);
return os.putW((stdChar >= VnStdCharOffset)?
m_toUnicode[stdChar-VnStdCharOffset] : (UnicodeChar)stdChar);
}
//-------------------------------------------
int UnicodeCharset::elementSize()
{
return 2;
}
////////////////////////////////////////
// Unicode decomposed
////////////////////////////////////////
//-------------------------------------------
int uniCompInfoCompare(const void *ele1, const void *ele2)
{
UKDWORD ch1 = ((UniCompCharInfo *)ele1)->compChar;
UKDWORD ch2 = ((UniCompCharInfo *)ele2)->compChar;
return (ch1 == ch2)? 0 : ((ch1 > ch2)? 1 : -1);
}
UnicodeCompCharset::UnicodeCompCharset(UnicodeChar *uniChars, UKDWORD *uniCompChars)
{
int i,k;
m_uniCompChars = uniCompChars;
m_totalChars = 0;
for (i=0; i<TOTAL_VNCHARS; i++) {
m_info[i].compChar = uniCompChars[i];
m_info[i].stdIndex = i;
m_totalChars++;
}
for (k=0, i=TOTAL_VNCHARS; k<TOTAL_VNCHARS; k++)
if (uniChars[k] != uniCompChars[k]) {
m_info[i].compChar = uniChars[k];
m_info[i].stdIndex = k;
m_totalChars++;
i++;
}
qsort(m_info, m_totalChars, sizeof(UniCompCharInfo), uniCompInfoCompare);
}
//---------------------------------------------
int UnicodeCompCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
// read first char
UniCompCharInfo key;
UKWORD w;
if (!is.getNextW(w)) {
bytesRead = 0;
return 0;
}
key.compChar = w;
bytesRead = 2;
UniCompCharInfo *pInfo = (UniCompCharInfo *)bsearch(&key, m_info, m_totalChars,
sizeof(UniCompCharInfo), uniCompInfoCompare);
if (!pInfo)
stdChar = key.compChar;
else {
stdChar = pInfo->stdIndex + VnStdCharOffset;
if (is.peekNextW(w)) {
UKDWORD hi = w;
if (hi > 0) {
key.compChar += hi << 16;
pInfo = (UniCompCharInfo *)bsearch(&key, m_info, m_totalChars,
sizeof(UniCompCharInfo), uniCompInfoCompare);
if (pInfo) {
stdChar = pInfo->stdIndex + VnStdCharOffset;
bytesRead += 2;
is.getNextW(w);
}
}
}
}
return 1;
}
//---------------------------------------------
int UnicodeCompCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
int ret;
if (stdChar >= VnStdCharOffset) {
UKDWORD uniCompCh = m_uniCompChars[stdChar-VnStdCharOffset];
UKWORD lo = LOWORD(uniCompCh);
UKWORD hi = HIWORD(uniCompCh);
outLen = 2;
ret = os.putW(lo);
if (hi > 0) {
outLen += 2;
ret = os.putW(hi);
}
}
else {
outLen = 2;
ret = os.putW((UKWORD)stdChar);
}
return ret;
}
//-------------------------------------------
int UnicodeCompCharset::elementSize()
{
return 2;
}
////////////////////////////////
// Unicode UTF-8 //
////////////////////////////////
int UnicodeUTF8Charset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
UKWORD w1, w2, w3;
UKBYTE first, second, third;
UnicodeChar uniCh;
bytesRead = 0;
if (!is.getNext(first))
return 0;
bytesRead = 1;
if (first < 0x80)
uniCh = first; // 1-byte sequence
else if ((first & 0xE0) == 0xC0) {
//2-byte sequence
if (!is.peekNext(second))
return 0;
if ((second & 0xC0) != 0x80) {
stdChar = INVALID_STD_CHAR;
return 1;
}
is.getNext(second);
bytesRead = 2;
w1 = first;
w2 = second;
uniCh = ((w1 & 0x001F) << 6) | (w2 & 0x3F);
}
else if ((first & 0xF0) == 0xE0) {
//3-byte sequence
if (!is.peekNext(second))
return 0;
if ((second & 0xC0) != 0x80) {
stdChar = INVALID_STD_CHAR;
return 1;
}
is.getNext(second);
bytesRead = 2;
if (!is.peekNext(third))
return 0;
if ((third & 0xC0) != 0x80) {
stdChar = INVALID_STD_CHAR;
return 1;
}
is.getNext(third);
bytesRead = 3;
w1 = first;
w2 = second;
w3 = third;
uniCh = ((w1 & 0x000F) << 12) | ((w2 & 0x003F) << 6) | (w3 & 0x003F);
}
else {
stdChar = INVALID_STD_CHAR;
return 1;
}
// translate to StdVnChar
UKDWORD key = uniCh;
UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
if (pChar)
stdChar = VnStdCharOffset + HIWORD(*pChar);
else stdChar = uniCh;
return 1;
}
//-------------------------------------------
int UnicodeUTF8Charset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
UnicodeChar uChar = (stdChar < VnStdCharOffset)?
(UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
int ret;
if (uChar < 0x0080) {
outLen = 1;
ret = os.putB((UKBYTE)uChar);
} else if (uChar < 0x0800) {
outLen = 2;
os.putB(0xC0 | (UKBYTE)(uChar >> 6));
ret = os.putB(0x80 | (UKBYTE)(uChar & 0x003F));
} else {
outLen = 3;
os.putB(0xE0 | (UKBYTE)(uChar >> 12));
os.putB(0x80 | (UKBYTE)((uChar >> 6) & 0x003F));
ret = os.putB(0x80 | (UKBYTE)(uChar & 0x003F));
}
return ret;
}
////////////////////////////////////////
// Unicode character reference &#D; //
////////////////////////////////////////
int hexDigitValue(unsigned char digit)
{
if (digit >= 'a' && digit <= 'f')
return digit-'a'+10;
if (digit >= 'A' && digit <= 'F')
return digit-'A'+10;
if (digit >= '0' && digit <= '9')
return digit-'0';
return 0;
}
//--------------------------------------
int UnicodeRefCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
unsigned char ch;
UnicodeChar uniCh;
bytesRead = 0;
if (!is.getNext(ch))
return 0;
bytesRead = 1;
uniCh = ch;
if (ch == '&') {
if (is.peekNext(ch) && ch == '#') {
is.getNext(ch);
bytesRead++;
if (!is.eos()) {
is.peekNext(ch);
if (ch != 'x' && ch != 'X') {
UKWORD code = 0;
int digits = 0;
while (is.peekNext(ch) && isdigit(ch) && digits < 5) {
is.getNext(ch);
bytesRead++;
code = code*10 + (ch - '0');
digits++;
}
if (is.peekNext(ch) && ch == ';') {
is.getNext(ch);
bytesRead++;
uniCh = code;
}
}
else {
is.getNext(ch);
bytesRead++;
UKWORD code = 0;
int digits = 0;
while (is.peekNext(ch) && isxdigit(ch) && digits < 4) {
is.getNext(ch);
bytesRead++;
code = (code << 4) + hexDigitValue(ch);
digits++;
}
if (is.peekNext(ch) && ch == ';') {
is.getNext(ch);
bytesRead++;
uniCh = code;
}
} // hex digits
}
}
}
// translate to StdVnChar
UKDWORD key = uniCh;
UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
if (pChar)
stdChar = VnStdCharOffset + HIWORD(*pChar);
else stdChar = uniCh;
return 1;
}
//--------------------------------
int UnicodeRefCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
UnicodeChar uChar = (stdChar < VnStdCharOffset)?
(UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
int ret;
if (uChar < 128) {
outLen = 1;
ret = os.putB((UKBYTE)uChar);
}
else {
outLen = 2;
os.putB((UKBYTE)'&');
os.putB((UKBYTE)'#');
int i, digit, prev, base;
prev = 0;
base = 10000;
for (i=0; i < 5; i++) {
digit = uChar / base;
if (digit || prev) {
prev = 1;
outLen++;
os.putB('0' + (unsigned char)digit);
}
uChar %= base;
base /= 10;
}
ret = os.putB((UKBYTE)';');
outLen++;
}
return ret;
}
#define HEX_DIGIT(x) ((x < 10)? ('0'+x) : ('A'+x-10))
//--------------------------------
int UnicodeHexCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
UnicodeChar uChar = (stdChar < VnStdCharOffset)?
(UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
int ret;
if (uChar < 256) {
outLen = 1;
ret = os.putB((UKBYTE)uChar);
}
else {
outLen = 3;
os.putB('&');
os.putB('#');
os.putB('x');
int i, digit;
int prev = 0;
int shifts = 12;
for (i=0; i < 4; i++) {
digit = ((uChar >> shifts) & 0x000F);
if (digit > 0 || prev) {
prev = 1;
outLen++;
os.putB((UKBYTE)HEX_DIGIT(digit));
}
shifts -= 4;
}
ret = os.putB(';');
outLen++;
}
return ret;
}
/////////////////////////////////
// Class UnicodeCStringCharset /
/////////////////////////////////
void UnicodeCStringCharset::startInput()
{
m_prevIsHex = 0;
}
//----------------------------------------
int UnicodeCStringCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
unsigned char ch;
UnicodeChar uniCh;
bytesRead = 0;
if (!is.getNext(ch))
return 0;
bytesRead = 1;
uniCh = ch;
if (ch == '\\') {
if (is.peekNext(ch) && (ch=='x' || ch=='X')) {
is.getNext(ch);
bytesRead++;
UKWORD code = 0;
int digits = 0;
while (is.peekNext(ch) && isxdigit(ch) && digits < 4) {
is.getNext(ch);
bytesRead++;
code = (code << 4) + hexDigitValue(ch);
digits++;
}
uniCh = code;
}
}
// translate to StdVnChar
UKDWORD key = uniCh;
UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
if (pChar)
stdChar = VnStdCharOffset + HIWORD(*pChar);
else stdChar = uniCh;
return 1;
}
//------------------------------------
int UnicodeCStringCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
UnicodeChar uChar = (stdChar < VnStdCharOffset)?
(UnicodeChar)stdChar : m_toUnicode[stdChar-VnStdCharOffset];
int ret;
if (uChar < 128 && !isxdigit(uChar) && uChar != 'x' && uChar != 'X') {
outLen = 1;
ret = os.putB((UKBYTE)uChar);
}
else {
outLen = 2;
os.putB('\\');
os.putB('x');
int i, digit;
int prev = 0;
int shifts = 12;
for (i=0; i < 4; i++) {
digit = ((uChar >> shifts) & 0x000F);
if (digit > 0 || prev) {
prev = 1;
outLen++;
os.putB((UKBYTE)HEX_DIGIT(digit));
}
shifts -= 4;
}
ret = os.isOK();
m_prevIsHex = 1;
}
return ret;
}
/////////////////////////////////
// Double-byte charsets //
/////////////////////////////////
DoubleByteCharset::DoubleByteCharset(UKWORD *vnChars)
{
m_toDoubleChar = vnChars;
memset(m_stdMap, 0, 256*sizeof(UKWORD));
for (int i=0; i<TOTAL_VNCHARS; i++) {
if (vnChars[i] >> 8) // a 2-byte character
m_stdMap[vnChars[i] >> 8] = 0xFFFF; //INVALID_STD_CHAR;
else if (m_stdMap[vnChars[i]] == 0)
m_stdMap[vnChars[i]] = i+1;
m_vnChars[i] = (i << 16) + vnChars[i]; // high word is used for StdChar index
}
qsort(m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
}
//---------------------------------------------
int DoubleByteCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
unsigned char ch;
// read first byte
bytesRead = 0;
if (!is.getNext(ch))
return 0;
bytesRead = 1;
stdChar = m_stdMap[ch];
if (stdChar == 0)
stdChar = ch;
else if (stdChar == 0xFFFF)
stdChar = INVALID_STD_CHAR;
else {
stdChar += VnStdCharOffset - 1;
UKBYTE hi;
if (is.peekNext(hi) && hi > 0) {
//test if a double-byte character is encountered
UKDWORD key = MAKEWORD(ch,hi);
UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, TOTAL_VNCHARS, sizeof(UKDWORD), wideCharCompare);
if (pChar) {
stdChar = VnStdCharOffset + HIWORD(*pChar);
bytesRead = 2;
is.getNext(hi);
}
}
}
return 1;
}
//---------------------------------------------
int DoubleByteCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
int ret;
if (stdChar >= VnStdCharOffset) {
UKWORD wCh = m_toDoubleChar[stdChar-VnStdCharOffset];
if (wCh & 0xFF00) {
outLen = 2;
os.putB((UKBYTE)(wCh & 0x00FF));
ret = os.putB((UKBYTE)(wCh >> 8));
}
else {
unsigned char b = (unsigned char)wCh;
if (m_stdMap[b] == 0xFFFF)
b = PadChar;
outLen = 1;
ret = os.putB(b);
}
/*
outLen = 1;
ret = os.putB((UKBYTE)(wCh & 0x00FF));
if (wCh & 0xFF00) {
outLen = 2;
ret = os.putB((UKBYTE)(wCh >> 8));
}
*/
}
else {
if (stdChar > 255 || m_stdMap[stdChar]) {
outLen = 1;
ret = os.putB((UKBYTE)PadChar);
}
else {
outLen = 1;
ret = os.putB((UKBYTE)stdChar);
}
}
return ret;
}
/////////////////////////////////////////////
// Class: VIQRCharset //
/////////////////////////////////////////////
unsigned char VIQRTones[] = {'\'','`','?','~','.'};
const char *VIQREscapes[] = {
"://",
"/",
"@",
"mailto:",
"email:",
"news:",
"www",
"ftp"
};
const int VIQREscCount = sizeof(VIQREscapes) / sizeof(char*);
VIQRCharset::VIQRCharset(UKDWORD *vnChars)
{
memset(m_stdMap, 0, 256*sizeof(UKWORD));
int i;
UKDWORD dw;
m_vnChars = vnChars;
for (i=0; i<TOTAL_VNCHARS; i++) {
dw = m_vnChars[i];
if (!(dw & 0xffffff00)) { //single byte
//ch = (unsigned char)(dw & 0xff);
m_stdMap[dw] = i+256;
}
}
// set offset from base characters according to tone marks
m_stdMap[(unsigned char)'\''] = 2;
m_stdMap[(unsigned char)'`'] = 4;
m_stdMap[(unsigned char)'?'] = 6;
m_stdMap[(unsigned char)'~'] = 8;
m_stdMap[(unsigned char)'.'] = 10;
m_stdMap[(unsigned char)'^'] = 12;
m_stdMap[(unsigned char)'('] = 24;
m_stdMap[(unsigned char)'+'] = 26;
m_stdMap[(unsigned char)'*'] = 26;
}
//---------------------------------------------------
void VIQRCharset::startInput()
{
m_suspicious = 0;
m_atWordBeginning = 1;
m_gotTone = 0;
m_escAll = 0;
if (VnCharsetLibObj.m_options.viqrEsc)
VnCharsetLibObj.m_VIQREscPatterns.reset();
}
//---------------------------------------------------
int VIQRCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
unsigned char ch1;
bytesRead = 0;
if (!is.getNext(ch1))
return 0;
bytesRead = 1;
stdChar = m_stdMap[ch1];
if (VnCharsetLibObj.m_options.viqrEsc) {
if (VnCharsetLibObj.m_VIQREscPatterns.foundAtNextChar(ch1)!=-1) {
m_escAll = 1;
}
}
if (m_escAll && (ch1==' ' || ch1=='\t' || ch1=='\r' || ch1=='\n'))
m_escAll = 0;
if (ch1 == '\\') {
// ecape character , try to read next
if (!is.getNext(ch1)) {
bytesRead++;
stdChar = m_stdMap[ch1];
}
}
if (stdChar < 256) {
stdChar = ch1;
}
else if (!m_escAll && !is.eos()) {
// try to read the next byte
unsigned char ch2;
is.peekNext(ch2);
unsigned char upper = toupper(ch1);
if ((!VnCharsetLibObj.m_options.smartViqr || m_atWordBeginning) &&
upper == 'D' && (ch2 == 'd' || ch2 == 'D'))
{
is.getNext(ch2);
bytesRead++;
stdChar += 2; // dd is 2 positions after d.
}
else {
StdVnChar index = m_stdMap[ch2];
int cond;
if (m_suspicious) {
cond = IS_VOWEL(ch1) &&
( index == 2 || index == 4 || index == 8 || //not accepting ? . in suspicious mode
(index == 12 && (upper == 'A' || upper == 'E' || upper == 'O')) ||
(m_stdMap[ch2] == 24 && upper== 'A') ||
(m_stdMap[ch2] == 26 && (upper == 'O' || upper == 'U')) );
if (cond)
m_suspicious = 0;
}
else
cond = IS_VOWEL(ch1) &&
((index <= 10 && index > 0 && (!m_gotTone || (index!=6 && index!=10)) ) ||
(index == 12 && (upper == 'A' || upper == 'E' || upper == 'O')) ||
(m_stdMap[ch2] == 24 && upper== 'A') ||
(m_stdMap[ch2] == 26 && (upper == 'O' || upper == 'U')) );
if (cond) {
if (index > 0)
m_gotTone = 1; //we have a tone/breve/hook in the current word
// ok, take this byte
is.getNext(ch2);
bytesRead++;
int offset = m_stdMap[ch2];
if (offset == 26) offset = 24;
if (offset == 24 && (ch1 == 'u' || ch1 == 'U'))
offset = 12;
stdChar += offset;
// check next byte
if (is.peekNext(ch2)) {
if (index > 10 && m_stdMap[ch2] > 0 && m_stdMap[ch2] <= 10) {
// ok, take one more byte
is.getNext(ch2);
bytesRead++;
stdChar += m_stdMap[ch2];
}
}
}
}
}
m_atWordBeginning = (stdChar < 256);
if (stdChar < 256) {
m_gotTone = 0; //reset this flag because we are at the beginning of a new word
}
// adjust stdChar
if (stdChar >= 256)
stdChar += VnStdCharOffset - 256;
return 1;
}
//---------------------------------------------------
void VIQRCharset::startOutput()
{
m_escapeBowl = 0;
m_escapeRoof = 0;
m_escapeHook = 0;
m_escapeTone = 0;
m_noOutEsc = 0;
VnCharsetLibObj.m_VIQROutEscPatterns.reset();
}
//---------------------------------------------------
int VIQRCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
int ret;
UKBYTE b;
if (stdChar >= VnStdCharOffset) {
outLen = 1;
UKDWORD dw = m_vnChars[stdChar-VnStdCharOffset];
unsigned char first = (unsigned char)dw;
unsigned char firstUpper = toupper(first);
b = (UKBYTE)dw;
ret = os.putB(b);
if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar(b) != -1)
m_noOutEsc = 1;
if (m_noOutEsc && (b==' ' || b=='\t' || b=='\r' || b=='\n'))
m_noOutEsc = 0;
if (dw & 0x0000FF00) {
// second byte is present
unsigned char second = (UKBYTE)(dw >> 8);
outLen++;
ret = os.putB(second);
if (dw & 0x00FF0000) {
//third byte is present
outLen++;
ret = os.putB((UKBYTE)(dw >> 16));
m_escapeTone = 0;
}
else {
UKWORD index = m_stdMap[second];
m_escapeTone = (index == 12 || index == 24 || index == 26);
}
VnCharsetLibObj.m_VIQROutEscPatterns.reset();
m_escapeBowl = 0;
m_escapeHook = 0;
m_escapeRoof = 0;
}
else {
m_escapeTone = IS_VOWEL(first);
m_escapeBowl = (firstUpper == 'A');
m_escapeHook = (firstUpper == 'U' || firstUpper == 'O');
m_escapeRoof = (firstUpper == 'A' || firstUpper == 'E' || firstUpper == 'O');
}
}
else {
if (stdChar > 255) {
outLen = 1;
ret = os.putB((UKBYTE)PadChar);
if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar((UKBYTE)PadChar) != -1)
m_noOutEsc = 1;
}
else {
outLen = 1;
UKWORD index = m_stdMap[stdChar];
if (!VnCharsetLibObj.m_options.viqrMixed && !m_noOutEsc &&
(stdChar=='\\' ||
(index > 0 && index <= 10 && m_escapeTone) ||
(index == 12 && m_escapeRoof) ||
(index == 24 && m_escapeBowl) ||
(index == 26 && m_escapeHook))) {
//(m_stdMap[stdChar] > 0 && m_stdMap[stdChar] <= 26)) {
// tone mark, needs an escape character
outLen++;
ret = os.putB('\\');
if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar('\\') != -1)
m_noOutEsc = 1;
}
b = (UKBYTE)stdChar;
ret = os.putB(b);
if (VnCharsetLibObj.m_VIQROutEscPatterns.foundAtNextChar(b) != -1)
m_noOutEsc = 1;
if (m_noOutEsc && (b==' ' || b=='\t' || b=='\r' || b=='\n'))
m_noOutEsc = 0;
}
// reset escape marks
m_escapeBowl = 0;
m_escapeRoof = 0;
m_escapeHook = 0;
m_escapeTone = 0;
}
return ret;
}
/////////////////////////////////////////////
// Class: UTF8VIQRCharset //
/////////////////////////////////////////////
//-----------------------------------------
UTF8VIQRCharset::UTF8VIQRCharset(UnicodeUTF8Charset *pUtf, VIQRCharset *pViqr)
{
m_pUtf = pUtf;
m_pViqr = pViqr;
}
//-----------------------------------------
void UTF8VIQRCharset::startInput()
{
m_pUtf->startInput();
m_pViqr->startInput();
}
//-----------------------------------------
void UTF8VIQRCharset::startOutput()
{
m_pUtf->startOutput();
m_pViqr->startOutput();
}
//-----------------------------------------
int UTF8VIQRCharset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
UKBYTE ch;
if (!is.peekNext(ch))
return 0;
if (ch > 0xBF && ch < 0xFE) {
m_pViqr->startInput(); // just to reset the VIQR object state
m_pViqr->m_suspicious = 1;
return m_pUtf->nextInput(is, stdChar, bytesRead);
}
return m_pViqr->nextInput(is, stdChar, bytesRead);
}
//-----------------------------------------
int UTF8VIQRCharset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
return m_pViqr->putChar(os, stdChar, outLen);
}
//-----------------------------------------
CVnCharsetLib::CVnCharsetLib()
{
unsigned char ch;
for (ch = 'a'; ch < 'z'; ch++)
LoVowel[ch-'a'] = 0;
LoVowel['a'-'a'] = 1;
LoVowel['e'-'a'] = 1;
LoVowel['i'-'a'] = 1;
LoVowel['o'-'a'] = 1;
LoVowel['u'-'a'] = 1;
LoVowel['y'-'a'] = 1;
for (ch = 'A'; ch < 'Z'; ch++)
HiVowel[ch-'A'] = 0;
HiVowel['A'-'A'] = 1;
HiVowel['E'-'A'] = 1;
HiVowel['I'-'A'] = 1;
HiVowel['O'-'A'] = 1;
HiVowel['U'-'A'] = 1;
HiVowel['Y'-'A'] = 1;
m_pUniCharset = NULL;
m_pUniCompCharset = NULL;
m_pUniUTF8 = NULL;
m_pUniRef = NULL;
m_pUniHex = NULL;
m_pVIQRCharObj = NULL;
m_pUVIQRCharObj = NULL;
m_pWinCP1258 = NULL;
m_pVnIntCharset = NULL;
int i;
for (i = 0; i < CONV_TOTAL_SINGLE_CHARSETS; i++)
m_sgCharsets[i] = NULL;
for (i = 0; i < CONV_TOTAL_DOUBLE_CHARSETS; i++)
m_dbCharsets[i] = NULL;
VnConvResetOptions(&m_options);
m_VIQREscPatterns.init((char**)VIQREscapes, VIQREscCount);
m_VIQROutEscPatterns.init((char**)VIQREscapes, VIQREscCount);
}
//-----------------------------------------
CVnCharsetLib::~CVnCharsetLib()
{
if (m_pUniCharset)
delete m_pUniCharset;
if (m_pUniUTF8)
delete m_pUniUTF8;
if (m_pUniRef)
delete m_pUniRef;
if (m_pUniHex)
delete m_pUniHex;
if (m_pVIQRCharObj)
delete m_pVIQRCharObj;
if (m_pUVIQRCharObj)
delete m_pUVIQRCharObj;
if (m_pWinCP1258)
delete m_pWinCP1258;
if (m_pUniCString)
delete m_pUniCString;
if (m_pVnIntCharset)
delete m_pVnIntCharset;
int i;
for (i = 0; i < CONV_TOTAL_SINGLE_CHARSETS; i++)
if (m_sgCharsets[i]) delete m_sgCharsets[i];
for (i = 0; i < CONV_TOTAL_DOUBLE_CHARSETS; i++)
if (m_dbCharsets[i]) delete m_dbCharsets[i];
}
//-----------------------------------------
VnCharset * CVnCharsetLib::getVnCharset(int charsetIdx)
{
switch (charsetIdx) {
case CONV_CHARSET_UNICODE:
if (m_pUniCharset == NULL)
m_pUniCharset = new UnicodeCharset(UnicodeTable);
return m_pUniCharset;
case CONV_CHARSET_UNIDECOMPOSED:
if (m_pUniCompCharset == NULL)
m_pUniCompCharset = new UnicodeCompCharset(UnicodeTable, UnicodeComposite);
return m_pUniCompCharset;
case CONV_CHARSET_UNIUTF8:
case CONV_CHARSET_XUTF8:
if (m_pUniUTF8 == NULL)
m_pUniUTF8 = new UnicodeUTF8Charset(UnicodeTable);
return m_pUniUTF8;
case CONV_CHARSET_UNIREF:
if (m_pUniRef == NULL)
m_pUniRef = new UnicodeRefCharset(UnicodeTable);
return m_pUniRef;
case CONV_CHARSET_UNIREF_HEX:
if (m_pUniHex == NULL)
m_pUniHex = new UnicodeHexCharset(UnicodeTable);
return m_pUniHex;
case CONV_CHARSET_UNI_CSTRING:
if (m_pUniCString == NULL)
m_pUniCString = new UnicodeCStringCharset(UnicodeTable);
return m_pUniCString;
case CONV_CHARSET_WINCP1258:
if (m_pWinCP1258 == NULL)
m_pWinCP1258 = new WinCP1258Charset(WinCP1258, WinCP1258Pre);
return m_pWinCP1258;
case CONV_CHARSET_VIQR:
if (m_pVIQRCharObj == NULL)
m_pVIQRCharObj = new VIQRCharset(VIQRTable);
return m_pVIQRCharObj;
case CONV_CHARSET_VNSTANDARD:
if (m_pVnIntCharset == NULL)
m_pVnIntCharset = new VnInternalCharset();
return m_pVnIntCharset;
case CONV_CHARSET_UTF8VIQR:
if (m_pUVIQRCharObj == NULL) {
if (m_pVIQRCharObj == NULL)
m_pVIQRCharObj = new VIQRCharset(VIQRTable);
if (m_pUniUTF8 == NULL)
m_pUniUTF8 = new UnicodeUTF8Charset(UnicodeTable);
m_pUVIQRCharObj = new UTF8VIQRCharset(m_pUniUTF8, m_pVIQRCharObj);
}
return m_pUVIQRCharObj;
default:
if (IS_SINGLE_BYTE_CHARSET(charsetIdx)) {
int i = charsetIdx - CONV_CHARSET_TCVN3;
if (m_sgCharsets[i] == NULL)
m_sgCharsets[i] = new SingleByteCharset(SingleByteTables[i]);
return m_sgCharsets[i];
}
else if (IS_DOUBLE_BYTE_CHARSET(charsetIdx)) {
int i = charsetIdx - CONV_CHARSET_VNIWIN;
if (m_dbCharsets[i] == NULL)
m_dbCharsets[i] = new DoubleByteCharset(DoubleByteTables[i]);
return m_dbCharsets[i];
}
}
return NULL;
}
//-------------------------------------------------
DllExport void VnConvSetOptions(VnConvOptions *pOptions)
{
VnCharsetLibObj.m_options = *pOptions;
}
//-------------------------------------------------
DllExport void VnConvGetOptions(VnConvOptions *pOptions)
{
*pOptions = VnCharsetLibObj.m_options;
}
//-------------------------------------------------
DllExport void VnConvResetOptions(VnConvOptions *pOptions)
{
pOptions->viqrEsc = 1;
pOptions->viqrMixed = 0;
pOptions->toUpper = 0;
pOptions->toLower = 0;
pOptions->removeTone = 0;
pOptions->smartViqr = 1;
}
/////////////////////////////////////////////
// Class WinCP1258Charset
/////////////////////////////////////////////
WinCP1258Charset::WinCP1258Charset(UKWORD *compositeChars, UKWORD *precomposedChars)
{
int i,k;
m_toDoubleChar = compositeChars;
memset(m_stdMap, 0, 256*sizeof(UKWORD));
// encode composite chars
for (i=0; i<TOTAL_VNCHARS; i++) {
if (compositeChars[i] >> 8) // a 2-byte character
m_stdMap[compositeChars[i] >> 8] = 0xFFFF; //INVALID_STD_CHAR;
else if (m_stdMap[compositeChars[i]] == 0)
m_stdMap[compositeChars[i]] = i+1;
m_vnChars[i] = (i << 16) + compositeChars[i]; // high word is used for StdChar index
}
m_totalChars = TOTAL_VNCHARS;
//add precomposed chars to the table
for (k=0, i=TOTAL_VNCHARS; k<TOTAL_VNCHARS; k++)
if (precomposedChars[k] != compositeChars[k]) {
if (precomposedChars[k] >> 8) // a 2-byte character
m_stdMap[precomposedChars[k] >> 8] = 0xFFFF; //INVALID_STD_CHAR;
else if (m_stdMap[precomposedChars[k]] == 0)
m_stdMap[precomposedChars[k]] = k+1;
m_vnChars[i] = (k << 16) + precomposedChars[k];
m_totalChars++;
i++;
}
qsort(m_vnChars, m_totalChars, sizeof(UKDWORD), wideCharCompare);
}
//---------------------------------------------------------------------
// This fuction is basically the same as that of DoubleByteCharset
// with m_totalChars is used instead of constant TOTAL_VNCHARS
//---------------------------------------------------------------------
int WinCP1258Charset::nextInput(ByteInStream & is, StdVnChar & stdChar, int & bytesRead)
{
unsigned char ch;
// read first byte
bytesRead = 0;
if (!is.getNext(ch))
return 0;
bytesRead = 1;
stdChar = m_stdMap[ch];
if (stdChar == 0)
stdChar = ch;
else if (stdChar == 0xFFFF)
stdChar = INVALID_STD_CHAR;
else {
stdChar += VnStdCharOffset - 1;
UKBYTE hi;
if (is.peekNext(hi) && hi > 0) {
//test if a double-byte character is encountered
UKDWORD key = MAKEWORD(ch,hi);
UKDWORD *pChar = (UKDWORD *)bsearch(&key, m_vnChars, m_totalChars, sizeof(UKDWORD), wideCharCompare);
if (pChar) {
stdChar = VnStdCharOffset + HIWORD(*pChar);
bytesRead = 2;
is.getNext(hi);
}
}
}
return 1;
}
//---------------------------------------------------------------------
// This fuction is exactly the same as that of DoubleByteCharset
//---------------------------------------------------------------------
int WinCP1258Charset::putChar(ByteOutStream & os, StdVnChar stdChar, int & outLen)
{
int ret;
if (stdChar >= VnStdCharOffset) {
UKWORD wCh = m_toDoubleChar[stdChar-VnStdCharOffset];
if (wCh & 0xFF00) {
outLen = 2;
os.putB((UKBYTE)(wCh & 0x00FF));
ret = os.putB((UKBYTE)(wCh >> 8));
}
else {
unsigned char b = (unsigned char)wCh;
if (m_stdMap[b] == 0xFFFF)
b = PadChar;
outLen = 1;
ret = os.putB(b);
}
}
else {
if (stdChar > 255 || m_stdMap[stdChar]) {
outLen = 1;
ret = os.putB((UKBYTE)PadChar);
}
else {
outLen = 1;
ret = os.putB((UKBYTE)stdChar);
}
}
return ret;
}
#define IS_ODD(x) (x & 1)
#define IS_EVEN(x) (!(x & 1))
StdVnChar StdVnToUpper(StdVnChar ch)
{
if (ch >= VnStdCharOffset &&
ch<(VnStdCharOffset + TOTAL_ALPHA_VNCHARS) &&
IS_ODD(ch))
ch -= 1;
return ch;
}
//----------------------------------------
StdVnChar StdVnToLower(StdVnChar ch)
{
if (ch >= VnStdCharOffset &&
ch<(VnStdCharOffset + TOTAL_ALPHA_VNCHARS) &&
IS_EVEN(ch))
ch += 1;
return ch;
}
//----------------------------------------
StdVnChar StdVnGetRoot(StdVnChar ch)
{
if (ch >= VnStdCharOffset && ch<VnStdCharOffset+TOTAL_VNCHARS)
ch = VnStdCharOffset + StdVnRootChar[ch-VnStdCharOffset];
return ch;
}