Codebase list teckit / fb3258bf-40fa-4dbe-b05e-93ab5429efe7/main SFconv / sfReader.h
fb3258bf-40fa-4dbe-b05e-93ab5429efe7/main

Tree @fb3258bf-40fa-4dbe-b05e-93ab5429efe7/main (Download .tar.gz)

sfReader.h @fb3258bf-40fa-4dbe-b05e-93ab5429efe7/mainraw · history · blame

#include <string>
#include <cstdio>

#include "ushort_chartraits.h"

#define END_OF_FILE		-1
#define BODY_TEXT		0
#define SFM				1
#define INLINE_START	2
#define INLINE_END		3
#define INLINE_MARKER	4
#define UNI_REPLACEMENT_CHAR    uint32_t(0x0000FFFD)
#define UNI_MAX_BMP             uint32_t(0x0000FFFF)
#define UNI_MAX_UTF16           uint32_t(0x0010FFFF)

template<class C>
class sfReader
{
public:
	typedef std::basic_string<C>			stringT;
	typedef std::char_traits<C>				traitsT;

							sfReader(std::FILE* inFile, char inForm = kForm_Bytes);
							~sfReader();

	long					escapeChar;
	long					inlineEscapeChar;
	long					startInline;
	long					endInline;
	
	stringT					sfmChars;
	stringT					inlineChars;
	
	int						next(bool checkInlineEnd);
	
	stringT					text;
	
protected:
	long					get();
	void					putback(long c);

	long					fSavedChar;

	char					inForm;

	std::FILE*				inFile;
};

template<class C>
sfReader<C>::sfReader(std::FILE* f, char i)
	: inForm(i)
	, inFile(f)
{
	escapeChar			= '\\';
	inlineEscapeChar	= -1;
	startInline			= -1;
	endInline			= -1;
	
	text.reserve(10000);
	fSavedChar = -1;
}

template<class C>
sfReader<C>::~sfReader()
{
}

template<> long
sfReader<char>::get()
{
	long	rval = fSavedChar;
	if (rval == -1)
		rval = getc(inFile);
	else
		fSavedChar = -1;
	return rval;
}

static char bytesFromUTF8[256] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};
static unsigned long offsetsFromUTF8[6] =
	{0x00000000UL, 0x00003080UL, 0x000E2080UL, 
 	 0x03C82080UL, 0xFA082080UL, 0x82082080UL};
/*
const unsigned long kReplacementCharacter =	0x0000FFFDUL;
const unsigned long kMaximumUCS2 =			0x0000FFFFUL;
const unsigned long kMaximumUTF16 =			0x0010FFFFUL;
const unsigned long kMaximumUCS4 =			0x7FFFFFFFUL;
*/
const int halfShift							= 10;
const unsigned long halfBase				= 0x0010000UL;
const unsigned long halfMask				= 0x3FFUL;
const unsigned long kSurrogateHighStart		= 0xD800UL;
const unsigned long kSurrogateHighEnd		= 0xDBFFUL;
const unsigned long kSurrogateLowStart		= 0xDC00UL;
const unsigned long kSurrogateLowEnd		= 0xDFFFUL;

template<> long
sfReader<UniChar>::get()
{
	if (fSavedChar != -1) {
		long	rval = fSavedChar;
		fSavedChar = -1;
		return rval;
	}
	long	c1 = getc(inFile);
	if (c1 == -1)
		return -1;
	else {
		if (inForm == kForm_UTF8) {
			long	t1, t2 = -1;
			unsigned long ch = c1;
			unsigned short extraBytes = bytesFromUTF8[c1];
			long	c2;
			switch(extraBytes) {	/* note: code falls through cases! */
				case 5:	c2 = getc(inFile); if (c2 == -1) return -1; ch <<= 6; ch += c2;
				case 4:	c2 = getc(inFile); if (c2 == -1) return -1; ch <<= 6; ch += c2;
				case 3:	c2 = getc(inFile); if (c2 == -1) return -1; ch <<= 6; ch += c2;
				case 2:	c2 = getc(inFile); if (c2 == -1) return -1; ch <<= 6; ch += c2;
				case 1:	c2 = getc(inFile); if (c2 == -1) return -1; ch <<= 6; ch += c2;
				case 0:	;
			};
			ch -= offsetsFromUTF8[extraBytes];
			if (ch <= UNI_MAX_BMP) {
				t1 = ch;
			} else if (ch > UNI_MAX_UTF16) {
				t1 = UNI_REPLACEMENT_CHAR;
			} else {
				ch -= halfBase;
				t1 = (ch >> halfShift) + kSurrogateHighStart;
				t2 = (ch & halfMask) + kSurrogateLowStart;
			}
			fSavedChar = t2;
			return t1;
		}
		else {
			long	c2 = getc(inFile);
			if (c2 == -1)
				return -1;
			if (inForm == kForm_UTF16BE)
				return (c1 << 8) + c2;
			else
				return (c2 << 8) + c1;
		}
	}
}

template<class C>
void
sfReader<C>::putback(long c)
{
	fSavedChar = c;
}

template<class C>
int
sfReader<C>::next(bool checkInlineEnd)
{
	const long	eof = traitsT::eof();
	long	c = get();
	if (c == eof)
		return END_OF_FILE;

	text.clear();
	
	if (c == escapeChar) {
		while (1) {
			c = get();
			if (c == eof) {
				return SFM;
			}
			if (sfmChars.find(c, 0) != stringT::npos) {
				text.append(1, c);
			}
			else if (text.length() == 0) {
				text.append(1, c);
				return SFM;
			}
			else {
				putback(c);
				return SFM;
			}
		}
	}

	if (c == inlineEscapeChar) {
		while (1) {
			c = get();
			if (c == eof) {
				return INLINE_MARKER;
			}
			if (inlineChars.find(c, 0) != stringT::npos)
				text.append(1, c);
			else if (text.length() == 0) {
				text.append(1, c);
				c = get();
				if (c == startInline)
					return INLINE_START;
				else {
					putback(c);
					return INLINE_MARKER;
				}
			}
			else {
				if (c == startInline)
					return INLINE_START;
				else {
					putback(c);
					return INLINE_MARKER;
				}
			}
		}
	}
	
	if (checkInlineEnd && c == endInline) {
		return INLINE_END;
	}

	// must be "junk" before the first SFM
	text.append(1, c);

	while (1) {
		c = get();
		if (c == eof || c == escapeChar || c == inlineEscapeChar || (checkInlineEnd && c == endInline)) {
			putback(c);
			return BODY_TEXT;
		}
		text.append(1, c);
	}
}