Codebase list unrtf / bd13ae3a-94e6-40bf-b472-38179327dae8/main src / unicode.c
bd13ae3a-94e6-40bf-b472-38179327dae8/main

Tree @bd13ae3a-94e6-40bf-b472-38179327dae8/main (Download .tar.gz)

unicode.c @bd13ae3a-94e6-40bf-b472-38179327dae8/main

d5eaff9
 
 
 
 
 
 
 
 
 
4329b02
 
 
d5eaff9
 
4329b02
 
d5eaff9
 
 
 
 
4329b02
d5eaff9
 
 
 
 
 
dff09ba
 
 
 
 
 
 
d5eaff9
 
 
4329b02
 
d5eaff9
4329b02
d5eaff9
 
4329b02
d5eaff9
dff09ba
d5eaff9
 
641be09
d5eaff9
 
 
 
 
dff09ba
d5eaff9
 
 
 
 
 
dff09ba
d5eaff9
 
 
 
 
 
 
dff09ba
d5eaff9
 
 
 
 
 
 
 
dff09ba
d5eaff9
 
 
 
 
 
 
 
 
dff09ba
d5eaff9
 
 
 
 
 
 
 
 
641be09
d5eaff9
 
 
4329b02
d5eaff9
dff09ba
 
d5eaff9
 
 
dff09ba
d5eaff9
 
4329b02
 
d5eaff9
dff09ba
 
 
 
d5eaff9
 
 
4329b02
d5eaff9
 
 
 
 
dff09ba
d5eaff9
dff09ba
 
d5eaff9
 
 
 
dff09ba
d5eaff9
dff09ba
d5eaff9
 
4329b02
 
d5eaff9
4329b02
 
 
dff09ba
 
d5eaff9
/*----------------------------------------------------------------------
 * Module name:    unicode
 * Author name:    Arkadiusz Firus
 * Create date:    09 Nov 08
 * Purpose:        unicode translations
 *----------------------------------------------------------------------
 * Changes:
 * 04 Jan 10, daved@physiol.usyd.edu.au: null terminate strings in
 *		unicode_to_string
 * 21 Aug 10, daved@physiol.usyd.edu.au: test feof() rather than EOF for
 *              AIX support
 * 07 Oct 11, jf@dockes.org: major changes including change of
 *	get_unicode_char to get_unicode_utf8
 *--------------------------------------------------------------------*/
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>

#include "malloc.h"

/*========================================================================
 * Name		get_unicode
 * Purpose:	Translates string like U221E or 221E to int value
 * Args:	Unicode character.
 * Returns:	Unicode number.
 *=======================================================================*/
int
get_unicode(char *string)
{
	unsigned long uc;
	if (string[0] == 'U' || string[0] == 'u')
	{
		string++;
	}
	uc = strtoul(string, 0, 16);
	return uc;
}

/*========================================================================
 * Name		unicode_to_utf8
 * Purpose:	Translates unicode number to UTF-8 string
 * Args:	Unicode number.
 * Returns:	malloced UTF-8 string
 *=======================================================================*/
char *
unicode_to_utf8(unsigned int uc)
{
	unsigned char *string = NULL;
	if (uc < 0x7f)
	{
		string = (unsigned char *)my_malloc(2 * sizeof(char));
		string[0] = (unsigned char) uc;
		string[1] = '\0';
	}
	else if (uc < 0x7ff)
	{
		string = (unsigned char *)my_malloc(3 * sizeof(char));
		string[0] = (unsigned char) 192 + (uc / 64);
		string[1] = (unsigned char) 128 + (uc % 64);
		string[2] = '\0';
	}
	else if (uc < 0xffff)
	{
		string = (unsigned char *)my_malloc(4 * sizeof(char));
		string[0] = (unsigned char) 224 + (uc / (64 * 64));
		string[1] = (unsigned char) 128 + ((uc / 64) % 64);
		string[2] = (unsigned char) 128 + (uc % 64);
		string[3] = '\0';
	}
	else if (uc < 0x1FFFFF)
	{
		string = (unsigned char *)my_malloc(5 * sizeof(char));
		string[0] = (unsigned char) 240 + (uc / (64 * 64 * 64));
		string[1] = (unsigned char) 128 + ((uc / (64 * 64)) % 64);
		string[2] = (unsigned char) 128 + ((uc / 64) % 64);
		string[3] = (unsigned char) 128 + (uc % 64);
		string[4] = '\0';
	}
	else if (uc < 0x3FFFFFF)
	{
		string = (unsigned char *)my_malloc(6 * sizeof(char));
		string[0] = (unsigned char) 248 + (uc / (64 * 64 * 64 * 64));
		string[1] = (unsigned char) 128 + ((uc / (64 * 64 * 64)) % 64);
		string[2] = (unsigned char) 128 + ((uc / (64 * 64)) % 64);
		string[3] = (unsigned char) 128 + ((uc / 64) % 64);
		string[4] = (unsigned char) 128 + (uc % 64);
		string[5] = '\0';
	}
	else if (uc < 0x7FFFFFFF)
	{
		string = (unsigned char *)my_malloc(7 * sizeof(char));
		string[0] = (unsigned char) 252 + (uc / (64 * 64 * 64 * 64 * 64));
		string[1] = (unsigned char) 128 + ((uc / (64 * 64 * 64 * 64)) % 64);
		string[2] = (unsigned char) 128 + ((uc / (64 * 64 * 64)) % 64);
		string[3] = (unsigned char) 128 + ((uc / (64 * 64)) % 64);
		string[4] = (unsigned char) 128 + ((uc / 64) % 64);
		string[5] = (unsigned char) 128 + (uc % 64);
		string[6] = '\0';
	}

	return (char *)string;
}

/*========================================================================
 * Name		get_unicode_int
 * Purpose:	Reads unicode character (in format <UN...N> and translates
		it to printable unicode character. We are reading from our charmaps,
        not from the rtf file.
 * Caution:	This function should be executed after char '<'  was read.
		It reads until char '>' was found or EOL or EOF.
 * Args:	File to read from.
 * Returns:	integer Unicode code point
 *=======================================================================*/

int
get_unicode_int(FILE *file)
{
#define MAX_CHARMAP_UC_LEN 20
	char unicode_number[MAX_CHARMAP_UC_LEN];
	int len = 0;
	char c;

	c = fgetc(file);

	while (c != '>' && c != '\n' && !feof(file) && !ferror(file))
	{
		unicode_number[len] = c;
		c = fgetc(file);
		len++;

		if (len == MAX_CHARMAP_UC_LEN-1)
		{
			fprintf(stderr, "Buffer overflow in get_unicode_int\n");
			return 0;
		}
	}

	if (c != '>')
	{
		ungetc(c, file);
	}

	unicode_number[len] = '\0';
	return get_unicode(unicode_number);
}

char *
get_unicode_utf8(FILE *file)
{
	int uc = get_unicode_int(file);
	return unicode_to_utf8(uc);
}