/*
* File: misc.c
*
* Copyright (C) 2000-2007 Jorge Arellano Cid <jcid@dillo.org>,
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include "utf8.hh"
#include "msg.h"
#include "misc.h"
/*
* Escape characters as %XX sequences.
* Return value: New string.
*/
char *a_Misc_escape_chars(const char *str, const char *esc_set)
{
static const char *const hex = "0123456789ABCDEF";
char *p = NULL;
Dstr *dstr;
int i;
dstr = dStr_sized_new(64);
for (i = 0; str[i]; ++i) {
if (str[i] <= 0x1F || str[i] == 0x7F || strchr(esc_set, str[i])) {
dStr_append_c(dstr, '%');
dStr_append_c(dstr, hex[(str[i] >> 4) & 15]);
dStr_append_c(dstr, hex[str[i] & 15]);
} else {
dStr_append_c(dstr, str[i]);
}
}
p = dstr->str;
dStr_free(dstr, FALSE);
return p;
}
#define TAB_SIZE 8
/*
* Takes a string and converts any tabs to spaces.
*/
int
a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen)
{
int j, pos = 0, written = 0, old_pos, char_len;
uint_t code;
static const int combining_char_space = 32;
while (*start < end && written < buflen - TAB_SIZE - combining_char_space) {
code = a_Utf8_decode(*start, end, &char_len);
if (code == '\t') {
/* Fill with whitespaces until the next tab. */
old_pos = pos;
pos += TAB_SIZE - (pos % TAB_SIZE);
for (j = old_pos; j < pos; j++)
buf[written++] = ' ';
} else {
assert(char_len <= 4);
for (j = 0; j < char_len; j++)
buf[written++] = (*start)[j];
pos++;
}
*start += char_len;
}
/* If following chars are combining chars (e.g. accents) add them to the
* buffer. We have reserved combining_char_space bytes for this.
* If there should be more combining chars, we split nevertheless.
*/
while (*start < end && written < buflen - 4) {
code = a_Utf8_decode(*start, end, &char_len);
if (! a_Utf8_combining_char(code))
break;
assert(char_len <= 4);
for (j = 0; j < char_len; j++)
buf[written++] = (*start)[j];
*start += char_len;
}
return written;
}
/* TODO: could use dStr ADT! */
typedef struct ContentType_ {
const char *str;
int len;
} ContentType_t;
static const ContentType_t MimeTypes[] = {
{ "application/octet-stream", 24 },
{ "application/xhtml+xml", 21 },
{ "text/html", 9 },
{ "text/plain", 10 },
{ "image/gif", 9 },
{ "image/png", 9 },
{ "image/jpeg", 10 },
{ NULL, 0 }
};
typedef enum {
DT_OCTET_STREAM = 0,
DT_TEXT_HTML,
DT_TEXT_PLAIN,
DT_IMAGE_GIF,
DT_IMAGE_PNG,
DT_IMAGE_JPG,
} DetectedContentType;
/*
* Detects 'Content-Type' from a data stream sample.
*
* It uses the magic(5) logic from file(1). Currently, it
* only checks the few mime types that Dillo supports.
*
* 'Data' is a pointer to the first bytes of the raw data.
*
* Return value: (0 on success, 1 on doubt, 2 on lack of data).
*/
int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
{
size_t i, non_ascci, non_ascci_text, bin_chars;
char *p = Data;
int st = 1; /* default to "doubt' */
DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */
/* HTML try */
for (i = 0; i < Size && dIsspace(p[i]); ++i);
if ((Size - i >= 5 && !dStrncasecmp(p+i, "<html", 5)) ||
(Size - i >= 5 && !dStrncasecmp(p+i, "<head", 5)) ||
(Size - i >= 6 && !dStrncasecmp(p+i, "<title", 6)) ||
(Size - i >= 14 && !dStrncasecmp(p+i, "<!doctype html", 14)) ||
/* this line is workaround for FTP through the Squid proxy */
(Size - i >= 17 && !dStrncasecmp(p+i, "<!-- HTML listing", 17))) {
Type = DT_TEXT_HTML;
st = 0;
/* Images */
} else if (Size >= 4 && !dStrncasecmp(p, "GIF8", 4)) {
Type = DT_IMAGE_GIF;
st = 0;
} else if (Size >= 4 && !dStrncasecmp(p, "\x89PNG", 4)) {
Type = DT_IMAGE_PNG;
st = 0;
} else if (Size >= 2 && !dStrncasecmp(p, "\xff\xd8", 2)) {
/* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking
* at the character representation should be machine independent. */
Type = DT_IMAGE_JPG;
st = 0;
/* Text */
} else {
/* Heuristic for "text/plain"
* {ASCII, LATIN1, UTF8, KOI8-R, CP-1251}
* All in the above set regard [00-31] as control characters.
* LATIN1: [7F-9F] unused
* CP-1251 {7F,98} unused (two characters).
*
* We'll use [0-31] as indicators of non-text content.
* Better heuristics are welcomed! :-) */
non_ascci = non_ascci_text = bin_chars = 0;
Size = MIN (Size, 256);
for (i = 0; i < Size; i++) {
int ch = (uchar_t) p[i];
if (ch < 32 && !dIsspace(ch))
++bin_chars;
if (ch > 126)
++non_ascci;
if (ch > 190)
++non_ascci_text;
}
if (bin_chars == 0 && (non_ascci - non_ascci_text) <= Size/10) {
/* Let's say text: if "rare" chars are <= 10% */
Type = DT_TEXT_PLAIN;
} else if (Size > 0) {
/* a special check for UTF-8 */
Size = a_Utf8_end_of_char(p, Size - 1) + 1;
if (a_Utf8_test(p, Size) > 0)
Type = DT_TEXT_PLAIN;
}
if (Size >= 256)
st = 0;
}
*PT = MimeTypes[Type].str;
return st;
}
/*
* Parse Content-Type string, e.g., "text/html; charset=utf-8".
* Content-Type is defined in RFC 2045 section 5.1.
*/
void a_Misc_parse_content_type(const char *type, char **major, char **minor,
char **charset)
{
static const char tspecials_space[] = "()<>@,;:\\\"/[]?= ";
const char *str, *s;
if (major)
*major = NULL;
if (minor)
*minor = NULL;
if (charset)
*charset = NULL;
if (!(str = type))
return;
for (s = str; *s && !iscntrl((uchar_t)*s) && !strchr(tspecials_space, *s);
s++) ;
if (major)
*major = dStrndup(str, s - str);
if (*s == '/') {
for (str = ++s;
*s && !iscntrl((uchar_t)*s) && !strchr(tspecials_space, *s); s++) ;
if (minor)
*minor = dStrndup(str, s - str);
}
if (charset && *s &&
(dStrncasecmp(type, "text/", 5) == 0 ||
dStrncasecmp(type, "application/xhtml+xml", 21) == 0)) {
/* "charset" parameter defined for text media type in RFC 2046,
* application/xhtml+xml in RFC 3236.
*
* Note that RFC 3023 lists some main xml media types and provides
* the convention of using the "+xml" minor type suffix for other
* xml types, so it would be reasonable to check for that suffix if
* we have need to care about various xml types someday.
*/
const char terminators[] = " ;\t";
const char key[] = "charset";
if ((s = dStristr(str, key)) &&
(s == str || strchr(terminators, s[-1]))) {
s += sizeof(key) - 1;
for ( ; *s == ' ' || *s == '\t'; ++s);
if (*s == '=') {
size_t len;
for (++s; *s == ' ' || *s == '\t'; ++s);
if ((len = strcspn(s, terminators))) {
if (*s == '"' && s[len-1] == '"' && len > 1) {
/* quoted string */
s++;
len -= 2;
}
*charset = dStrndup(s, len);
}
}
}
}
}
/*
* Compare two Content-Type strings.
* Return 0 if they are equivalent, and 1 otherwise.
*/
int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
{
char *major1, *major2, *minor1, *minor2, *charset1, *charset2;
int ret;
if ((!ct1 || !*ct1) && (!ct2 || !*ct2))
return 0;
if ((!ct1 || !*ct1) || (!ct2 || !*ct2))
return 1;
a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1);
a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2);
if (major1 && major2 && !dStrcasecmp(major1, major2) &&
minor1 && minor2 && !dStrcasecmp(minor1, minor2) &&
((!charset1 && !charset2) ||
(charset1 && charset2 && !dStrcasecmp(charset1, charset2)) ||
(!charset1 && charset2 && !dStrcasecmp(charset2, "UTF-8")) ||
(charset1 && !charset2 && !dStrcasecmp(charset1, "UTF-8")))) {
ret = 0;
} else {
ret = 1;
}
dFree(major1); dFree(major2);
dFree(minor1); dFree(minor2);
dFree(charset1); dFree(charset2);
return ret;
}
/*
* Check the server-supplied 'Content-Type' against our detected type.
* (some servers seem to default to "text/plain").
*
* Return value:
* 0, if they match
* -1, if a mismatch is detected
*
* There are many MIME types Dillo doesn't know, they're handled
* as "application/octet-stream" (as the SPEC says).
*
* A mismatch happens when receiving a binary stream as
* "text/plain" or "text/html", or an image that's not an image of its kind.
*
* Note: this is a basic security procedure.
*
*/
int a_Misc_content_type_check(const char *EntryType, const char *DetectedType)
{
int i;
int st = -1;
_MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType);
if (!EntryType)
return 0; /* there's no mismatch without server type */
for (i = 1; MimeTypes[i].str; ++i)
if (dStrncasecmp(EntryType, MimeTypes[i].str, MimeTypes[i].len) == 0)
break;
if (!MimeTypes[i].str) {
/* type not found, no mismatch */
st = 0;
} else if (dStrncasecmp(EntryType, "image/", 6) == 0 &&
!dStrncasecmp(DetectedType,MimeTypes[i].str,MimeTypes[i].len)){
/* An image, and there's an exact match */
st = 0;
} else if (dStrncasecmp(EntryType, "text/", 5) ||
dStrncasecmp(DetectedType, "application/", 12)) {
/* Not an application sent as text */
st = 0;
} else if (dStrncasecmp(EntryType, "application/xhtml+xml", 21) &&
dStrncasecmp(DetectedType, "text/html", 9)) {
/* XML version of HTML */
st = 0;
}
_MSG("Type check: %s\n", st == 0 ? "MATCH" : "MISMATCH");
return st;
}
/*
* Parse a geometry string.
*/
int a_Misc_parse_geometry(char *str, int *x, int *y, int *w, int *h)
{
char *p, *t1, *t2;
int n1, n2;
int ret = 0;
if ((p = strchr(str, 'x')) || (p = strchr(str, 'X'))) {
n1 = strtol(str, &t1, 10);
n2 = strtol(++p, &t2, 10);
if (t1 != str && t2 != p) {
*w = n1;
*h = n2;
ret = 1;
/* parse x,y now */
p = t2;
n1 = strtol(p, &t1, 10);
n2 = strtol(t1, &t2, 10);
if (t1 != p && t2 != t1) {
*x = n1;
*y = n2;
}
}
}
_MSG("geom: w,h,x,y = (%d,%d,%d,%d)\n", *w, *h, *x, *y);
return ret;
}
/*
* Parse dillorc's search_url string ("[<label> ]<url>")
* Return value: -1 on error, 0 on success (and label and urlstr pointers)
*/
int a_Misc_parse_search_url(char *source, char **label, char **urlstr)
{
static char buf[32];
char *p, *q;
int ret = -1;
if ((p = strrchr(source, ' '))) {
/* label and url pair */
strncpy(buf,source,MIN(p-source,31));
buf[MIN(p-source,31)] = 0;
source = p+1;
if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) {
*urlstr = source;
ret = 0;
}
} else {
/* url only, make a custom label */
if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) {
strncpy(buf,p+2,MIN(q-p-2,31));
buf[MIN(q-p-2,31)] = 0;
*urlstr = source;
ret = 0;
}
}
*label = buf;
if (ret == -1)
MSG("Invalid search_url: \"%s\"\n", source);
return ret;
}
/*
* Encodes string using base64 encoding.
* Return value: new string or NULL if input string is empty.
*/
char *a_Misc_encode_base64(const char *in)
{
static const char *const base64_hex = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
char *out = NULL;
int len, i = 0;
if (in == NULL) return NULL;
len = strlen(in);
out = (char *)dMalloc((len + 2) / 3 * 4 + 1);
for (; len >= 3; len -= 3) {
out[i++] = base64_hex[in[0] >> 2];
out[i++] = base64_hex[((in[0]<<4) & 0x30) | (in[1]>>4)];
out[i++] = base64_hex[((in[1]<<2) & 0x3c) | (in[2]>>6)];
out[i++] = base64_hex[in[2] & 0x3f];
in += 3;
}
if (len > 0) {
unsigned char fragment;
out[i++] = base64_hex[in[0] >> 2];
fragment = (in[0] << 4) & 0x30;
if (len > 1) fragment |= in[1] >> 4;
out[i++] = base64_hex[fragment];
out[i++] = (len < 2) ? '=' : base64_hex[(in[1] << 2) & 0x3c];
out[i++] = '=';
}
out[i] = '\0';
return out;
}
/*
* Load a local file into a dStr.
* Return value: dStr on success, NULL on error.
* TODO: a filesize threshold may be implemented.
*/
Dstr *a_Misc_file2dstr(const char *filename)
{
FILE *F_in;
int n;
char buf[4096];
Dstr *dstr = NULL;
if ((F_in = fopen(filename, "r"))) {
dstr = dStr_sized_new(4096);
while ((n = fread (buf, 1, 4096, F_in)) > 0) {
dStr_append_l(dstr, buf, n);
}
fclose(F_in);
}
return dstr;
}