hxnormalize.c - html-xml-utils (HEAD)

Tree @HEAD (Download .tar.gz)

hxnormalize.c @HEAD — raw · history · blame

/*
 * Format an HTML source in a consistent manner.
 *
 * Copyright © 1994-2012 World Wide Web Consortium
 * See http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
 *
 * Created 9 May 1998
 * Bert Bos <bert@w3.org>
 * $Id: hxnormalize.c,v 1.22 2017/11/24 09:50:25 bbos Exp $
 */
#include "config.h"
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#ifdef HAVE_UNISTD_H
#  include <unistd.h>
#endif
#ifdef HAVE_STRING_H
#  include <string.h>
#elif HAVE_STRINGS_H
#  include <strings.h>
#endif
#include <assert.h>
#include <stdbool.h>
#include "export.h"
#include "types.e"
#include "tree.e"
#include "html.e"
#include "scan.e"
#include "textwrap.e"
#include "dict.e"
#include "openurl.e"
#include "errexit.e"

static Tree tree;
static bool do_xml = false;
static bool do_endtag = false;
static bool has_errors = false;
static bool do_doctype = true;
static bool clean_span = false;
static string long_comment = NULL;
static bool do_lang = false;


/* handle_error -- called when a parse error occurred */
void handle_error(void *clientdata, const string s, int lineno)
{
  fprintf(stderr, "%d: %s\n", lineno, s);
  has_errors = true;
}

/* start -- called before the first event is reported */
void* start(void)
{
  tree = create();
  return NULL;
}
  
/* end -- called after the last event is reported */
void end(void *clientdata)
{
  /* skip */
}

/* handle_comment -- called after a comment is parsed */
void handle_comment(void *clientdata, string commenttext)
{
  tree = append_comment(tree, commenttext);
}

/* handle_text -- called after a text chunk is parsed */
void handle_text(void *clientdata, string text)
{
  tree = append_text(tree, text);
}

/* handle_decl -- called after a declaration is parsed */
void handle_decl(void *clientdata, string gi,
		 string fpi, string url)
{
  tree = append_declaration(tree, gi, fpi, url);
}

/* handle_pi -- called after a PI is parsed */
void handle_pi(void *clientdata, string pi_text)
{
  tree = append_procins(tree, pi_text);
}

/* handle_starttag -- called after a start tag is parsed */
void handle_starttag(void *clientdata, string name, pairlist attribs)
{
  tree = html_push(tree, name, attribs);
}

/* handle_emptytag -- called after an empty tag is parsed */
void handle_emptytag(void *clientdata, string name, pairlist attribs)
{
  tree = html_push(tree, name, attribs);
}

/* handle_endtag -- called after an endtag is parsed (name may be "") */
void handle_endtag(void *clientdata, string name)
{
  tree = html_pop(tree, name);
  free(name);
}

/* insert -- insert an attribute into a sorted list of attributes */
static pairlist insert(pairlist x, pairlist list)
{
  if (! list) {					/* Empty list */
    x->next = NULL;
    return x;
  } else if (strcmp(x->name, list->name) <= 0) { /* Insert at head */
    x->next = list;
    return x;
  } else {					/* Insert not at head */
    list->next = insert(x, list->next);
    return list;
  }
}

/* sort_list -- sort a linked list of attributes, return reordered list */
static pairlist sort_list(pairlist list)
{
  /* Insertion sort should be fast enough... */
  if (! list) return NULL;
  else return insert(list, sort_list(list->next));
}

/* next_ambiguous -- check if omitting end changes the meaning */
static bool next_ambiguous(Node *n)
{
  Node *h = n;

  /* Skip text nodes with only white space */
  while (h->sister && h->sister->tp == Text && only_space(h->sister->text))
    h = h->sister;

  if (h->sister == NULL) return false;
  if (h->sister->tp == Text) return true;
  if (h->sister->tp == Comment) return true;
  if (h->sister->tp == Procins) return true;
  if (h->sister->tp == Declaration) return false; /* Should not occur */
  assert(h->sister->tp == Element);		/* Cannot be Root */
  return has_parent(h->sister->name, n->name);
}

/* needs_quotes -- check if the attribute value can be printed unquoted */
static bool needs_quotes(const string s)
{
  int i;
  assert(s);
  if (!s[0]) return true;			/* Empty string */
  for (i = 0; s[i]; i++)
    if (!isalnum(s[i]) && (s[i] != '-') && (s[i] != '.')) return true;
  return false;
}

/* pp -- print the document normalized */
static void pp(Tree n, bool preformatted, bool allow_text,
	       conststring lang)
{
  bool pre, mixed;
  conststring lang2;
  string s;
  pairlist h;
  size_t i, j;
  Tree l;

  switch (n->tp) {
    case Text:
      if (!allow_text) {
	assert(only_space(n->text));
      } else {
	s = n->text;
	i = strlen(s);
	outn(s, i, preformatted);
      }
      break;
    case Comment:
      if (long_comment && strstr(n->text, long_comment) && !preformatted) {
	/* Found a comment that should have an empty line before it */
	outbreak();
	outln(NULL, true);
      }
      out("<!--", true); out(n->text, true);
      if (allow_text || preformatted) out("-->", true);
      else outln("-->", preformatted);
      break;
    case Declaration:
      if (do_doctype) {
	out("<!DOCTYPE ", false);
	out(n->name, false);
	if (n->text) {
	  out(" PUBLIC \"", false);
	  out(n->text, false);
	  out("\"", false);
	}
	if (n->url) {
	  if (!n->text) out(" SYSTEM", false);
	  out(" \"", false);
	  out(n->url, false);
	  out("\"", false);
	} else if (n->text && do_xml) {	/* XML cannot omit the system literal */
	  out(" \"\"", false);
	}
	outln(">", false);
      }
      break;
    case Procins:
      out("<?", false); out(n->text, true);
      if (allow_text || preformatted) out(">", false);
      else outln(">", false);
      break;
    case Element:
      if (clean_span && eq(n->name, "span") && ! n->attribs) {
	/* Omit start and end tags, print just the children. */
	for (l = n->children; l != NULL; l = l->sister)
	  pp(l, preformatted, true, lang);
	break;
      }
      /* Determine language, remove redundant language attribute */
      if (do_lang) {
	if ((lang2 = pairlist_get(n->attribs, "lang")) ||
	    (lang2 = pairlist_get(n->attribs, "xml:lang"))) {
	  if (lang && eq(lang, lang2)) {
	    pairlist_unset(&n->attribs, "lang");
	    pairlist_unset(&n->attribs, "xml:lang");
	  }
	  lang = lang2;
	}
      }
      if (!preformatted && break_before(n->name)) outln(NULL, false);
      out("<", preformatted); out(n->name, preformatted);
      if (break_before(n->name)) inc_indent();
      n->attribs = sort_list(n->attribs);
      for (h = n->attribs; h != NULL; h = h->next) {
	out(" ", false); out(h->name, false);
	if (do_xml) {
	  out("=\"", false);
	  out(h->value ? h->value : h->name, true);
	  outc('"', false);
	} else if (h->value == NULL) {
	  /* The h->name *is* the value (and the attribute name is implicit) */
	} else if (!needs_quotes(h->value)) {
	  out("=", false); /* Omit the quotes */
	  out(h->value, true);
	} else {
	  out("=\"", false);
	  out(h->value, true);
	  outc('"', false);
	}
      }
      if (is_empty(n->name)) {
	assert(n->children == NULL);
	outbreakpoint();
	out(do_xml ? " />" : ">", true);
	if (break_before(n->name)) dec_indent();
	if (!preformatted && break_after(n->name)) outln(NULL, false);

      } else if (do_xml && is_cdata_elt(n->name)) {
	/* Insert <![CDATA[...]]>, but only if input was HTML, not XML */
	if (!n->children) {
	  out(" />", true);
	  if (break_before(n->name)) dec_indent();
	} else {
	  out(">", preformatted);
	  /* TODO: Strictly speaking, if the input is HTML (not XML),
	     then the string "<![CDATA[" in <style> or <script> is to
	     be taken as literal text. In practice, the string
	     "<![CDATA[" is nearly always preceeded by "<!--" or "//"
	     and so this simplistic check will usually work... */
	  assert(n->children->tp == Text);
	  if (!hasprefix(n->children->text, "<![CDATA[")) out("<![CDATA[",true);
	  for (l = n->children; l; l = l->sister) {
	    assert(n->children->tp == Text);
	    out(l->text, true);
	  }
	  if (!hasprefix(n->children->text, "<![CDATA[")) out("]]>", true);
	  if (break_before(n->name)) dec_indent();
	  out("</", preformatted);
	  out(n->name, preformatted);
	  outbreakpoint();
	  out(">", preformatted);
	}
	if (!preformatted && break_after(n->name)) outbreak();

      } else if (!do_xml && is_cdata_elt(n->name) && n->children &&
		 n->children->tp == Text && !n->children->sister &&
		 hasprefix(n->children->text, "<![CDATA[")) {
	/* Remove <![CDATA[...]]>, but only if input was XML, not HTML */
	assert(hasaffix(n->children->text, "]]>"));
	out(">", preformatted);
	s = n->children->text + 9; /* Skip "<![CDATA[" */
	i = strlen(s) - 3;	   /* Omit "]]>" */
	for (j = 0; j < i; j++) outc(s[j], true);
	if (break_before(n->name)) dec_indent();
	out("</", preformatted);
	out(n->name, preformatted);
	outbreakpoint();
	out(">", preformatted);
	if (!preformatted && break_after(n->name)) outbreak();

      } else {
	outbreakpoint();
	out(">", preformatted);
	pre = preformatted || is_pre(n->name);
	mixed = is_mixed(n->name);
	for (l = n->children; l != NULL; l = l->sister)
	  pp(l, pre, mixed, lang);
	if (break_before(n->name)) dec_indent();
	if (do_xml || do_endtag || need_etag(n->name) || next_ambiguous(n)) {
	  out("</", pre); out(n->name, pre);
	  outbreakpoint();
	  out(">", preformatted);
	}
	if (!preformatted && break_after(n->name)) outbreak();
      }
      break;
    default:
      assert(!"Cannot happen");
  }
}

/* prettyprint -- print the tree normalized */
static void prettyprint(Tree t)
{
  Tree h;
  assert(t->tp == Root);
  for (h = t->children; h != NULL; h = h->sister) pp(h, false, false, NULL);
  flush();
}

/* usage -- print usage message and exit */
static void usage(string prog)
{
  fprintf(stderr, "%s version %s\n\
Usage: %s [-e] [-d] [-x] [-L] [-i indent] [-l linelen] [-c commentmagic] [file_or_url]\n",
	  prog, VERSION, prog);
  exit(1);
}

/* main -- main body */
int main(int argc, char *argv[])
{
  int c, status = 200;

  /* Bind the parser callback routines to our handlers */
  set_error_handler(handle_error);
  set_start_handler(start);
  set_end_handler(end);
  set_comment_handler(handle_comment);
  set_text_handler(handle_text);
  set_decl_handler(handle_decl);
  set_pi_handler(handle_pi);
  set_starttag_handler(handle_starttag);
  set_emptytag_handler(handle_emptytag);
  set_endtag_handler(handle_endtag);

  while ((c = getopt(argc, argv, "edxi:l:sc:L")) != -1)
    switch (c) {
    case 'e': do_endtag = true; break;
    case 'x': do_xml = true; break;
    case 'd': do_doctype = false; break;
    case 'i': set_indent(atoi(optarg)); break;
    case 'l': set_linelen(atoi(optarg)); break;
    case 's': clean_span = true; break;
    case 'c': long_comment = optarg; break;
    case 'L': do_lang = true; break;
    default: usage(argv[0]);
    }
  if (optind == argc) yyin = stdin;
  else if (optind == argc - 1) yyin = fopenurl(argv[optind], "r", &status);
  else usage(argv[0]);
  if (yyin == NULL) {perror(argv[optind]); exit(2);}
  if (status != 200) errexit("%s : %s\n", argv[optind], http_strerror(status));
  if (yyparse() != 0) {exit(3);}
  tree = get_root(tree);
  prettyprint(tree);
  return has_errors ? 1 : 0;
}