hxmkbib.c - html-xml-utils (5b98a3a6-d5ac-491c-8cfe-5a693ac63d92/upstream/8.6)

Tree @5b98a3a6-d5ac-491c-8cfe-5a693ac63d92/upstream/8.6 (Download .tar.gz)

hxmkbib.c @5b98a3a6-d5ac-491c-8cfe-5a693ac63d92/upstream/8.6 — raw · history · blame

/*
 * mkbib - extract database entries from a db and format them
 *
 * mkbib reads a refer-style database of bibliographic entries, a list
 * of keys and a pattern file and outputs a list of citations
 * formatted according to the pattern and optionally sorted.
 *
 * The keys must correspond to %L fields in the refer database.
 *
 * The pattern file has the following structure:
 *
 * pattern: PREAMBLE entry POSTAMBLE;
 * entry: "{L:" [ TEXT | FIELD | conditional ]* "}";
 * conditional: "{" !"? F ":" [ TEXT | FIELD | conditional ]* "}";
 *
 * In the output, the entry will be repeated as often as there are
 * unique keys. A FIELD is of the form "%x" and wil be replaced by
 * field x of the entry.
 *
 * A part of the form "{x:ZZZ}" will be replaced by ZZZ if field x
 * exists and by nothing otherwise. A part of the form "{!x:ZZZ}" will
 * be replaced by ZZZ if field x does not exist.
 *
 * Occurrences of %x in the preamble (where x is a field name) will
 * not be output, but serve to build up the sort order. The default
 * sort order is to keep entries in the order they occur in the
 * auxfile, but if, e.g., "%A%D%T" occurs in the preamble, entries
 * will be sorted on author, date and title.
 *
 * To insert a literal "{", "}" or "%" in the preamble or in an entry,
 * prefix them with "%": "%{", "%}" and "%%".
 *
 * Usage: mkbib [-a auxfile] bibfile [inputfile]
 *
 * bibfile is a refer-style database.
 *
 * inputfile is the file that serves as template. If absent, stdin
 * is read.
 *
 * -a auxfile gives the name of the list of keys. If absent, the name
 * will be the same as inputfile with the extension (if any)
 * changed to ".aux". If no inputfile is given the default auxfile
 * is "aux.aux". Duplicate keys will only be used once.
 *
 * Note: When the "{x:" and "}" are inside an HTML file, they may be
 * in places where data is not allowed. To make the input file
 * itself valid HTML, it may be necessary to put them inside comments:
 * <!--{x:--> and <!--}-->. If one of them is put inside a comment,
 * the other must be as well.
 *
 * Here is an example of an input file:
 *
 * <html>
 * <title>Bibliography</title>
 * <!-- sort order is Author, Date, Title %A%D%T-->
 * <dl>
 * <!--{L:--><dt id="%L">%L
 * <dd>{A:%A.} <em>{T:%T.}</em> {D:%D. }
 * <!--}--></dl>
 * </html>
 *
 * To do: if the template adds something like "(eds)", allow it to be
 * changed to "(ed)" if there is only one editor.
 *
 * Copyright © 1994-2004 World Wide Web Consortium
 * See http://www.w3.org/Consortium/Legal/copyright-software
 *
 * Author: Bert Bos <bert@w3.org>
 * Created: 19 March 2000
 * Version: $Id: hxmkbib.c,v 1.9 2023/01/23 21:19:41 bbos Exp $
 **/
#include "config.h"
#ifdef HAVE_ERRNO_H
#  include <errno.h>
#endif
#ifdef HAVE_UNISTD_H
#  include <unistd.h>
#endif
#include <stdio.h>
#if STDC_HEADERS
# include <string.h>
#else
# ifndef HAVE_STRCHR
#  define strchr index
#  define strrchr rindex
# endif
#endif
#include <stdlib.h>
#include <assert.h>

#ifdef HAVE_SEARCH_H
#  include <search.h>
#else
#  include "hash.e"		/* Use our own implementation */
#endif

#include <ctype.h>
#include <err.h>
#include <sysexits.h>
#include <stdbool.h>
#include "heap.e"
#include "types.e"


#define LINESIZE 32768
#define INCR 25
/* Warning: arbitrary limit! */
#define HASHSIZE 4096				/* Size of hash table */

static string prog;				/* argv[0] */
static string sortorder = NULL;			/* Default is unsorted */
static string separator = "; ";			/* Separates authors */
static int et_al_limit = 3;			/* Max # of authors to print */
static string et_al = "et al.";			/* String if more authors */


/* escape -- print a string, escaping characters dangerous for XML/HTML */
static void escape(const string s, unsigned char *last)
{
  int i;

  for (i = 0; s[i]; i++)
    switch (s[i]) {
      case '<': printf("&lt;"); break;
      case '>': printf("&gt;"); break;
      case '&': printf("&amp;"); break;
      case '"': printf("&quot;"); break;
      default: putchar(s[i]);
    }
  if (i > 0) *last = s[i-1];
}


/* put_field -- copy field field of entry with label key */
static void put_field(const string key, unsigned char field, unsigned char *last)
{
  ENTRY *e, e1 = {key, NULL};
  string *lines;
  int i, j, nrfields;

  /* ToDo: escape dangerous characters */
  /* ToDo: print "et. al." if more than N authors */
  /* ToDo: for fields other than %A and %E use only the last occurrence */
  /* ToDo: interpret and pretty-print dates in a consistent manner */

  if (field == '%' || field == '{' || field == '}') { /* Literal */
    putchar(field);
    *last = '\0';
    return;
  }

  /* Find the entry for key */
  if (! (e = hsearch(e1, FIND))) {
    fprintf(stderr, "%s: entry for key %s not found\n", prog, key);
    return;
  }

  /* Count how many occurences of %field there are in the entry */
  lines = (string*)e->data;			/* Type cast */
  for (i = 0, nrfields = 0; lines[i]; i++)
    if (lines[i][1] == field) nrfields++;

  /* Check that there is indeed a field */
  if (nrfields == 0) {
    fprintf(stderr, "%s: entry %s has no field %%%c\n", prog, key, field);
    return;
  }

  /* Check that there are no duplicate fields, other than for A and E */
  if (nrfields != 1 && ! (field == 'A' || field == 'E')) {
    fprintf(stderr, "%s: entry %s has duplicate field %%%c\n",
	    prog, key, field);
    return;
  }

  /* Now print the field(s) */
  if (nrfields > et_al_limit) {			/* Print only the first */
    for (i = 0; lines[i][1] != field; i++);	/* Find the first */
    escape(lines[i] + 3, last);			/* Print with entities */
    printf("%s%s", separator, et_al);
    *last = et_al[strlen(et_al) - 1];
  } else {					/* Print all fields */
    for (i = 0, j = 0; lines[i]; i++) {
      if (lines[i][1] == field) {		/* Found it */
	if (j != 0) printf("%s", separator); /* Multiple fields */
	escape(lines[i] + 3, last);		/* Print with entities */
	j++;
      }
    }
  }
}


/* get_field -- check that entry for key has a field f, return ptr to field */
static string get_field(const string key, const unsigned char f)
{
  ENTRY *e, e1 = {key, NULL};
  string *lines;
  int i;

  /* Find the entry for key */
  e = hsearch(e1, FIND);
  assert(e != NULL);
  assert(e->data != NULL);

  /* Find a line that starts with %field */
  lines = (string*)e->data;			/* Type cast */
  for (i = 0; lines[i] && lines[i][1] != f; i++) ;

  assert(! lines[i] || (lines[i][0] == '%' && lines[i][2] == ' '));
  return lines[i];
}


/* compare_keys -- return the relative sort order for two keys: -1, 0, 1 */
static int compare_keys(const void *aptr, const void *bptr)
{
  ENTRY e, *ae, *be;
  int c, i;
  string af, bf, a = *(string*)aptr, b = *(string*)bptr;

  /* Get the entry for key a */
  e.key = a;
  ae = hsearch(e, FIND);
  assert(ae != NULL);

  /* Get the entry for key b */
  e.key = b;
  be = hsearch(e, FIND);
  assert(be != NULL);

  /* Loop over sortorder, stop as soon as entries a and b are unequal */
  for (i = 0, c = 0; c == 0 && sortorder[i]; i++) {
    af = get_field(a, sortorder[i]);
    bf = get_field(b, sortorder[i]);
    c = strcmp(af ? af : (string)"", bf ? bf : (string)"");
  }

  return c;
}


/* sort_keys -- sort the keys according to the sort order given */
static void sort_keys(string *keys, const int n)
{
  assert(sortorder != NULL);
  qsort(keys, n, sizeof(*keys), compare_keys);
}


/* conditional -- conditionally copy a %{...%} segment */
static int conditional(const string pattern, const string key,
		       unsigned char *last)
{
  bool on;
  int level, i = 1;

  /* Pattern starts with '{' */
  assert(pattern[0] == '{' && pattern[1] != '\0');

  /* Check the condition */
  if (pattern[i] == '!') on = !get_field(key, pattern[++i]);
  else on = get_field(key, pattern[i]) != NULL;

  if (pattern[i+1] != ':') errx(EX_DATAERR, "missing ':' in pattern");

  /* Skip or copy until matching '%}' */
  if (! on) {					/* Skip until matching '}' */
    for (i += 2, level = 1; level != 0; i++)
      if (pattern[i] == '%') {
	if (pattern[++i] == '{') level++;
	else if (pattern[i] == '}') level--;
      }
    i--;					/* i points to '}' */
  } else {					/* Recursively copy segment */
    for (i += 2; true; i++)
      if (pattern[i] == '%') {
	if (pattern[++i] == '{') i += conditional(pattern + i, key, last);
	else if (pattern[i] == '}') break;
	else if (pattern[i] == '%') {putchar('%'); *last = '\0';}
	else put_field(key, pattern[i], last);
      } else if (*last != '.' || pattern[i] != '.') {
	putchar(pattern[i]);
	*last = '\0';
      } else {
	*last = '\0';				/* Don't print this '.' */
      }
  }
  
  return i;					/* Points at '}' */
}


/* copy -- copy pattern, expanding fields. (May sort keys) */
static void copy(const string pattern, string *keys, const int n)
{
  int j, start, end, level, slen = 0;
  unsigned char last = '\0';			/* Last char of field */

  assert(sortorder == NULL);

  /* ToDo: Find a way to declare the separator in the source. Maybe {&:...} */

  /* Find first '%{'. Also look for sort order */
  for (start = 0; pattern[start]; start++) {
    if (pattern[start] == '%') {		/* Special character */
      if (pattern[++start] == '{') {		/* Start of template */
	break;
      } else if ('A' <= pattern[start] && pattern[start] <= 'Z') {
	renewarray(sortorder, slen + 2);	/* Sort order */
	sortorder[slen] = pattern[start];
	sortorder[++slen] = '\0';
      } else {
	putchar('%');				/* Not special */
	putchar(pattern[start]);
      }
    } else {					/* Normal character */
      putchar(pattern[start]);
    }
  }
  if (!pattern[start]) {
    fprintf(stderr, "%s: warning: no '%%{' in input file\n", prog);
    return;					/* Nothing more to copy */
  }

  /* Sort the keys if there was a sort order */
  if (sortorder) sort_keys(keys, n);

  /* Start now points to '{'. Find matching '%}' */
  for (end = start + 1, level = 1; pattern[end] && level != 0; end++) {
    if (pattern[end] == '%') {
      if (pattern[++end] == '}') level--;
      else if (pattern[end] == '{') level++;
    }
  }
  if (level != 0) errx(EX_DATAERR, "unbalanced %%{..%%} in pattern");

  /* End now points just after '}'. Loop over keys */
  for (j = 0; j < n; j++)
    conditional(pattern + start, keys[j], &last);

  /* Copy postamble */
  printf("%s", pattern + end);
}


/* in_list -- check if s is in the list of strings */
static bool in_list(const string s, const string *list, const int n)
{
  int i;

  for (i = 0; i < n && strcmp(s, list[i]) != 0; i++) ;
  return i < n;
}


/* read_keys -- read the list of keys from file f */
static string *read_keys(FILE *f, int *number)
{
  int i, e, n = 0;
  char line[LINESIZE];
  string *keys = NULL;

  clearerr(f);
  while (fgets(line, sizeof(line), f)) {

    /* Remove trailing \n and other whitespace */
    for (i = strlen(line); i > 0 && isspace(line[i-1]); i--) ;
    line[i] = '\0';

    /* ToDo: linear search fast enough? Books don't have 1000's of refs... */
    if (! in_list(line, keys, n)) {
      renewarray(keys, INCR * ((n + 1)/INCR + 1));
      keys[n++] = newstring(line);
    }
  }
  if ((e = ferror(f))) errx(EX_IOERR, "%s", strerror(e));

  *number = n;
  return keys;
}


/* check_and_store_entry -- check if we need this entry and if so store it */
static void check_and_store_entry(const string key, string *lines, int n)
{
  ENTRY e, *e1;

  renewarray(lines, INCR * ((n + 1)/INCR + 1));
  lines[n] = NULL;				/* Mark end of entry */
  if (key) {					/* Does it have a key at all */
    e.key = key;
    if ((e1 = hsearch(e, FIND))) 		/* Do we need this entry? */
      e1->data = (char*)lines;			/* Replace its data field */
  }
}


/* read_entries -- read the relevant entries from the refer database */
static void read_entries(FILE *f, const string *keys, const int n)
{
  char line[LINESIZE];
  string *lines = NULL;
  string key = NULL;
  ENTRY e, *e1;
  int i, j, fe;

  /* First enter all keys into the hash table without any data */
  for (i = 0; i < n; i++) {
    e.key = newstring(keys[i]);
    e.data = NULL;
    if (! hsearch(e, ENTER)) err(EX_OSERR, NULL);
  }

  /* Now read entries from the database */
  clearerr(f);
  i = 0;
  while (fgets(line, sizeof(line), f)) {

    if (line[0] != '%') {			/* Separator line */
      if (i != 0) {				/* We were in an entry */
	check_and_store_entry(key, lines, i);
	i = 0;					/* Reset */
	key = NULL;				/* Reset */
	lines = NULL;				/* Reset */
      }
    } else {					/* This line is a field */
      for (j = strlen(line); j > 0 && isspace(line[j-1]); j--) ;
      line[j] = '\0';				/* Remove trailing spaces */
      renewarray(lines, INCR * ((i + 1)/INCR + 1));
      lines[i] = newstring(line);
      if (strncmp(lines[i], "%L ", 3) == 0) key = lines[i] + 3;
      i++;
    }
  }
  if ((fe = ferror(f))) errx(EX_IOERR, "%s", strerror(fe));

  /* Check if last entry was already stored */
  if (i != 0)					/* We were still in an entry */
    check_and_store_entry(key, lines, i);

  /* Check that we found all keys */
  for (i = 0; i < n; i++) {
    e.key = keys[i];
    e1 = hsearch(e, FIND);
    assert(e1);
    if (! e1->data) errx(EX_DATAERR, "entry for \"%s\" not found", keys[i]);
  }

}


/* read_pattern -- read the input file into memory */
static string read_pattern(FILE *f)
{
  string p = NULL;
  int n, len = 0;

  /* ToDo: use ferror to check for errors */
  do {
    renewarray(p, len + LINESIZE + 1);
    n = fread(p + len, sizeof(*p), LINESIZE, f);
    len += n;
  } while (! feof(f));
  p[len] = '\0';
  return p;
}


/* usage -- print usage message and exit */
static void usage(void)
{
  fprintf(stderr, "Version %s\nUsage: %s [-a auxfile] [-s sep] [-n maxauthors] [-r moreauthors] bibfile [inputfile]\n",
	  VERSION, prog);
  exit(EX_USAGE);
}


/* main - main body */
int main(int argc, char *argv[])
{
  string auxfile = NULL, pattern, inputfile = NULL, dbfile, h;
  string *keys = NULL;
  FILE *f, *db, *aux;
  int c, n;

  /* Parse command line */
  prog = argv[0];
  while ((c = getopt(argc, argv, "a:s:n:r:")) != -1) {
    switch (c) {
    case 'a': auxfile = optarg; break;
    case 's': separator = optarg; break;
    case 'n': et_al_limit = atoi(optarg); break;
    case 'r': et_al = optarg; break;
    default: usage();
    }
  }
  if (optind == argc || argc > optind + 2) usage();

  /* First argument is refer database */
  dbfile = argv[optind++];

  /* Optional second argument is input file */
  if (optind != argc) inputfile = argv[optind];

  /* If we don't have an explicit auxfile yet, derive its name */
  if (! auxfile) {
    if (! inputfile) {
      auxfile = "aux.aux";
    } else {
      newarray(auxfile, strlen(argv[optind]) + 5);
      strcpy(auxfile, argv[optind]);
      if ((h = strrchr(auxfile, '.'))) *h = '\0';
      strcat(auxfile, ".aux");
    }
  }

  /* Create a hash table */
  if (! hcreate(HASHSIZE)) err(EX_OSERR, "creating hash table");

  /* Read the pattern *before* the auxfile, so that we have a chance
     that the pipeline
     "hxcite -a aux bibfile file | hxmkib -a aux bibfile"
     works. I.e., that hxcite finished writing the auxfile before we
     start reading it. */

  /* Read pattern into memory */
  if (! (f = inputfile ? fopen(inputfile, "r") : stdin))
    err(EX_IOERR, "%s", inputfile);
  pattern = read_pattern(f);
  if (fclose(f) != 0) err(EX_IOERR, "%s", inputfile);

  /* Read keys from aux file */
  if (! (aux = fopen(auxfile, "r"))) err(EX_IOERR, "%s", auxfile);
  keys = read_keys(aux, &n);
  if (fclose(aux) != 0) err(EX_IOERR, "%s", auxfile);

  /* Read the entries we need from the database */
  if (! (db = fopen(dbfile, "r"))) err(EX_IOERR, "%s", dbfile);
  read_entries(db, keys, n);
  if (fclose(db) != 0) err(EX_IOERR, "%s", dbfile);

  /* Copy and expand the pattern */
  copy(pattern, keys, n);

  return 0;
}