/*
* Insert an index between "<!--begin-index-->" and "<!--end-index-->",
* or replacing the comment "<!--index-->"
*
* The index links to elements with ID attributes as well as with
* empty <A NAME> elements.
*
* Any <A> tags with a class of "bctarget" are not copied, but
* regenerated. They are assumed to be backwards-compatible versions
* of ID attributes on their parent elements. But if the option -t or
* -x are given, those <A> elements are removed.
*
* There's a limit of 100000 index terms (10^(MAXIDLEN-1)).
*
* Index terms are elements with a class of "index", "index-inst" or
* "index-def", as well as all <dfn> elements. The contents of the
* element is the index term, unless the element has a title
* attribute. The title attribute can contain "|" and "!!":
*
* "term"
* "term1|term2|term3|..."
* "term!!subterm!!subsubterm!!..."
* "term1!!subterm1|term2!!subterm2|..."
* etc.
*
* For backward compatibility with an earlier Perl program, "::" is
* accepted as an alternative for "!!", but it is better not to use
* both separators in the same project, since the sorting maybe
* adversely affected.
*
* Class "index-def" results in a bold entry in the index, "index" in
* a normal one. "index-inst" is an alias for "index", provided for
* backward compatibility.
*
* To do: an option to split the index at each new first letter.
*
* Copyright © 1994-2005 World Wide Web Consortium
* See http://www.w3.org/Consortium/Legal/copyright-software
*
* Author: Bert Bos <bert@w3.org>
* Created: 11 Apr 2000
* Version: $Id: hxindex.c,v 1.25 2023/01/23 21:19:41 bbos Exp $
*
**/
#include "config.h"
#include <assert.h>
#include <locale.h>
#include <wchar.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <iconv.h>
#include <unistd.h>
#include <err.h>
#include <sysexits.h>
#include <stdbool.h>
#if STDC_HEADERS
# include <string.h>
#else
# ifndef HAVE_STRCHR
# define strchr index
# define strrchr rindex
# endif
# ifndef HAVE_STRSTR
# include "strstr.e"
# endif
#endif
#ifdef HAVE_ERRNO_H
# include <errno.h>
#else
extern int errno;
char *strerror(int errnum);
int strerror_r(int errnum, char *buf, size_t n);
#endif
#ifdef HAVE_SEARCH_H
# include <search.h>
#else
# include "search-freebsd.h"
#endif
#include "export.h"
#include "types.e"
#include "heap.e"
#include "tree.e"
#include "html.e"
#include "scan.e"
#include "dict.e"
#include "openurl.e"
#include "genid.e"
#include "class.e"
#undef USE_DATA_ATTRIBUTE /* Data attributes are a proposal in HTML5 */
#define BEGIN_INDEX "begin-index" /* <!--begin-index--> */
#define END_INDEX "end-index" /* <!--end-index--> */
#define INDEX "index" /* <!--index--> */
#define INDEX_INST "index-inst" /* class="index-inst" */
#define INDEX_DEF "index-def" /* class="index-def" */
#define TARGET "bctarget" /* CLASS="...bctarget..." */
#define MAXSUBS 20 /* Max. depth of subterms */
#define SECNO "secno" /* Class of elements that define section # */
#define NO_NUM "no-num" /* Class of elements without a section # */
typedef struct _indexterm {
string url;
int importance; /* 1 (low) or 2 (high) */
string secno; /* For option -n */
string sectitle; /* For option -N */
string doctitle;
string *terms; /* Array of subterms */
wchar_t **sortkeys; /* Array of normalized subterms */
int nrkeys; /* Length of term and sortkeys arrays */
} *Indexterm;
static Tree tree;
static bool xml = false; /* Use <empty /> convention */
static string base = NULL; /* (Rel.) URL of output file */
static string indexdb = NULL; /* Persistent store of terms */
static string* userclassnames = NULL; /* Persistent store of class names */
static FILE *globalfile; /* Must be global for twalk */
static Indexterm globalprevious; /* Must be global for twalk */
static string globalurlprevious;/* Must be global for twalk */
static bool bctarget = true; /* Add <A name=> after IDs */
static bool use_secno = false; /* Anchor text is "#" instead of section # */
static bool use_sectitle = false; /* Anchor text is section title, not # */
static bool final = false; /* Leave used attributes in document */
static bool trim_punct = true; /* Remove trailing punctuation from terms */
static string section_name = NULL; /* Term meaning "section %s" */
static string unknown_name = NULL; /* Term meaning "without number" */
static string* exclude_elts = NULL; /* Don't index these elements */
static string* only_elts = NULL; /* Only index these elements */
/* handle_error -- called when a parse error occurred */
static void handle_error(void *clientdata, const string s, int lineno)
{
(void) fprintf(stderr, "%d: %s\n", lineno, s);
}
/* start -- called before the first event is reported */
static void* start(void)
{
tree = create();
return NULL;
}
/* end -- called after the last event is reported */
static void end(void *clientdata)
{
/* skip */
}
/* handle_comment -- called after a comment is parsed */
static void handle_comment(void *clientdata, string commenttext)
{
tree = append_comment(tree, commenttext);
}
/* handle_text -- called after a tex chunk is parsed */
static void handle_text(void *clientdata, string text)
{
tree = append_text(tree, text);
}
/* handle_declaration -- called after a declaration is parsed */
static void handle_decl(void *clientdata, string gi,
string fpi, string url)
{
tree = append_declaration(tree, gi, fpi, url);
}
/* handle_proc_instr -- called after a PI is parsed */
static void handle_pi(void *clientdata, string pi_text)
{
tree = append_procins(tree, pi_text);
}
/* handle_starttag -- called after a start tag is parsed */
static void handle_starttag(void *clientdata, string name, pairlist attribs)
{
conststring id;
tree = html_push(tree, name, attribs);
/* If it has an ID, store it (so we don't accidentally generate it) */
if ((id = pairlist_get(attribs, "id"))) storeID(id);
}
/* handle_emptytag -- called after an empty tag is parsed */
static void handle_emptytag(void *clientdata, string name, pairlist attribs)
{
conststring id;
tree = html_push(tree, name, attribs);
/* If it has an ID, store it (so we don't accidentally generate it) */
if ((id = pairlist_get(attribs, "id"))) storeID(id);
}
/* handle_endtag -- called after an endtag is parsed (name may be "") */
static void handle_endtag(void *clientdata, string name)
{
tree = html_pop(tree, name);
}
/* trim -- remove leading and trailing white space, collapse white space */
static void trim(string s)
{
string t;
int i, j;
if (!s) return;
t = newstring(s);
for (i = 0; isspace(t[i]); i++); /* Skip leading white space */
for (j = 0; t[i]; i++)
if (!isspace(t[i])) s[j++] = t[i];
else if (!isspace(t[i-1])) s[j++] = ' ';
if (j == 0) s[j] = '\0';
else if (isspace(s[j-1])) s[j-1] = '\0';
else s[j] = '\0';
dispose(t);
}
/* parse_subterms -- parse s to create terms & sortkeys array in an Indexterm */
static void parse_subterms(const Indexterm term, const conststring s)
{
enum {TEXT, TAG, DQUOTE, SQUOTE} state;
string h, k, p, q;
iconv_t cd;
size_t len, len2;
int i;
/* Create the terms array and count the number of subterms */
h = newstring(s);
trim(h);
term->nrkeys = 1;
newarray(term->terms, 1);
term->terms[0] = h;
while ((k = strstr(h, "!!")) || (k = strstr(h, "::"))) {
h = k + 2;
*k = '\0';
renewarray(term->terms, term->nrkeys + 1);
trim(h); /* Remove leading & trailing space */
term->terms[term->nrkeys] = h; /* All terms point into h */
term->nrkeys++;
}
/* Create the sortkeys array by normalizing each term */
newarray(term->sortkeys, term->nrkeys);
for (i = 0; i < term->nrkeys; i++) {
/* First remove mark-up and expand the standard XML entities */
h = newstring(term->terms[i]);
for (state = TEXT, p = q = h; *p; p++) {
switch (state) {
case TEXT:
if (*p == '<') state = TAG;
else if (hasprefix(p, "<")) {*(q++) = '<'; p += 3;}
else if (hasprefix(p, "<")) {*(q++) = '<'; p += 4;}
else if (hasprefix(p, "<")) {*(q++) = '<'; p += 5;}
else if (hasprefix(p, "<")) {*(q++) = '<'; p += 5;}
else if (hasprefix(p, ">")) {*(q++) = '>'; p += 3;}
else if (hasprefix(p, ">")) {*(q++) = '>'; p += 4;}
else if (hasprefix(p, ">")) {*(q++) = '>'; p += 5;}
else if (hasprefix(p, ">")) {*(q++) = '>'; p += 5;}
else if (hasprefix(p, """)) {*(q++) = '"'; p += 5;}
else if (hasprefix(p, """)) {*(q++) = '"'; p += 4;}
else if (hasprefix(p, """)) {*(q++) = '"'; p += 5;}
else if (hasprefix(p, "&")) {*(q++) = '&'; p += 4;}
else if (hasprefix(p, "&")) {*(q++) = '&'; p += 4;}
else if (hasprefix(p, "&")) {*(q++) = '&'; p += 5;}
else *(q++) = tolower(*p);
break;
case TAG:
if (*p == '>') state = TEXT;
else if (*p == '"') state = DQUOTE;
else if (*p == '\'') state = SQUOTE;
break;
case DQUOTE:
if (*p == '"') state = TAG;
break;
case SQUOTE:
if (*p == '\'') state = TAG;
break;
default:
assert(!"Cannot happen!");
}
}
*q = '\0';
if (trim_punct) {
/* Remove some trailing white space and punctuation */
for (q--; q != h && strspn(q, " \r\n\t\f\v,:;!?"); q--) *q = '\0';
/* Remove final '.' only if it is the only '.' in the term */
if ((q = strrchr(h, '.')) && !*(q+1) && q == strchr(h, '.')) *q = '\0';
}
/* Then convert from UTF-8 to wchar_t */
cd = iconv_open("wchar_t", "UTF-8");
if (cd == (iconv_t)(-1)) err(EX_OSERR, NULL);
len = strlen(h) + 1;
newarray(term->sortkeys[i], len); /* Large enough */
p = (string) term->sortkeys[i];
len2 = len * sizeof(term->sortkeys[i][0]);
if (iconv(cd, &h, &len, &p, &len2) == (size_t)(-1)) err(EX_OSERR, NULL);
/* *p = L'\0'; */
if (iconv_close(cd) == -1) err(EX_OSERR, NULL);
}
}
/* folding_cmp -- compare two arrays of sort keys */
static int folding_cmp(wchar_t **a, const int alen, wchar_t **b, const int blen)
{
int i, j;
assert(a && alen >= 0);
assert(b && blen >= 0);
for (i = 0;; i++) {
if (i == alen) return i == blen ? 0 : -1;
if (i == blen) return 1;
assert(a[i] && b[i]);
if ((j = wcscoll(a[i], b[i])) != 0) return j;
}
assert(! "Cannot happen!");
}
/* indent -- print newline and n times 2 spaces */
static void indent(int n)
{
putchar('\n');
for (; n > 0; n--) printf(" ");
}
/* print_escaped -- print s escaped for use in an attribute */
static void print_escaped(const conststring s)
{
conststring h;
for (h = s; *h; h++) if (*h == '"') printf("""); else putchar(*h);
}
/* print_title -- print a TITLE attribute */
static void print_title(const Indexterm term)
{
enum {TEXT, TAG, DQUOTE, SQUOTE} state;
string h;
assert(use_secno);
fputs(" title=\"", stdout);
if (base[0]) { /* Only add document titles if needed */
for (state = TEXT, h = term->doctitle; *h; h++) {
switch (state) {
case TEXT:
if (*h == '<') state = TAG;
else if (*h == '"') fputs(""", stdout);
else putchar(*h);
break;
case TAG:
if (*h == '>') state = TEXT;
else if (*h == '"') state = DQUOTE;
else if (*h == '\'') state = SQUOTE;
break;
case DQUOTE:
if (*h == '"') state = TAG;
break;
case SQUOTE:
if (*h == '\'') state = TAG;
break;
default:
assert(!"Cannot happen!");
}
}
fputs(", ", stdout);
}
for (h = section_name; *h; h++)
switch (*h) {
case '"': fputs(""", stdout); break;
case '>': fputs(">", stdout); break;
case '<': fputs("<", stdout); break;
case '%':
if (*(h+1) == '%') {putchar('%'); h++;}
else if (*(h+1) != 's') putchar('%');
else if (term->secno) {print_escaped(term->secno); h++;}
else {print_escaped(unknown_name); h++;}
break;
default: putchar(*h);
}
putchar('"');
}
/* write_index_item -- write one item in the list of index terms */
static void write_index_item(const void *term1, const VISIT which,
const int depth)
{
Indexterm term = *(Indexterm*)term1;
int i, j;
if (which != postorder && which != leaf) return;
/* Count how many subterms are equal to the previous entry */
i = 0;
while (i < min(term->nrkeys, globalprevious->nrkeys) &&
!folding_cmp(term->sortkeys + i, 1, globalprevious->sortkeys + i, 1))
i++;
/* Close lists as needed */
for (j = globalprevious->nrkeys - 1; j > i; j--) {
indent(j);
printf("</ul>");
}
/* Open a list if needed */
if (term->nrkeys > globalprevious->nrkeys && globalprevious->nrkeys == i) {
indent(i);
printf("<ul>");
}
/* Print new subterms, if any */
for (j = i; j < term->nrkeys; j++) {
indent(j);
printf("<li>%s", term->terms[j]);
if (j != term->nrkeys - 1) {
indent(j + 1);
printf("<ul>");
}
}
/* Print a link */
printf(", ");
printf("<a href=\"");
print_escaped(term->url);
printf("\"");
if (use_secno) print_title(term);
if (term->importance == 2) printf("><strong>"); else printf(">");
if (use_sectitle)
printf("%s", term->sectitle ? term->sectitle : term->doctitle);
else if (!use_secno) putchar('#');
else if (term->secno) print_escaped(term->secno);
else print_escaped(unknown_name);
if (term->importance == 2) printf("</strong></a>"); else printf("</a>");
/* Remember this term */
globalprevious = term;
globalurlprevious = term->url;
}
/* mkindex -- write out an index */
static void mkindex(Indexterm terms)
{
int i;
printf("<ul class=\"indexlist\">");
/* Initialize globalprevious to a term with an unlikely sortkey */
new(globalprevious);
globalprevious->nrkeys = 1;
newarray(globalprevious->sortkeys, globalprevious->nrkeys);
newarray(globalprevious->sortkeys[0], 15);
wcscpy(globalprevious->sortkeys[0], L"zzzzzzzzzzzzzz");
twalk(terms, write_index_item);
/* Close all open lists */
for (i = 0; i < globalprevious->nrkeys; i++) printf("\n</ul>");
}
/* expand -- write the tree, add <A NAME> if needed and replace <!--index--> */
static void expand(Tree t, bool *write, Indexterm terms)
{
conststring val;
Tree h;
pairlist a;
string s;
bool do_tag;
for (h = t->children; h != NULL; h = h->sister) {
switch (h->tp) {
case Text:
if (*write) printf("%s", h->text);
break;
case Comment:
s = newstring(h->text);
trim(s);
if (eq(s, INDEX) || eq(s, BEGIN_INDEX)) {
if (!final) printf("<!--%s-->\n", BEGIN_INDEX);
mkindex(terms);
if (!final) printf("<!--%s-->", END_INDEX);
if (eq(s, BEGIN_INDEX)) *write = false; /* Skip old index */
} else if (eq(s, END_INDEX)) {
*write = true;
} else {
printf("<!--%s-->", h->text);
}
dispose(s);
break;
case Declaration:
printf("<!DOCTYPE %s", h->name);
if (h->text) printf(" PUBLIC \"%s\"", h->text);
if (h->url) printf(" %s\"%s\"", h->text ? "" : "SYSTEM ", h->url);
printf(">");
break;
case Procins:
if (*write) printf("<?%s>", h->text);
break;
case Element:
if (*write) {
/* If an <a> was inserted by index itself, remove it */
do_tag = !eq(h->name, "a") || !has_class(h->attribs, TARGET);
if (do_tag) {
printf("<%s", h->name);
for (a = h->attribs; a != NULL; a = a->next) {
printf(" %s", a->name);
if (a->value != NULL) printf("=\"%s\"", a->value);
}
assert(! is_empty(h->name) || h->children == NULL);
printf(xml && is_empty(h->name) ? " />" : ">");
/* Insert an <A NAME> if element has an ID and is not <A> */
if (bctarget && is_mixed(h->name) && (val = get_attrib(h, "id"))
&& !eq(h->name, "a") && ! xml)
printf("<a class=\"%s\" name=\"%s\"></a>", TARGET, val);
}
expand(h, write, terms);
if (do_tag && ! is_empty(h->name)) printf("</%s>", h->name);
}
break;
case Root:
assert(! "Cannot happen");
break;
default:
assert(! "Cannot happen");
}
}
}
/* termcmp -- comparison routine for Indexterms */
static int termcmp(const void *a1, const void *b1)
{
Indexterm a = (Indexterm)a1, b = (Indexterm)b1;
int r;
assert(a);
assert(b);
assert(a->sortkeys);
assert(b->sortkeys);
assert(a->nrkeys > 0);
assert(b->nrkeys > 0);
r = folding_cmp(a->sortkeys, a->nrkeys, b->sortkeys, b->nrkeys);
if (r != 0) return r;
return strcmp(a->url, b->url); /* Terms are equal, compare URL instead */
}
/* copy_contents -- recursively expand contents of element t into a string */
static void copy_contents(Tree t, string *s)
{
Tree h;
int i;
pairlist a;
string p;
for (h = t->children; h != NULL; h = h->sister) {
switch (h->tp) {
case Text:
i = *s ? strlen(*s) : 0;
renewarray(*s, i + strlen(h->text) + 1);
/* Copy, but transform all whitespace to spaces */
for (p = h->text; *p; p++, i++) (*s)[i] = isspace(*p) ? ' ' : *p;
(*s)[i] = '\0';
break;
case Comment: break;
case Declaration: break;
case Procins: break;
case Element:
/* Only certain tags are retained */
if (eq(h->name, "span") || eq(h->name, "code") || eq(h->name, "tt")
|| eq(h->name, "acronym") || eq(h->name, "abbr")
|| eq(h->name, "bdo") || eq(h->name, "kbd") || eq(h->name, "samp")
|| eq(h->name, "sub") || eq(h->name, "sup")
|| eq(h->name, "var")) {
strapp(s, "<", h->name, NULL);
for (a = h->attribs; a != NULL; a = a->next) {
if (! a->value) strapp(s, " ", a->name, NULL);
else strapp(s, " ", a->name, "=\"", a->value, "\"", NULL);
}
assert(! is_empty(h->name) || h->children == NULL);
if (is_empty(h->name)) {
strapp(s, xml ? " />" : ">", NULL);
} else {
strapp(s, ">", NULL);
copy_contents(h, s);
strapp(s, "</", h->name, ">", NULL);
}
} else { /* Ignore tag, copy contents */
copy_contents(h, s);
}
break;
case Root: assert(! "Cannot happen"); break;
default: assert(! "Cannot happen");
}
}
}
/* copy_to_index -- copy the contents of element h to the index db */
static void copy_to_index(Tree t, Indexterm *terms, int importance,
conststring secno, conststring sectitle,
conststring doctitle)
{
conststring id, title;
string h;
Indexterm term;
int i, n;
id = get_attrib(t, "id");
#ifdef USE_DATA_ATTRIBUTE
if (! (title = get_attrib(t, "data-index")))
#endif
title = get_attrib(t, "title");
/* Get term either from title attribute or contents */
if (title) {
i = 0;
while (title[i]) {
n = strcspn(title + i, "|"); /* Find | or \0 */
new(term);
term->importance = importance;
term->secno = secno ? newstring(secno) : NULL;
term->sectitle = sectitle ? newstring(sectitle) : NULL;
term->doctitle = newstring(doctitle);
term->url = NULL;
strapp(&term->url, base, "#", id, NULL);
h = newnstring(title + i, n);
parse_subterms(term, h);
if (! tsearch(term, (void**)terms, termcmp))
errx(1, "Out of memory while parsing term %s", h);
i += n;
if (title[i]) i++; /* Skip '|' */
}
if (final) /* Remove used attribute */
#ifdef USE_DATA_ATTRIBUTE
if (!delete_attrib(t, "data-index"))
#endif
delete_attrib(t, "title");
} else { /* Recursively copy contents */
h = NULL;
copy_contents(t, &h);
if (h) { /* Non-empty contents */
new(term);
term->importance = importance;
term->secno = secno ? newstring(secno) : NULL;
term->sectitle = sectitle ? newstring(sectitle) : NULL;
term->doctitle = newstring(doctitle);
term->url = NULL;
strapp(&term->url, base, "#", id, NULL);
parse_subterms(term, h);
if (! tsearch(term, (void**)terms, termcmp))
errx(1, "Out of memory while parsing term %s", h);
}
}
}
/* in_list -- check if word occurs in array list */
static bool in_list(const string word, const string *list)
{
int i;
for (i = 0; list[i]; i++) if (eq(word, list[i])) return true;
return false;
}
/* collect -- collect index terms, add IDs where needed */
static void collect(Tree t, Indexterm *terms, string *secno,
string *sectitle, string *doctitle)
{
int importance;
Tree h;
for (h = t->children; h != NULL; h = h->sister) {
switch (h->tp) {
case Text: case Comment: case Declaration: case Procins: break;
case Element:
if (eq(h->name, "title")) {
dispose(*doctitle);
copy_contents(h, doctitle);
}
if (has_class(h->attribs, SECNO)) {
dispose(*secno);
copy_contents(h, secno);
trim(*secno);
} else if (has_class(h->attribs, NO_NUM)) {
dispose(*secno);
*secno = newstring(unknown_name);
}
if (eq(h->name, "h1") || eq(h->name, "h2") || eq(h->name, "h3") ||
eq(h->name, "h4") || eq(h->name, "h5") || eq(h->name, "h5")) {
dispose(*sectitle);
copy_contents(h, sectitle);
trim(*sectitle);
}
if (eq(h->name, "dfn")) importance = 2;
else if (exclude_elts && in_list(h->name, exclude_elts)) importance = 0;
else if (only_elts && !in_list(h->name, only_elts)) importance = 0;
else if (has_class(h->attribs,INDEX)||has_class(h->attribs,INDEX_INST))
importance = 1;
else if (userclassnames && has_class_in_list(h->attribs, userclassnames))
importance = 1;
else if (has_class(h->attribs, INDEX_DEF)) importance = 2;
else importance = 0;
if (importance != 0) {
/* Give it an ID, if it doesn't have one */
if (! get_attrib(h, "id")) set_attrib(h, "id", gen_id(h));
copy_to_index(h, terms, importance, *secno, *sectitle, *doctitle);
} else {
collect(h, terms, secno, sectitle, doctitle);
}
break;
case Root: assert(! "Cannot happen"); break;
default: assert(! "Cannot happen");
}
}
}
/* load_index -- read persistent term db from file */
static void load_index(const string indexdb, Indexterm *terms)
{
FILE *f;
int n1, n2, n3, n4, n5, n6;
char *line = NULL;
size_t linesize = 0;
Indexterm term;
string h;
if (! (f = fopen(indexdb, "r"))) return; /* Assume file not found... */
while (getline(&line, &linesize, f) != -1) {
n1 = strcspn(line, "\t");
if (line[n1] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n2 = n1 + 1 + strcspn(line + n1 + 1, "\t");
if (line[n2] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n3 = n2 + 1 + strcspn(line + n2 + 1, "\t");
if (line[n3] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n4 = n3 + 1 + strcspn(line + n3 + 1, "\t");
if (line[n4] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n5 = n4 + 1 + strcspn(line + n4 + 1, "\t");
if (line[n5] != '\t') errx(1, "Illegal syntax in %s", indexdb);
n6 = n5 + 1 + strcspn(line + n5 + 1, "\t\n");
if (line[n6] != '\n') errx(1, "Illegal syntax in %s", indexdb);
new(term);
h = newnstring(line, n1);
switch (line[n1 + 1]) {
case '1': term->importance = 1; break;
case '2': term->importance = 2; break;
default: errx(1, "Error in %s (column 2 must be '1' or '2')", indexdb);
}
term->url = newnstring(line + n2 + 1, n3 - n2 - 1);
term->secno = newnstring(line + n3 + 1, n4 - n3 - 1);
term->sectitle = newnstring(line + n4 + 1, n5 - n4 - 1);
term->doctitle = newnstring(line + n5 + 1, n6 - n5 - 1);
parse_subterms(term, h);
if (! tsearch(term, (void**)terms, termcmp))
errx(1, "Out of memory while loading %s", indexdb);
}
fclose(f);
free(line);
}
/* save_a_term -- write one term to globalfile */
static void save_a_term(const void *term1, const VISIT which, const int dp)
{
Indexterm term = *(Indexterm*)term1;
int i;
if (which == endorder || which == leaf) {
for (i = 0; i < term->nrkeys; i++) {
if (i > 0) fprintf(globalfile, "!!");
fprintf(globalfile, "%s", term->terms[i]);
}
fprintf(globalfile, "\t%d\t%s\t%s\t%s\t%s\n", term->importance, term->url,
term->secno ? term->secno : (use_secno ? unknown_name : "#"),
term->sectitle ? term->sectitle : term->doctitle,
term->doctitle);
}
}
/* save_index -- write terms to file */
static void save_index(const string indexdb, Indexterm terms)
{
if (! (globalfile = fopen(indexdb, "w")))
errx(1, "%s: %s", indexdb, strerror(errno));
twalk(terms, save_a_term);
fclose(globalfile);
}
/* usage -- print usage message and exit */
static void usage(string name)
{
errx(1, "Version %s\nUsage: %s [-i indexdb] [-b base] [-x] [-t] [-n] [-c userclass] [-s template] [-u phrase] [html-file]",
VERSION, name);
}
/* tokenize -- split string s into tokens at each comma, return an array */
static string * tokenize(string s)
{
string * t;
int i, n;
assert(s && s[0]);
for (t = NULL, n = 0; *s; s += i + 1, n++) {
i = strcspn(s, ",");
renewarray(t, n + 1);
t[n] = newnstring(s, i);
}
renewarray(t, n + 1); /* Make final item NULL */
t[n] = NULL;
return t;
}
/* main */
int main(int argc, char *argv[])
{
bool write = true;
Indexterm termtree = NULL; /* Sorted tree of terms */
string secno, doctitle, sectitle;
int c, status = 200;
/* Bind the parser callback routines to our handlers */
set_error_handler(handle_error);
set_start_handler(start);
set_end_handler(end);
set_comment_handler(handle_comment);
set_text_handler(handle_text);
set_decl_handler(handle_decl);
set_pi_handler(handle_pi);
set_starttag_handler(handle_starttag);
set_emptytag_handler(handle_emptytag);
set_endtag_handler(handle_endtag);
yyin = NULL;
while ((c = getopt(argc, argv, "txb:i:cnNfrs:u:O:X:")) != -1)
switch (c) {
case 't': bctarget = false; break; /* Don't write <a name> after each ID */
case 'x': xml = true; break; /* Output as XML */
case 'b': base = newstring(optarg); break; /* Set base of URL */
case 'i': indexdb = newstring(optarg); break; /* Set name of index db */
case 'c': userclassnames = tokenize(optarg); break; /* Set class names */
case 'n': use_secno = true; use_sectitle = false; break;
case 'N': use_sectitle = true; use_secno = false; break;
case 'f': final = true; break; /* "Final": remove used attributes */
case 'r': trim_punct = false; break; /* Do not remove trailing punctuation */
case 's': section_name = newstring(optarg); break;
case 'u': unknown_name = newstring(optarg); break;
case 'O': only_elts = tokenize(optarg); break; /* Index only these elements */
case 'X': exclude_elts = tokenize(optarg); break; /* Don't index these */
default: usage(argv[0]);
}
if (optind == argc) yyin = stdin;
else if (argc > optind + 1) usage(argv[0]);
else if (eq(argv[optind], "-")) yyin = stdin;
else yyin = fopenurl(argv[optind], "r", &status);
if (yyin == NULL) err(EX_IOERR, "%s", argv[optind]);
if (status != 200) errx(1, "%s : %s", argv[optind], http_strerror(status));
if (!base) base = newstring("");
if (!section_name) section_name = newstring("section %s");
if (!unknown_name) unknown_name = newstring("??");
/* Apply user's locale */
setlocale(LC_ALL, "");
/* Read the index DB into memory */
if (indexdb) load_index(indexdb, &termtree);
/* Parse, build tree, collect existing IDs */
if (yyparse() != 0) exit(3);
/* Scan for index terms, add them to the tree, add IDs where needed */
secno = NULL;
sectitle = NULL;
doctitle = newstring("");
collect(get_root(tree), &termtree, &secno, §itle, &doctitle);
/* Write out the document, adding <A NAME> and replacing <!--index--> */
expand(get_root(tree), &write, termtree);
/* Store terms to file */
if (indexdb) save_index(indexdb, termtree);
fclose(yyin);
return 0;
}