; "prec.scm", dynamically extensible parser/tokenizer -*-scheme-*-
; Copyright 1989, 1990, 1991, 1992, 1993, 1995, 1997 Aubrey Jaffer
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
;granted, subject to the following restrictions and understandings.
;
;1. Any copy made of this software must include this copyright notice
;in full.
;
;2. I have made no warranty or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
;3. In conjunction with products arising from the use of this
;material, there shall be no use of my name in any advertising,
;promotional, or sales literature without prior written consent in
;each case.
; This file implements:
; * a Pratt style parser.
; * a tokenizer which congeals tokens according to assigned classes of
; constituent characters.
;
; This module is a significant improvement because grammar can be
; changed dynamically from rulesets which don't need compilation.
; Theoretically, all possibilities of bad input are handled and return
; as much structure as was parsed when the error occured; The symbol
; `?' is substituted for missing input.
; References for the parser are:
; Pratt, V. R.
; Top Down Operator Precendence.
; SIGACT/SIGPLAN
; Symposium on Principles of Programming Languages,
; Boston, 1973, 41-51
; WORKING PAPER 121
; CGOL - an Alternative External Representation For LISP users
; Vaughan R. Pratt
; MIT Artificial Intelligence Lab.
; March 1976
; Mathlab Group,
; MACSYMA Reference Manual, Version Ten,
; Laboratory for Computer Science, MIT, 1983
(require 'string-search)
(require 'string-port)
(require 'delay)
(require 'multiarg-apply)
;@
(define *syn-defs* #f)
(define (tok:peek-char dyn) (peek-char (cadr dyn)))
(define (tok:read-char dyn)
(let ((c (read-char (cadr dyn))))
(set-car! (cddddr dyn)
(if (or (eqv? c #\newline) (eof-object? c))
0
(+ 1 (car (cddddr dyn)))))
c))
(define (prec:warn dyn . msgs)
(do ((j (+ -1 (car (cddddr dyn))) (+ -8 j)))
((> 8 j)
(do ((i j (+ -1 i)))
((>= 0 i)
(display "^ ")
(newline)
(for-each (lambda (x) (write x) (display #\space)) msgs)
(newline))
(display #\space)))
(display slib:tab)))
;; Structure of lexical records.
(define tok:make-rec cons)
(define tok:cc car)
(define tok:sfp cdr)
(define (tok:lookup alist char)
(if (eof-object? char)
#f
(let ((pair (assv char alist)))
(and pair (cdr pair)))))
;@
(define (tok:char-group group chars chars-proc)
(map (lambda (token)
;;; (let ((oldlexrec (tok:lookup *syn-defs* token)))
;;; (cond ((or (not oldlexrec) (eqv? (tok:cc oldlexrec) group)))
;;; (else (math:warn 'cc-of token 'redefined-to- group))))
(cons token (tok:make-rec group chars-proc)))
(cond ((string? chars) (string->list chars))
((char? chars) (list chars))
(else chars))))
(define (tokenize dyn)
(let* ((char (tok:read-char dyn))
(rec (tok:lookup (car dyn) char))
(proc (and rec (tok:cc rec)))
(clist (list char)))
(cond
((not proc) char)
((procedure? proc)
(do ((cl clist (begin (set-cdr! cl (list (tok:read-char dyn))) (cdr cl))))
((proc (tok:peek-char dyn))
((or (tok:sfp rec) (lambda (dyn l) (list->string l)))
dyn
clist))))
((eqv? 0 proc) (tokenize dyn))
(else
(do ((cl clist (begin (set-cdr! cl (list (tok:read-char dyn))) (cdr cl))))
((not (let* ((prec (tok:lookup (car dyn) (tok:peek-char dyn)))
(cclass (and prec (tok:cc prec))))
(or (eqv? cclass proc)
(eqv? cclass (+ -1 proc)))))
((tok:sfp rec) dyn clist)))))))
;;; PREC:NUD is the null denotation (function and arguments to call when no
;;; unclaimed tokens).
;;; PREC:LED is the left denotation (function and arguments to call when
;;; unclaimed token is on left).
;;; PREC:LBP is the left binding power of this LED. It is the first
;;; argument position of PREC:LED
(define (prec:nudf alist self)
(let ((pair (assoc (cons 'nud self) alist)))
(and pair (cdr pair))))
(define (prec:ledf alist self)
(let ((pair (assoc (cons 'led self) alist)))
(and pair (cdr pair))))
(define (prec:lbp alist self)
(let ((pair (assoc (cons 'led self) alist)))
(and pair (cadr pair))))
(define (prec:call-or-list proc . args)
(prec:apply-or-cons proc args))
(define (prec:apply-or-cons proc args)
(if (procedure? proc) (apply proc args) (cons (or proc '?) args)))
;;; PREC:SYMBOLFY and PREC:DE-SYMBOLFY are not exact inverses.
(define (prec:symbolfy obj)
(cond ((symbol? obj) obj)
((string? obj) (string->symbol obj))
((char? obj) (string->symbol (string obj)))
(else obj)))
(define (prec:de-symbolfy obj)
(cond ((symbol? obj) (symbol->string obj))
(else obj)))
;;;Calls to set up tables.
;@
(define (prec:define-grammar . synlsts)
(set! *syn-defs* (append (apply append synlsts) *syn-defs*)))
;@
(define (prec:make-led toks . args)
(map (lambda (tok)
(cons (cons 'led (prec:de-symbolfy tok))
args))
(if (pair? toks) toks (list toks))))
(define (prec:make-nud toks . args)
(map (lambda (tok)
(cons (cons 'nud (prec:de-symbolfy tok))
args))
(if (pair? toks) toks (list toks))))
;;; Produce dynamically augmented grammars.
(define prec:process-binds append)
;;(define (prec:replace-rules) some-sort-of-magic-cookie)
;;; Here are the procedures to define high-level grammar, along with
;;; utility functions called during parsing. The utility functions
;;; (prec:parse-*) could be incorportated into the defining commands,
;;; but tracing these functions is useful for debugging.
;@
(define (prec:delim tk)
(prec:make-led tk 0 #f))
;@
(define (prec:nofix tk sop . binds)
(prec:make-nud tk prec:parse-nofix sop (apply append binds)))
(define (prec:parse-nofix dyn self sop binds)
(let ((dyn (cons (prec:process-binds binds (car dyn)) (cdr dyn))))
(prec:call-or-list (or sop (prec:symbolfy self)))))
;@
(define (prec:prefix tk sop bp . binds)
(prec:make-nud tk prec:parse-prefix sop bp (apply append binds)))
(define (prec:parse-prefix dyn self sop bp binds)
(let ((dyn (cons (prec:process-binds binds (car dyn)) (cdr dyn))))
(prec:call-or-list (or sop (prec:symbolfy self)) (prec:parse1 dyn bp))))
;@
(define (prec:infix tk sop lbp bp . binds)
(prec:make-led tk lbp prec:parse-infix sop bp (apply append binds)))
(define (prec:parse-infix dyn left self lbp sop bp binds)
(let ((dyn (cons (prec:process-binds binds (car dyn)) (cdr dyn))))
(prec:call-or-list (or sop (prec:symbolfy self)) left (prec:parse1 dyn bp))))
;@
(define (prec:nary tk sop bp)
(prec:make-led tk bp prec:parse-nary sop bp))
(define (prec:parse-nary dyn left self lbp sop bp)
(prec:apply-or-cons (or sop (prec:symbolfy self))
(cons left (prec:parse-list dyn self bp))))
;@
(define (prec:postfix tk sop lbp . binds)
(prec:make-led tk lbp prec:parse-postfix sop (apply append binds)))
(define (prec:parse-postfix dyn left self lbp sop binds)
(let ((dyn (cons (prec:process-binds binds (car dyn)) (cdr dyn))))
(prec:call-or-list (or sop (prec:symbolfy self)) left)))
;@
(define (prec:prestfix tk sop bp . binds)
(prec:make-nud tk prec:parse-rest sop bp (apply append binds)))
(define (prec:parse-rest dyn self sop bp binds)
(let ((dyn (cons (prec:process-binds binds (car dyn)) (cdr dyn))))
(prec:apply-or-cons (or sop (prec:symbolfy self)) (prec:parse-list dyn #f bp))))
;@
(define (prec:commentfix tk stp match . binds)
(append
(prec:make-nud tk prec:parse-nudcomment stp match (apply append binds))
(prec:make-led tk 220 prec:parse-ledcomment stp match (apply append binds))))
(define (prec:parse-nudcomment dyn self stp match binds)
(let ((dyn (cons (prec:process-binds binds (car dyn)) (cdr dyn))))
(tok:read-through-comment dyn stp match)
(prec:advance dyn)
(cond ((prec:delim? dyn (force (caddr dyn))) #f)
(else (prec:parse1 dyn (cadddr dyn))))))
(define (prec:parse-ledcomment dyn left lbp self stp match binds)
(let ((dyn (cons (prec:process-binds binds (car dyn)) (cdr dyn))))
(tok:read-through-comment dyn stp match)
(prec:advance dyn)
left))
(define (tok:read-through-comment dyn stp match)
(set! match (if (char? match)
(string match)
(prec:de-symbolfy match)))
(cond ((procedure? stp)
(let* ((len #f)
(str (call-with-output-string
(lambda (sp)
(set! len (find-string-from-port?
match (cadr dyn)
(lambda (c) (display c sp) #f)))))))
(stp (and len (substring str 0 (- len (string-length match)))))))
(else (find-string-from-port? match (cadr dyn)))))
;@
(define (prec:matchfix tk sop sep match . binds)
(define sep-lbp 0)
(prec:make-nud tk prec:parse-matchfix
sop sep-lbp sep match
(apply append (prec:delim match) binds)))
(define (prec:parse-matchfix dyn self sop sep-lbp sep match binds)
(let ((dyn (cons (prec:process-binds binds (car dyn)) (cdr dyn))))
(cond (sop (prec:apply-or-cons sop (prec:parse-delimited dyn sep sep-lbp match)))
((equal? (force (caddr dyn)) match)
(prec:warn dyn 'expression-missing)
(prec:advance dyn)
'?)
(else (let ((ans (prec:parse1 dyn 0))) ;just parenthesized expression
(cond ((equal? (force (caddr dyn)) match)
(prec:advance dyn))
((prec:delim? dyn (force (caddr dyn)))
(prec:warn dyn 'mismatched-delimiter (force (caddr dyn))
'not match)
(prec:advance dyn))
(else (prec:warn dyn 'delimiter-expected--ignoring-rest
(force (caddr dyn)) 'expected match
'or-delimiter)
(do () ((prec:delim? dyn (force (caddr dyn))))
(prec:parse1 dyn 0))))
ans)))))
;@
(define (prec:inmatchfix tk sop sep match lbp . binds)
(define sep-lbp 0)
(prec:make-led tk lbp prec:parse-inmatchfix
sop sep-lbp sep match
(apply append (prec:delim match) binds)))
(define (prec:parse-inmatchfix dyn left self lbp sop sep-lbp sep match binds)
(let ((dyn (cons (prec:process-binds binds (car dyn)) (cdr dyn))))
(prec:apply-or-cons sop (cons left (prec:parse-delimited dyn sep sep-lbp match)))))
;;;; Here is the code which actually parses.
(define (prec:advance dyn)
(set-car! (cddr dyn) (delay (tokenize dyn))))
(define (prec:advance-return-last dyn)
(let ((last (and (caddr dyn) (force (caddr dyn)))))
(prec:advance dyn)
last))
(define (prec:nudcall dyn self)
(let ((pob (prec:nudf (car dyn) self)))
(cond
(pob (let ((proc (car pob)))
(cond ((procedure? proc) (apply proc dyn self (cdr pob)))
(proc (cons proc (cdr pob)))
(else '?))))
((char? self) (prec:warn dyn 'extra-separator)
(prec:advance dyn)
(prec:nudcall dyn (force (caddr dyn))))
((string? self) (string->symbol self))
(else self))))
(define (prec:ledcall dyn left self)
(let* ((pob (prec:ledf (car dyn) self)))
(apply (cadr pob) dyn left self (cdr pob))))
;;; PREC:PARSE1 is the heart.
(define (prec:parse1 dyn bp)
(do ((left (prec:nudcall dyn (prec:advance-return-last dyn))
(prec:ledcall dyn left (prec:advance-return-last dyn))))
((or (>= bp 200) ;to avoid unneccesary lookahead
(>= bp (or (prec:lbp (car dyn) (force (caddr dyn))) 0))
(not left))
left)))
(define (prec:delim? dyn token)
(or (eof-object? token) (<= (or (prec:lbp (car dyn) token) 220) 0)))
(define (prec:parse-list dyn sep bp)
(cond ((prec:delim? dyn (force (caddr dyn)))
(prec:warn dyn 'expression-missing)
'(?))
(else
(let ((f (prec:parse1 dyn bp)))
(cons f (cond ((equal? (force (caddr dyn)) sep)
(prec:advance dyn)
(cond ((equal? (force (caddr dyn)) sep)
(prec:warn dyn 'expression-missing)
(prec:advance dyn)
(cons '? (prec:parse-list dyn sep bp)))
((prec:delim? dyn (force (caddr dyn)))
(prec:warn dyn 'expression-missing)
'(?))
(else (prec:parse-list dyn sep bp))))
((prec:delim? dyn (force (caddr dyn))) '())
((not sep) (prec:parse-list dyn sep bp))
((prec:delim? dyn sep) (prec:warn dyn 'separator-missing)
(prec:parse-list dyn sep bp))
(else '())))))))
(define (prec:parse-delimited dyn sep bp delim)
(cond ((equal? (force (caddr dyn)) sep)
(prec:warn dyn 'expression-missing)
(prec:advance dyn)
(cons '? (prec:parse-delimited dyn sep bp delim)))
((prec:delim? dyn (force (caddr dyn)))
(if (not (equal? (force (caddr dyn)) delim))
(prec:warn dyn 'mismatched-delimiter (force (caddr dyn))
'expected delim))
(if (not sep) (prec:warn dyn 'expression-missing))
(prec:advance dyn)
(if sep '() '(?)))
(else (let ((ans (prec:parse-list dyn sep bp)))
(cond ((equal? (force (caddr dyn)) delim))
((prec:delim? dyn (force (caddr dyn)))
(prec:warn dyn 'mismatched-delimiter (force (caddr dyn))
'expecting delim))
(else (prec:warn dyn 'delimiter-expected--ignoring-rest
(force (caddr dyn)) '...)
(do () ((prec:delim? dyn (force (caddr dyn))))
(prec:parse1 dyn bp))))
(prec:advance dyn)
ans))))
;@
(define (prec:parse grammar delim column . ports)
(define port (if (null? ports) (current-input-port) (car ports)))
(set! delim (prec:de-symbolfy delim))
(let ((dyn (list (append (prec:delim delim) grammar)
port
#f
0
column)))
(prec:advance dyn) ; setup prec:token with first token
(cond ((eof-object? (force (caddr dyn))) (force (caddr dyn)))
((equal? (force (caddr dyn)) delim) #f)
(else
(let ((ans (prec:parse1 dyn 0)))
(cond ((eof-object? (force (caddr dyn))))
((equal? (force (caddr dyn)) delim))
(else (prec:warn dyn
'delimiter-expected--ignoring-rest
(force (caddr dyn)) 'not delim)
(do () ((or (equal? (force (caddr dyn)) delim)
(eof-object? (force (caddr dyn)))))
(prec:advance dyn))))
ans)))))
;@
(define tok:decimal-digits "0123456789")
(define tok:upper-case "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
(define tok:lower-case "abcdefghijklmnopqrstuvwxyz")
(define tok:whitespaces
(do ((i (+ -1 (min 256 char-code-limit)) (+ -1 i))
(ws "" (if (char-whitespace? (integer->char i))
(string-append ws (string (integer->char i)))
ws)))
((negative? i) ws)))
;;;;The parse tables.
;;; Definitions accumulate in top-level variable *SYN-DEFS*.
(set! *syn-defs* '()) ;Make sure *SYN-DEFS* is empty.
;;; Ignore Whitespace characters.
(prec:define-grammar (tok:char-group 0 tok:whitespaces #f))
;;; On MS-DOS systems, <ctrl>-Z (26) needs to be ignored in order to
;;; avoid problems at end of files.
(case (software-type)
((ms-dos)
(if (not (char-whitespace? (integer->char 26)))
(prec:define-grammar (tok:char-group 0 (integer->char 26) #f))
)))
;;;@ Save these convenient definitions.
(define *syn-ignore-whitespace* *syn-defs*)
(set! *syn-defs* '())
;;(begin (trace-all "prec.scm") (set! *qp-width* 333))
;;(pretty-print (grammar-read-tab (get-grammar 'standard)))
;;(prec:trace)