Codebase list slib / cme/main html4each.scm
cme/main

Tree @cme/main (Download .tar.gz)

html4each.scm @cme/mainraw · history · blame

;;;; HTML scan calls procedures for word, tag, whitespac, and newline.
;;; Copyright 2002 Aubrey Jaffer
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
;granted, subject to the following restrictions and understandings.
;
;1.  Any copy made of this software must include this copyright notice
;in full.
;
;2.  I have made no warranty or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
;3.  In conjunction with products arising from the use of this
;material, there shall be no use of my name in any advertising,
;promotional, or sales literature without prior written consent in
;each case.

(require 'line-i/o)
(require 'string-port)
(require 'scanf)
(require-if 'compiling 'string-case)

;;@code{(require 'html-for-each)}
;;@ftindex html-for-each

;;@body
;;@1 is an input port or a string naming an existing file containing
;;HTML text.
;;@2 is a procedure of one argument or #f.
;;@3 is a procedure of one argument or #f.
;;@4 is a procedure of one argument or #f.
;;@5 is a procedure of no arguments or #f.
;;
;;@0 opens and reads characters from port @1 or the file named by
;;string @1.  Sequential groups of characters are assembled into
;;strings which are either
;;
;;@itemize @bullet
;;@item
;;enclosed by @samp{<} and @samp{>} (hypertext markups or comments);
;;@item
;;end-of-line;
;;@item
;;whitespace; or
;;@item
;;none of the above (words).
;;@end itemize
;;
;;Procedures are called according to these distinctions in order of
;;the string's occurrence in @1.
;;
;;@5 is called with no arguments for end-of-line @emph{not within a
;;markup or comment}.
;;
;;@4 is called with strings of non-newline whitespace.
;;
;;@3 is called with hypertext markup strings (including @samp{<} and
;;@samp{>}).
;;
;;@2 is called with the remaining strings.
;;
;;@0 returns an unspecified value.
(define (html-for-each file word-proc markup-proc white-proc newline-proc)
  (define nl (string #\newline))
  (define (string-index str . chrs)
    (define len (string-length str))
    (do ((pos 0 (+ 1 pos)))
	((or (>= pos len) (memv (string-ref str pos) chrs))
	 (and (< pos len) pos))))
  (define (proc-words line edx)
    (let loop ((idx 0))
      (define ldx idx)
      (do ((idx idx (+ 1 idx)))
	  ((or (>= idx edx)
	       (not (char-whitespace? (string-ref line idx))))
	   (do ((jdx idx (+ 1 jdx)))
	       ((or (>= jdx edx)
		    (char-whitespace? (string-ref line jdx)))
		(and white-proc (not (= ldx idx))
		     (white-proc (substring line ldx idx)))
		(and word-proc (not (= idx jdx))
		     (word-proc (substring line idx jdx)))
		(if (< jdx edx) (loop jdx))))))))
  ((if (input-port? file) call-with-open-ports call-with-input-file)
   file
   (lambda (iport)
     (do ((line (read-line iport) (read-line iport)))
	 ((eof-object? line))
       (do ((idx (string-index line #\<) (string-index line #\<)))
	   ((not idx) (proc-words line (string-length line)))
					; seen '<'
	 (proc-words line idx)
	 (let ((trm (if (and (<= (+ 4 idx) (string-length line))
			     (string=? "<!--" (substring line idx (+ 4 idx))))
			"-->" #\>)))
	   (let loop ((lne (substring line idx (string-length line)))
		      (tag "")
		      (quot #f))
	     (define edx (or (eof-object? lne)
			     (if quot
				 (string-index lne quot)
				 (if (char? trm)
				     (string-index lne #\" #\' #\>)
				     (string-index lne #\>)))))
	     (cond
	      ((not edx)		; still inside tag
	       ;;(print quot trm 'within-tag lne)
	       (loop (read-line iport)
		     (and markup-proc (string-append tag lne nl))
		     quot))
	      ((eqv? #t edx)		; EOF
	       ;;(print quot trm 'eof lne)
	       (slib:error 'unterminated 'HTML 'entity file)
	       (and markup-proc (markup-proc tag)))
	      ((eqv? quot (string-ref lne edx))	; end of quoted string
	       ;;(print quot trm 'end-quote lne)
	       (set! edx (+ 1 edx))
	       (loop (substring lne edx (string-length lne))
		     (and markup-proc
			  (string-append tag (substring lne 0 edx)))
		     #f))
	      ((not (eqv? #\> (string-ref lne edx))) ; start of quoted
	       ;;(print quot trm 'start-quote lne)
	       (set! edx (+ 1 edx))
	       (loop (substring lne edx (string-length lne))
		     (and markup-proc
			  (string-append tag (substring lne 0 edx)))
		     (string-ref lne (+ -1 edx))))
	      ((or (and (string? trm)	; found matching '>' or '-->'
			(<= 2 edx)
			(equal? trm (substring lne (+ -2 edx) (+ 1 edx))))
		   (eqv? (string-ref lne edx) trm))
	       ;;(print quot trm 'end-> lne)
	       (set! edx (+ 1 edx))
	       (and markup-proc
		    (markup-proc (string-append tag (substring lne 0 edx))))
					; process words after '>'
	       (set! line (substring lne edx (string-length lne))))
	      (else
	       ;;(print quot trm 'within-comment lne)
	       (set! edx (+ 1 edx))
	       (loop (substring lne edx (string-length lne))
		     (and markup-proc
			  (string-append tag (substring lne 0 edx)))
		     #f))))))
       (and newline-proc (newline-proc))))))

;;@args file limit
;;@args file
;;@1 is an input port or a string naming an existing file containing
;;HTML text.  If supplied, @2 must be an integer.  @2 defaults to
;;1000.
;;
;;@0 opens and reads HTML from port @1 or the file named by string @1,
;;until reaching the (mandatory) @samp{TITLE} field.  @0 returns the
;;title string with adjacent whitespaces collapsed to one space.  @0
;;returns #f if the title field is empty, absent, if the first
;;character read from @1 is not @samp{#\<}, or if the end of title is
;;not found within the first (approximately) @2 words.
(define (html:read-title file . limit)
  (set! limit (if (null? limit) 1000 (* 2 (car limit))))
  ((if (input-port? file) call-with-open-ports call-with-input-file)
   file
   (lambda (port)
     (and (eqv? #\< (peek-char port))
	  (call-with-current-continuation
	   (lambda (return)
	     (define (cnt . args)
	       (if (negative? limit)
		   (return #f)
		   (set! limit (+ -1 limit))))
	     (define capturing? #f)
	     (define text '())
	     (html-for-each
	      port
	      (lambda (str)
		(cnt)
		(if capturing? (set! text (cons " " (cons str text)))))
	      (lambda (str)
		(cnt)
		(cond ((prefix-ci? "<title" str)
		       (set! capturing? #t))
		      ((prefix-ci? "</title" str)
		       (return (and (not (null? text))
				    (apply string-append
					   (reverse (cdr text))))))
		      ((or (prefix-ci? "</head" str)
			   (prefix-ci? "<body" str))
		       (return #f))))
	      cnt
	      cnt)
	     #f))))))

(define (prefix-ci? pre str)
  (define prelen (string-length pre))
  (and (< prelen (string-length str))
       (string-ci=? pre (substring str 0 prelen))))

;;@body
;;@1 is a hypertext markup string.
;;
;;If @1 is a (hypertext) comment or DTD, then @0 returns #f.
;;Otherwise @0 returns the hypertext element string consed onto an
;;association list of the attribute name-symbols and values.  If the
;;tag ends with "/>", then "/" is appended to the hypertext element
;;string.  The name-symbols are created by @code{string-ci->symbol}.
;;Each value is a string; or #t if the name had no value
;;assigned within the markup.
(define (htm-fields htm)
  (require 'string-case)
  (and
   (not (and (> (string-length htm) 3) (equal? "<!" (substring htm 0 2))))
   (call-with-input-string htm
     (lambda (port)
       (define element #f)
       (define fields '())
       (cond ((not (eqv? 1 (fscanf port "<%s" element)))
	      (slib:error 'htm-fields 'strange htm)))
       (let loop ((chr (peek-char port)))
	 (define name #f)
	 (define junk #f)
	 (define value #t)
	 (cond
	  ((eof-object? chr)
	   (cond ((and element
		       (eqv? (string-ref element
					 (+ -1 (string-length element)))
			     #\>))
		  (cons (substring element 0 (+ -1 (string-length element)))
			fields))
		 (else
		  (slib:warn 'htm-fields 'missing '> htm)
		  (if element
		      (cons element (reverse fields))
		      (reverse fields)))))
	  ((eqv? #\> chr) (cons element (reverse fields)))
	  ((eqv? #\/ chr)
	   (set! element (string-append element (string (read-char port))))
	   (loop (peek-char port)))
	  ((char-whitespace? chr) (read-char port) (loop (peek-char port)))
	  ((case (fscanf port "%[-a-zA-Z0-9:] %[=] %[-.a-zA-Z0-9]"
			 name junk value)
	     ((3 1) #t)
	     ((2)
	      (case (peek-char port)
		((#\") (cond ((eqv? 1 (fscanf port "\"%[^\"]\"" value)))
			     ((eqv? #\" (peek-char port))
			      (read-char port)
			      (set! value ""))
			     (else #f)))
		((#\') (cond ((eqv? 1 (fscanf port "'%[^']'" value)))
			     ((eqv? #\' (peek-char port))
			      (read-char port)
			      (set! value ""))
			     (else #f)))
		(else #f)))
	     (else #f))
	   (set! fields (cons (cons (string-ci->symbol name) value)
                              fields))
	   (loop (peek-char port)))
	  (else (slib:warn 'htm-fields 'bad 'field htm)
		(reverse fields))))))))