Codebase list slib / cme/main strsrch.scm
cme/main

Tree @cme/main (Download .tar.gz)

strsrch.scm @cme/mainraw · history · blame

;;; "strsrch.scm" Search for string from port.
; Written 1995, 1996 by Oleg Kiselyov (oleg@acm.org)
; Modified 1996, 1997, 1998, 2001 by A. Jaffer (agj@alum.mit.edu)
; Modified 2003 by Steve VanDevender (stevev@hexadecimal.uoregon.edu)
; 2013-01 A. Jaffer replaced the skip-vector with an alist

; This code is in the public domain.

(require 'multiarg-apply)		; used in string-subst
(require 'alist)

;;;@ Return the index of the first occurence of chr in str, or #f
(define (string-index str chr)
  (define len (string-length str))
  (do ((pos 0 (+ 1 pos)))
      ((or (>= pos len) (char=? chr (string-ref str pos)))
       (and (< pos len) pos))))
;@
(define (string-index-ci str chr)
  (define len (string-length str))
  (do ((pos 0 (+ 1 pos)))
      ((or (>= pos len) (char-ci=? chr (string-ref str pos)))
       (and (< pos len) pos))))
;@
(define (string-reverse-index str chr)
  (do ((pos (+ -1 (string-length str)) (+ -1 pos)))
      ((or (negative? pos) (char=? (string-ref str pos) chr))
       (and (not (negative? pos)) pos))))
;@
(define (string-reverse-index-ci str chr)
  (do ((pos (+ -1 (string-length str)) (+ -1 pos)))
      ((or (negative? pos) (char-ci=? (string-ref str pos) chr))
       (and (not (negative? pos)) pos))))
;@
(define (substring? pat str)
  (define patlen (string-length pat))
  (define strlen (string-length str))
  (cond ((zero? patlen) 0)		; trivial match
	((>= patlen strlen) (and (= patlen strlen) (string=? pat str) 0))
	;; use faster string-index to match a single-character pattern
	((= 1 patlen) (string-index str (string-ref pat 0)))
	((or (<= strlen (* 2 patlen))
	     (<= patlen 2))
	 (subloop pat patlen str strlen char=?))
	(else
	 ;; compute skip values for search pattern characters
	 ;; for all c not in pat, skip[c] = patlen + 1
	 ;; for c in pat, skip[c] is distance of rightmost occurrence
	 ;;  of c from end of str
	 (let ((skip '()))
	   (define setprop (alist-associator char=?))
	   (do ((i 0 (+ i 1)))
	       ((= i patlen))
	     (set! skip (setprop skip (string-ref pat i) (- patlen i))))
	   (subskip skip pat patlen str strlen char=?)))))
;@
(define (substring-ci? pat str)
  (define patlen (string-length pat))
  (define strlen (string-length str))
  (cond ((zero? patlen) 0)		; trivial match
	((>= patlen strlen) (and (= patlen strlen) (string-ci=? pat str) 0))
	((= 1 patlen) (string-index-ci str (string-ref pat 0)))
	((or (<= strlen (* 2 patlen))
	     (<= patlen 2))
	 (subloop pat patlen str strlen char-ci=?))
	(else
	 (let ((skip '()))
	   (define setprop (alist-associator char-ci=?))
	   (do ((i 0 (+ i 1)))
	       ((= i patlen))
	     (set! skip (setprop skip (string-ref pat i) (- patlen i))))
	   (subskip skip pat patlen str strlen char-ci=?)))))

(define (subskip skip pat patlen str strlen char=)
  (define getprop (alist-inquirer char=?))
  (do ((k patlen (if (< k strlen)
		     (+ k (or (getprop skip (string-ref str k)) (+ patlen 1)))
		     (+ strlen 1))))
      ((or (> k strlen)
	   (do ((i 0 (+ i 1))
		(j (- k patlen) (+ j 1)))
	       ((or (= i patlen)
		    (not (char= (string-ref pat i) (string-ref str j))))
		(= i patlen))))
       (and (<= k strlen) (- k patlen)))))

;;; Assumes that PATLEN > 1
(define (subloop pat patlen str strlen char=)
  (define span (- strlen patlen))
  (define c1 (string-ref pat 0))
  (define c2 (string-ref pat 1))
  (let outer ((pos 0))
    (cond
     ((> pos span) #f)		; nothing was found thru the whole str
     ((not (char= c1 (string-ref str pos)))
      (outer (+ 1 pos)))	; keep looking for the right beginning
     ((not (char= c2 (string-ref str (+ 1 pos))))
      (outer (+ 1 pos)))	 ; could've done pos+2 if c1 == c2....
     (else			  ; two char matched: high probability
					; the rest will match too
      (let inner ((pdx 2) (sdx (+ 2 pos)))
	(if (>= pdx patlen) pos	; the whole pat matched
	    (if (char= (string-ref pat pdx)
		       (string-ref str sdx))
		(inner (+ 1 pdx) (+ 1 sdx))
		;; mismatch after partial match
		(outer (+ 1 pos)))))))))
;@
(define (find-string-from-port? str <input-port> . max-no-char)
  (set! max-no-char (if (null? max-no-char) #f (car max-no-char)))
  (letrec
      ((no-chars-read 0)
       (peeked? #f)
       (my-peek-char			; Return a peeked char or #f
	(lambda () (and (or (not (number? max-no-char))
			    (< no-chars-read max-no-char))
			(let ((c (peek-char <input-port>)))
			  (cond (peeked? c)
				((eof-object? c) #f)
				((procedure? max-no-char)
				 (set! peeked? #t)
				 (if (max-no-char c) #f c))
				((eqv? max-no-char c) #f)
				(else c))))))
       (next-char (lambda () (set! peeked? #f) (read-char <input-port>)
			  (set! no-chars-read  (+ 1 no-chars-read))))
       (match-1st-char			; of the string str
	(lambda ()
	  (let ((c (my-peek-char)))
	    (and c
		 (begin (next-char)
			(if (char=? c (string-ref str 0))
			    (match-other-chars 1)
			    (match-1st-char)))))))
       ;; There has been a partial match, up to the point pos-to-match
       ;; (for example, str[0] has been found in the stream)
       ;; Now look to see if str[pos-to-match] for would be found, too
       (match-other-chars
	(lambda (pos-to-match)
	  (if (>= pos-to-match (string-length str))
	      no-chars-read	       ; the entire string has matched
	      (let ((c (my-peek-char)))
		(and c
		     (if (not (char=? c (string-ref str pos-to-match)))
			 (backtrack 1 pos-to-match)
			 (begin (next-char)
				(match-other-chars (+ 1 pos-to-match)))))))))

       ;; There had been a partial match, but then a wrong char showed up.
       ;; Before discarding previously read (and matched) characters, we check
       ;; to see if there was some smaller partial match. Note, characters read
       ;; so far (which matter) are those of str[0..matched-substr-len - 1]
       ;; In other words, we will check to see if there is such i>0 that
       ;; substr(str,0,j) = substr(str,i,matched-substr-len)
       ;; where j=matched-substr-len - i
       (backtrack
	(lambda (i matched-substr-len)
	  (let ((j (- matched-substr-len i)))
	    (if (<= j 0)
		;; backed off completely to the begining of str
		(match-1st-char)
		(let loop ((k 0))
		  (if (>= k j)
		      (match-other-chars j) ; there was indeed a shorter match
		      (if (char=? (string-ref str k)
				  (string-ref str (+ i k)))
			  (loop (+ 1 k))
			  (backtrack (+ 1 i) matched-substr-len))))))))
       )
    (match-1st-char)))
;@
(define (string-subst text old new . rest)
  (define sub
    (lambda (text)
      (set! text
	    (cond ((equal? "" text) text)
		  ((substring? old text)
		   => (lambda (idx)
			(string-append
			 (substring text 0 idx)
			 new
			 (sub (substring
			       text (+ idx (string-length old))
			       (string-length text))))))
		  (else text)))
      (if (null? rest)
	  text
	  (apply string-subst text rest))))
  (sub text))
;@
(define (count-newlines str)
  (do ((idx (+ -1 (string-length str)) (+ -1 idx))
       (cnt 0 (+ (if (eqv? #\newline (string-ref str idx)) 1 0) cnt)))
      ((<= idx 0) cnt)))