;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/charmap.lisp,v 1.19 2009/09/17 19:17:30 edi Exp $
;;; An optimized representation of sets of characters.
;;; Copyright (c) 2008-2009, Dr. Edmund Weitz. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:
;;; * Redistributions of source code must retain the above copyright
;;; notice, this list of conditions and the following disclaimer.
;;; * Redistributions in binary form must reproduce the above
;;; copyright notice, this list of conditions and the following
;;; disclaimer in the documentation and/or other materials
;;; provided with the distribution.
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(in-package :cl-ppcre)
(defstruct (charmap (:constructor make-charmap%))
;; a bit vector mapping char codes to "booleans" (1 for set members,
;; 0 for others)
(vector #*0 :type simple-bit-vector)
;; the smallest character code of all characters in the set
(start 0 :type fixnum)
;; the upper (exclusive) bound of all character codes in the set
(end 0 :type fixnum)
;; the number of characters in the set, or NIL if this is unknown
(count nil :type (or fixnum null))
;; whether the charmap actually represents the complement of the set
(complementp nil :type boolean))
;; seems to be necessary for some Lisps like ClozureCL
(defmethod make-load-form ((map charmap) &optional environment)
(make-load-form-saving-slots map :environment environment))
(declaim (inline in-charmap-p))
(defun in-charmap-p (char charmap)
"Tests whether the character CHAR belongs to the set represented by CHARMAP."
(declare #.*standard-optimize-settings*)
(declare (character char) (charmap charmap))
(let* ((char-code (char-code char))
(char-in-vector-p
(let ((charmap-start (charmap-start charmap)))
(declare (fixnum charmap-start))
(and (<= charmap-start char-code)
(< char-code (the fixnum (charmap-end charmap)))
(= 1 (sbit (the simple-bit-vector (charmap-vector charmap))
(- char-code charmap-start)))))))
(cond ((charmap-complementp charmap) (not char-in-vector-p))
(t char-in-vector-p))))
(defun charmap-contents (charmap)
"Returns a list of all characters belonging to a character map.
Only works for non-complement charmaps."
(declare #.*standard-optimize-settings*)
(declare (charmap charmap))
(and (not (charmap-complementp charmap))
(loop for code of-type fixnum from (charmap-start charmap) to (charmap-end charmap)
for i across (the simple-bit-vector (charmap-vector charmap))
when (= i 1)
collect (code-char code))))
(defun make-charmap (start end test-function &optional complementp)
"Creates and returns a charmap representing all characters with
character codes in the interval [start end) that satisfy
TEST-FUNCTION. The COMPLEMENTP slot of the charmap is set to the
value of the optional argument, but this argument doesn't have an
effect on how TEST-FUNCTION is used."
(declare #.*standard-optimize-settings*)
(declare (fixnum start end))
(let ((vector (make-array (- end start) :element-type 'bit))
(count 0))
(declare (fixnum count))
(loop for code from start below end
for char = (code-char code)
for index from 0
when char do
(incf count)
(setf (sbit vector index) (if (funcall test-function char) 1 0)))
(make-charmap% :vector vector
:start start
:end end
;; we don't know for sure if COMPLEMENTP is true as
;; there isn't a necessary a character for each
;; integer below *REGEX-CHAR-CODE-LIMIT*
:count (and (not complementp) count)
;; make sure it's boolean
:complementp (not (not complementp)))))
(defun create-charmap-from-test-function (test-function start end)
"Creates and returns a charmap representing all characters with
character codes between START and END which satisfy TEST-FUNCTION.
Tries to find the smallest interval which is necessary to represent
the character set and uses the complement representation if that
helps."
(declare #.*standard-optimize-settings*)
(let (start-in end-in start-out end-out)
;; determine the smallest intervals containing the set and its
;; complement, [start-in, end-in) and [start-out, end-out) - first
;; the lower bound
(loop for code from start below end
for char = (code-char code)
until (and start-in start-out)
when (and char
(not start-in)
(funcall test-function char))
do (setq start-in code)
when (and char
(not start-out)
(not (funcall test-function char)))
do (setq start-out code))
(unless start-in
;; no character satisfied the test, so return a "pseudo" charmap
;; where IN-CHARMAP-P is always false
(return-from create-charmap-from-test-function
(make-charmap% :count 0)))
(unless start-out
;; no character failed the test, so return a "pseudo" charmap
;; where IN-CHARMAP-P is always true
(return-from create-charmap-from-test-function
(make-charmap% :complementp t)))
;; now determine upper bound
(loop for code from (1- end) downto start
for char = (code-char code)
until (and end-in end-out)
when (and char
(not end-in)
(funcall test-function char))
do (setq end-in (1+ code))
when (and char
(not end-out)
(not (funcall test-function char)))
do (setq end-out (1+ code)))
;; use the smaller interval
(cond ((<= (- end-in start-in) (- end-out start-out))
(make-charmap start-in end-in test-function))
(t (make-charmap start-out end-out (complement* test-function) t)))))