# A parser for SGML, using the derived class as static DTD.
# from http://raa.ruby-lang.org/project/html-parser
class SGMLParser
# Regular expressions used for parsing:
Interesting = /[&<]/
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
'<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
'![^<>]*)?')
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
Charref = /&#([0-9]+)[^0-9]/
Starttagopen = /<[>a-zA-Z]/
Endtagopen = /<\/[<>a-zA-Z]/
Endbracket = /[<>]/
Special = /<![^<>]*>/
Commentopen = /<!--/
Commentclose = /--[ \t\n]*>/
Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
'(\s*=\s*' +
"('[^']*'" +
'|"[^"]*"' +
'|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
Entitydefs =
{'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
def initialize(verbose=false)
@verbose = verbose
reset
end
def reset
@rawdata = ''
@stack = []
@lasttag = '???'
@nomoretags = false
@literal = false
end
def has_context(gi)
@stack.include? gi
end
def setnomoretags
@nomoretags = true
@literal = true
end
def setliteral(*args)
@literal = true
end
def feed(data)
@rawdata << data
goahead(false)
end
def close
goahead(true)
end
def goahead(_end)
rawdata = @rawdata
i = 0
n = rawdata.length
while i < n
if @nomoretags
handle_data(rawdata[i..(n-1)])
i = n
break
end
j = rawdata.index(Interesting, i)
j = n unless j
if i < j
handle_data(rawdata[i..(j-1)])
end
i = j
break if (i == n)
if rawdata[i] == ?< #
if rawdata.index(Starttagopen, i) == i
if @literal
handle_data(rawdata[i, 1])
i += 1
next
end
k = parse_starttag(i)
break unless k
i = k
next
end
if rawdata.index(Endtagopen, i) == i
k = parse_endtag(i)
break unless k
i = k
@literal = false
next
end
if rawdata.index(Commentopen, i) == i
if @literal
handle_data(rawdata[i,1])
i += 1
next
end
k = parse_comment(i)
break unless k
i += k
next
end
if rawdata.index(Special, i) == i
if @literal
handle_data(rawdata[i, 1])
i += 1
next
end
k = parse_special(i)
break unless k
i += k
next
end
elsif rawdata[i] == ?& #
if rawdata.index(Charref, i) == i
i += $&.length
handle_charref($1)
i -= 1 unless rawdata[i-1] == ?;
next
end
if rawdata.index(Entityref, i) == i
i += $&.length
handle_entityref($1)
i -= 1 unless rawdata[i-1] == ?;
next
end
else
raise RuntimeError, 'neither < nor & ??'
end
# We get here only if incomplete matches but
# nothing else
match = rawdata.index(Incomplete, i)
unless match == i
handle_data(rawdata[i, 1])
i += 1
next
end
j = match + $&.length
break if j == n # Really incomplete
handle_data(rawdata[i..(j-1)])
i = j
end
# end while
if _end and i < n
handle_data(@rawdata[i..(n-1)])
i = n
end
@rawdata = rawdata[i..-1]
end
def parse_comment(i)
rawdata = @rawdata
if rawdata[i, 4] != '<!--'
raise RuntimeError, 'unexpected call to handle_comment'
end
match = rawdata.index(Commentclose, i)
return nil unless match
matched_length = $&.length
j = match
handle_comment(rawdata[i+4..(j-1)])
j = match + matched_length
return j-i
end
def parse_starttag(i)
rawdata = @rawdata
j = rawdata.index(Endbracket, i + 1)
return nil unless j
attrs = []
if rawdata[i+1] == ?> #
# SGML shorthand: <> == <last open tag seen>
k = j
tag = @lasttag
else
match = rawdata.index(Tagfind, i + 1)
unless match
raise RuntimeError, 'unexpected call to parse_starttag'
end
k = i + 1 + ($&.length)
tag = $&.downcase
@lasttag = tag
end
while k < j
break unless rawdata.index(Attrfind, k)
matched_length = $&.length
attrname, rest, attrvalue = $1, $2, $3
if not rest
attrvalue = '' # was: = attrname
elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
(attrvalue[0] == ?" && attrvalue[-1,1] == ?")
attrvalue = attrvalue[1..-2]
end
attrs << [attrname.downcase, attrvalue]
k += matched_length
end
if rawdata[j] == ?> #
j += 1
end
finish_starttag(tag, attrs)
return j
end
def parse_endtag(i)
rawdata = @rawdata
j = rawdata.index(Endbracket, i + 1)
return nil unless j
tag = (rawdata[i+2..j-1].strip).downcase
if rawdata[j] == ?> #
j += 1
end
finish_endtag(tag)
return j
end
def finish_starttag(tag, attrs)
method = 'start_' + tag
if self.respond_to?(method)
@stack << tag
handle_starttag(tag, method, attrs)
return 1
else
method = 'do_' + tag
if self.respond_to?(method)
handle_starttag(tag, method, attrs)
return 0
else
unknown_starttag(tag, attrs)
return -1
end
end
end
def finish_endtag(tag)
if tag == ''
found = @stack.length - 1
if found < 0
unknown_endtag(tag)
return
end
else
unless @stack.include? tag
method = 'end_' + tag
unless self.respond_to?(method)
unknown_endtag(tag)
end
return
end
found = @stack.index(tag) #or @stack.length
end
while @stack.length > found
tag = @stack[-1]
method = 'end_' + tag
if respond_to?(method)
handle_endtag(tag, method)
else
unknown_endtag(tag)
end
@stack.pop
end
end
def parse_special(i)
rawdata = @rawdata
match = rawdata.index(Endbracket, i+1)
return nil unless match
matched_length = $&.length
handle_special(rawdata[i+1..(match-1)])
return match - i + matched_length
end
def handle_starttag(tag, method, attrs)
self.send(method, attrs)
end
def handle_endtag(tag, method)
self.send(method)
end
def report_unbalanced(tag)
if @verbose
print '*** Unbalanced </' + tag + '>', "\n"
print '*** Stack:', self.stack, "\n"
end
end
def handle_charref(name)
n = Integer(name)
if !(0 <= n && n <= 255)
unknown_charref(name)
return
end
handle_data(n.chr)
end
def handle_entityref(name)
table = Entitydefs
if table.include?(name)
handle_data(table[name])
else
unknown_entityref(name)
return
end
end
def handle_data(data)
end
def handle_comment(data)
end
def handle_special(data)
end
def unknown_starttag(tag, attrs)
end
def unknown_endtag(tag)
end
def unknown_charref(ref)
end
def unknown_entityref(ref)
end
end