Codebase list feed2imap / 6770b06 lib / feed2imap / sgml-parser.rb
6770b06

Tree @6770b06 (Download .tar.gz)

sgml-parser.rb @6770b06raw · history · blame

# A parser for SGML, using the derived class as static DTD.
# from http://raa.ruby-lang.org/project/html-parser

class SGMLParser

  # Regular expressions used for parsing:
  Interesting = /[&<]/
  Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
                              '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
                              '![^<>]*)?')

  Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
  Charref = /&#([0-9]+)[^0-9]/

  Starttagopen = /<[>a-zA-Z]/
  Endtagopen = /<\/[<>a-zA-Z]/
  Endbracket = /[<>]/
  Special = /<![^<>]*>/
  Commentopen = /<!--/
  Commentclose = /--[ \t\n]*>/
  Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
  Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
                            '(\s*=\s*' +
                            "('[^']*'" +
                            '|"[^"]*"' +
                            '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')

  Entitydefs =
    {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}

  def initialize(verbose=false)
    @verbose = verbose
    reset
  end

  def reset
    @rawdata = ''
    @stack = []
    @lasttag = '???'
    @nomoretags = false
    @literal = false
  end

  def has_context(gi)
    @stack.include? gi
  end

  def setnomoretags
    @nomoretags = true
    @literal = true
  end

  def setliteral(*args)
    @literal = true
  end

  def feed(data)
    @rawdata << data
    goahead(false)
  end

  def close
    goahead(true)
  end

  def goahead(_end)
    rawdata = @rawdata
    i = 0
    n = rawdata.length
    while i < n
      if @nomoretags
        handle_data(rawdata[i..(n-1)])
        i = n
        break
      end
      j = rawdata.index(Interesting, i)
      j = n unless j
      if i < j
        handle_data(rawdata[i..(j-1)])
      end
      i = j
      break if (i == n)
      if rawdata[i] == ?< #
        if rawdata.index(Starttagopen, i) == i
          if @literal
            handle_data(rawdata[i, 1])
            i += 1
            next
          end
          k = parse_starttag(i)
          break unless k
          i = k
          next
        end
        if rawdata.index(Endtagopen, i) == i
          k = parse_endtag(i)
          break unless k
          i = k
          @literal = false
          next
        end
        if rawdata.index(Commentopen, i) == i
          if @literal
            handle_data(rawdata[i,1])
            i += 1
            next
          end
          k = parse_comment(i)
          break unless k
          i += k
          next
        end
        if rawdata.index(Special, i) == i
          if @literal
            handle_data(rawdata[i, 1])
            i += 1
            next
          end
          k = parse_special(i)
          break unless k
          i += k
          next
        end
      elsif rawdata[i] == ?& #
        if rawdata.index(Charref, i) == i
          i += $&.length
          handle_charref($1)
          i -= 1 unless rawdata[i-1] == ?;
          next
        end
        if rawdata.index(Entityref, i) == i
          i += $&.length
          handle_entityref($1)
          i -= 1 unless rawdata[i-1] == ?;
          next
        end
      else
        raise RuntimeError, 'neither < nor & ??'
      end
      # We get here only if incomplete matches but
      # nothing else
      match = rawdata.index(Incomplete, i)
      unless match == i
        handle_data(rawdata[i, 1])
        i += 1
        next
      end
      j = match + $&.length
      break if j == n # Really incomplete
      handle_data(rawdata[i..(j-1)])
      i = j
    end
    # end while
    if _end and i < n
      handle_data(@rawdata[i..(n-1)])
      i = n
    end
    @rawdata = rawdata[i..-1]
  end

  def parse_comment(i)
    rawdata = @rawdata
    if rawdata[i, 4] != '<!--'
      raise RuntimeError, 'unexpected call to handle_comment'
    end
    match = rawdata.index(Commentclose, i)
    return nil unless match
    matched_length = $&.length
    j = match
    handle_comment(rawdata[i+4..(j-1)])
    j = match + matched_length
    return j-i
  end

  def parse_starttag(i)
    rawdata = @rawdata
    j = rawdata.index(Endbracket, i + 1)
    return nil unless j
    attrs = []
    if rawdata[i+1] == ?> #
      # SGML shorthand: <> == <last open tag seen>
      k = j
      tag = @lasttag
    else
      match = rawdata.index(Tagfind, i + 1)
      unless match
        raise RuntimeError, 'unexpected call to parse_starttag'
      end
      k = i + 1 + ($&.length)
      tag = $&.downcase
      @lasttag = tag
    end
    while k < j
      break unless rawdata.index(Attrfind, k)
      matched_length = $&.length
      attrname, rest, attrvalue = $1, $2, $3
      if not rest
        attrvalue = '' # was: = attrname
      elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
          (attrvalue[0] == ?" && attrvalue[-1,1] == ?")
        attrvalue = attrvalue[1..-2]
      end
      attrs << [attrname.downcase, attrvalue]
      k += matched_length
    end
    if rawdata[j] == ?> #
      j += 1
    end
    finish_starttag(tag, attrs)
    return j
  end

  def parse_endtag(i)
    rawdata = @rawdata
    j = rawdata.index(Endbracket, i + 1)
    return nil unless j
    tag = (rawdata[i+2..j-1].strip).downcase
    if rawdata[j] == ?> #
      j += 1
    end
    finish_endtag(tag)
    return j
  end

  def finish_starttag(tag, attrs)
    method = 'start_' + tag
    if self.respond_to?(method)
      @stack << tag
      handle_starttag(tag, method, attrs)
      return 1
    else
      method = 'do_' + tag
      if self.respond_to?(method)
        handle_starttag(tag, method, attrs)
        return 0
      else
        unknown_starttag(tag, attrs)
        return -1
      end
    end
  end

  def finish_endtag(tag)
    if tag == ''
      found = @stack.length - 1
      if found < 0
        unknown_endtag(tag)
        return
      end
    else
      unless @stack.include? tag
        method = 'end_' + tag
        unless self.respond_to?(method)
          unknown_endtag(tag)
        end
        return
      end
      found = @stack.index(tag) #or @stack.length
    end
    while @stack.length > found
      tag = @stack[-1]
      method = 'end_' + tag
      if respond_to?(method)
        handle_endtag(tag, method)
      else
        unknown_endtag(tag)
      end
      @stack.pop
    end
  end

  def parse_special(i)
    rawdata = @rawdata
    match = rawdata.index(Endbracket, i+1)
    return nil unless match
    matched_length = $&.length
    handle_special(rawdata[i+1..(match-1)])
    return match - i + matched_length
  end

  def handle_starttag(tag, method, attrs)
    self.send(method, attrs)
  end

  def handle_endtag(tag, method)
    self.send(method)
  end

  def report_unbalanced(tag)
    if @verbose
      print '*** Unbalanced </' + tag + '>', "\n"
      print '*** Stack:', self.stack, "\n"
    end
  end

  def handle_charref(name)
    n = Integer(name)
    if !(0 <= n && n <= 255)
      unknown_charref(name)
      return
    end
    handle_data(n.chr)
  end

  def handle_entityref(name)
    table = Entitydefs
    if table.include?(name)
      handle_data(table[name])
    else
      unknown_entityref(name)
      return
    end
  end

  def handle_data(data)
  end

  def handle_comment(data)
  end

  def handle_special(data)
  end

  def unknown_starttag(tag, attrs)
  end
  def unknown_endtag(tag)
  end
  def unknown_charref(ref)
  end
  def unknown_entityref(ref)
  end

end