Parent

Files

HTML::SGMLParser

A parser for SGML, using the derived class as static DTD.

Constants

Attrfind

Assaf: / is no longer part of allowed attribute value

Charref
Commentclose
Commentopen
Endbracket

Assaf: fixed to allow tag to close itself (XHTML)

Endtagopen
Entitydefs
Entityref
Incomplete
Interesting

Regular expressions used for parsing:

Special
Starttagopen
Tagfind

Public Class Methods

new(verbose=false) click to toggle source
# File lib/html/htmlparser.rb, line 34
def initialize(verbose=false)
    @verbose = verbose
    reset
end

Public Instance Methods

close() click to toggle source
# File lib/html/htmlparser.rb, line 65
def close
    goahead(true)
end
feed(data) click to toggle source
# File lib/html/htmlparser.rb, line 60
def feed(data)
    @rawdata << data
    goahead(false)
end
finish_endtag(tag) click to toggle source
# File lib/html/htmlparser.rb, line 255
def finish_endtag(tag)
    if tag == ''
    found = @stack.length - 1
    if found < 0
        unknown_endtag(tag)
        return
    end
    else
    unless @stack.include? tag
        method = 'end_' + tag
        unless self.respond_to?(method)
        unknown_endtag(tag)
        end
        return
    end
    found = @stack.index(tag) #or @stack.length

    end
    while @stack.length > found
    tag = @stack[-1]
    method = 'end_' + tag
    if respond_to?(method)
        handle_endtag(tag, method)
    else
        unknown_endtag(tag)
    end
    @stack.pop
    end
end
finish_starttag(tag, attrs) click to toggle source
# File lib/html/htmlparser.rb, line 237
def finish_starttag(tag, attrs)
    method = 'start_' + tag
    if self.respond_to?(method)
    @stack << tag
    handle_starttag(tag, method, attrs)
    return 1
    else
    method = 'do_' + tag
    if self.respond_to?(method)
        handle_starttag(tag, method, attrs)
        return 0
    else
        unknown_starttag(tag, attrs)
        return -1
    end
    end
end
goahead(_end) click to toggle source
# File lib/html/htmlparser.rb, line 69
def goahead(_end)
    rawdata = @rawdata
    i = 0
    n = rawdata.length
    while i < n
    if @nomoretags
        handle_data(rawdata[i..(n-1)])
        i = n
        break
    end
    j = rawdata.index(Interesting, i)
    j = n unless j
    if i < j
        handle_data(rawdata[i..(j-1)])
    end
    i = j
    break if (i == n)
    if rawdata[i] == << #

        if rawdata.index(Starttagopen, i) == i
        if @literal
            handle_data(rawdata[i, 1])
            i += 1
            next
        end
        k = parse_starttag(i)
        break unless k
        i = k
        next
        end
        if rawdata.index(Endtagopen, i) == i
        k = parse_endtag(i)
        break unless k
        i = k
        @literal = false
        next
        end
        if rawdata.index(Commentopen, i) == i
        if @literal
            handle_data(rawdata[i,1])
            i += 1
            next
        end
        k = parse_comment(i)
        break unless k
        i += k
        next
        end
        if rawdata.index(Special, i) == i
        if @literal
            handle_data(rawdata[i, 1])
            i += 1
            next
        end
        k = parse_special(i)
        break unless k
        i += k
        next
        end
    elsif rawdata[i] == && #

        if rawdata.index(Charref, i) == i
        i += $&.length
        handle_charref($1)
        i -= 1 unless rawdata[i-1] == ;;
        next
        end
        if rawdata.index(Entityref, i) == i
        i += $&.length
        handle_entityref($1)
        i -= 1 unless rawdata[i-1] == ;;
        next
        end
    else
        raise RuntimeError, 'neither < nor & ??'
    end
    # We get here only if incomplete matches but

    # nothing else

    match = rawdata.index(Incomplete, i)
    unless match == i
        handle_data(rawdata[i, 1])
        i += 1
        next
    end
    j = match + $&.length
    break if j == n # Really incomplete

    handle_data(rawdata[i..(j-1)])
    i = j
    end
    # end while

    if _end and i < n
    handle_data(@rawdata[i..(n-1)])
    i = n
    end
    @rawdata = rawdata[i..-1]
end
handle_charref(name) click to toggle source
# File lib/html/htmlparser.rb, line 308
def handle_charref(name)
    n = Integer(name) rescue -1
    if !(0 <= n && n <= 255)
    unknown_charref(name)
    return
    end
    handle_data(n.chr)
end
handle_comment(data) click to toggle source
# File lib/html/htmlparser.rb, line 330
def handle_comment(data)
end
handle_data(data) click to toggle source
# File lib/html/htmlparser.rb, line 327
def handle_data(data)
end
handle_endtag(tag, method) click to toggle source
# File lib/html/htmlparser.rb, line 297
def handle_endtag(tag, method)
    self.send(method)
end
handle_entityref(name) click to toggle source
# File lib/html/htmlparser.rb, line 317
def handle_entityref(name)
    table = Entitydefs
    if table.include?(name)
    handle_data(table[name])
    else
    unknown_entityref(name)
    return
    end
end
handle_special(data) click to toggle source
# File lib/html/htmlparser.rb, line 333
def handle_special(data)
end
handle_starttag(tag, method, attrs) click to toggle source
# File lib/html/htmlparser.rb, line 293
def handle_starttag(tag, method, attrs)
    self.send(method, attrs)
end
has_context(gi) click to toggle source
# File lib/html/htmlparser.rb, line 47
def has_context(gi)
    @stack.include? gi
end
parse_comment(i) click to toggle source
# File lib/html/htmlparser.rb, line 164
def parse_comment(i)
    rawdata = @rawdata
    if rawdata[i, 4] != '<!--'
    raise RuntimeError, 'unexpected call to handle_comment'
    end
    match = rawdata.index(Commentclose, i)
    return nil unless match
    matched_length = $&.length
    j = match
    handle_comment(rawdata[i+4..(j-1)])
    j = match + matched_length
    return j-i
end
parse_endtag(i) click to toggle source
# File lib/html/htmlparser.rb, line 225
def parse_endtag(i)
    rawdata = @rawdata
    j = rawdata.index(Endbracket, i + 1)
    return nil unless j
    tag = (rawdata[i+2..j-1].strip).downcase
    if rawdata[j] == >> #

    j += 1
    end
    finish_endtag(tag)
    return j
end
parse_special(i) click to toggle source
# File lib/html/htmlparser.rb, line 284
def parse_special(i)
    rawdata = @rawdata
    match = rawdata.index(Endbracket, i+1)
    return nil unless match
    matched_length = $&.length
    handle_special(rawdata[i+1..(match-1)])
    return match - i + matched_length
end
parse_starttag(i) click to toggle source
# File lib/html/htmlparser.rb, line 178
def parse_starttag(i)
    rawdata = @rawdata
    j = rawdata.index(Endbracket, i + 1)
    return nil unless j
    attrs = []
    if rawdata[i+1] == >> #

    # SGML shorthand: <> == <last open tag seen>

    k = j
    tag = @lasttag
    else
    match = rawdata.index(Tagfind, i + 1)
    unless match
        raise RuntimeError, 'unexpected call to parse_starttag'
    end
    k = i + 1 + ($&.length)
    tag = $&.downcase
    @lasttag = tag
    end
    while k < j
    # Assaf: fixed to allow tag to close itself (XHTML)

    break unless idx = rawdata.index(Attrfind, k) and idx < j
    matched_length = $&.length
    attrname, rest, attrvalue = $1, $2, $3
    if not rest
        attrvalue = '' # was: = attrname

    # Assaf: fixed to handle double quoted attribute values properly

    elsif (attrvalue[0] == '' && attrvalue[-1] == '') or
        (attrvalue[0] == "" && attrvalue[-1] == "")
        attrvalue = attrvalue[1..-2]
    end
    attrs << [attrname.downcase, attrvalue]
    k += matched_length
    end
    # Assaf: fixed to allow tag to close itself (XHTML)

    if rawdata[j,2] == '/>'
    j += 2
    finish_starttag(tag, attrs)
    finish_endtag(tag)
    else
    if rawdata[j] == >> #

        j += 1
    end
    finish_starttag(tag, attrs)
    end
    return j
end
report_unbalanced(tag) click to toggle source
# File lib/html/htmlparser.rb, line 301
def report_unbalanced(tag)
    if @verbose
    print '*** Unbalanced </' + tag + '>', "\n"
    print '*** Stack:', self.stack, "\n"
    end
end
reset() click to toggle source
# File lib/html/htmlparser.rb, line 39
def reset
    @rawdata = ''
    @stack = []
    @lasttag = '???'
    @nomoretags = false
    @literal = false
end
setliteral(*args) click to toggle source
# File lib/html/htmlparser.rb, line 56
def setliteral(*args)
    @literal = true
end
setnomoretags() click to toggle source
# File lib/html/htmlparser.rb, line 51
def setnomoretags
    @nomoretags = true
    @literal = true
end
unknown_charref(ref) click to toggle source
# File lib/html/htmlparser.rb, line 340
def unknown_charref(ref)
end
unknown_endtag(tag) click to toggle source
# File lib/html/htmlparser.rb, line 338
def unknown_endtag(tag)
end
unknown_entityref(ref) click to toggle source
# File lib/html/htmlparser.rb, line 342
def unknown_entityref(ref)
end
unknown_starttag(tag, attrs) click to toggle source
# File lib/html/htmlparser.rb, line 336
def unknown_starttag(tag, attrs)
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.