Object
A parser for SGML, using the derived class as static DTD.
Assaf: / is no longer part of allowed attribute value
Assaf: fixed to allow tag to close itself (XHTML)
Regular expressions used for parsing:
# File lib/html/htmlparser.rb, line 60 def feed(data) @rawdata << data goahead(false) end
# File lib/html/htmlparser.rb, line 255 def finish_endtag(tag) if tag == '' found = @stack.length - 1 if found < 0 unknown_endtag(tag) return end else unless @stack.include? tag method = 'end_' + tag unless self.respond_to?(method) unknown_endtag(tag) end return end found = @stack.index(tag) #or @stack.length end while @stack.length > found tag = @stack[-1] method = 'end_' + tag if respond_to?(method) handle_endtag(tag, method) else unknown_endtag(tag) end @stack.pop end end
# File lib/html/htmlparser.rb, line 237 def finish_starttag(tag, attrs) method = 'start_' + tag if self.respond_to?(method) @stack << tag handle_starttag(tag, method, attrs) return 1 else method = 'do_' + tag if self.respond_to?(method) handle_starttag(tag, method, attrs) return 0 else unknown_starttag(tag, attrs) return -1 end end end
# File lib/html/htmlparser.rb, line 69 def goahead(_end) rawdata = @rawdata i = 0 n = rawdata.length while i < n if @nomoretags handle_data(rawdata[i..(n-1)]) i = n break end j = rawdata.index(Interesting, i) j = n unless j if i < j handle_data(rawdata[i..(j-1)]) end i = j break if (i == n) if rawdata[i] == << # if rawdata.index(Starttagopen, i) == i if @literal handle_data(rawdata[i, 1]) i += 1 next end k = parse_starttag(i) break unless k i = k next end if rawdata.index(Endtagopen, i) == i k = parse_endtag(i) break unless k i = k @literal = false next end if rawdata.index(Commentopen, i) == i if @literal handle_data(rawdata[i,1]) i += 1 next end k = parse_comment(i) break unless k i += k next end if rawdata.index(Special, i) == i if @literal handle_data(rawdata[i, 1]) i += 1 next end k = parse_special(i) break unless k i += k next end elsif rawdata[i] == && # if rawdata.index(Charref, i) == i i += $&.length handle_charref($1) i -= 1 unless rawdata[i-1] == ;; next end if rawdata.index(Entityref, i) == i i += $&.length handle_entityref($1) i -= 1 unless rawdata[i-1] == ;; next end else raise RuntimeError, 'neither < nor & ??' end # We get here only if incomplete matches but # nothing else match = rawdata.index(Incomplete, i) unless match == i handle_data(rawdata[i, 1]) i += 1 next end j = match + $&.length break if j == n # Really incomplete handle_data(rawdata[i..(j-1)]) i = j end # end while if _end and i < n handle_data(@rawdata[i..(n-1)]) i = n end @rawdata = rawdata[i..-1] end
# File lib/html/htmlparser.rb, line 308 def handle_charref(name) n = Integer(name) rescue -1 if !(0 <= n && n <= 255) unknown_charref(name) return end handle_data(n.chr) end
# File lib/html/htmlparser.rb, line 330 def handle_comment(data) end
# File lib/html/htmlparser.rb, line 327 def handle_data(data) end
# File lib/html/htmlparser.rb, line 297 def handle_endtag(tag, method) self.send(method) end
# File lib/html/htmlparser.rb, line 317 def handle_entityref(name) table = Entitydefs if table.include?(name) handle_data(table[name]) else unknown_entityref(name) return end end
# File lib/html/htmlparser.rb, line 333 def handle_special(data) end
# File lib/html/htmlparser.rb, line 293 def handle_starttag(tag, method, attrs) self.send(method, attrs) end
# File lib/html/htmlparser.rb, line 47 def has_context(gi) @stack.include? gi end
# File lib/html/htmlparser.rb, line 164 def parse_comment(i) rawdata = @rawdata if rawdata[i, 4] != '<!--' raise RuntimeError, 'unexpected call to handle_comment' end match = rawdata.index(Commentclose, i) return nil unless match matched_length = $&.length j = match handle_comment(rawdata[i+4..(j-1)]) j = match + matched_length return j-i end
# File lib/html/htmlparser.rb, line 225 def parse_endtag(i) rawdata = @rawdata j = rawdata.index(Endbracket, i + 1) return nil unless j tag = (rawdata[i+2..j-1].strip).downcase if rawdata[j] == >> # j += 1 end finish_endtag(tag) return j end
# File lib/html/htmlparser.rb, line 284 def parse_special(i) rawdata = @rawdata match = rawdata.index(Endbracket, i+1) return nil unless match matched_length = $&.length handle_special(rawdata[i+1..(match-1)]) return match - i + matched_length end
# File lib/html/htmlparser.rb, line 178 def parse_starttag(i) rawdata = @rawdata j = rawdata.index(Endbracket, i + 1) return nil unless j attrs = [] if rawdata[i+1] == >> # # SGML shorthand: <> == <last open tag seen> k = j tag = @lasttag else match = rawdata.index(Tagfind, i + 1) unless match raise RuntimeError, 'unexpected call to parse_starttag' end k = i + 1 + ($&.length) tag = $&.downcase @lasttag = tag end while k < j # Assaf: fixed to allow tag to close itself (XHTML) break unless idx = rawdata.index(Attrfind, k) and idx < j matched_length = $&.length attrname, rest, attrvalue = $1, $2, $3 if not rest attrvalue = '' # was: = attrname # Assaf: fixed to handle double quoted attribute values properly elsif (attrvalue[0] == '' && attrvalue[-1] == '') or (attrvalue[0] == "" && attrvalue[-1] == "") attrvalue = attrvalue[1..-2] end attrs << [attrname.downcase, attrvalue] k += matched_length end # Assaf: fixed to allow tag to close itself (XHTML) if rawdata[j,2] == '/>' j += 2 finish_starttag(tag, attrs) finish_endtag(tag) else if rawdata[j] == >> # j += 1 end finish_starttag(tag, attrs) end return j end
# File lib/html/htmlparser.rb, line 301 def report_unbalanced(tag) if @verbose print '*** Unbalanced </' + tag + '>', "\n" print '*** Stack:', self.stack, "\n" end end
# File lib/html/htmlparser.rb, line 39 def reset @rawdata = '' @stack = [] @lasttag = '???' @nomoretags = false @literal = false end
# File lib/html/htmlparser.rb, line 56 def setliteral(*args) @literal = true end
# File lib/html/htmlparser.rb, line 340 def unknown_charref(ref) end
# File lib/html/htmlparser.rb, line 338 def unknown_endtag(tag) end
Generated with the Darkfish Rdoc Generator 2.