class Nokogiri::HTML::Document

Public Class Methods

new click to toggle source

Create a new document

static VALUE new(int argc, VALUE *argv, VALUE klass)
{
  VALUE uri, external_id, rest, rb_doc;
  htmlDocPtr doc;

  rb_scan_args(argc, argv, "0*", &rest);
  uri         = rb_ary_entry(rest, (long)0);
  external_id = rb_ary_entry(rest, (long)1);

  doc = htmlNewDoc(
      RTEST(uri) ? (const xmlChar *)StringValuePtr(uri) : NULL,
      RTEST(external_id) ? (const xmlChar *)StringValuePtr(external_id) : NULL
  );
  rb_doc = Nokogiri_wrap_xml_document(klass, doc);
  rb_obj_call_init(rb_doc, argc, argv);
  return rb_doc ;
}
parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML) { |options| ... } click to toggle source

Parse HTML. thing may be a String, or any object that responds to read and close such as an IO, or StringIO. url is resource where this document is located. encoding is the encoding that should be used when processing the document. options is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions.

# File lib/nokogiri/html/document.rb, line 80
def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML

  options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
  # Give the options to the user
  yield options if block_given?

  if string_or_io.respond_to?(:encoding)
    unless string_or_io.encoding.name == "ASCII-8BIT"
      encoding ||= string_or_io.encoding.name
    end
  end

  if string_or_io.respond_to?(:read)
    url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
    if !encoding
      # Perform advanced encoding detection that libxml2 does
      # not do.
      string_or_io = EncodingReader.new(string_or_io)
      begin
        return read_io(string_or_io, url, encoding, options.to_i)
      rescue EncodingFoundException => e
        # A retry is required because libxml2 has a problem in
        # that it cannot switch encoding well in the middle of
        # parsing, especially if it has already seen a
        # non-ASCII character when it finds an encoding hint.
        encoding = e.encoding
      end
    end
    return read_io(string_or_io, url, encoding, options.to_i)
  end

  # read_memory pukes on empty docs
  return new if string_or_io.nil? or string_or_io.empty?

  if !encoding
    encoding = EncodingReader.detect_encoding(string_or_io)
  end

  read_memory(string_or_io, url, encoding, options.to_i)
end
read_io(io, url, encoding, options) click to toggle source

Read the HTML document from io with given url, encoding, and options. See Nokogiri::HTML.parse

static VALUE read_io( VALUE klass,
                      VALUE io,
                      VALUE url,
                      VALUE encoding,
                      VALUE options )
{
  const char * c_url    = NIL_P(url)      ? NULL : StringValuePtr(url);
  const char * c_enc    = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  htmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

  doc = htmlReadIO(
      io_read_callback,
      io_close_callback,
      (void *)io,
      c_url,
      c_enc,
      (int)NUM2INT(options)
  );
  xmlSetStructuredErrorFunc(NULL, NULL);

  if(doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
    else
      rb_raise(rb_eRuntimeError, "Could not parse document");

    return Qnil;
  }

  document = Nokogiri_wrap_xml_document(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}
read_memory(string, url, encoding, options) click to toggle source

Read the HTML document contained in string with given url, encoding, and options. See Nokogiri::HTML.parse

static VALUE read_memory( VALUE klass,
                          VALUE string,
                          VALUE url,
                          VALUE encoding,
                          VALUE options )
{
  const char * c_buffer = StringValuePtr(string);
  const char * c_url    = NIL_P(url)      ? NULL : StringValuePtr(url);
  const char * c_enc    = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
  int len               = (int)RSTRING_LEN(string);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  htmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

  doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
  xmlSetStructuredErrorFunc(NULL, NULL);

  if(doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
    else
      rb_raise(rb_eRuntimeError, "Could not parse document");

    return Qnil;
  }

  document = Nokogiri_wrap_xml_document(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}

Public Instance Methods

fragment(tags = nil) click to toggle source

Create a Nokogiri::XML::DocumentFragment from tags

# File lib/nokogiri/html/document.rb, line 67
def fragment tags = nil
  DocumentFragment.new(self, tags, self.root)
end
meta_encoding() click to toggle source

Get the meta tag encoding for this document. If there is no meta tag, then nil is returned.

# File lib/nokogiri/html/document.rb, line 7
def meta_encoding
  meta = meta_content_type and
    /charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
end
meta_encoding=(encoding) click to toggle source

Set the meta tag encoding for this document. If there is no meta content tag, the encoding is not set.

# File lib/nokogiri/html/document.rb, line 15
def meta_encoding= encoding
  meta = meta_content_type and
    meta['content'] = "text/html; charset=%s" % encoding
end
serialize(options = {}) click to toggle source

Serialize Node using options. Save options can also be set using a block. See SaveOptions.

These two statements are equivalent:

node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)

or

node.serialize(:encoding => 'UTF-8') do |config|
  config.format.as_xml
end
Calls superclass method Nokogiri::XML::Node#serialize
# File lib/nokogiri/html/document.rb, line 60
def serialize options = {}
  options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
  super
end
title() click to toggle source

Get the title string of this document. Return nil if there is no title tag.

# File lib/nokogiri/html/document.rb, line 30
def title
  title = at('title') and title.inner_text
end
title=(text) click to toggle source

Set the title string of this document. If there is no head element, the title is not set.

# File lib/nokogiri/html/document.rb, line 37
def title=(text)
  unless title = at('title')
    head = at('head') or return nil
    title = Nokogiri::XML::Node.new('title', self)
    head << title
  end
  title.children = XML::Text.new(text, self)
end
type click to toggle source

The type for this document

static VALUE type(VALUE self)
{
  htmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);
  return INT2NUM((long)doc->type);
}

Private Instance Methods

meta_content_type() click to toggle source
# File lib/nokogiri/html/document.rb, line 20
def meta_content_type
  css('meta[@http-equiv]').find { |node|
    node['http-equiv'] =~ /\AContent-Type\z/i
  }
end