class Nokogiri::XML::Document
Nokogiri::XML::Document wraps an xml document.
Nokogiri::XML::Document is the main entry point for dealing with XML documents. The Document is created by parsing an XML document. See Nokogiri::XML::Document.parse for more information on parsing.
For searching a Document, see Nokogiri::XML::Searchable#css and Nokogiri::XML::Searchable#xpath
Constants
- NCNAME_CHAR
- NCNAME_RE
- NCNAME_START_CHAR
See www.w3.org/TR/REC-xml-names/#ns-decl for more details. Note that we’re not attempting to handle unicode characters partly because libxml2 doesn’t handle unicode characters in NCNAMEs.
Attributes
The errors found while parsing a document.
- Returns
-
Array<Nokogiri::XML::SyntaxError>
When ‘true`, reparented elements without a namespace will inherit their new parent’s namespace (if one exists). Defaults to ‘false`.
- Returns
-
Boolean
Example: Default behavior of namespace inheritance
xml = <<~EOF <root xmlns:foo="http://nokogiri.org/default_ns/test/foo"> <foo:parent> </foo:parent> </root> EOF doc = Nokogiri::XML(xml) parent = doc.at_xpath("//foo:parent", "foo" => "http://nokogiri.org/default_ns/test/foo") parent.add_child("<child></child>") doc.to_xml # => <?xml version="1.0"?> # <root xmlns:foo="http://nokogiri.org/default_ns/test/foo"> # <foo:parent> # <child/> # </foo:parent> # </root>
Example: Setting namespace inheritance to ‘true`
xml = <<~EOF <root xmlns:foo="http://nokogiri.org/default_ns/test/foo"> <foo:parent> </foo:parent> </root> EOF doc = Nokogiri::XML(xml) doc.namespace_inheritance = true parent = doc.at_xpath("//foo:parent", "foo" => "http://nokogiri.org/default_ns/test/foo") parent.add_child("<child></child>") doc.to_xml # => <?xml version="1.0"?> # <root xmlns:foo="http://nokogiri.org/default_ns/test/foo"> # <foo:parent> # <foo:child/> # </foo:parent> # </root>
Since v1.12.4
Public Class Methods
Create a new document with version (defaults to “1.0”)
static VALUE
new (int argc, VALUE *argv, VALUE klass)
{
xmlDocPtr doc;
VALUE version, rest, rb_doc ;
rb_scan_args(argc, argv, "0*", &rest);
version = rb_ary_entry(rest, (long)0);
if (NIL_P(version)) { version = rb_str_new2("1.0"); }
doc = xmlNewDoc((xmlChar *)StringValueCStr(version));
rb_doc = noko_xml_document_wrap_with_init_args(klass, doc, argc, argv);
return rb_doc ;
}
Parse an XML file.
string_or_io may be a String, or any object that responds to read and close such as an IO, or StringIO.
url (optional) is the URI where this document is located.
encoding (optional) is the encoding that should be used when processing the document.
options (optional) is a configuration object that sets options during parsing, such as Nokogiri::XML::ParseOptions::RECOVER. See the Nokogiri::XML::ParseOptions for more information.
block (optional) is passed a configuration object on which parse options may be set.
By default, Nokogiri treats documents as untrusted, and so does not attempt to load DTDs or access the network. See Nokogiri::XML::ParseOptions for a complete list of options; and that module’s DEFAULT_XML constant for what’s set (and not set) by default.
Nokogiri.XML() is a convenience method which will call this method.
# File lib/nokogiri/xml/document.rb, line 48 def parse(string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML) options = Nokogiri::XML::ParseOptions.new(options) if Integer === options yield options if block_given? url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil if empty_doc?(string_or_io) if options.strict? raise Nokogiri::XML::SyntaxError, "Empty document" else return encoding ? new.tap { |i| i.encoding = encoding } : new end end doc = if string_or_io.respond_to?(:read) if string_or_io.is_a?(Pathname) # resolve the Pathname to the file and open it as an IO object, see #2110 string_or_io = string_or_io.expand_path.open url ||= string_or_io.path end read_io(string_or_io, url, encoding, options.to_i) else # read_memory pukes on empty docs read_memory(string_or_io, url, encoding, options.to_i) end # do xinclude processing doc.do_xinclude(options) if options.xinclude? doc end
Create a new document from an IO object
static VALUE
read_io(VALUE klass,
VALUE io,
VALUE url,
VALUE encoding,
VALUE options)
{
const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url);
const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
VALUE error_list = rb_ary_new();
VALUE document;
xmlDocPtr doc;
xmlResetLastError();
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
doc = xmlReadIO(
(xmlInputReadCallback)noko_io_read,
(xmlInputCloseCallback)noko_io_close,
(void *)io,
c_url,
c_enc,
(int)NUM2INT(options)
);
xmlSetStructuredErrorFunc(NULL, NULL);
if (doc == NULL) {
xmlErrorPtr error;
xmlFreeDoc(doc);
error = xmlGetLastError();
if (error) {
rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
} else {
rb_raise(rb_eRuntimeError, "Could not parse document");
}
return Qnil;
}
document = noko_xml_document_wrap(klass, doc);
rb_iv_set(document, "@errors", error_list);
return document;
}
Create a new document from a String
static VALUE
read_memory(VALUE klass,
VALUE string,
VALUE url,
VALUE encoding,
VALUE options)
{
const char *c_buffer = StringValuePtr(string);
const char *c_url = NIL_P(url) ? NULL : StringValueCStr(url);
const char *c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
int len = (int)RSTRING_LEN(string);
VALUE error_list = rb_ary_new();
VALUE document;
xmlDocPtr doc;
xmlResetLastError();
xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
doc = xmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
xmlSetStructuredErrorFunc(NULL, NULL);
if (doc == NULL) {
xmlErrorPtr error;
xmlFreeDoc(doc);
error = xmlGetLastError();
if (error) {
rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
} else {
rb_raise(rb_eRuntimeError, "Could not parse document");
}
return Qnil;
}
document = noko_xml_document_wrap(klass, doc);
rb_iv_set(document, "@errors", error_list);
return document;
}
⚠ This method is only available when running JRuby.
Create a Document using an existing Java DOM document object.
The returned Document shares the same underlying data structure as the Java object, so changes in one are reflected in the other.
- Parameters
-
‘java_document` (Java::OrgW3cDom::Document) (The class `Java::OrgW3cDom::Document` is also accessible as `org.w3c.dom.Document`.)
- Returns
See also #to_java
# File lib/nokogiri/xml/document.rb, line 91
Private Class Methods
# File lib/nokogiri/xml/document.rb, line 83 def empty_doc?(string_or_io) string_or_io.nil? || (string_or_io.respond_to?(:empty?) && string_or_io.empty?) || (string_or_io.respond_to?(:eof?) && string_or_io.eof?) end
Public Instance Methods
Canonicalize a document and return the results. Takes an optional block that takes two parameters: the obj and that node’s parent. The obj will be either a Nokogiri::XML::Node, or a Nokogiri::XML::Namespace The block must return a non-nil, non-false value if the obj passed in should be included in the canonicalized document.
static VALUE
rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self)
{
VALUE rb_mode;
VALUE rb_namespaces;
VALUE rb_comments_p;
int c_mode = 0;
xmlChar **c_namespaces;
xmlDocPtr c_doc;
xmlOutputBufferPtr c_obuf;
xmlC14NIsVisibleCallback c_callback_wrapper = NULL;
void *rb_callback = NULL;
VALUE rb_cStringIO;
VALUE rb_io;
rb_scan_args(argc, argv, "03", &rb_mode, &rb_namespaces, &rb_comments_p);
if (!NIL_P(rb_mode)) {
Check_Type(rb_mode, T_FIXNUM);
c_mode = NUM2INT(rb_mode);
}
if (!NIL_P(rb_namespaces)) {
Check_Type(rb_namespaces, T_ARRAY);
if (c_mode == XML_C14N_1_0 || c_mode == XML_C14N_1_1) {
rb_raise(rb_eRuntimeError, "This canonicalizer does not support this operation");
}
}
c_doc = noko_xml_document_unwrap(self);
rb_cStringIO = rb_const_get_at(rb_cObject, rb_intern("StringIO"));
rb_io = rb_class_new_instance(0, 0, rb_cStringIO);
c_obuf = xmlAllocOutputBuffer(NULL);
c_obuf->writecallback = (xmlOutputWriteCallback)noko_io_write;
c_obuf->closecallback = (xmlOutputCloseCallback)noko_io_close;
c_obuf->context = (void *)rb_io;
if (rb_block_given_p()) {
c_callback_wrapper = block_caller;
rb_callback = (void *)rb_block_proc();
}
if (NIL_P(rb_namespaces)) {
c_namespaces = NULL;
} else {
long ns_len = RARRAY_LEN(rb_namespaces);
c_namespaces = ruby_xcalloc((size_t)ns_len + 1, sizeof(xmlChar *));
for (int j = 0 ; j < ns_len ; j++) {
VALUE entry = rb_ary_entry(rb_namespaces, j);
c_namespaces[j] = (xmlChar *)StringValueCStr(entry);
}
}
xmlC14NExecute(c_doc, c_callback_wrapper, rb_callback,
c_mode,
c_namespaces,
(int)RTEST(rb_comments_p),
c_obuf);
ruby_xfree(c_namespaces);
xmlOutputBufferClose(c_obuf);
return rb_funcall(rb_io, rb_intern("string"), 0);
}
Create a new entity named name.
type is an integer representing the type of entity to be created, and it defaults to Nokogiri::XML::EntityDecl::INTERNAL_GENERAL. See the constants on Nokogiri::XML::EntityDecl for more information.
external_id, system_id, and content set the External ID, System ID, and content respectively. All of these parameters are optional.
static VALUE
create_entity(int argc, VALUE *argv, VALUE self)
{
VALUE name;
VALUE type;
VALUE external_id;
VALUE system_id;
VALUE content;
xmlEntityPtr ptr;
xmlDocPtr doc ;
doc = noko_xml_document_unwrap(self);
rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id,
&content);
xmlResetLastError();
ptr = xmlAddDocEntity(
doc,
(xmlChar *)(NIL_P(name) ? NULL : StringValueCStr(name)),
(int)(NIL_P(type) ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(type)),
(xmlChar *)(NIL_P(external_id) ? NULL : StringValueCStr(external_id)),
(xmlChar *)(NIL_P(system_id) ? NULL : StringValueCStr(system_id)),
(xmlChar *)(NIL_P(content) ? NULL : StringValueCStr(content))
);
if (NULL == ptr) {
xmlErrorPtr error = xmlGetLastError();
if (error) {
rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
} else {
rb_raise(rb_eRuntimeError, "Could not create entity");
}
return Qnil;
}
return noko_xml_node_wrap(cNokogiriXmlEntityDecl, (xmlNodePtr)ptr);
}
Copy this Document. An optional depth may be passed in, but it defaults to a deep copy. 0 is a shallow copy, 1 is a deep copy.
static VALUE
duplicate_document(int argc, VALUE *argv, VALUE self)
{
xmlDocPtr doc, dup;
VALUE copy;
VALUE level;
if (rb_scan_args(argc, argv, "01", &level) == 0) {
level = INT2NUM((long)1);
}
doc = noko_xml_document_unwrap(self);
dup = xmlCopyDoc(doc, (int)NUM2INT(level));
if (dup == NULL) { return Qnil; }
dup->type = doc->type;
copy = noko_xml_document_wrap(rb_obj_class(self), dup);
rb_iv_set(copy, "@errors", rb_iv_get(self, "@errors"));
return copy ;
}
Get the encoding for this Document
static VALUE
encoding(VALUE self)
{
xmlDocPtr doc = noko_xml_document_unwrap(self);
if (!doc->encoding) { return Qnil; }
return NOKOGIRI_STR_NEW2(doc->encoding);
}
Set the encoding string for this Document
static VALUE
set_encoding(VALUE self, VALUE encoding)
{
xmlDocPtr doc = noko_xml_document_unwrap(self);
if (doc->encoding) {
xmlFree(DISCARD_CONST_QUAL_XMLCHAR(doc->encoding));
}
doc->encoding = xmlStrdup((xmlChar *)StringValueCStr(encoding));
return encoding;
}
Remove all namespaces from all nodes in the document.
This could be useful for developers who either don’t understand namespaces or don’t care about them.
The following example shows a use case, and you can decide for yourself whether this is a good thing or not:
doc = Nokogiri::XML <<-EOXML <root> <car xmlns:part="http://general-motors.com/"> <part:tire>Michelin Model XGV</part:tire> </car> <bicycle xmlns:part="http://schwinn.com/"> <part:tire>I'm a bicycle tire!</part:tire> </bicycle> </root> EOXML doc.xpath("//tire").to_s # => "" doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "<part:tire>Michelin Model XGV</part:tire>" doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => "<part:tire>I'm a bicycle tire!</part:tire>" doc.remove_namespaces! doc.xpath("//tire").to_s # => "<tire>Michelin Model XGV</tire><tire>I'm a bicycle tire!</tire>" doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "" doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => ""
For more information on why this probably is not a good thing in general, please direct your browser to tenderlovemaking.com/2009/04/23/namespaces-in-xml.html
static VALUE
remove_namespaces_bang(VALUE self)
{
xmlDocPtr doc = noko_xml_document_unwrap(self);
recursively_remove_namespaces_from_node((xmlNodePtr)doc);
return self;
}
Get the root node for this document.
static VALUE
rb_xml_document_root(VALUE self)
{
xmlDocPtr c_document;
xmlNodePtr c_root;
c_document = noko_xml_document_unwrap(self);
c_root = xmlDocGetRootElement(c_document);
if (!c_root) {
return Qnil;
}
return noko_xml_node_wrap(Qnil, c_root) ;
}
Set the root element on this document
static VALUE
rb_xml_document_root_set(VALUE self, VALUE rb_new_root)
{
xmlDocPtr c_document;
xmlNodePtr c_new_root = NULL, c_current_root;
c_document = noko_xml_document_unwrap(self);
c_current_root = xmlDocGetRootElement(c_document);
if (c_current_root) {
xmlUnlinkNode(c_current_root);
noko_xml_document_pin_node(c_current_root);
}
if (!NIL_P(rb_new_root)) {
if (!rb_obj_is_kind_of(rb_new_root, cNokogiriXmlNode)) {
rb_raise(rb_eArgError,
"expected Nokogiri::XML::Node but received %"PRIsVALUE,
rb_obj_class(rb_new_root));
}
Noko_Node_Get_Struct(rb_new_root, xmlNode, c_new_root);
/* If the new root's document is not the same as the current document,
* then we need to dup the node in to this document. */
if (c_new_root->doc != c_document) {
c_new_root = xmlDocCopyNode(c_new_root, c_document, 1);
if (!c_new_root) {
rb_raise(rb_eRuntimeError, "Could not reparent node (xmlDocCopyNode)");
}
}
}
xmlDocSetRootElement(c_document, c_new_root);
return rb_new_root;
}
⚠ This method is only available when running JRuby.
Returns the underlying Java DOM document object for this document.
The returned Java object shares the same underlying data structure as this document, so changes in one are reflected in the other.
- Returns
-
Java::OrgW3cDom::Document (The class ‘Java::OrgW3cDom::Document` is also accessible as `org.w3c.dom.Document`.)
See also Document.wrap
# File lib/nokogiri/xml/document.rb, line 109
Get the url name for this document.
static VALUE
url(VALUE self)
{
xmlDocPtr doc = noko_xml_document_unwrap(self);
if (doc->URL) { return NOKOGIRI_STR_NEW2(doc->URL); }
return Qnil;
}