From 228752c7a6c674daeaa0193a9f2882e2c38d109c Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sat, 17 Aug 2013 18:08:40 -0400 Subject: [PATCH 0001/1759] renamed version of gumbo-parser/ruby --- nokogumbo-import/README.md | 46 ++++++++ nokogumbo-import/Rakefile | 50 +++++++++ nokogumbo-import/ext/extconf.rb | 3 + nokogumbo-import/ext/nokogumbo.c | 131 +++++++++++++++++++++++ nokogumbo-import/lib/nokogumbo.rb | 166 +++++++++++++++++++++++++++++ nokogumbo-import/test-nokogumbo.rb | 48 +++++++++ 6 files changed, 444 insertions(+) create mode 100644 nokogumbo-import/README.md create mode 100644 nokogumbo-import/Rakefile create mode 100644 nokogumbo-import/ext/extconf.rb create mode 100644 nokogumbo-import/ext/nokogumbo.c create mode 100644 nokogumbo-import/lib/nokogumbo.rb create mode 100644 nokogumbo-import/test-nokogumbo.rb diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md new file mode 100644 index 0000000000..35548469d0 --- /dev/null +++ b/nokogumbo-import/README.md @@ -0,0 +1,46 @@ +Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser. +=========== + +At the moment, this is a proof of concept, allowing a Ruby program to invoke +the Gumbo HTML5 parser and access the result as a Nokogiri parsed document. + +Usage: +----- + +```ruby +require 'nokogumbo' +doc = Nokogiri::HTML5(string) +``` + +Notes: +----- + +* The `Nokogumbo.parse` function takes a string and passes it to the +gumbo_parse_with_options method, using the default options. +The resulting Gumbo parse tree is the walked, producing a Nokogiri parse tree. +The original Gumbo parse tree is then destroyed, and the Nokogiri parse tree +is returned. + +* Instead of uppercase element names, lowercase element names are produced. + +* Instead of returning 'unknown' as the element name for unknown tags, the +original tag name is returned verbatim. + +* Nothing meaningful is done with the `GumboDocument` struct, i.e., no +Nokogiri `EntityDecl` is produced. + +Installation: +============ + +* Build and install the +[gumbo-parser](https://github.com/google/gumbo-parser#readme) C library + +* Execute `rake gem` + +* [sudo] gem install pkg/nokogumbo*.gem + +Related efforts: +============ + +* [ruby-gumbo](https://github.com/galdor/ruby-gumbo#readme) - a ruby binding +for the Gumbo HTML5 parser. diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile new file mode 100644 index 0000000000..391dc303b4 --- /dev/null +++ b/nokogumbo-import/Rakefile @@ -0,0 +1,50 @@ +require 'rubygems/package_task' +require 'rake/clean' + +task 'default' => 'test' + +file 'Makefile' => 'ext/extconf.rb' do + Dir.chdir 'ext' do + ruby 'extconf.rb' + end +end + +task 'test' => 'Makefile' do + Dir.chdir 'ext' do + sh 'make -s' + end + ruby 'test-nokogumbo.rb' +end + +CLEAN.include('ext/*.o', 'ext/*.so', 'ext/*.log', 'ext/Makefile', 'pkg') + +MANIFEST = %w( + ext/extconf.rb + ext/nokogumbo.c + lib/nokogumbo.rb + Rakefile + README.md +) + +SPEC = Gem::Specification.new do |gem| + gem.name = 'nokogumbo' + gem.version = '0.1' + gem.email = 'rubys@intertwingly.net' + gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme' + gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' + gem.files = MANIFEST + gem.extensions = 'ext/extconf.rb' + gem.author = 'Sam Ruby' + gem.add_dependency 'nokogiri' + gem.license = 'MIT' + gem.description = %q( + At the moment, this is a proof of concept, allowing a Ruby + program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri + parsed document.).strip.gsub(/\s+/, ' ') +end + +task 'gem' => 'test' +Gem::PackageTask.new(SPEC) do |pkg| + pkg.need_tar = true + pkg.need_zip = true +end diff --git a/nokogumbo-import/ext/extconf.rb b/nokogumbo-import/ext/extconf.rb new file mode 100644 index 0000000000..f05b937aee --- /dev/null +++ b/nokogumbo-import/ext/extconf.rb @@ -0,0 +1,3 @@ +require 'mkmf' +have_library('gumbo', 'gumbo_parse') +create_makefile('nokogumboc') diff --git a/nokogumbo-import/ext/nokogumbo.c b/nokogumbo-import/ext/nokogumbo.c new file mode 100644 index 0000000000..6c654c77fe --- /dev/null +++ b/nokogumbo-import/ext/nokogumbo.c @@ -0,0 +1,131 @@ +#include "ruby.h" +#include "gumbo.h" + +// class constants +static VALUE Nokogiri; +static VALUE HTML; +static VALUE XML; +static VALUE Document; +static VALUE Element; +static VALUE Text; +static VALUE CDATA; +static VALUE Comment; +static VALUE TAGS=0; +static int Unknown=0; + +// interned symbols +static VALUE new; +static VALUE set_attribute; +static VALUE add_child; + +// determine tag name for a given node +static VALUE _name(GumboElement *node) { + if (!TAGS) { + // Deferred initialization of "Unknown" as the GumboParser class is + // defined *after* the Nokogumbo class is. + VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5")); + TAGS = rb_const_get(HTML5, rb_intern("TAGS")); + Unknown = NUM2INT(rb_const_get(HTML5, rb_intern("Unknown"))); + } + + if (node->tag != Unknown) { + return rb_ary_entry(TAGS, (long) node->tag); + } else { + // Gumbo doesn't provide unknown tags, so we need to parse it ourselves: + // http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state + GumboStringPiece *tag = &node->original_tag; + int length; + for (length = 1; length < tag->length-1; length++) { + if (strchr(" \t\r\n<", *((char*)tag->data+length))) break; + } + return rb_str_new(1+(char *)tag->data, length-1); + } +} + +// Build a Nokogiri Element for a given GumboElement (recursively) +static VALUE _element(VALUE document, GumboElement *node) { + int i; + VALUE element = rb_funcall(Element, new, 2, _name(node), document); + + // add in the attributes + GumboVector* attrs = &node->attributes; + for (i=0; i < attrs->length; i++) { + GumboAttribute *attr = attrs->data[i]; + VALUE name = rb_str_new2(attr->name); + rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value)); + } + + // add in the children + GumboVector* children = &node->children; + for (i=0; i < children->length; i++) { + GumboNode* child = children->data[i]; + + VALUE node = 0; + VALUE text; + + switch (child->type) { + case GUMBO_NODE_ELEMENT: + node = _element(document, &child->v.element); + break; + case GUMBO_NODE_WHITESPACE: + case GUMBO_NODE_TEXT: + text = rb_str_new2(child->v.text.text); + node = rb_funcall(Text, new, 2, text, document); + break; + case GUMBO_NODE_CDATA: + text = rb_str_new2(child->v.text.text); + node = rb_funcall(CDATA, new, 2, text, document); + break; + case GUMBO_NODE_COMMENT: + text = rb_str_new2(child->v.text.text); + node = rb_funcall(Comment, new, 2, document, text); + break; + case GUMBO_NODE_DOCUMENT: + break; // should never happen -- ignore + } + + if (node) rb_funcall(element, add_child, 1, node); + } + + return element; +} + +// Parse a string using gumbo_parse into a Nokogiri document +static VALUE t_parse(VALUE self, VALUE string) { + VALUE document = rb_funcall(Document, new, 0); + + GumboOutput *output = gumbo_parse_with_options( + &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string) + ); + VALUE root = _element(document, (GumboElement*)&output->root->v.element); + rb_funcall(document, add_child, 1, root); + gumbo_destroy_output(&kGumboDefaultOptions, output); + + return document; +} + +// Initialize the Nokogumbo class and fetch constants we will use later +void Init_nokogumboc() { + rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri")); + rb_require("nokogiri"); + + // class constants + Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri")); + HTML = rb_const_get(Nokogiri, rb_intern("HTML")); + XML = rb_const_get(Nokogiri, rb_intern("XML")); + Document = rb_const_get(HTML, rb_intern("Document")); + Element = rb_const_get(XML, rb_intern("Element")); + Text = rb_const_get(XML, rb_intern("Text")); + CDATA = rb_const_get(XML, rb_intern("CDATA")); + Comment = rb_const_get(XML, rb_intern("Comment")); + + // interned symbols + new = rb_intern("new"); + set_attribute = rb_intern("set_attribute"); + add_child = rb_intern("add_child"); + + // define Nokogumbo class with a singleton parse method + VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject); + rb_define_singleton_method(Gumbo, "parse", t_parse, 1); +} + diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb new file mode 100644 index 0000000000..5ef89dc4d0 --- /dev/null +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -0,0 +1,166 @@ +require 'nokogiri' +require 'nokogumboc' + +module Nokogiri + def self.HTML5(string) + Nokogumbo.parse(string) + end + + module HTML5 + TAGS = [ + 'HTML', + 'HEAD', + 'TITLE', + 'BASE', + 'LINK', + 'META', + 'STYLE', + 'SCRIPT', + 'NOSCRIPT', + 'BODY', + 'SECTION', + 'NAV', + 'ARTICLE', + 'ASIDE', + 'H1', + 'H2', + 'H3', + 'H4', + 'H5', + 'H6', + 'HGROUP', + 'HEADER', + 'FOOTER', + 'ADDRESS', + 'P', + 'HR', + 'PRE', + 'BLOCKQUOTE', + 'OL', + 'UL', + 'LI', + 'DL', + 'DT', + 'DD', + 'FIGURE', + 'FIGCAPTION', + 'DIV', + 'A', + 'EM', + 'STRONG', + 'SMALL', + 'S', + 'CITE', + 'Q', + 'DFN', + 'ABBR', + 'TIME', + 'CODE', + 'VAR', + 'SAMP', + 'KBD', + 'SUB', + 'SUP', + 'I', + 'B', + 'MARK', + 'RUBY', + 'RT', + 'RP', + 'BDI', + 'BDO', + 'SPAN', + 'BR', + 'WBR', + 'INS', + 'DEL', + 'IMAGE', + 'IMG', + 'IFRAME', + 'EMBED', + 'OBJECT', + 'PARAM', + 'VIDEO', + 'AUDIO', + 'SOURCE', + 'TRACK', + 'CANVAS', + 'MAP', + 'AREA', + 'MATH', + 'MI', + 'MO', + 'MN', + 'MS', + 'MTEXT', + 'MGLYPH', + 'MALIGNMARK', + 'ANNOTATION_XML', + 'SVG', + 'FOREIGNOBJECT', + 'DESC', + 'TABLE', + 'CAPTION', + 'COLGROUP', + 'COL', + 'TBODY', + 'THEAD', + 'TFOOT', + 'TR', + 'TD', + 'TH', + 'FORM', + 'FIELDSET', + 'LEGEND', + 'LABEL', + 'INPUT', + 'BUTTON', + 'SELECT', + 'DATALIST', + 'OPTGROUP', + 'OPTION', + 'TEXTAREA', + 'KEYGEN', + 'OUTPUT', + 'PROGRESS', + 'METER', + 'DETAILS', + 'SUMMARY', + 'COMMAND', + 'MENU', + 'APPLET', + 'ACRONYM', + 'BGSOUND', + 'DIR', + 'FRAME', + 'FRAMESET', + 'NOFRAMES', + 'ISINDEX', + 'LISTING', + 'XMP', + 'NEXTID', + 'NOEMBED', + 'PLAINTEXT', + 'RB', + 'STRIKE', + 'BASEFONT', + 'BIG', + 'BLINK', + 'CENTER', + 'FONT', + 'MARQUEE', + 'MULTICOL', + 'NOBR', + 'SPACER', + 'TT', + 'U', + 'UNKNOWN', + ].map(&:downcase).map(&:freeze).freeze + + Unknown = TAGS.length - 1 + + def parse(string) + Nokogumbo.parse(string) + end + end +end diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb new file mode 100644 index 0000000000..7bf7863ff8 --- /dev/null +++ b/nokogumbo-import/test-nokogumbo.rb @@ -0,0 +1,48 @@ +$:.push('lib', 'ext') + +require 'nokogumbo' +require 'test/unit' + +class TestNokogumbo < Test::Unit::TestCase + def setup + @buffer = <<-EOF.gsub(/^ /, '') + + + + hello world + + +

hello world

+
+ content +
+ +
+ +
+ + + EOF + @doc = Nokogiri::HTML5(@buffer) + end + + def test_element_text + assert_equal "content", @doc.at('span').text + end + + def test_element_cdata + assert_equal "foobar", @doc.at('textarea').text.strip + end + + def test_attr_value + assert_equal "utf-8", @doc.at('meta')['charset'] + end + + def test_comment + assert_equal " test comment ", @doc.xpath('//comment()').text + end + + def test_unknown_element + assert_equal "main", @doc.at('main').name + end +end From 782b69a40f30b003611deb7899141781833ad35d Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 18 Aug 2013 18:22:32 -0400 Subject: [PATCH 0002/1759] embed a copy of the gumbo parser --- nokogumbo-import/.gitignore | 3 +++ nokogumbo-import/README.md | 5 ++-- nokogumbo-import/Rakefile | 40 +++++++++++++++++++----------- nokogumbo-import/ext/extconf.rb | 2 +- nokogumbo-import/ext/nokogumbo.c | 5 ++-- nokogumbo-import/test-nokogumbo.rb | 2 +- 6 files changed, 34 insertions(+), 23 deletions(-) create mode 100644 nokogumbo-import/.gitignore diff --git a/nokogumbo-import/.gitignore b/nokogumbo-import/.gitignore new file mode 100644 index 0000000000..fbf13e3ba6 --- /dev/null +++ b/nokogumbo-import/.gitignore @@ -0,0 +1,3 @@ +gumbo-parser +pkg +work diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 35548469d0..dc6656c272 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -29,12 +29,11 @@ original tag name is returned verbatim. * Nothing meaningful is done with the `GumboDocument` struct, i.e., no Nokogiri `EntityDecl` is produced. +* The gem itself includes a copy of the Nokogumbo HTML5 parser. + Installation: ============ -* Build and install the -[gumbo-parser](https://github.com/google/gumbo-parser#readme) C library - * Execute `rake gem` * [sudo] gem install pkg/nokogumbo*.gem diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 391dc303b4..edc38d6db2 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -3,44 +3,54 @@ require 'rake/clean' task 'default' => 'test' -file 'Makefile' => 'ext/extconf.rb' do - Dir.chdir 'ext' do +file 'gumbo-parser' do + sh 'git clone https://github.com/google/gumbo-parser.git' +end + +file 'work/extconf.rb' => 'gumbo-parser' do + sh 'mkdir work' + sh 'cp gumbo-parser/src/* work' + sh 'cp ext/* work' +end + +file 'work/Makefile' => 'work/extconf.rb' do + Dir.chdir 'work' do ruby 'extconf.rb' end end -task 'test' => 'Makefile' do - Dir.chdir 'ext' do +task 'test' => 'work/Makefile' do + Dir.chdir 'work' do sh 'make -s' end ruby 'test-nokogumbo.rb' end -CLEAN.include('ext/*.o', 'ext/*.so', 'ext/*.log', 'ext/Makefile', 'pkg') +CLEAN.include 'pkg', 'gumbo-parser', 'work' -MANIFEST = %w( - ext/extconf.rb - ext/nokogumbo.c +MANIFEST = FileList[*%w( + work/*.rb + work/*.c + work/*.h lib/nokogumbo.rb Rakefile README.md -) +)] SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.1' + gem.version = '0.2' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' gem.files = MANIFEST - gem.extensions = 'ext/extconf.rb' + gem.extensions = 'work/extconf.rb' gem.author = 'Sam Ruby' gem.add_dependency 'nokogiri' - gem.license = 'MIT' + gem.license = 'Apache 2.0' gem.description = %q( - At the moment, this is a proof of concept, allowing a Ruby - program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri - parsed document.).strip.gsub(/\s+/, ' ') + Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and + access the result as a Nokogiri parsed document.).strip.gsub(/\s+/, ' ') end task 'gem' => 'test' diff --git a/nokogumbo-import/ext/extconf.rb b/nokogumbo-import/ext/extconf.rb index f05b937aee..3997bd8358 100644 --- a/nokogumbo-import/ext/extconf.rb +++ b/nokogumbo-import/ext/extconf.rb @@ -1,3 +1,3 @@ require 'mkmf' -have_library('gumbo', 'gumbo_parse') +$CFLAGS << " -std=c99" create_makefile('nokogumboc') diff --git a/nokogumbo-import/ext/nokogumbo.c b/nokogumbo-import/ext/nokogumbo.c index 6c654c77fe..85d81ee0e2 100644 --- a/nokogumbo-import/ext/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumbo.c @@ -44,12 +44,11 @@ static VALUE _name(GumboElement *node) { // Build a Nokogiri Element for a given GumboElement (recursively) static VALUE _element(VALUE document, GumboElement *node) { - int i; VALUE element = rb_funcall(Element, new, 2, _name(node), document); // add in the attributes GumboVector* attrs = &node->attributes; - for (i=0; i < attrs->length; i++) { + for (int i=0; i < attrs->length; i++) { GumboAttribute *attr = attrs->data[i]; VALUE name = rb_str_new2(attr->name); rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value)); @@ -57,7 +56,7 @@ static VALUE _element(VALUE document, GumboElement *node) { // add in the children GumboVector* children = &node->children; - for (i=0; i < children->length; i++) { + for (int i=0; i < children->length; i++) { GumboNode* child = children->data[i]; VALUE node = 0; diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index 7bf7863ff8..205da08bd8 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -1,4 +1,4 @@ -$:.push('lib', 'ext') +$:.push('lib', 'work') require 'nokogumbo' require 'test/unit' From 9cc6689aae9b904eeff8854efcac18e11bd7e8e6 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 18 Aug 2013 18:26:41 -0400 Subject: [PATCH 0003/1759] fix readme link --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index edc38d6db2..d33da451f5 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -41,7 +41,7 @@ SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' gem.version = '0.2' gem.email = 'rubys@intertwingly.net' - gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme' + gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' gem.files = MANIFEST gem.extensions = 'work/extconf.rb' From de361f10aae12f2dfd3992cf85a958a503b37acd Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 18 Aug 2013 22:24:47 -0400 Subject: [PATCH 0004/1759] Add LICENCE, cleanup Rakefile --- nokogumbo-import/LICENSE.txt | 201 +++++++++++++++++++++++++++++++++++ nokogumbo-import/Rakefile | 7 +- 2 files changed, 205 insertions(+), 3 deletions(-) create mode 100644 nokogumbo-import/LICENSE.txt diff --git a/nokogumbo-import/LICENSE.txt b/nokogumbo-import/LICENSE.txt new file mode 100644 index 0000000000..261eeb9e9f --- /dev/null +++ b/nokogumbo-import/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index edc38d6db2..4f5e790271 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -8,9 +8,9 @@ file 'gumbo-parser' do end file 'work/extconf.rb' => 'gumbo-parser' do - sh 'mkdir work' - sh 'cp gumbo-parser/src/* work' - sh 'cp ext/* work' + mkdir_p 'work' + cp Dir['gumbo-parser/src/*'], 'work' + cp Dir['ext/*'], 'work' end file 'work/Makefile' => 'work/extconf.rb' do @@ -33,6 +33,7 @@ MANIFEST = FileList[*%w( work/*.c work/*.h lib/nokogumbo.rb + LICENSE.txt Rakefile README.md )] From cce2c294737e46293b94e1af943c061e03928021 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 19 Aug 2013 10:47:04 -0400 Subject: [PATCH 0005/1759] push license, readme changes --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 26c0293dab..b9e04c960d 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -40,7 +40,7 @@ MANIFEST = FileList[*%w( SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.2' + gem.version = '0.3' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From 13de40392510983dceef2c57b772264fef56d588 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 19 Aug 2013 13:02:56 -0400 Subject: [PATCH 0006/1759] cleanup readme --- nokogumbo-import/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index dc6656c272..bc5f898ad1 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -1,8 +1,9 @@ Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser. =========== -At the moment, this is a proof of concept, allowing a Ruby program to invoke -the Gumbo HTML5 parser and access the result as a Nokogiri parsed document. +Nokogumbo provides the ability for a Ruby program to invoke the +[Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme) +and to access the result as a Nokogiri parsed document. Usage: ----- @@ -29,7 +30,7 @@ original tag name is returned verbatim. * Nothing meaningful is done with the `GumboDocument` struct, i.e., no Nokogiri `EntityDecl` is produced. -* The gem itself includes a copy of the Nokogumbo HTML5 parser. +* The gem itself includes a copy of the Gumbo HTML5 parser. Installation: ============ From a9bd11a63f4d422f09ce1bd3cf60fe0ca176908a Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 19 Aug 2013 13:08:57 -0400 Subject: [PATCH 0007/1759] convert encoding to utf-8 (Ruby 1.9+) --- nokogumbo-import/lib/nokogumbo.rb | 9 +++++++-- nokogumbo-import/test-nokogumbo.rb | 8 ++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 5ef89dc4d0..5caf600be3 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -3,7 +3,7 @@ module Nokogiri def self.HTML5(string) - Nokogumbo.parse(string) + Nokogiri::HTML5.parse(string) end module HTML5 @@ -159,7 +159,12 @@ module HTML5 Unknown = TAGS.length - 1 - def parse(string) + def self.parse(string) + # convert to UTF-8 (Ruby 1.9+) + if string.respond_to?(:encoding) and string.encoding != Encoding::UTF_8 + string = string.encode(Encoding::UTF_8) + end + Nokogumbo.parse(string) end end diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index 205da08bd8..73f06d1300 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -45,4 +45,12 @@ def test_comment def test_unknown_element assert_equal "main", @doc.at('main').name end + + if ''.respond_to? 'encoding' + def test_encoding + mac="\xCA".force_encoding('macroman') + assert_equal ' ', + Nokogumbo.parse(mac.encode('utf-8')).at('span').to_xml + end + end end From 0218bc99903edbacce9e7cb77507b0480a635a6d Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 19 Aug 2013 22:36:00 -0400 Subject: [PATCH 0008/1759] load into libxml2 and wrap instead of loading into nokogiri --- nokogumbo-import/Rakefile | 34 +++-- nokogumbo-import/ext/extconf.rb | 6 +- nokogumbo-import/ext/nokogumbo.c | 236 ++++++++++++++++++++++------- nokogumbo-import/lib/nokogumbo.rb | 152 ------------------- nokogumbo-import/test-nokogumbo.rb | 2 +- 5 files changed, 206 insertions(+), 224 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index b9e04c960d..6902060d9e 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -9,8 +9,8 @@ end file 'work/extconf.rb' => 'gumbo-parser' do mkdir_p 'work' - cp Dir['gumbo-parser/src/*'], 'work' - cp Dir['ext/*'], 'work' + cp Dir['gumbo-parser/src/*'], 'work', :preserve => true + cp Dir['ext/*'], 'work', :preserve => true end file 'work/Makefile' => 'work/extconf.rb' do @@ -19,32 +19,29 @@ file 'work/Makefile' => 'work/extconf.rb' do end end -task 'test' => 'work/Makefile' do +file 'work/nokogumbo.c' => 'ext/nokogumbo.c' do + cp 'ext/nokogumbo.c', 'work/nokogumbo.c' +end + +task 'compile' => ['work/Makefile', 'work/nokogumbo.c'] do Dir.chdir 'work' do sh 'make -s' end +end + +task 'test' => 'compile' do ruby 'test-nokogumbo.rb' end CLEAN.include 'pkg', 'gumbo-parser', 'work' -MANIFEST = FileList[*%w( - work/*.rb - work/*.c - work/*.h - lib/nokogumbo.rb - LICENSE.txt - Rakefile - README.md -)] SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.3' + gem.version = '0.4' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' - gem.files = MANIFEST gem.extensions = 'work/extconf.rb' gem.author = 'Sam Ruby' gem.add_dependency 'nokogiri' @@ -52,6 +49,15 @@ SPEC = Gem::Specification.new do |gem| gem.description = %q( Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.).strip.gsub(/\s+/, ' ') + gem.files = FileList[ + 'work/*.rb', + 'work/*.c', + 'work/*.h', + 'lib/nokogumbo.rb', + 'LICENSE.txt', + 'Rakefile', + 'README.md' + ] end task 'gem' => 'test' diff --git a/nokogumbo-import/ext/extconf.rb b/nokogumbo-import/ext/extconf.rb index 3997bd8358..925968e0cc 100644 --- a/nokogumbo-import/ext/extconf.rb +++ b/nokogumbo-import/ext/extconf.rb @@ -1,3 +1,7 @@ require 'mkmf' -$CFLAGS << " -std=c99" +$CFLAGS = " -std=c99" +pkg_config('libxml-2.0') +nokogiri_lib = Gem.find_files('nokogiri').first +nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri$), 'ext/nokogiri') +find_header('nokogiri.h', nokogiri_ext) create_makefile('nokogumboc') diff --git a/nokogumbo-import/ext/nokogumbo.c b/nokogumbo-import/ext/nokogumbo.c index 85d81ee0e2..3b9bd3dbfa 100644 --- a/nokogumbo-import/ext/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumbo.c @@ -1,35 +1,169 @@ #include "ruby.h" #include "gumbo.h" +#include +#include +#include // class constants -static VALUE Nokogiri; -static VALUE HTML; -static VALUE XML; static VALUE Document; -static VALUE Element; -static VALUE Text; -static VALUE CDATA; -static VALUE Comment; -static VALUE TAGS=0; -static int Unknown=0; - -// interned symbols -static VALUE new; -static VALUE set_attribute; -static VALUE add_child; -// determine tag name for a given node -static VALUE _name(GumboElement *node) { - if (!TAGS) { - // Deferred initialization of "Unknown" as the GumboParser class is - // defined *after* the Nokogumbo class is. - VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5")); - TAGS = rb_const_get(HTML5, rb_intern("TAGS")); - Unknown = NUM2INT(rb_const_get(HTML5, rb_intern("Unknown"))); - } +static const char* TAGS[] = { + "html", + "head", + "title", + "base", + "link", + "meta", + "style", + "script", + "noscript", + "body", + "section", + "nav", + "article", + "aside", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hgroup", + "header", + "footer", + "address", + "p", + "hr", + "pre", + "blockquote", + "ol", + "ul", + "li", + "dl", + "dt", + "dd", + "figure", + "figcaption", + "div", + "a", + "em", + "strong", + "small", + "s", + "cite", + "q", + "dfn", + "abbr", + "time", + "code", + "var", + "samp", + "kbd", + "sub", + "sup", + "i", + "b", + "mark", + "ruby", + "rt", + "rp", + "bdi", + "bdo", + "span", + "br", + "wbr", + "ins", + "del", + "image", + "img", + "iframe", + "embed", + "object", + "param", + "video", + "audio", + "source", + "track", + "canvas", + "map", + "area", + "math", + "mi", + "mo", + "mn", + "ms", + "mtext", + "mglyph", + "malignmark", + "annotation_xml", + "svg", + "foreignobject", + "desc", + "table", + "caption", + "colgroup", + "col", + "tbody", + "thead", + "tfoot", + "tr", + "td", + "th", + "form", + "fieldset", + "legend", + "label", + "input", + "button", + "select", + "datalist", + "optgroup", + "option", + "textarea", + "keygen", + "output", + "progress", + "meter", + "details", + "summary", + "command", + "menu", + "applet", + "acronym", + "bgsound", + "dir", + "frame", + "frameset", + "noframes", + "isindex", + "listing", + "xmp", + "nextid", + "noembed", + "plaintext", + "rb", + "strike", + "basefont", + "big", + "blink", + "center", + "font", + "marquee", + "multicol", + "nobr", + "spacer", + "tt", + "u", + "unknown" +}; + +const static int Unknown=sizeof(TAGS)/sizeof(char*)-1; +// determine tag name for a given node +static xmlNodePtr new_element(GumboElement *node) { + xmlNodePtr element; if (node->tag != Unknown) { - return rb_ary_entry(TAGS, (long) node->tag); + element = xmlNewNode(NULL, BAD_CAST TAGS[(int)node->tag]); } else { // Gumbo doesn't provide unknown tags, so we need to parse it ourselves: // http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state @@ -38,20 +172,23 @@ static VALUE _name(GumboElement *node) { for (length = 1; length < tag->length-1; length++) { if (strchr(" \t\r\n<", *((char*)tag->data+length))) break; } - return rb_str_new(1+(char *)tag->data, length-1); + char name[length]; + strncpy(name, 1+(char *)tag->data, length-1); + name[length-1] = '\0'; + element = xmlNewNode(NULL, BAD_CAST name); } + return element; } // Build a Nokogiri Element for a given GumboElement (recursively) -static VALUE _element(VALUE document, GumboElement *node) { - VALUE element = rb_funcall(Element, new, 2, _name(node), document); +static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { + xmlNodePtr element = new_element(node); // add in the attributes GumboVector* attrs = &node->attributes; for (int i=0; i < attrs->length; i++) { GumboAttribute *attr = attrs->data[i]; - VALUE name = rb_str_new2(attr->name); - rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value)); + xmlNewProp(element, BAD_CAST attr->name, BAD_CAST attr->value); } // add in the children @@ -59,31 +196,29 @@ static VALUE _element(VALUE document, GumboElement *node) { for (int i=0; i < children->length; i++) { GumboNode* child = children->data[i]; - VALUE node = 0; - VALUE text; + xmlNodePtr node = NULL; switch (child->type) { case GUMBO_NODE_ELEMENT: - node = _element(document, &child->v.element); + node = walk_tree(document, &child->v.element); break; case GUMBO_NODE_WHITESPACE: case GUMBO_NODE_TEXT: - text = rb_str_new2(child->v.text.text); - node = rb_funcall(Text, new, 2, text, document); + node = xmlNewText(BAD_CAST child->v.text.text); break; case GUMBO_NODE_CDATA: - text = rb_str_new2(child->v.text.text); - node = rb_funcall(CDATA, new, 2, text, document); + node = xmlNewCDataBlock(document, + BAD_CAST child->v.text.original_text.data, + child->v.text.original_text.length); break; case GUMBO_NODE_COMMENT: - text = rb_str_new2(child->v.text.text); - node = rb_funcall(Comment, new, 2, document, text); + node = xmlNewComment(BAD_CAST child->v.text.text); break; case GUMBO_NODE_DOCUMENT: break; // should never happen -- ignore } - if (node) rb_funcall(element, add_child, 1, node); + if (node) xmlAddChild(element, node); } return element; @@ -91,16 +226,15 @@ static VALUE _element(VALUE document, GumboElement *node) { // Parse a string using gumbo_parse into a Nokogiri document static VALUE t_parse(VALUE self, VALUE string) { - VALUE document = rb_funcall(Document, new, 0); - GumboOutput *output = gumbo_parse_with_options( &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string) ); - VALUE root = _element(document, (GumboElement*)&output->root->v.element); - rb_funcall(document, add_child, 1, root); + xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0"); + xmlNodePtr root = walk_tree(doc, (GumboElement*)&output->root->v.element); + xmlDocSetRootElement(doc, root); gumbo_destroy_output(&kGumboDefaultOptions, output); - return document; + return Nokogiri_wrap_xml_document(Document, doc); } // Initialize the Nokogumbo class and fetch constants we will use later @@ -109,19 +243,9 @@ void Init_nokogumboc() { rb_require("nokogiri"); // class constants - Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri")); - HTML = rb_const_get(Nokogiri, rb_intern("HTML")); - XML = rb_const_get(Nokogiri, rb_intern("XML")); + VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri")); + VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML")); Document = rb_const_get(HTML, rb_intern("Document")); - Element = rb_const_get(XML, rb_intern("Element")); - Text = rb_const_get(XML, rb_intern("Text")); - CDATA = rb_const_get(XML, rb_intern("CDATA")); - Comment = rb_const_get(XML, rb_intern("Comment")); - - // interned symbols - new = rb_intern("new"); - set_attribute = rb_intern("set_attribute"); - add_child = rb_intern("add_child"); // define Nokogumbo class with a singleton parse method VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject); diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 5caf600be3..9c1b85a967 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -7,158 +7,6 @@ def self.HTML5(string) end module HTML5 - TAGS = [ - 'HTML', - 'HEAD', - 'TITLE', - 'BASE', - 'LINK', - 'META', - 'STYLE', - 'SCRIPT', - 'NOSCRIPT', - 'BODY', - 'SECTION', - 'NAV', - 'ARTICLE', - 'ASIDE', - 'H1', - 'H2', - 'H3', - 'H4', - 'H5', - 'H6', - 'HGROUP', - 'HEADER', - 'FOOTER', - 'ADDRESS', - 'P', - 'HR', - 'PRE', - 'BLOCKQUOTE', - 'OL', - 'UL', - 'LI', - 'DL', - 'DT', - 'DD', - 'FIGURE', - 'FIGCAPTION', - 'DIV', - 'A', - 'EM', - 'STRONG', - 'SMALL', - 'S', - 'CITE', - 'Q', - 'DFN', - 'ABBR', - 'TIME', - 'CODE', - 'VAR', - 'SAMP', - 'KBD', - 'SUB', - 'SUP', - 'I', - 'B', - 'MARK', - 'RUBY', - 'RT', - 'RP', - 'BDI', - 'BDO', - 'SPAN', - 'BR', - 'WBR', - 'INS', - 'DEL', - 'IMAGE', - 'IMG', - 'IFRAME', - 'EMBED', - 'OBJECT', - 'PARAM', - 'VIDEO', - 'AUDIO', - 'SOURCE', - 'TRACK', - 'CANVAS', - 'MAP', - 'AREA', - 'MATH', - 'MI', - 'MO', - 'MN', - 'MS', - 'MTEXT', - 'MGLYPH', - 'MALIGNMARK', - 'ANNOTATION_XML', - 'SVG', - 'FOREIGNOBJECT', - 'DESC', - 'TABLE', - 'CAPTION', - 'COLGROUP', - 'COL', - 'TBODY', - 'THEAD', - 'TFOOT', - 'TR', - 'TD', - 'TH', - 'FORM', - 'FIELDSET', - 'LEGEND', - 'LABEL', - 'INPUT', - 'BUTTON', - 'SELECT', - 'DATALIST', - 'OPTGROUP', - 'OPTION', - 'TEXTAREA', - 'KEYGEN', - 'OUTPUT', - 'PROGRESS', - 'METER', - 'DETAILS', - 'SUMMARY', - 'COMMAND', - 'MENU', - 'APPLET', - 'ACRONYM', - 'BGSOUND', - 'DIR', - 'FRAME', - 'FRAMESET', - 'NOFRAMES', - 'ISINDEX', - 'LISTING', - 'XMP', - 'NEXTID', - 'NOEMBED', - 'PLAINTEXT', - 'RB', - 'STRIKE', - 'BASEFONT', - 'BIG', - 'BLINK', - 'CENTER', - 'FONT', - 'MARQUEE', - 'MULTICOL', - 'NOBR', - 'SPACER', - 'TT', - 'U', - 'UNKNOWN', - ].map(&:downcase).map(&:freeze).freeze - - Unknown = TAGS.length - 1 - def self.parse(string) # convert to UTF-8 (Ruby 1.9+) if string.respond_to?(:encoding) and string.encoding != Encoding::UTF_8 diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index 73f06d1300..fb25749085 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -15,7 +15,7 @@ def setup

hello world

content -
+
From 422e1f6755e4b4daffd6f51a13dfcf601f1b3d28 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 20 Aug 2013 10:23:45 -0400 Subject: [PATCH 0009/1759] cleanup --- nokogumbo-import/Rakefile | 15 ++++++++------- nokogumbo-import/ext/nokogumbo.c | 7 +++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 6902060d9e..1f39e238fa 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -7,8 +7,9 @@ file 'gumbo-parser' do sh 'git clone https://github.com/google/gumbo-parser.git' end -file 'work/extconf.rb' => 'gumbo-parser' do +file 'work/extconf.rb' => ['ext/extconf.rb', 'gumbo-parser'] do mkdir_p 'work' + rm_f 'work/Makefile' cp Dir['gumbo-parser/src/*'], 'work', :preserve => true cp Dir['ext/*'], 'work', :preserve => true end @@ -35,7 +36,6 @@ end CLEAN.include 'pkg', 'gumbo-parser', 'work' - SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' gem.version = '0.4' @@ -50,9 +50,6 @@ SPEC = Gem::Specification.new do |gem| Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.).strip.gsub(/\s+/, ' ') gem.files = FileList[ - 'work/*.rb', - 'work/*.c', - 'work/*.h', 'lib/nokogumbo.rb', 'LICENSE.txt', 'Rakefile', @@ -60,8 +57,12 @@ SPEC = Gem::Specification.new do |gem| ] end -task 'gem' => 'test' -Gem::PackageTask.new(SPEC) do |pkg| +task 'package_workfiles' => 'work/extconf.rb' do + PKG.package_files += FileList['work/*.rb', 'work/*.c', 'work/*.h'] +end + +task 'gem' => ['test', 'package_workfiles'] +PKG = Gem::PackageTask.new(SPEC) do |pkg| pkg.need_tar = true pkg.need_zip = true end diff --git a/nokogumbo-import/ext/nokogumbo.c b/nokogumbo-import/ext/nokogumbo.c index 3b9bd3dbfa..e9e23848e7 100644 --- a/nokogumbo-import/ext/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumbo.c @@ -1,13 +1,12 @@ -#include "ruby.h" -#include "gumbo.h" +#include +#include #include -#include #include // class constants static VALUE Document; -static const char* TAGS[] = { +static const char* const TAGS[] = { "html", "head", "title", From 5fd16a84fc4d4d033a8a908127720fec18649961 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 20 Aug 2013 12:47:18 -0400 Subject: [PATCH 0010/1759] "touch" ext files when copying --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 1f39e238fa..38b34ca7df 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -11,7 +11,7 @@ file 'work/extconf.rb' => ['ext/extconf.rb', 'gumbo-parser'] do mkdir_p 'work' rm_f 'work/Makefile' cp Dir['gumbo-parser/src/*'], 'work', :preserve => true - cp Dir['ext/*'], 'work', :preserve => true + cp Dir['ext/*'], 'work' end file 'work/Makefile' => 'work/extconf.rb' do From f84b952711876a7d4d36c593f1913ab904d9df7d Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 20 Aug 2013 12:47:41 -0400 Subject: [PATCH 0011/1759] Add doctype --- nokogumbo-import/README.md | 3 --- nokogumbo-import/ext/nokogumbo.c | 9 ++++++++- nokogumbo-import/test-nokogumbo.rb | 9 +++++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index bc5f898ad1..b2f8d85ec8 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -27,9 +27,6 @@ is returned. * Instead of returning 'unknown' as the element name for unknown tags, the original tag name is returned verbatim. -* Nothing meaningful is done with the `GumboDocument` struct, i.e., no -Nokogiri `EntityDecl` is produced. - * The gem itself includes a copy of the Gumbo HTML5 parser. Installation: diff --git a/nokogumbo-import/ext/nokogumbo.c b/nokogumbo-import/ext/nokogumbo.c index e9e23848e7..c8591a8e48 100644 --- a/nokogumbo-import/ext/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumbo.c @@ -229,8 +229,15 @@ static VALUE t_parse(VALUE self, VALUE string) { &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string) ); xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0"); - xmlNodePtr root = walk_tree(doc, (GumboElement*)&output->root->v.element); + xmlNodePtr root = walk_tree(doc, &output->root->v.element); xmlDocSetRootElement(doc, root); + if (output->document->v.document.has_doctype) { + const char *public = output->document->v.document.public_identifier; + const char *system = output->document->v.document.system_identifier; + xmlCreateIntSubset(doc, BAD_CAST "html", + (strlen(public) ? public : NULL), + (strlen(system) ? system : NULL)); + } gumbo_destroy_output(&kGumboDefaultOptions, output); return Nokogiri_wrap_xml_document(Document, doc); diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index fb25749085..acd701c04f 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -49,8 +49,13 @@ def test_unknown_element if ''.respond_to? 'encoding' def test_encoding mac="\xCA".force_encoding('macroman') - assert_equal ' ', - Nokogumbo.parse(mac.encode('utf-8')).at('span').to_xml + doc = Nokogumbo.parse(mac.encode('utf-8')) + assert_equal ' ', doc.at('span').to_xml end end + + def test_html5_doctype + doc = Nokogumbo.parse("") + assert_match //, doc.to_html + end end From e08d334b08d279009a82d59b655a871fbda9c773 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 20 Aug 2013 16:41:26 -0400 Subject: [PATCH 0012/1759] Cleanup to use gumbo functions when possible. Based on blog comment suggestion by "Craig". --- nokogumbo-import/ext/nokogumbo.c | 187 ++----------------------------- 1 file changed, 12 insertions(+), 175 deletions(-) diff --git a/nokogumbo-import/ext/nokogumbo.c b/nokogumbo-import/ext/nokogumbo.c index c8591a8e48..9be5ab48c0 100644 --- a/nokogumbo-import/ext/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumbo.c @@ -6,182 +6,20 @@ // class constants static VALUE Document; -static const char* const TAGS[] = { - "html", - "head", - "title", - "base", - "link", - "meta", - "style", - "script", - "noscript", - "body", - "section", - "nav", - "article", - "aside", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "hgroup", - "header", - "footer", - "address", - "p", - "hr", - "pre", - "blockquote", - "ol", - "ul", - "li", - "dl", - "dt", - "dd", - "figure", - "figcaption", - "div", - "a", - "em", - "strong", - "small", - "s", - "cite", - "q", - "dfn", - "abbr", - "time", - "code", - "var", - "samp", - "kbd", - "sub", - "sup", - "i", - "b", - "mark", - "ruby", - "rt", - "rp", - "bdi", - "bdo", - "span", - "br", - "wbr", - "ins", - "del", - "image", - "img", - "iframe", - "embed", - "object", - "param", - "video", - "audio", - "source", - "track", - "canvas", - "map", - "area", - "math", - "mi", - "mo", - "mn", - "ms", - "mtext", - "mglyph", - "malignmark", - "annotation_xml", - "svg", - "foreignobject", - "desc", - "table", - "caption", - "colgroup", - "col", - "tbody", - "thead", - "tfoot", - "tr", - "td", - "th", - "form", - "fieldset", - "legend", - "label", - "input", - "button", - "select", - "datalist", - "optgroup", - "option", - "textarea", - "keygen", - "output", - "progress", - "meter", - "details", - "summary", - "command", - "menu", - "applet", - "acronym", - "bgsound", - "dir", - "frame", - "frameset", - "noframes", - "isindex", - "listing", - "xmp", - "nextid", - "noembed", - "plaintext", - "rb", - "strike", - "basefont", - "big", - "blink", - "center", - "font", - "marquee", - "multicol", - "nobr", - "spacer", - "tt", - "u", - "unknown" -}; - -const static int Unknown=sizeof(TAGS)/sizeof(char*)-1; - -// determine tag name for a given node -static xmlNodePtr new_element(GumboElement *node) { +// Build a Nokogiri Element for a given GumboElement (recursively) +static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { + // determine tag name for a given node xmlNodePtr element; - if (node->tag != Unknown) { - element = xmlNewNode(NULL, BAD_CAST TAGS[(int)node->tag]); + if (node->tag != GUMBO_TAG_UNKNOWN) { + element = xmlNewNode(NULL, BAD_CAST gumbo_normalized_tagname(node->tag)); } else { - // Gumbo doesn't provide unknown tags, so we need to parse it ourselves: - // http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state - GumboStringPiece *tag = &node->original_tag; - int length; - for (length = 1; length < tag->length-1; length++) { - if (strchr(" \t\r\n<", *((char*)tag->data+length))) break; - } - char name[length]; - strncpy(name, 1+(char *)tag->data, length-1); - name[length-1] = '\0'; + GumboStringPiece tag = node->original_tag; + gumbo_tag_from_original_text(&tag); + char name[tag.length+1]; + strncpy(name, tag.data, tag.length); + name[tag.length] = '\0'; element = xmlNewNode(NULL, BAD_CAST name); } - return element; -} - -// Build a Nokogiri Element for a given GumboElement (recursively) -static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { - xmlNodePtr element = new_element(node); // add in the attributes GumboVector* attrs = &node->attributes; @@ -224,7 +62,7 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { } // Parse a string using gumbo_parse into a Nokogiri document -static VALUE t_parse(VALUE self, VALUE string) { +static VALUE parse(VALUE self, VALUE string) { GumboOutput *output = gumbo_parse_with_options( &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string) ); @@ -255,6 +93,5 @@ void Init_nokogumboc() { // define Nokogumbo class with a singleton parse method VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject); - rb_define_singleton_method(Gumbo, "parse", t_parse, 1); + rb_define_singleton_method(Gumbo, "parse", parse, 1); } - From 7dadd9822f145a458014aeda87edfe3ea14d6a28 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 21 Aug 2013 06:56:24 -0400 Subject: [PATCH 0013/1759] push out another gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 38b34ca7df..2b2dd8aab1 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -38,7 +38,7 @@ CLEAN.include 'pkg', 'gumbo-parser', 'work' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.4' + gem.version = '0.5' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From 63b32be8a5e496c8bc4c20d5cf27f85055eb280b Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 21 Aug 2013 08:22:47 -0400 Subject: [PATCH 0014/1759] more carefully construct gemfile --- nokogumbo-import/Rakefile | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 2b2dd8aab1..d3f6960837 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -7,23 +7,28 @@ file 'gumbo-parser' do sh 'git clone https://github.com/google/gumbo-parser.git' end -file 'work/extconf.rb' => ['ext/extconf.rb', 'gumbo-parser'] do +task 'sources' => ['work/parser.c', 'work/nokogumbo.c', 'work/extconf.rb'] + +file 'work/parser.c' => ['gumbo-parser'] do mkdir_p 'work' - rm_f 'work/Makefile' cp Dir['gumbo-parser/src/*'], 'work', :preserve => true - cp Dir['ext/*'], 'work' end -file 'work/Makefile' => 'work/extconf.rb' do +file 'work/nokogumbo.c' => 'ext/nokogumbo.c' do + cp 'ext/nokogumbo.c', 'work/nokogumbo.c' +end + +file 'work/extconf.rb' => 'ext/extconf.rb' do + rm_f 'work/Makefile' + cp 'ext/extconf.rb', 'work/extconf.rb' +end + +file 'work/Makefile' => 'sources' do Dir.chdir 'work' do ruby 'extconf.rb' end end -file 'work/nokogumbo.c' => 'ext/nokogumbo.c' do - cp 'ext/nokogumbo.c', 'work/nokogumbo.c' -end - task 'compile' => ['work/Makefile', 'work/nokogumbo.c'] do Dir.chdir 'work' do sh 'make -s' @@ -52,13 +57,13 @@ SPEC = Gem::Specification.new do |gem| gem.files = FileList[ 'lib/nokogumbo.rb', 'LICENSE.txt', - 'Rakefile', - 'README.md' + 'README.md', ] end -task 'package_workfiles' => 'work/extconf.rb' do - PKG.package_files += FileList['work/*.rb', 'work/*.c', 'work/*.h'] +task 'package_workfiles' => 'sources' do + SPEC.files += FileList['work/*'] + PKG.package_files += FileList['work/*'] end task 'gem' => ['test', 'package_workfiles'] From e2715d8580ebd2123845e42d1c6d1a4ec67d4454 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 21 Aug 2013 08:25:00 -0400 Subject: [PATCH 0015/1759] push out gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index d3f6960837..f21ebdf623 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -43,7 +43,7 @@ CLEAN.include 'pkg', 'gumbo-parser', 'work' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.5' + gem.version = '0.5.1' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From 4e9072e5f90b3914283bf2635d6101778b1c26a6 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 21 Aug 2013 08:30:25 -0400 Subject: [PATCH 0016/1759] prune work directory --- nokogumbo-import/Rakefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index f21ebdf623..a3c92662bc 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -43,7 +43,7 @@ CLEAN.include 'pkg', 'gumbo-parser', 'work' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.5.1' + gem.version = '0.5.2' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' @@ -62,8 +62,9 @@ SPEC = Gem::Specification.new do |gem| end task 'package_workfiles' => 'sources' do - SPEC.files += FileList['work/*'] - PKG.package_files += FileList['work/*'] + sources = FileList['work/*.c', 'work/*.h', 'work/*.rb'] + SPEC.files += sources + PKG.package_files += sources end task 'gem' => ['test', 'package_workfiles'] From bb7247d32f9c2994436c8056a8b6cafa35327335 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 21 Aug 2013 17:47:20 -0400 Subject: [PATCH 0017/1759] IO (file), url, redirect, https, and encoding --- nokogumbo-import/lib/nokogumbo.rb | 82 +++++++++++++++++++++++++++++- nokogumbo-import/test-nokogumbo.rb | 10 +++- 2 files changed, 89 insertions(+), 3 deletions(-) diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 9c1b85a967..9fe3b5bf3b 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -8,12 +8,92 @@ def self.HTML5(string) module HTML5 def self.parse(string) + if string.respond_to? :read + string = string.read + end + # convert to UTF-8 (Ruby 1.9+) if string.respond_to?(:encoding) and string.encoding != Encoding::UTF_8 - string = string.encode(Encoding::UTF_8) + string = reencode(string) end Nokogumbo.parse(string) end + + def self.get(uri, limit=10) + require 'net/http' + uri = URI(uri) unless URI === uri + + http = Net::HTTP.new(uri.host, uri.port) + if uri.scheme == 'https' + http.use_ssl = true + http.verify_mode = OpenSSL::SSL::VERIFY_NONE + end + request = Net::HTTP::Get.new(uri.request_uri) + response = http.request(request) + + case response + when Net::HTTPSuccess + parse(reencode(response.body, response['content-type'])) + when Net::HTTPRedirection + response.value if limit <= 1 + get(response['location'], limit-1) + else + response.value + end + end + + private + + # Charset sniffing is a complex and controversial topic that understandably + # isn't done _by default_ by the Ruby Net::HTTP library. This being said, + # it is a very real problem for consumers of HTML as the default for HTML + # is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser + # *only* supports utf-8. + # + # Accordingly, Nokogiri::HTML::Document.parse provides limited encoding + # detection. Following this lead, Nokogiri::HTML5 attempts to do likewise, + # while attempting to more closely follow the HTML5 standard. + # + # http://bugs.ruby-lang.org/issues/2567 + # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding + # + def self.reencode(body, content_type=nil) + return body unless body.respond_to? :encoding + + if body.encoding == Encoding::ASCII_8BIT + encoding = nil + + # look for a Byte Order Mark (BOM) + if body[0..1] == "\xFE\xFF" + encoding = 'utf-16be' + elsif body[0..1] == "\xFF\xFE" + encoding = 'utf-16le' + elsif body[0..2] == "\xEF\xBB\xBF" + encoding = 'utf-8' + end + + # look for a charset in a content-encoding header + if content_type + encoding ||= content_type[/charset=(.*?)($|\s|;)/i, 1] + end + + # look for a charset in a meta tag in the first 1024 bytes + if not encoding + data = body[0..1023].gsub(/|\Z)/m, '') + data.scan(//m).each do |meta| + encoding ||= meta[/charset="?(.*?)($|"|\s|>)/im, 1] + end + end + + # if all else fails, default to the official default encoding for HTML + encoding ||= Encoding::ISO_8859_1 + + # change the encoding to match the detected or inferred encoding + body.force_encoding(encoding) + end + + body.encode(Encoding::UTF_8) + end end end diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index acd701c04f..d312b29321 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -47,11 +47,17 @@ def test_unknown_element end if ''.respond_to? 'encoding' - def test_encoding + def test_macroman_encoding mac="\xCA".force_encoding('macroman') - doc = Nokogumbo.parse(mac.encode('utf-8')) + doc = Nokogiri::HTML5(mac) assert_equal ' ', doc.at('span').to_xml end + + def test_iso8859_encoding + iso8859="Se\xF2or".force_encoding(Encoding::ASCII_8BIT) + doc = Nokogiri::HTML5(iso8859) + assert_equal 'Seòor', doc.at('span').to_xml + end end def test_html5_doctype From ff6f2251e656745b11cdefc079befca2119e3a31 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 21 Aug 2013 23:32:32 -0400 Subject: [PATCH 0018/1759] Update README to mention new interface --- nokogumbo-import/README.md | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index b2f8d85ec8..da22a25891 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -13,18 +13,31 @@ require 'nokogumbo' doc = Nokogiri::HTML5(string) ``` +Because HTML is often fetched via the web, a convenience interface is also +provided: + +```ruby +require 'nokogumbo' +doc = Nokogiri::HTML5.get(uri) +``` Notes: ----- -* The `Nokogumbo.parse` function takes a string and passes it to the +* The `Nokogiri::HTML5.parse` function takes a string and passes it to the gumbo_parse_with_options method, using the default options. -The resulting Gumbo parse tree is the walked, producing a Nokogiri parse tree. -The original Gumbo parse tree is then destroyed, and the Nokogiri parse tree -is returned. +The resulting Gumbo parse tree is the walked, producing a libxml2 parse tree. +The original Gumbo parse tree is then destroyed, and single Nokogiri Ruby +object is constructed to wrap the libxml2 parse tree. Nokogiri only produces +Ruby objects as necessary, so all scanning is done using the underlying +libxml2 libraries. + +* The `Nokogiri::HTML5.get` function takes care of following redirects, +https, and determining the character encoding of the result, based on the +rules defined in the HTML5 specification for doing so. * Instead of uppercase element names, lowercase element names are produced. -* Instead of returning 'unknown' as the element name for unknown tags, the +* Instead of returning `unknown` as the element name for unknown tags, the original tag name is returned verbatim. * The gem itself includes a copy of the Gumbo HTML5 parser. From 0974fc0efc22ef484a6c7c80656c08fd32849668 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 22 Aug 2013 07:06:44 -0400 Subject: [PATCH 0019/1759] Push out a new gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index a3c92662bc..c37b34ba42 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -43,7 +43,7 @@ CLEAN.include 'pkg', 'gumbo-parser', 'work' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.5.2' + gem.version = '0.6' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From dfe8a100d10f65d2cee93993f93befccf6ac8c09 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Fri, 23 Aug 2013 09:22:48 -0400 Subject: [PATCH 0020/1759] Reference Nokogiri documentation --- nokogumbo-import/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index da22a25891..260cc87eb8 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -3,7 +3,8 @@ Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser. Nokogumbo provides the ability for a Ruby program to invoke the [Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme) -and to access the result as a Nokogiri parsed document. +and to access the result as a +[Nokogiri::HTML::Document](http://nokogiri.org/Nokogiri/HTML/Document.html). Usage: ----- From 336decd5976c6505e12e20e1bc35b2bf0adce1be Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 25 Aug 2013 06:19:47 -0400 Subject: [PATCH 0021/1759] link to libxml2 --- nokogumbo-import/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 260cc87eb8..38d27ca473 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -26,10 +26,12 @@ Notes: * The `Nokogiri::HTML5.parse` function takes a string and passes it to the gumbo_parse_with_options method, using the default options. -The resulting Gumbo parse tree is the walked, producing a libxml2 parse tree. +The resulting Gumbo parse tree is the walked, producing a +[libxml2](http://xmlsoft.org/html/) +[xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc). The original Gumbo parse tree is then destroyed, and single Nokogiri Ruby -object is constructed to wrap the libxml2 parse tree. Nokogiri only produces -Ruby objects as necessary, so all scanning is done using the underlying +object is constructed to wrap the xmlDoc structure. Nokogiri only produces +Ruby objects as necessary, so all searching is done using the underlying libxml2 libraries. * The `Nokogiri::HTML5.get` function takes care of following redirects, From a1ebd03669f57bc15b33a7216782d41225ae02ae Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 25 Aug 2013 06:20:52 -0400 Subject: [PATCH 0022/1759] prevent unnecessary recompiles of gumbo-parser --- nokogumbo-import/Rakefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index c37b34ba42..11afd6233a 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -9,16 +9,18 @@ end task 'sources' => ['work/parser.c', 'work/nokogumbo.c', 'work/extconf.rb'] -file 'work/parser.c' => ['gumbo-parser'] do - mkdir_p 'work' - cp Dir['gumbo-parser/src/*'], 'work', :preserve => true +file 'work/parser.c' => 'gumbo-parser' do + mkdir_p 'work' unless File.exist? 'work' + cp Dir['gumbo-parser/src/*'], 'work' end file 'work/nokogumbo.c' => 'ext/nokogumbo.c' do + mkdir_p 'work' unless File.exist? 'work' cp 'ext/nokogumbo.c', 'work/nokogumbo.c' end file 'work/extconf.rb' => 'ext/extconf.rb' do + mkdir_p 'work' unless File.exist? 'work' rm_f 'work/Makefile' cp 'ext/extconf.rb', 'work/extconf.rb' end From 8b31cb832fe676cacef05ca070fa2b79dad136f6 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 25 Aug 2013 06:21:57 -0400 Subject: [PATCH 0023/1759] test charset --- nokogumbo-import/lib/nokogumbo.rb | 2 +- nokogumbo-import/test-nokogumbo.rb | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 9fe3b5bf3b..858c9f23e1 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -82,7 +82,7 @@ def self.reencode(body, content_type=nil) if not encoding data = body[0..1023].gsub(/|\Z)/m, '') data.scan(//m).each do |meta| - encoding ||= meta[/charset="?(.*?)($|"|\s|>)/im, 1] + encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1] end end diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index d312b29321..f4ccdb8dc3 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -54,9 +54,16 @@ def test_macroman_encoding end def test_iso8859_encoding - iso8859="Se\xF2or".force_encoding(Encoding::ASCII_8BIT) + iso8859="Se\xF1or".force_encoding(Encoding::ASCII_8BIT) doc = Nokogiri::HTML5(iso8859) - assert_equal 'Seòor', doc.at('span').to_xml + assert_equal 'Señor', doc.at('span').to_xml + end + + def test_charset_encoding + utf8="Se\xC3\xB1or". + force_encoding(Encoding::ASCII_8BIT) + doc = Nokogiri::HTML5(utf8) + assert_equal 'Señor', doc.at('span').to_xml end end From d54dbaa883371d68594cf7848b1c5f31d3261810 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 25 Aug 2013 06:22:10 -0400 Subject: [PATCH 0024/1759] add comments --- nokogumbo-import/lib/nokogumbo.rb | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 858c9f23e1..85818f89a3 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -2,11 +2,15 @@ require 'nokogumboc' module Nokogiri + # Parse an HTML document. +string+ contains the document. +string+ + # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+. def self.HTML5(string) Nokogiri::HTML5.parse(string) end module HTML5 + # Parse an HTML document. +string+ contains the document. +string+ + # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+. def self.parse(string) if string.respond_to? :read string = string.read @@ -20,6 +24,10 @@ def self.parse(string) Nokogumbo.parse(string) end + # Fetch and parse a HTML document from the web, following redirects, + # handling https, and determining the character encoding using HTML5 + # rules. +uri+ may be a +String+ or a +URI+. +limit+ controls the + # number of redirects that will be followed. def self.get(uri, limit=10) require 'net/http' uri = URI(uri) unless URI === uri From 35695200413363583cccdd24945fe9e2f61eef78 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 25 Aug 2013 06:23:50 -0400 Subject: [PATCH 0025/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 11afd6233a..adbad0ccdd 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -45,7 +45,7 @@ CLEAN.include 'pkg', 'gumbo-parser', 'work' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.6' + gem.version = '0.7' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From adc0842e45b37f8f736373cb4b80a4d31a7fbb94 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 26 Aug 2013 08:54:59 -0400 Subject: [PATCH 0026/1759] cleanup tests --- nokogumbo-import/test-nokogumbo.rb | 67 ++++++++++++++++++------------ 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index f4ccdb8dc3..97fa6c5835 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -4,46 +4,35 @@ require 'test/unit' class TestNokogumbo < Test::Unit::TestCase - def setup - @buffer = <<-EOF.gsub(/^ /, '') - - - - hello world - - -

hello world

-
- content -
- - - - - - - EOF - @doc = Nokogiri::HTML5(@buffer) - end - def test_element_text - assert_equal "content", @doc.at('span').text + doc = Nokogiri::HTML5(buffer) + assert_equal "content", doc.at('span').text end def test_element_cdata - assert_equal "foobar", @doc.at('textarea').text.strip + doc = Nokogiri::HTML5(buffer) + assert_equal "foobar", doc.at('textarea').text.strip end def test_attr_value - assert_equal "utf-8", @doc.at('meta')['charset'] + doc = Nokogiri::HTML5(buffer) + assert_equal "utf-8", doc.at('meta')['charset'] end def test_comment - assert_equal " test comment ", @doc.xpath('//comment()').text + doc = Nokogiri::HTML5(buffer) + assert_equal " test comment ", doc.xpath('//comment()').text end def test_unknown_element - assert_equal "main", @doc.at('main').name + doc = Nokogiri::HTML5(buffer) + assert_equal "main", doc.at('main').name + end + + def test_IO + require 'stringio' + doc = Nokogiri::HTML5(StringIO.new(buffer)) + assert_equal 'textarea', doc.at('form').element_children.first.name end if ''.respond_to? 'encoding' @@ -71,4 +60,28 @@ def test_html5_doctype doc = Nokogumbo.parse("") assert_match //, doc.to_html end + +private + + def buffer + <<-EOF.gsub(/^ /, '') + + + + hello world + + +

hello world

+
+ content +
+ +
+ +
+ + + EOF + end + end From 9931195982a55dc2e7d33ee65d22becf2feaa342 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 27 Aug 2013 07:50:22 -0400 Subject: [PATCH 0027/1759] Cleanup README, add example --- nokogumbo-import/README.md | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 38d27ca473..45f4a29247 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -6,7 +6,7 @@ Nokogumbo provides the ability for a Ruby program to invoke the and to access the result as a [Nokogiri::HTML::Document](http://nokogiri.org/Nokogiri/HTML/Document.html). -Usage: +Usage ----- ```ruby @@ -14,14 +14,22 @@ require 'nokogumbo' doc = Nokogiri::HTML5(string) ``` -Because HTML is often fetched via the web, a convenience interface is also -provided: +Because HTML is often fetched via the web, a convenience interface to +HTTP get is also provided: ```ruby require 'nokogumbo' doc = Nokogiri::HTML5.get(uri) ``` -Notes: + +Example Usage +----- +```ruby +require 'nokogumbo' +puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title'] +``` + +Notes ----- * The `Nokogiri::HTML5.parse` function takes a string and passes it to the @@ -45,14 +53,14 @@ original tag name is returned verbatim. * The gem itself includes a copy of the Gumbo HTML5 parser. -Installation: +Installation ============ * Execute `rake gem` * [sudo] gem install pkg/nokogumbo*.gem -Related efforts: +Related efforts ============ * [ruby-gumbo](https://github.com/galdor/ruby-gumbo#readme) - a ruby binding From 4aee4877893a0e0822007d376ea3b61f7d20135d Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 27 Aug 2013 09:08:02 -0400 Subject: [PATCH 0028/1759] Protect against bogus encoding --- nokogumbo-import/lib/nokogumbo.rb | 6 +++++- nokogumbo-import/test-nokogumbo.rb | 7 +++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 85818f89a3..60737e0505 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -98,7 +98,11 @@ def self.reencode(body, content_type=nil) encoding ||= Encoding::ISO_8859_1 # change the encoding to match the detected or inferred encoding - body.force_encoding(encoding) + begin + body.force_encoding(encoding) + rescue ArgumentError + body.force_encoding(Encoding::ISO_8859_1) + end end body.encode(Encoding::UTF_8) diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index 97fa6c5835..234e557f3a 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -54,6 +54,13 @@ def test_charset_encoding doc = Nokogiri::HTML5(utf8) assert_equal 'Señor', doc.at('span').to_xml end + + def test_bogus_encoding + bogus="Se\xF1or". + force_encoding(Encoding::ASCII_8BIT) + doc = Nokogiri::HTML5(bogus) + assert_equal 'Señor', doc.at('span').to_xml + end end def test_html5_doctype From 0cdac2b5937b79d8cf2375fcbf26146f7db9dfed Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 27 Aug 2013 09:19:58 -0400 Subject: [PATCH 0029/1759] Only embed gumbo-parser if it is not already available --- nokogumbo-import/README.md | 5 +++-- nokogumbo-import/Rakefile | 19 +++++++++++++------ nokogumbo-import/ext/extconf.rb | 13 ++++++++++++- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 45f4a29247..0abc2b4cab 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -22,7 +22,7 @@ require 'nokogumbo' doc = Nokogiri::HTML5.get(uri) ``` -Example Usage +Example ----- ```ruby require 'nokogumbo' @@ -51,7 +51,8 @@ rules defined in the HTML5 specification for doing so. * Instead of returning `unknown` as the element name for unknown tags, the original tag name is returned verbatim. -* The gem itself includes a copy of the Gumbo HTML5 parser. +* If the Gumbo HTML5 parser is not already installed, the source for the +parser will be downloaded and compiled into the Gem itself. Installation ============ diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index adbad0ccdd..bda8c10bc7 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -7,11 +7,16 @@ file 'gumbo-parser' do sh 'git clone https://github.com/google/gumbo-parser.git' end +task 'pull' => 'gumbo-parser' do + Dir.chdir('gumbo-parser') do + sh 'git pull' + end +end + task 'sources' => ['work/parser.c', 'work/nokogumbo.c', 'work/extconf.rb'] -file 'work/parser.c' => 'gumbo-parser' do +file 'work/parser.c' do mkdir_p 'work' unless File.exist? 'work' - cp Dir['gumbo-parser/src/*'], 'work' end file 'work/nokogumbo.c' => 'ext/nokogumbo.c' do @@ -25,7 +30,7 @@ file 'work/extconf.rb' => 'ext/extconf.rb' do cp 'ext/extconf.rb', 'work/extconf.rb' end -file 'work/Makefile' => 'sources' do +file 'work/Makefile' => ['sources', 'gumbo-parser'] do Dir.chdir 'work' do ruby 'extconf.rb' end @@ -41,11 +46,12 @@ task 'test' => 'compile' do ruby 'test-nokogumbo.rb' end -CLEAN.include 'pkg', 'gumbo-parser', 'work' +CLEAN.include FileList.new('work').existing +CLOBBER.include FileList.new('gumbo-parser').existing SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.7' + gem.version = '0.8' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' @@ -64,7 +70,8 @@ SPEC = Gem::Specification.new do |gem| end task 'package_workfiles' => 'sources' do - sources = FileList['work/*.c', 'work/*.h', 'work/*.rb'] + sources = 'work/nokogumbo.c', 'work/extconf.rb' + sources += FileList['gumbo-parser/src/*'] SPEC.files += sources PKG.package_files += sources end diff --git a/nokogumbo-import/ext/extconf.rb b/nokogumbo-import/ext/extconf.rb index 925968e0cc..7c8cedacb0 100644 --- a/nokogumbo-import/ext/extconf.rb +++ b/nokogumbo-import/ext/extconf.rb @@ -1,7 +1,18 @@ require 'mkmf' $CFLAGS = " -std=c99" + +# libxml2 libraries from http://www.xmlsoft.org/ pkg_config('libxml-2.0') -nokogiri_lib = Gem.find_files('nokogiri').first + +# nokogiri headers from gem install +nokogiri_lib = Gem.find_files('nokogiri').first or gem 'nokogiri' nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri$), 'ext/nokogiri') find_header('nokogiri.h', nokogiri_ext) + +# add in gumbo-parser source from github if not already installed +unless have_library('gumbo', 'gumbo_parse') or File.exist? 'work/gumbo.h' + require 'fileutils' + FileUtils.cp Dir['../gumbo-parser/src/*'], '.' +end + create_makefile('nokogumboc') From ba0fab9a80e0cd9beafe2b9870a3f89df17de3b1 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 27 Aug 2013 12:23:46 -0400 Subject: [PATCH 0030/1759] Test on Mac OSX --- nokogumbo-import/Rakefile | 12 ++++-------- nokogumbo-import/ext/extconf.rb | 11 +++++++---- nokogumbo-import/ext/nokogumbo.c | 25 ++++++++++++++----------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index bda8c10bc7..6cf4f1f689 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -13,11 +13,7 @@ task 'pull' => 'gumbo-parser' do end end -task 'sources' => ['work/parser.c', 'work/nokogumbo.c', 'work/extconf.rb'] - -file 'work/parser.c' do - mkdir_p 'work' unless File.exist? 'work' -end +SOURCES = ['work/nokogumbo.c', 'work/extconf.rb'] file 'work/nokogumbo.c' => 'ext/nokogumbo.c' do mkdir_p 'work' unless File.exist? 'work' @@ -30,7 +26,7 @@ file 'work/extconf.rb' => 'ext/extconf.rb' do cp 'ext/extconf.rb', 'work/extconf.rb' end -file 'work/Makefile' => ['sources', 'gumbo-parser'] do +file 'work/Makefile' => SOURCES + ['gumbo-parser'] do Dir.chdir 'work' do ruby 'extconf.rb' end @@ -51,7 +47,7 @@ CLOBBER.include FileList.new('gumbo-parser').existing SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.8' + gem.version = '0.9' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' @@ -69,7 +65,7 @@ SPEC = Gem::Specification.new do |gem| ] end -task 'package_workfiles' => 'sources' do +task 'package_workfiles' => SOURCES do sources = 'work/nokogumbo.c', 'work/extconf.rb' sources += FileList['gumbo-parser/src/*'] SPEC.files += sources diff --git a/nokogumbo-import/ext/extconf.rb b/nokogumbo-import/ext/extconf.rb index 7c8cedacb0..82f0ed49c5 100644 --- a/nokogumbo-import/ext/extconf.rb +++ b/nokogumbo-import/ext/extconf.rb @@ -4,10 +4,13 @@ # libxml2 libraries from http://www.xmlsoft.org/ pkg_config('libxml-2.0') -# nokogiri headers from gem install -nokogiri_lib = Gem.find_files('nokogiri').first or gem 'nokogiri' -nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri$), 'ext/nokogiri') -find_header('nokogiri.h', nokogiri_ext) +# nokogiri configuration from gem install +nokogiri_lib = Gem.find_files('nokogiri').sort.last or gem 'nokogiri' +nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri') +unless find_header('nokogiri.h', nokogiri_ext) + require "#{nokogiri_ext}/extconf.rb" + find_header('nokogiri.h', nokogiri_ext) +end # add in gumbo-parser source from github if not already installed unless have_library('gumbo', 'gumbo_parse') or File.exist? 'work/gumbo.h' diff --git a/nokogumbo-import/ext/nokogumbo.c b/nokogumbo-import/ext/nokogumbo.c index 9be5ab48c0..9fb03fda5a 100644 --- a/nokogumbo-import/ext/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumbo.c @@ -3,6 +3,8 @@ #include #include +#define CONST_CAST (xmlChar const*) + // class constants static VALUE Document; @@ -11,7 +13,7 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { // determine tag name for a given node xmlNodePtr element; if (node->tag != GUMBO_TAG_UNKNOWN) { - element = xmlNewNode(NULL, BAD_CAST gumbo_normalized_tagname(node->tag)); + element = xmlNewNode(NULL, CONST_CAST gumbo_normalized_tagname(node->tag)); } else { GumboStringPiece tag = node->original_tag; gumbo_tag_from_original_text(&tag); @@ -25,7 +27,7 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { GumboVector* attrs = &node->attributes; for (int i=0; i < attrs->length; i++) { GumboAttribute *attr = attrs->data[i]; - xmlNewProp(element, BAD_CAST attr->name, BAD_CAST attr->value); + xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value); } // add in the children @@ -41,15 +43,15 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { break; case GUMBO_NODE_WHITESPACE: case GUMBO_NODE_TEXT: - node = xmlNewText(BAD_CAST child->v.text.text); + node = xmlNewText(CONST_CAST child->v.text.text); break; case GUMBO_NODE_CDATA: node = xmlNewCDataBlock(document, - BAD_CAST child->v.text.original_text.data, - child->v.text.original_text.length); + CONST_CAST child->v.text.original_text.data, + (int) child->v.text.original_text.length); break; case GUMBO_NODE_COMMENT: - node = xmlNewComment(BAD_CAST child->v.text.text); + node = xmlNewComment(CONST_CAST child->v.text.text); break; case GUMBO_NODE_DOCUMENT: break; // should never happen -- ignore @@ -64,17 +66,18 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { // Parse a string using gumbo_parse into a Nokogiri document static VALUE parse(VALUE self, VALUE string) { GumboOutput *output = gumbo_parse_with_options( - &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string) + &kGumboDefaultOptions, RSTRING_PTR(string), + (size_t) RSTRING_LEN(string) ); - xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0"); + xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0"); xmlNodePtr root = walk_tree(doc, &output->root->v.element); xmlDocSetRootElement(doc, root); if (output->document->v.document.has_doctype) { const char *public = output->document->v.document.public_identifier; const char *system = output->document->v.document.system_identifier; - xmlCreateIntSubset(doc, BAD_CAST "html", - (strlen(public) ? public : NULL), - (strlen(system) ? system : NULL)); + xmlCreateIntSubset(doc, CONST_CAST "html", + (strlen(public) ? CONST_CAST public : NULL), + (strlen(system) ? CONST_CAST system : NULL)); } gumbo_destroy_output(&kGumboDefaultOptions, output); From 84cf2bc3970de9da1150c793f8f9e899b7c2a679 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 27 Aug 2013 14:49:03 -0400 Subject: [PATCH 0031/1759] extconf cleanup --- nokogumbo-import/ext/extconf.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/ext/extconf.rb b/nokogumbo-import/ext/extconf.rb index 82f0ed49c5..87d5bb1b3e 100644 --- a/nokogumbo-import/ext/extconf.rb +++ b/nokogumbo-import/ext/extconf.rb @@ -5,11 +5,15 @@ pkg_config('libxml-2.0') # nokogiri configuration from gem install -nokogiri_lib = Gem.find_files('nokogiri').sort.last or gem 'nokogiri' +nokogiri_lib = Gem.find_files('nokogiri'). + sort_by {|name| name[/nokogiri-([\d.]+)/,1].split('.').map(&:to_i)}.last +gem 'nokogiri' unless nokogiri_lib nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri') + +# if that doesn't work, try workarounds found in Nokogiri's extconf unless find_header('nokogiri.h', nokogiri_ext) require "#{nokogiri_ext}/extconf.rb" - find_header('nokogiri.h', nokogiri_ext) + throw 'nokogiri.h not found' unless find_header('nokogiri.h', nokogiri_ext) end # add in gumbo-parser source from github if not already installed From 0c0067cf8e9eb9bc52c8ca3b2297b64ea4d0ad80 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 29 Aug 2013 10:17:03 -0400 Subject: [PATCH 0032/1759] Switch to rake-compiler (hopefully a stepping stone to windows cross compiles) --- nokogumbo-import/.gitignore | 1 + nokogumbo-import/Gemfile | 9 ++++ nokogumbo-import/README.md | 8 ++-- nokogumbo-import/Rakefile | 64 +++++++++++-------------- nokogumbo-import/{ext => }/extconf.rb | 0 nokogumbo-import/{ext => }/nokogumbo.c | 0 nokogumbo-import/{lib => }/nokogumbo.rb | 0 nokogumbo-import/test-nokogumbo.rb | 2 +- 8 files changed, 44 insertions(+), 40 deletions(-) create mode 100644 nokogumbo-import/Gemfile rename nokogumbo-import/{ext => }/extconf.rb (100%) rename nokogumbo-import/{ext => }/nokogumbo.c (100%) rename nokogumbo-import/{lib => }/nokogumbo.rb (100%) diff --git a/nokogumbo-import/.gitignore b/nokogumbo-import/.gitignore index fbf13e3ba6..fd26d2a2c2 100644 --- a/nokogumbo-import/.gitignore +++ b/nokogumbo-import/.gitignore @@ -1,3 +1,4 @@ gumbo-parser pkg work +Gemfile.lock diff --git a/nokogumbo-import/Gemfile b/nokogumbo-import/Gemfile new file mode 100644 index 0000000000..267c33b732 --- /dev/null +++ b/nokogumbo-import/Gemfile @@ -0,0 +1,9 @@ +source 'https://rubygems.org' + +gem 'nokogiri' + +group :development, :test do + gem 'rake' + gem 'rake-compiler' +end + diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 0abc2b4cab..8b96c79250 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -57,9 +57,11 @@ parser will be downloaded and compiled into the Gem itself. Installation ============ -* Execute `rake gem` - -* [sudo] gem install pkg/nokogumbo*.gem + git clone https://github.com/rubys/nokogumbo.git + cd nokogumbo + bundle install + rake gem + gem install pkg/nokogumbo*.gem Related efforts ============ diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 6cf4f1f689..b6db861e8d 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -1,8 +1,24 @@ require 'rubygems/package_task' require 'rake/clean' +require 'rake/extensiontask' task 'default' => 'test' +file 'lib/nokogumbo.rb' do + mkdir_p 'lib' + cp 'nokogumbo.rb', 'lib' +end + +EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] +task 'compile' => EXT + +EXT.each do |ext| + file ext => File.basename(ext) do + mkdir_p File.dirname(ext) + cp File.basename(ext), File.dirname(ext) + end +end + file 'gumbo-parser' do sh 'git clone https://github.com/google/gumbo-parser.git' end @@ -13,45 +29,27 @@ task 'pull' => 'gumbo-parser' do end end -SOURCES = ['work/nokogumbo.c', 'work/extconf.rb'] - -file 'work/nokogumbo.c' => 'ext/nokogumbo.c' do - mkdir_p 'work' unless File.exist? 'work' - cp 'ext/nokogumbo.c', 'work/nokogumbo.c' -end - -file 'work/extconf.rb' => 'ext/extconf.rb' do - mkdir_p 'work' unless File.exist? 'work' - rm_f 'work/Makefile' - cp 'ext/extconf.rb', 'work/extconf.rb' -end - -file 'work/Makefile' => SOURCES + ['gumbo-parser'] do - Dir.chdir 'work' do - ruby 'extconf.rb' - end +task 'test' => ['compile', 'lib/nokogumbo.rb'] do + ruby 'test-nokogumbo.rb' end -task 'compile' => ['work/Makefile', 'work/nokogumbo.c'] do - Dir.chdir 'work' do - sh 'make -s' - end -end +CLEAN.include FileList.new('ext', 'lib').existing +CLOBBER.include FileList.new('gumbo-parser', 'Gemfile.lock').existing -task 'test' => 'compile' do - ruby 'test-nokogumbo.rb' +task 'package-ext' => EXT + ['gumbo-parser'] do + sources = EXT + FileList['gumbo-parser/src/*'] + SPEC.files += sources + PKG.package_files += sources end -CLEAN.include FileList.new('work').existing -CLOBBER.include FileList.new('gumbo-parser').existing - +task 'gem' => ['test', 'package-ext'] SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' gem.version = '0.9' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' - gem.extensions = 'work/extconf.rb' + gem.extensions = 'ext/nokogumboc/extconf.rb' gem.author = 'Sam Ruby' gem.add_dependency 'nokogiri' gem.license = 'Apache 2.0' @@ -65,15 +63,9 @@ SPEC = Gem::Specification.new do |gem| ] end -task 'package_workfiles' => SOURCES do - sources = 'work/nokogumbo.c', 'work/extconf.rb' - sources += FileList['gumbo-parser/src/*'] - SPEC.files += sources - PKG.package_files += sources -end - -task 'gem' => ['test', 'package_workfiles'] PKG = Gem::PackageTask.new(SPEC) do |pkg| pkg.need_tar = true pkg.need_zip = true end + +Rake::ExtensionTask.new('nokogumboc') diff --git a/nokogumbo-import/ext/extconf.rb b/nokogumbo-import/extconf.rb similarity index 100% rename from nokogumbo-import/ext/extconf.rb rename to nokogumbo-import/extconf.rb diff --git a/nokogumbo-import/ext/nokogumbo.c b/nokogumbo-import/nokogumbo.c similarity index 100% rename from nokogumbo-import/ext/nokogumbo.c rename to nokogumbo-import/nokogumbo.c diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/nokogumbo.rb similarity index 100% rename from nokogumbo-import/lib/nokogumbo.rb rename to nokogumbo-import/nokogumbo.rb diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index 234e557f3a..cff42a0427 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -1,4 +1,4 @@ -$:.push('lib', 'work') +$:.push('lib') require 'nokogumbo' require 'test/unit' From 0c415b1924871f9b648d10e7ae7be52651510d91 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Fri, 30 Aug 2013 08:00:04 -0400 Subject: [PATCH 0033/1759] Make compile depend on gumbo-parser --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index b6db861e8d..9984198f1d 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -10,7 +10,7 @@ file 'lib/nokogumbo.rb' do end EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] -task 'compile' => EXT +task 'compile' => EXT + ['gumbo-parser'] EXT.each do |ext| file ext => File.basename(ext) do From 8dbd4e58a7f1a52a92a406cd8ce97e01b0ca02d7 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Fri, 30 Aug 2013 10:51:45 -0400 Subject: [PATCH 0034/1759] Use original directory for copying --- nokogumbo-import/Rakefile | 2 ++ nokogumbo-import/extconf.rb | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 9984198f1d..0492bd4a34 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -2,6 +2,8 @@ require 'rubygems/package_task' require 'rake/clean' require 'rake/extensiontask' +ENV['RAKEHOME'] = File.dirname(File.expand_path(__FILE__)) + task 'default' => 'test' file 'lib/nokogumbo.rb' do diff --git a/nokogumbo-import/extconf.rb b/nokogumbo-import/extconf.rb index 87d5bb1b3e..8f9e443ec0 100644 --- a/nokogumbo-import/extconf.rb +++ b/nokogumbo-import/extconf.rb @@ -19,7 +19,9 @@ # add in gumbo-parser source from github if not already installed unless have_library('gumbo', 'gumbo_parse') or File.exist? 'work/gumbo.h' require 'fileutils' - FileUtils.cp Dir['../gumbo-parser/src/*'], '.' + rakehome = ENV['RAKEHOME'] || File.expand_path('../..') + FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/*"], + "#{rakehome}/ext/nokogumboc" end create_makefile('nokogumboc') From 2d6480a7f1fdfc6d5cb45a4ee2b90c58615b2d07 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sat, 31 Aug 2013 07:26:30 -0400 Subject: [PATCH 0035/1759] Rough in cross compile; silence clean and clobber --- nokogumbo-import/Rakefile | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 0492bd4a34..536111c2d9 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -35,9 +35,6 @@ task 'test' => ['compile', 'lib/nokogumbo.rb'] do ruby 'test-nokogumbo.rb' end -CLEAN.include FileList.new('ext', 'lib').existing -CLOBBER.include FileList.new('gumbo-parser', 'Gemfile.lock').existing - task 'package-ext' => EXT + ['gumbo-parser'] do sources = EXT + FileList['gumbo-parser/src/*'] SPEC.files += sources @@ -70,4 +67,16 @@ PKG = Gem::PackageTask.new(SPEC) do |pkg| pkg.need_zip = true end -Rake::ExtensionTask.new('nokogumboc') +Rake::ExtensionTask.new('nokogumboc', SPEC) do |ext| + ext.cross_compile = true + ext.cross_platform = ["x86-mingw32"] +end + +CLEAN.include FileList.new('ext', 'lib') +CLOBBER.include FileList.new('pkg', 'gumbo-parser', 'Gemfile.lock') + +# silence cleanup operations +Rake::Task[:clobber_package].clear +CLEAN.existing! +CLOBBER.uniq!.existing! +CLOBBER.exclude *Dir['lib/*'] From 0c3ac70685eb2c3e3e642e1af539a031de96e5d9 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sat, 31 Aug 2013 10:06:23 -0400 Subject: [PATCH 0036/1759] reorder to Rake works on Mac OSX --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 536111c2d9..be9e53e6ea 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -78,5 +78,5 @@ CLOBBER.include FileList.new('pkg', 'gumbo-parser', 'Gemfile.lock') # silence cleanup operations Rake::Task[:clobber_package].clear CLEAN.existing! -CLOBBER.uniq!.existing! +CLOBBER.existing!.uniq! CLOBBER.exclude *Dir['lib/*'] From e6783ceac16265d6c9f6e945eace87f6c4ee460b Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sat, 31 Aug 2013 10:06:38 -0400 Subject: [PATCH 0037/1759] update gitignore --- nokogumbo-import/.gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/.gitignore b/nokogumbo-import/.gitignore index fd26d2a2c2..b9f412eda9 100644 --- a/nokogumbo-import/.gitignore +++ b/nokogumbo-import/.gitignore @@ -1,4 +1,6 @@ +Gemfile.lock +ext gumbo-parser +lib pkg -work -Gemfile.lock +tmp From 710b3c68824aa4a4a40a26b98410aebcf43b7b71 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 1 Sep 2013 13:44:42 -0400 Subject: [PATCH 0038/1759] build ext and lib structure for cross compile too --- nokogumbo-import/Rakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index be9e53e6ea..d398c4df95 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -12,6 +12,7 @@ file 'lib/nokogumbo.rb' do end EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] +task 'cross' => EXT + ['gumbo-parser'] task 'compile' => EXT + ['gumbo-parser'] EXT.each do |ext| From 571aef9d42251f75b835043d3d5178b017a3f87b Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 1 Sep 2013 13:45:04 -0400 Subject: [PATCH 0039/1759] fall back to building a nokogiri tree if libxml2 and/or nokogiri headers are not found, build nokogiri objects directly instead of building a libxml2 tree and wrapping it. --- nokogumbo-import/README.md | 17 +++--- nokogumbo-import/extconf.rb | 15 +++-- nokogumbo-import/nokogumbo.c | 110 ++++++++++++++++++++++++++++++++--- 3 files changed, 120 insertions(+), 22 deletions(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 8b96c79250..531a31502f 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -34,13 +34,16 @@ Notes * The `Nokogiri::HTML5.parse` function takes a string and passes it to the gumbo_parse_with_options method, using the default options. -The resulting Gumbo parse tree is the walked, producing a -[libxml2](http://xmlsoft.org/html/) -[xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc). -The original Gumbo parse tree is then destroyed, and single Nokogiri Ruby -object is constructed to wrap the xmlDoc structure. Nokogiri only produces -Ruby objects as necessary, so all searching is done using the underlying -libxml2 libraries. +The resulting Gumbo parse tree is the walked. + * If the necessary Nokogiri and [libxml2](http://xmlsoft.org/html/) headers + can be found at installation time then an + [xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc) tree is produced + and a single Nokogiri Ruby object is constructed to wrap the xmlDoc + structure. Nokogiri only produces Ruby objects as necessary, so all + searching is done using the underlying libxml2 libraries. + * If the necessary headers are not present at installation time, then + Nokogiri Ruby objects are created for each Gumbo node. Other than + memory usage and CPU time, the results should be equivalent. * The `Nokogiri::HTML5.get` function takes care of following redirects, https, and determining the character encoding of the result, based on the diff --git a/nokogumbo-import/extconf.rb b/nokogumbo-import/extconf.rb index 8f9e443ec0..9ce2124b2b 100644 --- a/nokogumbo-import/extconf.rb +++ b/nokogumbo-import/extconf.rb @@ -7,13 +7,16 @@ # nokogiri configuration from gem install nokogiri_lib = Gem.find_files('nokogiri'). sort_by {|name| name[/nokogiri-([\d.]+)/,1].split('.').map(&:to_i)}.last -gem 'nokogiri' unless nokogiri_lib -nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri') +if nokogiri_lib + nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri') -# if that doesn't work, try workarounds found in Nokogiri's extconf -unless find_header('nokogiri.h', nokogiri_ext) - require "#{nokogiri_ext}/extconf.rb" - throw 'nokogiri.h not found' unless find_header('nokogiri.h', nokogiri_ext) + # if that doesn't work, try workarounds found in Nokogiri's extconf + unless find_header('nokogiri.h', nokogiri_ext) + require "#{nokogiri_ext}/extconf.rb" + end + + # if found, enable direct calls to Nokogiri (and libxml2) + $CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext) end # add in gumbo-parser source from github if not already installed diff --git a/nokogumbo-import/nokogumbo.c b/nokogumbo-import/nokogumbo.c index 9fb03fda5a..3bbb8cc02f 100644 --- a/nokogumbo-import/nokogumbo.c +++ b/nokogumbo-import/nokogumbo.c @@ -1,26 +1,101 @@ +// +// nokogumbo.c defines the following: +// +// class Nokogumbo +// def parse(utf8_string) # returns Nokogiri::HTML::Document +// end +// +// Processing starts by calling gumbo_parse_with_options. The resulting +// document tree is then walked: +// +// * if Nokogiri and libxml2 headers are available at compile time, +// (ifdef NGLIB) then a parallel libxml2 tree is constructed, and the +// final document is then wrapped using Nokogiri_wrap_xml_document. +// This approach reduces memory and CPU requirements as Ruby objects +// are only built when necessary. +// +// * if the necessary headers are not available at compile time, Nokogiri +// methods are called instead, producing the equivalent functionality. +// + #include #include + +// class constants +static VALUE Document; + +#ifdef NGLIB #include #include +#define NIL NULL #define CONST_CAST (xmlChar const*) +#else +#define NIL 0 +#define CONST_CAST -// class constants -static VALUE Document; +// more class constants +static VALUE Element; +static VALUE Text; +static VALUE CDATA; +static VALUE Comment; + +// interned symbols +static VALUE new; +static VALUE set_attribute; +static VALUE add_child; +static VALUE internal_subset; +static VALUE remove_; +static VALUE create_internal_subset; + +// map libxml2 types to Ruby VALUE +#define xmlNodePtr VALUE +#define xmlDocPtr VALUE + +// redefine libxml2 API as Ruby function calls +#define xmlNewDocNode(doc, ns, name, content) \ + rb_funcall(Element, new, 2, rb_str_new2(name), doc) +#define xmlNewProp(element, name, value) \ + rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value)) +#define xmlNewDocText(doc, text) \ + rb_funcall(Text, new, 2, rb_str_new2(text), doc) +#define xmlNewCDataBlock(doc, content, length) \ + rb_funcall(CDATA, new, 2, rb_str_new(content, length), doc) +#define xmlNewDocComment(doc, text) \ + rb_funcall(Comment, new, 2, doc, rb_str_new2(text)) +#define xmlAddChild(element, node) \ + rb_funcall(element, add_child, 1, node) +#define xmlDocSetRootElement(doc, root) \ + rb_funcall(doc, add_child, 1, root) +#define xmlCreateIntSubset(doc, name, external, system) \ + rb_funcall(doc, create_internal_subset, 3, rb_str_new2(name), \ + (external ? rb_str_new2(external) : Qnil), \ + (system ? rb_str_new2(system) : Qnil)); +#define Nokogiri_wrap_xml_document(klass, doc) \ + doc + +// remove internal subset from newly created documents +static VALUE xmlNewDoc(char* version) { + VALUE doc = rb_funcall(Document, new, 0); + rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0); + return doc; +} +#endif // Build a Nokogiri Element for a given GumboElement (recursively) static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { // determine tag name for a given node xmlNodePtr element; if (node->tag != GUMBO_TAG_UNKNOWN) { - element = xmlNewNode(NULL, CONST_CAST gumbo_normalized_tagname(node->tag)); + element = xmlNewDocNode(document, NIL, + CONST_CAST gumbo_normalized_tagname(node->tag), NIL); } else { GumboStringPiece tag = node->original_tag; gumbo_tag_from_original_text(&tag); char name[tag.length+1]; strncpy(name, tag.data, tag.length); name[tag.length] = '\0'; - element = xmlNewNode(NULL, BAD_CAST name); + element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL); } // add in the attributes @@ -35,7 +110,7 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { for (int i=0; i < children->length; i++) { GumboNode* child = children->data[i]; - xmlNodePtr node = NULL; + xmlNodePtr node = NIL; switch (child->type) { case GUMBO_NODE_ELEMENT: @@ -43,7 +118,7 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { break; case GUMBO_NODE_WHITESPACE: case GUMBO_NODE_TEXT: - node = xmlNewText(CONST_CAST child->v.text.text); + node = xmlNewDocText(document, CONST_CAST child->v.text.text); break; case GUMBO_NODE_CDATA: node = xmlNewCDataBlock(document, @@ -51,7 +126,7 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { (int) child->v.text.original_text.length); break; case GUMBO_NODE_COMMENT: - node = xmlNewComment(CONST_CAST child->v.text.text); + node = xmlNewDocComment(document, CONST_CAST child->v.text.text); break; case GUMBO_NODE_DOCUMENT: break; // should never happen -- ignore @@ -76,8 +151,8 @@ static VALUE parse(VALUE self, VALUE string) { const char *public = output->document->v.document.public_identifier; const char *system = output->document->v.document.system_identifier; xmlCreateIntSubset(doc, CONST_CAST "html", - (strlen(public) ? CONST_CAST public : NULL), - (strlen(system) ? CONST_CAST system : NULL)); + (strlen(public) ? CONST_CAST public : NIL), + (strlen(system) ? CONST_CAST system : NIL)); } gumbo_destroy_output(&kGumboDefaultOptions, output); @@ -94,6 +169,23 @@ void Init_nokogumboc() { VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML")); Document = rb_const_get(HTML, rb_intern("Document")); +#ifndef NGLIB + // more class constants + VALUE XML = rb_const_get(Nokogiri, rb_intern("XML")); + Element = rb_const_get(XML, rb_intern("Element")); + Text = rb_const_get(XML, rb_intern("Text")); + CDATA = rb_const_get(XML, rb_intern("CDATA")); + Comment = rb_const_get(XML, rb_intern("Comment")); + + // interned symbols + new = rb_intern("new"); + set_attribute = rb_intern("set_attribute"); + add_child = rb_intern("add_child"); + internal_subset = rb_intern("internal_subset"); + remove_ = rb_intern("remove"); + create_internal_subset = rb_intern("create_internal_subset"); +#endif + // define Nokogumbo class with a singleton parse method VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject); rb_define_singleton_method(Gumbo, "parse", parse, 1); From 51a2d5c414639869211c2d83aa17d23eefc003fe Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 1 Sep 2013 13:52:50 -0400 Subject: [PATCH 0040/1759] add gumbo-parser as a submodule --- nokogumbo-import/.gitignore | 1 - nokogumbo-import/.gitmodules | 3 +++ nokogumbo-import/Rakefile | 27 ++++++--------------------- nokogumbo-import/gumbo-parser | 1 + 4 files changed, 10 insertions(+), 22 deletions(-) create mode 100644 nokogumbo-import/.gitmodules create mode 160000 nokogumbo-import/gumbo-parser diff --git a/nokogumbo-import/.gitignore b/nokogumbo-import/.gitignore index b9f412eda9..1fea33581e 100644 --- a/nokogumbo-import/.gitignore +++ b/nokogumbo-import/.gitignore @@ -1,6 +1,5 @@ Gemfile.lock ext -gumbo-parser lib pkg tmp diff --git a/nokogumbo-import/.gitmodules b/nokogumbo-import/.gitmodules new file mode 100644 index 0000000000..4fccc478ea --- /dev/null +++ b/nokogumbo-import/.gitmodules @@ -0,0 +1,3 @@ +[submodule "gumbo-parser"] + path = gumbo-parser + url = https://github.com/google/gumbo-parser.git diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index d398c4df95..9b5887219c 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -12,8 +12,8 @@ file 'lib/nokogumbo.rb' do end EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] -task 'cross' => EXT + ['gumbo-parser'] -task 'compile' => EXT + ['gumbo-parser'] +task 'cross' => EXT +task 'compile' => EXT EXT.each do |ext| file ext => File.basename(ext) do @@ -22,27 +22,11 @@ EXT.each do |ext| end end -file 'gumbo-parser' do - sh 'git clone https://github.com/google/gumbo-parser.git' -end - -task 'pull' => 'gumbo-parser' do - Dir.chdir('gumbo-parser') do - sh 'git pull' - end -end - task 'test' => ['compile', 'lib/nokogumbo.rb'] do ruby 'test-nokogumbo.rb' end -task 'package-ext' => EXT + ['gumbo-parser'] do - sources = EXT + FileList['gumbo-parser/src/*'] - SPEC.files += sources - PKG.package_files += sources -end - -task 'gem' => ['test', 'package-ext'] +task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' gem.version = '0.9' @@ -56,10 +40,11 @@ SPEC = Gem::Specification.new do |gem| gem.description = %q( Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.).strip.gsub(/\s+/, ' ') - gem.files = FileList[ + gem.files = EXT + FileList[ 'lib/nokogumbo.rb', 'LICENSE.txt', 'README.md', + 'gumbo-parser/src/*' ] end @@ -74,7 +59,7 @@ Rake::ExtensionTask.new('nokogumboc', SPEC) do |ext| end CLEAN.include FileList.new('ext', 'lib') -CLOBBER.include FileList.new('pkg', 'gumbo-parser', 'Gemfile.lock') +CLOBBER.include FileList.new('pkg', 'Gemfile.lock') # silence cleanup operations Rake::Task[:clobber_package].clear diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser new file mode 160000 index 0000000000..1fbc93bdf9 --- /dev/null +++ b/nokogumbo-import/gumbo-parser @@ -0,0 +1 @@ +Subproject commit 1fbc93bdf97dde6b07b67abb29a71d779f533dd8 From f4c93dfc4379694a0f73b0d53fe34799cf38c579 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 1 Sep 2013 18:24:57 -0400 Subject: [PATCH 0041/1759] conditionally include libxml2 --- nokogumbo-import/extconf.rb | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/nokogumbo-import/extconf.rb b/nokogumbo-import/extconf.rb index 9ce2124b2b..b83ae4ef14 100644 --- a/nokogumbo-import/extconf.rb +++ b/nokogumbo-import/extconf.rb @@ -1,22 +1,24 @@ require 'mkmf' -$CFLAGS = " -std=c99" +$CFLAGS += " -std=c99" -# libxml2 libraries from http://www.xmlsoft.org/ -pkg_config('libxml-2.0') +if have_library('xml2', 'xmlNewDoc') + # libxml2 libraries from http://www.xmlsoft.org/ + pkg_config('libxml-2.0') -# nokogiri configuration from gem install -nokogiri_lib = Gem.find_files('nokogiri'). - sort_by {|name| name[/nokogiri-([\d.]+)/,1].split('.').map(&:to_i)}.last -if nokogiri_lib - nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri') + # nokogiri configuration from gem install + nokogiri_lib = Gem.find_files('nokogiri'). + sort_by {|name| name[/nokogiri-([\d.]+)/,1].split('.').map(&:to_i)}.last + if nokogiri_lib + nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri') - # if that doesn't work, try workarounds found in Nokogiri's extconf - unless find_header('nokogiri.h', nokogiri_ext) - require "#{nokogiri_ext}/extconf.rb" - end + # if that doesn't work, try workarounds found in Nokogiri's extconf + unless find_header('nokogiri.h', nokogiri_ext) + require "#{nokogiri_ext}/extconf.rb" + end - # if found, enable direct calls to Nokogiri (and libxml2) - $CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext) + # if found, enable direct calls to Nokogiri (and libxml2) + $CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext) + end end # add in gumbo-parser source from github if not already installed From d99a8e1e84d4f750b7cea476953a1f940e4705a7 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 1 Sep 2013 22:19:28 -0400 Subject: [PATCH 0042/1759] mention recursive option --- nokogumbo-import/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 531a31502f..8d10055ca1 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -60,7 +60,7 @@ parser will be downloaded and compiled into the Gem itself. Installation ============ - git clone https://github.com/rubys/nokogumbo.git + git clone --recursive https://github.com/rubys/nokogumbo.git cd nokogumbo bundle install rake gem From fa565b6bb4d29c26035e5158a16de53f0ac683ee Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 2 Sep 2013 09:28:37 -0400 Subject: [PATCH 0043/1759] add setup task if gumbo-parser submodule was not extracted, update it --- nokogumbo-import/Rakefile | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 9b5887219c..683f8de072 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -6,14 +6,21 @@ ENV['RAKEHOME'] = File.dirname(File.expand_path(__FILE__)) task 'default' => 'test' +file 'gumbo-parser/src' do + sh 'git submodule init' + sh 'git submodule update' +end + file 'lib/nokogumbo.rb' do mkdir_p 'lib' cp 'nokogumbo.rb', 'lib' end EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] -task 'cross' => EXT -task 'compile' => EXT + +task 'setup' => EXT + ['lib/nokogumbo.rb', 'gumbo-parser/src'] +task 'cross' => 'setup' +task 'compile' => 'setup' EXT.each do |ext| file ext => File.basename(ext) do @@ -22,7 +29,7 @@ EXT.each do |ext| end end -task 'test' => ['compile', 'lib/nokogumbo.rb'] do +task 'test' => 'compile' do ruby 'test-nokogumbo.rb' end From e8eca786e9bebf1f2fa41d64d8aec05c6f20f84a Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 2 Sep 2013 12:36:32 -0400 Subject: [PATCH 0044/1759] reduce copying --- nokogumbo-import/Rakefile | 11 ++--------- nokogumbo-import/{ => ext/nokogumboc}/extconf.rb | 10 ++++++---- nokogumbo-import/{ => ext/nokogumboc}/nokogumbo.c | 0 nokogumbo-import/{ => lib}/nokogumbo.rb | 0 4 files changed, 8 insertions(+), 13 deletions(-) rename nokogumbo-import/{ => ext/nokogumboc}/extconf.rb (80%) rename nokogumbo-import/{ => ext/nokogumboc}/nokogumbo.c (100%) rename nokogumbo-import/{ => lib}/nokogumbo.rb (100%) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 683f8de072..50187604e6 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -18,17 +18,10 @@ end EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] -task 'setup' => EXT + ['lib/nokogumbo.rb', 'gumbo-parser/src'] +task 'setup' => 'gumbo-parser/src' task 'cross' => 'setup' task 'compile' => 'setup' -EXT.each do |ext| - file ext => File.basename(ext) do - mkdir_p File.dirname(ext) - cp File.basename(ext), File.dirname(ext) - end -end - task 'test' => 'compile' do ruby 'test-nokogumbo.rb' end @@ -65,7 +58,7 @@ Rake::ExtensionTask.new('nokogumboc', SPEC) do |ext| ext.cross_platform = ["x86-mingw32"] end -CLEAN.include FileList.new('ext', 'lib') +CLEAN.include FileList.new('ext/nokogumboc/*')-EXT CLOBBER.include FileList.new('pkg', 'Gemfile.lock') # silence cleanup operations diff --git a/nokogumbo-import/extconf.rb b/nokogumbo-import/ext/nokogumboc/extconf.rb similarity index 80% rename from nokogumbo-import/extconf.rb rename to nokogumbo-import/ext/nokogumboc/extconf.rb index b83ae4ef14..a97ffb78c3 100644 --- a/nokogumbo-import/extconf.rb +++ b/nokogumbo-import/ext/nokogumboc/extconf.rb @@ -22,11 +22,13 @@ end # add in gumbo-parser source from github if not already installed -unless have_library('gumbo', 'gumbo_parse') or File.exist? 'work/gumbo.h' - require 'fileutils' +unless have_library('gumbo', 'gumbo_parse') rakehome = ENV['RAKEHOME'] || File.expand_path('../..') - FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/*"], - "#{rakehome}/ext/nokogumboc" + unless File.exist? "#{rakehome}/ext/nokogumboc/gumbo.h" + require 'fileutils' + FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/*"], + "#{rakehome}/ext/nokogumboc" + end end create_makefile('nokogumboc') diff --git a/nokogumbo-import/nokogumbo.c b/nokogumbo-import/ext/nokogumboc/nokogumbo.c similarity index 100% rename from nokogumbo-import/nokogumbo.c rename to nokogumbo-import/ext/nokogumboc/nokogumbo.c diff --git a/nokogumbo-import/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb similarity index 100% rename from nokogumbo-import/nokogumbo.rb rename to nokogumbo-import/lib/nokogumbo.rb From d12219e95240a3b72e8746d755645b002b4ea530 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 3 Sep 2013 09:55:29 -0400 Subject: [PATCH 0045/1759] Cleanup rakefile, push out a new gem --- nokogumbo-import/Rakefile | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 50187604e6..cdce28faa6 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -2,34 +2,34 @@ require 'rubygems/package_task' require 'rake/clean' require 'rake/extensiontask' +# home directory - used to find gumbo-parser/src by extconf.rb ENV['RAKEHOME'] = File.dirname(File.expand_path(__FILE__)) +# default to running tests task 'default' => 'test' -file 'gumbo-parser/src' do - sh 'git submodule init' - sh 'git submodule update' -end - -file 'lib/nokogumbo.rb' do - mkdir_p 'lib' - cp 'nokogumbo.rb', 'lib' +task 'test' => 'compile' do + ruby 'test-nokogumbo.rb' end -EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] - +# ensure gumbo-parser submodule is updated task 'setup' => 'gumbo-parser/src' task 'cross' => 'setup' task 'compile' => 'setup' -task 'test' => 'compile' do - ruby 'test-nokogumbo.rb' +file 'gumbo-parser/src' do + sh 'git submodule init' + sh 'git submodule update' end +# list of ext source files to be included in package, excluded from CLEAN +EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] + +# gem, package, and extension tasks task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.9' + gem.version = '0.10' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' @@ -58,6 +58,7 @@ Rake::ExtensionTask.new('nokogumboc', SPEC) do |ext| ext.cross_platform = ["x86-mingw32"] end +# cleanup CLEAN.include FileList.new('ext/nokogumboc/*')-EXT CLOBBER.include FileList.new('pkg', 'Gemfile.lock') @@ -65,4 +66,3 @@ CLOBBER.include FileList.new('pkg', 'Gemfile.lock') Rake::Task[:clobber_package].clear CLEAN.existing! CLOBBER.existing!.uniq! -CLOBBER.exclude *Dir['lib/*'] From 736bb27992eeec3a594199b4455f59c87823d2fd Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 9 Sep 2013 22:24:32 -0400 Subject: [PATCH 0046/1759] Release 1.0 --- nokogumbo-import/Rakefile | 6 +++++- nokogumbo-import/gumbo-parser | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index cdce28faa6..05fcad7541 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -22,6 +22,10 @@ file 'gumbo-parser/src' do sh 'git submodule update' end +task 'pull' => 'setup' do + sh 'git submodule foreach git pull origin master' +end + # list of ext source files to be included in package, excluded from CLEAN EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] @@ -29,7 +33,7 @@ EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '0.10' + gem.version = '1.0' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index 1fbc93bdf9..7bc917a8e0 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit 1fbc93bdf97dde6b07b67abb29a71d779f533dd8 +Subproject commit 7bc917a8e057864b2f8ffa118c6cd552c8f62610 From 219ad28cce19316902ace6d61ddec705be5bde6f Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 12 Sep 2013 15:19:06 -0400 Subject: [PATCH 0047/1759] basic auth, http header support --- nokogumbo-import/lib/nokogumbo.rb | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 60737e0505..db05713e43 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -26,18 +26,37 @@ def self.parse(string) # Fetch and parse a HTML document from the web, following redirects, # handling https, and determining the character encoding using HTML5 - # rules. +uri+ may be a +String+ or a +URI+. +limit+ controls the - # number of redirects that will be followed. - def self.get(uri, limit=10) + # rules. +uri+ may be a +String+ or a +URI+. +options+ contains + # http headers and special options. Everything which is not a + # special option is considered a header. Special options include: + # * :follow_limit => number of redirects which are followed + # * :basic_auth => [username, password] + def self.get(uri, options={}) + headers = options.clone + headers = {:follow_limit => headers} if Numeric === headers # deprecated + limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10 + require 'net/http' uri = URI(uri) unless URI === uri http = Net::HTTP.new(uri.host, uri.port) + + # TLS / SSL support if uri.scheme == 'https' http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_NONE end + request = Net::HTTP::Get.new(uri.request_uri) + + # basic authentication + auth = headers.delete(:basic_auth) + auth ||= [uri.user, uri.password] if uri.user and uri.password + request.basic_auth auth.first, auth.last if auth + + # remaining options are treated as headers + headers.each {|key, value| request[key.to_s] = value.to_s} + response = http.request(request) case response @@ -45,7 +64,7 @@ def self.get(uri, limit=10) parse(reencode(response.body, response['content-type'])) when Net::HTTPRedirection response.value if limit <= 1 - get(response['location'], limit-1) + get(response['location'], options.merge(:follow_limit => limit-1)) else response.value end From 3640ecc69132255c42f94c478051f2b014c57787 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 12 Sep 2013 15:36:09 -0400 Subject: [PATCH 0048/1759] return response --- nokogumbo-import/lib/nokogumbo.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index db05713e43..82a9a07e51 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -61,7 +61,10 @@ def self.get(uri, options={}) case response when Net::HTTPSuccess - parse(reencode(response.body, response['content-type'])) + doc = parse(reencode(response.body, response['content-type'])) + doc.instance_variable_set('@response', response) + doc.class.send(:attr_reader, :response) + doc when Net::HTTPRedirection response.value if limit <= 1 get(response['location'], options.merge(:follow_limit => limit-1)) From fc306e730b1720994b41b4ba92ca191063a87bbc Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 12 Sep 2013 15:38:19 -0400 Subject: [PATCH 0049/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- nokogumbo-import/gumbo-parser | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 05fcad7541..3c4eab54bb 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -33,7 +33,7 @@ EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.0' + gem.version = '1.1' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index 7bc917a8e0..e429f28b8d 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit 7bc917a8e057864b2f8ffa118c6cd552c8f62610 +Subproject commit e429f28b8db9c58812c73d825f19038784855e23 From bf2d00b70697c91a74d2d683f4ab72442a3282f2 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 17 Sep 2013 12:55:55 -0400 Subject: [PATCH 0050/1759] Remove rake-compiler --- nokogumbo-import/Gemfile | 1 - nokogumbo-import/Rakefile | 32 +++++++++++++--------- nokogumbo-import/ext/nokogumboc/extconf.rb | 1 + nokogumbo-import/test-nokogumbo.rb | 1 + 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/nokogumbo-import/Gemfile b/nokogumbo-import/Gemfile index 267c33b732..40fb9c364a 100644 --- a/nokogumbo-import/Gemfile +++ b/nokogumbo-import/Gemfile @@ -4,6 +4,5 @@ gem 'nokogiri' group :development, :test do gem 'rake' - gem 'rake-compiler' end diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 3c4eab54bb..73b7959ee6 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -1,6 +1,5 @@ require 'rubygems/package_task' require 'rake/clean' -require 'rake/extensiontask' # home directory - used to find gumbo-parser/src by extconf.rb ENV['RAKEHOME'] = File.dirname(File.expand_path(__FILE__)) @@ -13,21 +12,33 @@ task 'test' => 'compile' do end # ensure gumbo-parser submodule is updated -task 'setup' => 'gumbo-parser/src' -task 'cross' => 'setup' -task 'compile' => 'setup' +DLEXT = RbConfig::CONFIG['DLEXT'] +EXT = 'ext/nokogumboc' +file "#{EXT}/nokogumboc.#{DLEXT}" => ["#{EXT}/Makefile","#{EXT}/nokogumbo.c"] do + Dir.chdir 'ext/nokogumboc' do + sh 'make' + end +end + +file "#{EXT}/Makefile" => ['gumbo-parser/src', "#{EXT}/extconf.rb"] do + Dir.chdir 'ext/nokogumboc' do + ruby 'extconf.rb' + end +end + +task 'compile' => "#{EXT}/nokogumboc.#{DLEXT}" file 'gumbo-parser/src' do sh 'git submodule init' sh 'git submodule update' end -task 'pull' => 'setup' do +task 'pull' => 'gumbo-parser/src' do sh 'git submodule foreach git pull origin master' end # list of ext source files to be included in package, excluded from CLEAN -EXT = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] +SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] # gem, package, and extension tasks task 'gem' => 'test' @@ -44,7 +55,7 @@ SPEC = Gem::Specification.new do |gem| gem.description = %q( Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.).strip.gsub(/\s+/, ' ') - gem.files = EXT + FileList[ + gem.files = SOURCES + FileList[ 'lib/nokogumbo.rb', 'LICENSE.txt', 'README.md', @@ -57,13 +68,8 @@ PKG = Gem::PackageTask.new(SPEC) do |pkg| pkg.need_zip = true end -Rake::ExtensionTask.new('nokogumboc', SPEC) do |ext| - ext.cross_compile = true - ext.cross_platform = ["x86-mingw32"] -end - # cleanup -CLEAN.include FileList.new('ext/nokogumboc/*')-EXT +CLEAN.include FileList.new('ext/nokogumboc/*')-SOURCES CLOBBER.include FileList.new('pkg', 'Gemfile.lock') # silence cleanup operations diff --git a/nokogumbo-import/ext/nokogumboc/extconf.rb b/nokogumbo-import/ext/nokogumboc/extconf.rb index a97ffb78c3..0dda6d0e72 100644 --- a/nokogumbo-import/ext/nokogumboc/extconf.rb +++ b/nokogumbo-import/ext/nokogumboc/extconf.rb @@ -28,6 +28,7 @@ require 'fileutils' FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/*"], "#{rakehome}/ext/nokogumboc" + $srcs = $objs = nil end end diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index cff42a0427..385f9bbe6e 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -1,4 +1,5 @@ $:.push('lib') +$:.push('ext/nokogumboc') require 'nokogumbo' require 'test/unit' From 70047f472f7b9df2ee4a1a70631f7b3a79feb605 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 17 Sep 2013 19:56:13 -0400 Subject: [PATCH 0051/1759] Ensure tests are run against repository Fixes #1 --- nokogumbo-import/Rakefile | 2 +- nokogumbo-import/gumbo-parser | 2 +- nokogumbo-import/test-nokogumbo.rb | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 73b7959ee6..c905b39c45 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1' + gem.version = '1.1.1' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index e429f28b8d..f22d35d542 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit e429f28b8db9c58812c73d825f19038784855e23 +Subproject commit f22d35d542680f1d66c9a69727b46f08dbd7b868 diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index 385f9bbe6e..d0c2692f30 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -1,5 +1,5 @@ -$:.push('lib') -$:.push('ext/nokogumboc') +$:.unshift('lib') +$:.unshift('ext/nokogumboc') require 'nokogumbo' require 'test/unit' From 802fb927391636756aea3e40c3ad2891ae4f0ad5 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 22 Sep 2013 20:53:16 -0400 Subject: [PATCH 0052/1759] fix typo --- nokogumbo-import/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 8d10055ca1..86ba72f6b0 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -34,7 +34,7 @@ Notes * The `Nokogiri::HTML5.parse` function takes a string and passes it to the gumbo_parse_with_options method, using the default options. -The resulting Gumbo parse tree is the walked. +The resulting Gumbo parse tree is then walked. * If the necessary Nokogiri and [libxml2](http://xmlsoft.org/html/) headers can be found at installation time then an [xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc) tree is produced From 80b13e52c73f72e343875635e6785f912c587bd3 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sat, 12 Oct 2013 11:16:04 -0400 Subject: [PATCH 0053/1759] resolve relative redirects --- nokogumbo-import/lib/nokogumbo.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 82a9a07e51..63c6003c72 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -67,7 +67,8 @@ def self.get(uri, options={}) doc when Net::HTTPRedirection response.value if limit <= 1 - get(response['location'], options.merge(:follow_limit => limit-1)) + location = URI.join(uri, response['location']) + get(location, options.merge(:follow_limit => limit-1)) else response.value end From fa66ec64602270f325263f1a1431e55d99b902a8 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sat, 12 Oct 2013 11:18:28 -0400 Subject: [PATCH 0054/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- nokogumbo-import/gumbo-parser | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index c905b39c45..1556eebe8f 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.1' + gem.version = '1.1.2' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index f22d35d542..d90ea2b2d0 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit f22d35d542680f1d66c9a69727b46f08dbd7b868 +Subproject commit d90ea2b2d01b27a7adf0501f644a7782e50362fe From 3a89516b45840b86a523470efe5bd9bdc3c05091 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sat, 16 Nov 2013 17:42:10 -0500 Subject: [PATCH 0055/1759] ensure input is converted to a string fixes https://github.com/rubys/nokogumbo/issues/2 --- nokogumbo-import/Rakefile | 2 +- nokogumbo-import/lib/nokogumbo.rb | 2 +- nokogumbo-import/test-nokogumbo.rb | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 1556eebe8f..1fda659507 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.2' + gem.version = '1.1.3' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 63c6003c72..95e974bb1c 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -21,7 +21,7 @@ def self.parse(string) string = reencode(string) end - Nokogumbo.parse(string) + Nokogumbo.parse(string.to_s) end # Fetch and parse a HTML document from the web, following redirects, diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index d0c2692f30..599dab974d 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -36,6 +36,11 @@ def test_IO assert_equal 'textarea', doc.at('form').element_children.first.name end + def test_nil + doc = Nokogiri::HTML5(nil) + assert_equal 1, doc.search('body').count + end + if ''.respond_to? 'encoding' def test_macroman_encoding mac="\xCA".force_encoding('macroman') From b4ec9f2de99467e4449c98929115669fecffdfaf Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 18 Mar 2014 12:22:29 -0400 Subject: [PATCH 0056/1759] push out a new gem (picking up gumbo-parser fixes) --- nokogumbo-import/Rakefile | 2 +- nokogumbo-import/gumbo-parser | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 1fda659507..e107b575a4 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.3' + gem.version = '1.1.4' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index d90ea2b2d0..3304d48d4c 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit d90ea2b2d01b27a7adf0501f644a7782e50362fe +Subproject commit 3304d48d4c11c83442de9017ae19e2fffb3d7aac From cc1715e9ca9284d957436b7c67830be2891284dc Mon Sep 17 00:00:00 2001 From: Matt Wildig Date: Mon, 7 Apr 2014 23:01:12 +0100 Subject: [PATCH 0057/1759] Track extconf.rb and nokogumbo.c These files shouldn't be ignored. Fix gitignore to track them. --- nokogumbo-import/.gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nokogumbo-import/.gitignore b/nokogumbo-import/.gitignore index 1fea33581e..d29d0cee57 100644 --- a/nokogumbo-import/.gitignore +++ b/nokogumbo-import/.gitignore @@ -1,5 +1,7 @@ Gemfile.lock -ext +ext/nokogumboc/* +!ext/nokogumboc/extconf.rb +!ext/nokogumboc/nokogumbo.c lib pkg tmp From 95e932764e0c9132fb501bb6cc4cb1989ca2cff2 Mon Sep 17 00:00:00 2001 From: Matt Wildig Date: Mon, 7 Apr 2014 23:02:36 +0100 Subject: [PATCH 0058/1759] Only look for 'lib/nokogiri' files Gem.find_files can return other directories as well as the lib dir when searching for Nokogiri's files, which can cause install to fail. Filter out any non-lib results from the list. --- nokogumbo-import/ext/nokogumboc/extconf.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/nokogumbo-import/ext/nokogumboc/extconf.rb b/nokogumbo-import/ext/nokogumboc/extconf.rb index 0dda6d0e72..9cbc763a37 100644 --- a/nokogumbo-import/ext/nokogumboc/extconf.rb +++ b/nokogumbo-import/ext/nokogumboc/extconf.rb @@ -7,6 +7,7 @@ # nokogiri configuration from gem install nokogiri_lib = Gem.find_files('nokogiri'). + select { |name| name.include? 'lib/nokogiri' }. sort_by {|name| name[/nokogiri-([\d.]+)/,1].split('.').map(&:to_i)}.last if nokogiri_lib nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri') From 7bf8ecf4ceaf311e2db2a61d0ca68ce222869299 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 7 Apr 2014 19:38:26 -0400 Subject: [PATCH 0059/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index e107b575a4..5ecc716024 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.4' + gem.version = '1.1.5' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From b854d43c8a135d7ab121da8a3024b36e33ccf23f Mon Sep 17 00:00:00 2001 From: Kevin Rutten Date: Wed, 4 Jun 2014 17:19:46 -0700 Subject: [PATCH 0060/1759] [Issue1] Fix undefined reference to 'Nokogiri_wrap_xml_document' --- nokogumbo-import/ext/nokogumboc/extconf.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nokogumbo-import/ext/nokogumboc/extconf.rb b/nokogumbo-import/ext/nokogumboc/extconf.rb index 9cbc763a37..cd02ecf09a 100644 --- a/nokogumbo-import/ext/nokogumboc/extconf.rb +++ b/nokogumbo-import/ext/nokogumboc/extconf.rb @@ -19,6 +19,9 @@ # if found, enable direct calls to Nokogiri (and libxml2) $CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext) + + # link to the library to prevent: nokogumbo.c:(.text+0x26a): undefined reference to `Nokogiri_wrap_xml_document' + $LDFLAGS += " -L#{nokogiri_ext} -l:nokogiri.so" end end From d551ada56bfbbf71757175a933cd5a9a076c03f8 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 5 Jun 2014 11:26:20 -0400 Subject: [PATCH 0061/1759] update to the latest gumbo parser --- nokogumbo-import/gumbo-parser | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index 3304d48d4c..3a61e9ad96 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit 3304d48d4c11c83442de9017ae19e2fffb3d7aac +Subproject commit 3a61e9ad963cacfb3246468feab28c5058f621c1 From 2bf0085c25d21b384471d873709d314740f853a0 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 5 Jun 2014 11:29:05 -0400 Subject: [PATCH 0062/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 5ecc716024..10ea9a261d 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.5' + gem.version = '1.1.6' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From f38c7a1ebad909795e467a98ed3573e70c26036e Mon Sep 17 00:00:00 2001 From: Ryan Grove Date: Fri, 6 Jun 2014 10:30:34 -0700 Subject: [PATCH 0063/1759] Revert b854d43c to unbreak nokogumbo on OS X. --- nokogumbo-import/ext/nokogumboc/extconf.rb | 3 --- 1 file changed, 3 deletions(-) diff --git a/nokogumbo-import/ext/nokogumboc/extconf.rb b/nokogumbo-import/ext/nokogumboc/extconf.rb index cd02ecf09a..9cbc763a37 100644 --- a/nokogumbo-import/ext/nokogumboc/extconf.rb +++ b/nokogumbo-import/ext/nokogumboc/extconf.rb @@ -19,9 +19,6 @@ # if found, enable direct calls to Nokogiri (and libxml2) $CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext) - - # link to the library to prevent: nokogumbo.c:(.text+0x26a): undefined reference to `Nokogiri_wrap_xml_document' - $LDFLAGS += " -L#{nokogiri_ext} -l:nokogiri.so" end end From 06d4dfbb06efb17b85854084ed31842a908cd8a6 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Fri, 6 Jun 2014 13:50:17 -0400 Subject: [PATCH 0064/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 10ea9a261d..22bea12d8b 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.6' + gem.version = '1.1.7' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From 7e2389550e70bb7ccaa39e9949286f87c619b548 Mon Sep 17 00:00:00 2001 From: Ryan Grove Date: Fri, 6 Jun 2014 10:59:58 -0700 Subject: [PATCH 0065/1759] Unbreak tests in Ruby 2.1.x. --- nokogumbo-import/Gemfile | 1 + nokogumbo-import/test-nokogumbo.rb | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/Gemfile b/nokogumbo-import/Gemfile index 40fb9c364a..c0bb570cc8 100644 --- a/nokogumbo-import/Gemfile +++ b/nokogumbo-import/Gemfile @@ -3,6 +3,7 @@ source 'https://rubygems.org' gem 'nokogiri' group :development, :test do + gem 'minitest' gem 'rake' end diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index 599dab974d..b8518df3a4 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -1,10 +1,12 @@ $:.unshift('lib') $:.unshift('ext/nokogumboc') +gem 'minitest' + require 'nokogumbo' -require 'test/unit' +require 'minitest/autorun' -class TestNokogumbo < Test::Unit::TestCase +class TestNokogumbo < Minitest::Test def test_element_text doc = Nokogiri::HTML5(buffer) assert_equal "content", doc.at('span').text From aafac95593dd08a44bf3f302b7253795ae4d5e76 Mon Sep 17 00:00:00 2001 From: Ryan Grove Date: Fri, 6 Jun 2014 11:00:22 -0700 Subject: [PATCH 0066/1759] Add a Travis CI config file to enable automated testing. --- nokogumbo-import/.travis.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 nokogumbo-import/.travis.yml diff --git a/nokogumbo-import/.travis.yml b/nokogumbo-import/.travis.yml new file mode 100644 index 0000000000..843a3381ab --- /dev/null +++ b/nokogumbo-import/.travis.yml @@ -0,0 +1,6 @@ +language: ruby +rvm: + - 1.9.3 + - 2.0.0 + - 2.1.2 + - ruby-head From 8e67f5739e1d8cbd658917eba2359fefcf9ac3a4 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Fri, 6 Jun 2014 14:47:41 -0400 Subject: [PATCH 0067/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 22bea12d8b..c55931cf8b 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.7' + gem.version = '1.1.8' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From 69e989f1a0bd1da0bd4e55431bb4ae816575629b Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Fri, 6 Jun 2014 16:44:59 -0400 Subject: [PATCH 0068/1759] optimistically add a build status image --- nokogumbo-import/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 86ba72f6b0..db4b61b681 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -6,6 +6,8 @@ Nokogumbo provides the ability for a Ruby program to invoke the and to access the result as a [Nokogiri::HTML::Document](http://nokogiri.org/Nokogiri/HTML/Document.html). +[![Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo) + Usage ----- From 038761aa1823aa4f6e72fae0a1c1d58db416a0da Mon Sep 17 00:00:00 2001 From: Alexandre Bernard Date: Wed, 11 Jun 2014 17:46:50 +0200 Subject: [PATCH 0069/1759] fix compilation on gentoo releases --- nokogumbo-import/ext/nokogumboc/extconf.rb | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/nokogumbo-import/ext/nokogumboc/extconf.rb b/nokogumbo-import/ext/nokogumboc/extconf.rb index 9cbc763a37..4804460bff 100644 --- a/nokogumbo-import/ext/nokogumboc/extconf.rb +++ b/nokogumbo-import/ext/nokogumboc/extconf.rb @@ -1,7 +1,7 @@ require 'mkmf' $CFLAGS += " -std=c99" -if have_library('xml2', 'xmlNewDoc') +if have_library('xml2', 'xmlNewDoc') # libxml2 libraries from http://www.xmlsoft.org/ pkg_config('libxml-2.0') @@ -19,6 +19,11 @@ # if found, enable direct calls to Nokogiri (and libxml2) $CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext) + + if File.exists?("/etc/gentoo-release") + # link to the library to prevent: nokogumbo.c:(.text+0x26a): undefined reference to `Nokogiri_wrap_xml_document' + $LDFLAGS += " -L#{nokogiri_ext} -l:nokogiri.so" + end end end From 1b2455176255e81f7e501a7c7bab6369cdf2b2a6 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 11 Jun 2014 12:53:50 -0400 Subject: [PATCH 0070/1759] push out a new release --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index c55931cf8b..2244d3ff30 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.8' + gem.version = '1.1.9' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From e6b74e53c575abe75bede53efa4f3cec596cb33d Mon Sep 17 00:00:00 2001 From: Joel Low Date: Sat, 12 Jul 2014 08:37:28 +0800 Subject: [PATCH 0071/1759] Use alloca when compilng under Visual C++ because variable length arrays are not currently supported. --- nokogumbo-import/ext/nokogumboc/nokogumbo.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nokogumbo-import/ext/nokogumboc/nokogumbo.c b/nokogumbo-import/ext/nokogumboc/nokogumbo.c index 3bbb8cc02f..1b4b22e009 100644 --- a/nokogumbo-import/ext/nokogumboc/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumboc/nokogumbo.c @@ -92,7 +92,11 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { } else { GumboStringPiece tag = node->original_tag; gumbo_tag_from_original_text(&tag); +#ifdef _MSC_VER + char* name = alloca(tag.length+1); +#else char name[tag.length+1]; +#endif strncpy(name, tag.data, tag.length); name[tag.length] = '\0'; element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL); From d6d543f81009854042ffb35e1137e10915222697 Mon Sep 17 00:00:00 2001 From: Joel Low Date: Sat, 12 Jul 2014 14:42:29 +0800 Subject: [PATCH 0072/1759] Copy Gumbo's strings.h for Visual C++ (Windows) targets. --- nokogumbo-import/Rakefile | 3 ++- nokogumbo-import/ext/nokogumboc/extconf.rb | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 2244d3ff30..1eacfad69a 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -59,7 +59,8 @@ SPEC = Gem::Specification.new do |gem| 'lib/nokogumbo.rb', 'LICENSE.txt', 'README.md', - 'gumbo-parser/src/*' + 'gumbo-parser/src/*', + 'gumbo-parser/visualc/include/*' ] end diff --git a/nokogumbo-import/ext/nokogumboc/extconf.rb b/nokogumbo-import/ext/nokogumboc/extconf.rb index 4804460bff..2f52f34bb2 100644 --- a/nokogumbo-import/ext/nokogumboc/extconf.rb +++ b/nokogumbo-import/ext/nokogumboc/extconf.rb @@ -34,6 +34,13 @@ require 'fileutils' FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/*"], "#{rakehome}/ext/nokogumboc" + + case RbConfig::CONFIG['target_os'] + when 'mingw32', /mswin/ + FileUtils.cp Dir["#{rakehome}/gumbo-parser/visualc/include/*"], + "#{rakehome}/ext/nokogumboc" + end + $srcs = $objs = nil end end From b744ab059a74d5f18fbb2cb18902d1ad30e895ac Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 21 Jul 2014 12:03:12 -0400 Subject: [PATCH 0073/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 1eacfad69a..13b765fb85 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.9' + gem.version = '1.1.10' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From 8d09a4fe0aeabf28781f6cfe3c041d52af843bec Mon Sep 17 00:00:00 2001 From: Ryan Grove Date: Tue, 2 Sep 2014 15:24:39 -0700 Subject: [PATCH 0074/1759] Update to the latest Gumbo parser. Mainly to pick up this bug fix: https://github.com/google/gumbo-parser/commit/405ed672 --- nokogumbo-import/gumbo-parser | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index 3a61e9ad96..fa8673bd8d 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit 3a61e9ad963cacfb3246468feab28c5058f621c1 +Subproject commit fa8673bd8d6367659483b6be6acff90c0aa27166 From 787bafb34025aacb3cef0c3593b10e77c5eba00c Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 2 Sep 2014 19:22:06 -0400 Subject: [PATCH 0075/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- nokogumbo-import/gumbo-parser | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 13b765fb85..7ecb6f71c2 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.10' + gem.version = '1.1.11' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index fa8673bd8d..3a61e9ad96 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit fa8673bd8d6367659483b6be6acff90c0aa27166 +Subproject commit 3a61e9ad963cacfb3246468feab28c5058f621c1 From 7ba49c5cb99dd171b0bac8dfe256e08612c169ad Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Tue, 2 Sep 2014 19:24:54 -0400 Subject: [PATCH 0076/1759] revert overwrite of gumbo-parser version --- nokogumbo-import/Rakefile | 2 +- nokogumbo-import/gumbo-parser | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 7ecb6f71c2..03eb306c3c 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.11' + gem.version = '1.1.12' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index 3a61e9ad96..fa8673bd8d 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit 3a61e9ad963cacfb3246468feab28c5058f621c1 +Subproject commit fa8673bd8d6367659483b6be6acff90c0aa27166 From 40c9dfa845c9833cee17e972e96da12c90d62222 Mon Sep 17 00:00:00 2001 From: Ryan Grove Date: Thu, 25 Sep 2014 14:28:41 -0700 Subject: [PATCH 0077/1759] Update Gumbo to 0.9.2. Several bug fixes and performance improvements: https://github.com/google/gumbo-parser/releases/tag/v0.9.2 --- nokogumbo-import/gumbo-parser | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index fa8673bd8d..d0a5f3ac7b 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit fa8673bd8d6367659483b6be6acff90c0aa27166 +Subproject commit d0a5f3ac7b6de017cfb70d1a4fb11d759c819a41 From 005e164193b6503454ed6b9f6975dcba9352472f Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 9 Oct 2014 10:27:54 -0400 Subject: [PATCH 0078/1759] Refine nokogiri_lib to cope with other gems with lib/nokogiri folders Fixes https://github.com/rubys/nokogumbo/pull/13 --- nokogumbo-import/ext/nokogumboc/extconf.rb | 2 +- nokogumbo-import/gumbo-parser | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/ext/nokogumboc/extconf.rb b/nokogumbo-import/ext/nokogumboc/extconf.rb index 2f52f34bb2..4d485e2ca7 100644 --- a/nokogumbo-import/ext/nokogumboc/extconf.rb +++ b/nokogumbo-import/ext/nokogumboc/extconf.rb @@ -7,7 +7,7 @@ # nokogiri configuration from gem install nokogiri_lib = Gem.find_files('nokogiri'). - select { |name| name.include? 'lib/nokogiri' }. + select { |name| name.match(%r{gems/nokogiri-([\d.]+)/lib/nokogiri}) }. sort_by {|name| name[/nokogiri-([\d.]+)/,1].split('.').map(&:to_i)}.last if nokogiri_lib nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri') diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index d0a5f3ac7b..fa8673bd8d 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit d0a5f3ac7b6de017cfb70d1a4fb11d759c819a41 +Subproject commit fa8673bd8d6367659483b6be6acff90c0aa27166 From 001be59fe44db42523cf78b253723dbe4778e78c Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 9 Oct 2014 10:29:48 -0400 Subject: [PATCH 0079/1759] Grrr, why does git keep on doing this to me --- nokogumbo-import/gumbo-parser | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index fa8673bd8d..3d62e54d3a 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit fa8673bd8d6367659483b6be6acff90c0aa27166 +Subproject commit 3d62e54d3ae5e9b1f6f73518fc122b1d9bc492ba From 846d7c97c2360801d0c0d3d13c0060af0ae0fd0c Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Sat, 11 Oct 2014 01:02:01 +0200 Subject: [PATCH 0080/1759] Fix broken link to Nokogiri documentation --- nokogumbo-import/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index db4b61b681..69c71f0ff6 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -4,7 +4,7 @@ Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser. Nokogumbo provides the ability for a Ruby program to invoke the [Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme) and to access the result as a -[Nokogiri::HTML::Document](http://nokogiri.org/Nokogiri/HTML/Document.html). +[Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document). [![Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo) From df8260c051945a73819f6ae48d6490a92bbbdbc3 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 21 Dec 2014 21:49:14 -0500 Subject: [PATCH 0081/1759] Update to the latest version of Gumbo Fixes #16 --- nokogumbo-import/Rakefile | 2 +- nokogumbo-import/gumbo-parser | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 03eb306c3c..7450f10f4a 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.12' + gem.version = '1.1.14' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index 3d62e54d3a..d6b952b4bc 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit 3d62e54d3ae5e9b1f6f73518fc122b1d9bc492ba +Subproject commit d6b952b4bcfa15c8f9bc28f56457b969e7f96ffe From 27ea190eb91e1b51f9b309ea2bdd223d1c1d2c5b Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 22 Dec 2014 12:09:04 -0500 Subject: [PATCH 0082/1759] Add an experimental fragment method --- nokogumbo-import/README.md | 12 +++++++++ nokogumbo-import/lib/nokogumbo.rb | 39 ++++++++++++++++++++++++++++++ nokogumbo-import/test-nokogumbo.rb | 12 +++++++++ 3 files changed, 63 insertions(+) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 69c71f0ff6..00873aabf7 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -16,6 +16,14 @@ require 'nokogumbo' doc = Nokogiri::HTML5(string) ``` +An experimental _fragment_ method is also provided. While not HTML5 +compliant, it may be useful: + +```ruby +require 'nokogumbo' +doc = Nokogiri::HTML5.fragment(string) +``` + Because HTML is often fetched via the web, a convenience interface to HTTP get is also provided: @@ -34,6 +42,10 @@ puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title'] Notes ----- +* The `Nokogiri::HTML5.fragment` function takes a string and parses it + as a HTML5 document. The ``, ``, and `` elements are + removed, and any children that remain are returned as a + `Nokogiri::HTML::DocumentFragment`. * The `Nokogiri::HTML5.parse` function takes a string and passes it to the gumbo_parse_with_options method, using the default options. The resulting Gumbo parse tree is then walked. diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 95e974bb1c..fb6e89d27b 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -74,6 +74,45 @@ def self.get(uri, options={}) end end + # while fragment is on the Gumbo TODO list, simulate it by doing + # a full document parse and ignoring the parent , , and + # tags, and collecting up the children of each. + def self.fragment(string) + doc = parse(string) + fragment = Nokogiri::HTML::DocumentFragment.new(doc) + + if doc.children.length != 1 or doc.children.first.name != 'html' + # no HTML? Return document as is + fragment = doc + else + # examine children of HTML element + children = doc.children.first.children + + # head is always first. If present, take children but otherwise + # ignore the head element + if children.length > 0 and doc.children.first.name = 'head' + fragment << children.shift.children + end + + # body may be next, or last. If found, take children but otherwise + # ignore the body element. Also take any remaining elements, taking + # care to preserve order. + if children.length > 0 and doc.children.first.name = 'body' + fragment << children.shift.children + fragment << children + elsif children.length > 0 and doc.children.last.name = 'body' + body = children.pop + fragment << children + fragment << body.children + else + fragment << children + end + end + + # return result + fragment + end + private # Charset sniffing is a complex and controversial topic that understandably diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index b8518df3a4..3c953d65ae 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -76,6 +76,18 @@ def test_html5_doctype assert_match //, doc.to_html end + def test_fragment_head + doc = Nokogiri::HTML5.fragment(buffer[/(.*?)<\/head>/m, 1]) + assert_equal "hello world", doc.xpath('title').text + assert_equal "utf-8", doc.xpath('meta').first['charset'] + end + + def test_fragment_body + doc = Nokogiri::HTML5.fragment(buffer[/(.*?)<\/body>/m, 1]) + assert_equal 'content', doc.xpath('main/span').to_xml + assert_equal " test comment ", doc.xpath('comment()').text + end + private def buffer From f9fda1b48609cc1411fc997a4b76612e134a90ba Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 22 Dec 2014 12:10:54 -0500 Subject: [PATCH 0083/1759] clarify --- nokogumbo-import/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 00873aabf7..d16b9e71b5 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -44,8 +44,8 @@ Notes * The `Nokogiri::HTML5.fragment` function takes a string and parses it as a HTML5 document. The ``, ``, and `` elements are - removed, and any children that remain are returned as a - `Nokogiri::HTML::DocumentFragment`. + removed from this document, and any children of these elements that remain + are returned as a `Nokogiri::HTML::DocumentFragment`. * The `Nokogiri::HTML5.parse` function takes a string and passes it to the gumbo_parse_with_options method, using the default options. The resulting Gumbo parse tree is then walked. From 8ec182817dd418aea163ac607c3969e87911dd41 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 22 Dec 2014 16:06:49 -0500 Subject: [PATCH 0084/1759] Check for and deal with attr_namespace --- nokogumbo-import/Rakefile | 2 +- nokogumbo-import/ext/nokogumboc/nokogumbo.c | 41 ++++++++++++++++++++- nokogumbo-import/test-nokogumbo.rb | 11 ++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 7450f10f4a..2a2b7d10a4 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.1.14' + gem.version = '1.2.0' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' diff --git a/nokogumbo-import/ext/nokogumboc/nokogumbo.c b/nokogumbo-import/ext/nokogumboc/nokogumbo.c index 1b4b22e009..4c04c3280e 100644 --- a/nokogumbo-import/ext/nokogumboc/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumboc/nokogumbo.c @@ -104,10 +104,49 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { // add in the attributes GumboVector* attrs = &node->attributes; + char *name = NULL; + int namelen = 0; + char *ns; for (int i=0; i < attrs->length; i++) { GumboAttribute *attr = attrs->data[i]; - xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value); + + switch (attr->attr_namespace) { + case GUMBO_ATTR_NAMESPACE_XLINK: + ns = "xlink:"; + break; + + case GUMBO_ATTR_NAMESPACE_XML: + ns = "xml:"; + break; + + case GUMBO_ATTR_NAMESPACE_XMLNS: + ns = "xmlns:"; + if (!strcmp(attr->name, "xmlns")) ns = NULL; + break; + + default: + ns = NULL; + } + + if (ns) { + if (strlen(ns) + strlen(attr->name) + 1 > namelen) { + free(name); + name = NULL; + } + + if (!name) { + namelen = strlen(ns) + strlen(attr->name) + 1; + name = malloc(namelen); + } + + strcpy(name, ns); + strcat(name, attr->name); + xmlNewProp(element, CONST_CAST name, CONST_CAST attr->value); + } else { + xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value); + } } + if (name) free(name); // add in the children GumboVector* children = &node->children; diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index 3c953d65ae..1e70fc6a9b 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -88,6 +88,17 @@ def test_fragment_body assert_equal " test comment ", doc.xpath('comment()').text end + def test_xlink_attribute + source = <<-EOF.gsub(/^ {6}/, '') + + + + EOF + doc = Nokogiri::HTML5.fragment(source) + a = doc.at('a') + assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort + end + private def buffer From e88dc8b0971bb3f98325d1a6b5e7602e3b835db3 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 29 Dec 2014 15:13:13 -0500 Subject: [PATCH 0085/1759] Allow control over http overrides. Based on feedback: https://twitter.com/hirojin/status/549653608233709568 https://twitter.com/duckinator/status/549107236849872896 --- nokogumbo-import/lib/nokogumbo.rb | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index fb6e89d27b..36af96703d 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -42,9 +42,15 @@ def self.get(uri, options={}) http = Net::HTTP.new(uri.host, uri.port) # TLS / SSL support - if uri.scheme == 'https' - http.use_ssl = true - http.verify_mode = OpenSSL::SSL::VERIFY_NONE + http.use_ssl = true if uri.scheme == 'https' + + # Pass through Net::HTTP override values, which currently include: + # :ca_file, :ca_path, :cert, :cert_store, :ciphers, + # :close_on_empty_response, :continue_timeout, :key, :open_timeout, + # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl, + # :verify_callback, :verify_depth, :verify_mode + options.each do |key, value| + http.send "#{key}=", value if http.respond_to? "#{key}=" end request = Net::HTTP::Get.new(uri.request_uri) From dbf8ce645f10443c491cc548c9eeb93b4c9a292b Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 29 Dec 2014 18:52:56 -0500 Subject: [PATCH 0086/1759] delete http options from headers to be sent --- nokogumbo-import/lib/nokogumbo.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb index 36af96703d..4577118cb8 100644 --- a/nokogumbo-import/lib/nokogumbo.rb +++ b/nokogumbo-import/lib/nokogumbo.rb @@ -50,7 +50,7 @@ def self.get(uri, options={}) # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl, # :verify_callback, :verify_depth, :verify_mode options.each do |key, value| - http.send "#{key}=", value if http.respond_to? "#{key}=" + http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}=" end request = Net::HTTP::Get.new(uri.request_uri) From b591c4e3360cf4cbae5a7da3424457015b93c18f Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Fri, 2 Jan 2015 10:21:26 -0500 Subject: [PATCH 0087/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 2a2b7d10a4..3e2f7517e0 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.2.0' + gem.version = '1.3.0' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From 2baf4432b843da42a0280da3e5bfb48aa83e6b47 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 4 Jan 2015 02:18:58 -0500 Subject: [PATCH 0088/1759] nokogiri home page changed --- nokogumbo-import/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index d16b9e71b5..fbfb6a03ca 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -36,7 +36,7 @@ Example ----- ```ruby require 'nokogumbo' -puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title'] +puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text ``` Notes From ee52260bb0a49a9a31062d1da2a3e8810acd19b9 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Fri, 23 Jan 2015 14:55:21 -0500 Subject: [PATCH 0089/1759] Add test-nokogumbo.rb to Gem Fixes #18 --- nokogumbo-import/Rakefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 3e2f7517e0..d810ddd434 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -60,7 +60,8 @@ SPEC = Gem::Specification.new do |gem| 'LICENSE.txt', 'README.md', 'gumbo-parser/src/*', - 'gumbo-parser/visualc/include/*' + 'gumbo-parser/visualc/include/*', + 'test-nokogumbo.rb' ] end From 546f034b4c483d0e8ff789e74ee7ba74db3f284c Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 12 Mar 2015 13:44:38 -0400 Subject: [PATCH 0090/1759] Eliminate libmlx2 warnings "borrowed" from https://github.com/sparklemotion/nokogiri/commit/1696dc8dba759f7b0345decd236502b290df3f6c Fixes https://github.com/rubys/nokogumbo/issues/21 --- nokogumbo-import/ext/nokogumboc/nokogumbo.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/nokogumbo-import/ext/nokogumboc/nokogumbo.c b/nokogumbo-import/ext/nokogumboc/nokogumbo.c index 4c04c3280e..e4e545bd3b 100644 --- a/nokogumbo-import/ext/nokogumboc/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumboc/nokogumbo.c @@ -187,6 +187,11 @@ static VALUE parse(VALUE self, VALUE string) { &kGumboDefaultOptions, RSTRING_PTR(string), (size_t) RSTRING_LEN(string) ); + + VALUE error_list = rb_ary_new(); + xmlResetLastError(); + xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); + xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0"); xmlNodePtr root = walk_tree(doc, &output->root->v.element); xmlDocSetRootElement(doc, root); @@ -199,7 +204,11 @@ static VALUE parse(VALUE self, VALUE string) { } gumbo_destroy_output(&kGumboDefaultOptions, output); - return Nokogiri_wrap_xml_document(Document, doc); + xmlSetStructuredErrorFunc(NULL, NULL); + + VALUE wrapped_doc = Nokogiri_wrap_xml_document(Document, doc); + rb_iv_set(wrapped_doc, "@errors", error_list); + return wrapped_doc; } // Initialize the Nokogumbo class and fetch constants we will use later From 9e0323e35a4fe55e3ef2766fa2a56f9efb6cf17a Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 12 Mar 2015 13:48:38 -0400 Subject: [PATCH 0091/1759] pick up latest gumbo parser fixes --- nokogumbo-import/gumbo-parser | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index d6b952b4bc..e17f706c64 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit d6b952b4bcfa15c8f9bc28f56457b969e7f96ffe +Subproject commit e17f706c645c71fc33dbfa723965e374acddbebe From cad52091d49ec177790200072cc32e7628d18e2a Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 12 Mar 2015 13:50:08 -0400 Subject: [PATCH 0092/1759] push out a new gem --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index d810ddd434..bf449c230e 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.3.0' + gem.version = '1.4.0' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From 1b13efbb8c0afaf70013ed67f736dcfba8898ba2 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 12 Mar 2015 19:15:44 -0400 Subject: [PATCH 0093/1759] Revert "Eliminate libmlx2 warnings" This reverts commit 546f034b4c483d0e8ff789e74ee7ba74db3f284c. --- nokogumbo-import/ext/nokogumboc/nokogumbo.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/nokogumbo-import/ext/nokogumboc/nokogumbo.c b/nokogumbo-import/ext/nokogumboc/nokogumbo.c index e4e545bd3b..4c04c3280e 100644 --- a/nokogumbo-import/ext/nokogumboc/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumboc/nokogumbo.c @@ -187,11 +187,6 @@ static VALUE parse(VALUE self, VALUE string) { &kGumboDefaultOptions, RSTRING_PTR(string), (size_t) RSTRING_LEN(string) ); - - VALUE error_list = rb_ary_new(); - xmlResetLastError(); - xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); - xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0"); xmlNodePtr root = walk_tree(doc, &output->root->v.element); xmlDocSetRootElement(doc, root); @@ -204,11 +199,7 @@ static VALUE parse(VALUE self, VALUE string) { } gumbo_destroy_output(&kGumboDefaultOptions, output); - xmlSetStructuredErrorFunc(NULL, NULL); - - VALUE wrapped_doc = Nokogiri_wrap_xml_document(Document, doc); - rb_iv_set(wrapped_doc, "@errors", error_list); - return wrapped_doc; + return Nokogiri_wrap_xml_document(Document, doc); } // Initialize the Nokogumbo class and fetch constants we will use later From c0bf661682a21872745c1c34f0113e6b28c66cea Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 12 Mar 2015 19:59:39 -0400 Subject: [PATCH 0094/1759] template support --- nokogumbo-import/ext/nokogumboc/nokogumbo.c | 1 + nokogumbo-import/test-nokogumbo.rb | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/nokogumbo-import/ext/nokogumboc/nokogumbo.c b/nokogumbo-import/ext/nokogumboc/nokogumbo.c index 4c04c3280e..28504a9d95 100644 --- a/nokogumbo-import/ext/nokogumboc/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumboc/nokogumbo.c @@ -157,6 +157,7 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { switch (child->type) { case GUMBO_NODE_ELEMENT: + case GUMBO_NODE_TEMPLATE: node = walk_tree(document, &child->v.element); break; case GUMBO_NODE_WHITESPACE: diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index 1e70fc6a9b..be5ec9a777 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -99,6 +99,21 @@ def test_xlink_attribute assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort end + def test_template + source = <<-EOF.gsub(/^ {6}/, '') + + EOF + doc = Nokogiri::HTML5.fragment(source) + template = doc.at('template') + assert_equal "productrow", template['id'] + assert_equal "record", template.at('td')['class'] + end + private def buffer From 3236cc4a62c919f685894f039c90346b0eda8c95 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 12 Mar 2015 21:55:52 -0400 Subject: [PATCH 0095/1759] Update to supersede buggy 1.4.0 --- nokogumbo-import/Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index bf449c230e..de23784558 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -44,7 +44,7 @@ SOURCES = ['ext/nokogumboc/extconf.rb', 'ext/nokogumboc/nokogumbo.c'] task 'gem' => 'test' SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' - gem.version = '1.4.0' + gem.version = '1.4.1' gem.email = 'rubys@intertwingly.net' gem.homepage = 'https://github.com/rubys/nokogumbo/#readme' gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser' From af02e21825d0e04ca9825d98712cea88069b914d Mon Sep 17 00:00:00 2001 From: Joel Low Date: Wed, 1 Apr 2015 16:36:55 +0800 Subject: [PATCH 0096/1759] Update gumbo to 0.9.3. --- nokogumbo-import/gumbo-parser | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nokogumbo-import/gumbo-parser b/nokogumbo-import/gumbo-parser index e17f706c64..4a63d99fb6 160000 --- a/nokogumbo-import/gumbo-parser +++ b/nokogumbo-import/gumbo-parser @@ -1 +1 @@ -Subproject commit e17f706c645c71fc33dbfa723965e374acddbebe +Subproject commit 4a63d99fb6947b1144266849d06e0155cba8562a From a7abe511e95549bde9f2efb99df7f6bc83a59ec7 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 1 Apr 2015 09:09:05 -0400 Subject: [PATCH 0097/1759] comment out templates support for now --- nokogumbo-import/ext/nokogumboc/nokogumbo.c | 2 +- nokogumbo-import/test-nokogumbo.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nokogumbo-import/ext/nokogumboc/nokogumbo.c b/nokogumbo-import/ext/nokogumboc/nokogumbo.c index 28504a9d95..c20c39253f 100644 --- a/nokogumbo-import/ext/nokogumboc/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumboc/nokogumbo.c @@ -157,7 +157,7 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) { switch (child->type) { case GUMBO_NODE_ELEMENT: - case GUMBO_NODE_TEMPLATE: +// case GUMBO_NODE_TEMPLATE: /* future */ node = walk_tree(document, &child->v.element); break; case GUMBO_NODE_WHITESPACE: diff --git a/nokogumbo-import/test-nokogumbo.rb b/nokogumbo-import/test-nokogumbo.rb index be5ec9a777..0b8cad9d70 100644 --- a/nokogumbo-import/test-nokogumbo.rb +++ b/nokogumbo-import/test-nokogumbo.rb @@ -99,7 +99,7 @@ def test_xlink_attribute assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort end - def test_template + def x_test_template # future source = <<-EOF.gsub(/^ {6}/, '')