From 0274467fdba450388a8d71edbc603b0ffbfd4de3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 11 Jun 2024 15:11:07 +0900 Subject: [PATCH 01/40] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 3e870822..3af03ec7 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.0" + VERSION = "3.3.1" REVISION = "" Copyright = COPYRIGHT From 6415113201e0ebc334ff26a585ca7fdab418351b Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Tue, 11 Jun 2024 17:38:32 +0900 Subject: [PATCH 02/40] Remove an unused class var `@@namespaces` (#144) `@@namespaces` is defined under `REXML`, but it is never used. At least, `rake test` passes when it is removed. I guess the comment above `@@namespaces` is also false. --- lib/rexml/element.rb | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index bf913a82..2899759d 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -7,14 +7,6 @@ require_relative "parseexception" module REXML - # An implementation note about namespaces: - # As we parse, when we find namespaces we put them in a hash and assign - # them a unique ID. We then convert the namespace prefix for the node - # to the unique ID. This makes namespace lookup much faster for the - # cost of extra memory use. We save the namespace prefix for the - # context node and convert it back when we write it. - @@namespaces = {} - # An \REXML::Element object represents an XML element. # # An element: From b5bf109a599ea733663150e99c09eb44046b41dd Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Thu, 13 Jun 2024 15:12:32 +0900 Subject: [PATCH 03/40] Add a "malformed comment" check for top-level comments (#145) This check was missing. Therefore, `REXML::Document.new("/um, true)[1] ] + md = @source.match(/(.*?)-->/um, true) + if md.nil? + raise REXML::ParseException.new("Unclosed comment", @source) + end + if /--|-\z/.match?(md[1]) + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] elsif @source.match("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" unless @source.match(/\s+/um, true) diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb new file mode 100644 index 00000000..8f143495 --- /dev/null +++ b/test/parse/test_comment.rb @@ -0,0 +1,96 @@ +require "test/unit" +require "rexml/document" + +module REXMLTests + class TestParseComment < Test::Unit::TestCase + def parse(xml) + REXML::Document.new(xml) + end + + class TestInvalid < self + def test_toplevel_unclosed_comment + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 11 + Last 80 unconsumed characters: + DETAIL + end + + def test_toplevel_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 9 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 26 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 24 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 14 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 12 + Last 80 unconsumed characters: + DETAIL + end + end + end +end From 3b026f89b66af7a1e24fe394724e81b06b25d552 Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Thu, 13 Jun 2024 15:55:32 +0900 Subject: [PATCH 04/40] Improve `Element#attribute` implementation as 6500x faster (#146) `Element#namespaces` is heavy method because this method needs to traverse all ancestors of the element. `Element#attribute` calls `namespaces` redundantly, so it is much slower. This PR reduces `namespaces` calls in `Element#attribute`. Also, this PR removes a redundant `respond_to?` because `namespaces` must return `Hash` in the current implementation. Below is the result of a benchmark for this on my laptop. ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/makenowjust/Projects/github.com/makenowjust/simple-dotfiles/.asdf/installs/ruby/3.3.2/bin/ruby -v -S benchmark-driver /Users/makenowjust/Projects/github.com/ruby/rexml/benchmark/attribute.yaml ruby 3.3.2 (2024-05-30 revision e5a195edf6) [arm64-darwin23] Calculating ------------------------------------- rexml 3.2.6 master 3.2.6(YJIT) master(YJIT) attribute_with_ns 425.420 849.271 5.336k 10.629k i/s - 1.000k times in 2.350620s 1.177481s 0.187416s 0.094084s attribute_without_ns 834.750 5.587M 10.656k 2.950M i/s - 1.000k times in 1.197963s 0.000179s 0.093846s 0.000339s Comparison: attribute_with_ns master(YJIT): 10628.8 i/s 3.2.6(YJIT): 5335.7 i/s - 1.99x slower master: 849.3 i/s - 12.52x slower rexml 3.2.6: 425.4 i/s - 24.98x slower attribute_without_ns master: 5586593.2 i/s master(YJIT): 2949854.4 i/s - 1.89x slower 3.2.6(YJIT): 10655.8 i/s - 524.28x slower rexml 3.2.6: 834.8 i/s - 6692.53x slower ``` This result shows that `Element#attribute` is now 6500x faster than the old implementation if `namespace` is not supplied. It seems strange that it is slower when YJIT is enabled, but we believe this is a separate issue. Thank you. --------- Co-authored-by: Sutou Kouhei --- benchmark/attribute.yaml | 38 ++++++++++++++++++++++++++++++++++++++ lib/rexml/element.rb | 9 ++------- 2 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 benchmark/attribute.yaml diff --git a/benchmark/attribute.yaml b/benchmark/attribute.yaml new file mode 100644 index 00000000..5dd7fded --- /dev/null +++ b/benchmark/attribute.yaml @@ -0,0 +1,38 @@ +loop_count: 1000 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + xml_source = "" + 100.times do + xml_source = "#{xml_source}" + end + xml_source = "#{xml_source}" + + document = REXML::Document.new(xml_source) + deepest_node = document.elements["//deepest"] + +benchmark: + with_ns: deepest_node.attribute("with_ns", "xyz") + without_ns: deepest_node.attribute("without_ns") diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 2899759d..a5808d7c 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -1276,16 +1276,11 @@ def [](name_or_index) # document.root.attribute("x", "a") # => a:x='a:x' # def attribute( name, namespace=nil ) - prefix = nil - if namespaces.respond_to? :key - prefix = namespaces.key(namespace) if namespace - else - prefix = namespaces.index(namespace) if namespace - end + prefix = namespaces.key(namespace) if namespace prefix = nil if prefix == 'xmlns' ret_val = - attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + attributes.get_attribute( prefix ? "#{prefix}:#{name}" : name ) return ret_val unless ret_val.nil? return nil if prefix.nil? From 1e31ffc7c9170255c2a62773ac1e1d90c4991a9d Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Thu, 13 Jun 2024 23:29:59 +0900 Subject: [PATCH 05/40] Fix small typos (#148) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I found these typos with using [`typos-cli`](https://github.com/crate-ci/typos). Now, we can obtain no typo reports from the `typos` command with this configuration (`.typos.toml`): ```toml [files] extend-exclude = [ "*.svg", "*.xml", ] [default.extend-words] # Common variable names in this project. arry = "arry" blok = "blok" eles = "eles" # Incomplete words in test data. caf = "caf" # German words in test data. abl = "abl" # NOTE: It is a part of "Ablüfe". alle = "alle" ist = "ist" technik = "technik" ``` Thank you. --------- Co-authored-by: Olle Jonsson --- test/test_document.rb | 2 +- test/test_light.rb | 2 +- test/test_sax.rb | 2 +- test/xpath/test_base.rb | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_document.rb b/test/test_document.rb index 7fccbacb..2b0a8a73 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -237,7 +237,7 @@ def test_each_recursive document = REXML::Document.new(xml_source) # Node#each_recursive iterates elements only. - # This does not iterate XML declerations, comments, attributes, CDATA sections, etc. + # This does not iterate XML declarations, comments, attributes, CDATA sections, etc. actual_names = [] document.each_recursive do |element| actual_names << element.attributes["name"] diff --git a/test/test_light.rb b/test/test_light.rb index 54b2c52e..c556c978 100644 --- a/test/test_light.rb +++ b/test/test_light.rb @@ -62,7 +62,7 @@ def test_access_child_elements assert_equal( 'c', a[1].name ) end - def test_itterate_over_children + def test_iterate_over_children foo = make_small_document ctr = 0 foo[0].each { ctr += 1 } diff --git a/test/test_sax.rb b/test/test_sax.rb index c2255bf3..8e905f2e 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -140,7 +140,7 @@ def test_simple_doctype_listener # test doctype with missing name, should throw ParseException # submitted by Jeff Barczewseki - def test_doctype_with_mising_name_throws_exception + def test_doctype_with_missing_name_throws_exception xml = <<~END diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 68b33ab7..1dacd69d 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -651,7 +651,7 @@ def test_comparisons source = "" doc = REXML::Document.new(source) - # NOTE TO SER: check that number() is required + # NOTE: check that number() is required assert_equal 2, REXML::XPath.match(doc, "//b[number(@id) > 1]").size assert_equal 3, REXML::XPath.match(doc, "//b[number(@id) >= 1]").size assert_equal 1, REXML::XPath.match(doc, "//b[number(@id) <= 1]").size From d906ae2f05351ea68e5860be9b8c6e1de57dee9b Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Fri, 14 Jun 2024 06:00:13 +0900 Subject: [PATCH 06/40] Add a "Malformed comment" check for invalid comments such as `` (#147) `Document.new("")` raises `undefined method '[]' for nil`. This commit fixes it and adds a test for it. --- lib/rexml/parsers/baseparser.rb | 5 ++--- test/parse/test_comment.rb | 13 +++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index eae0db8b..272d8a6b 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -406,12 +406,11 @@ def pull_event if md[0][0] == ?- md = @source.match(/--(.*?)-->/um, true) - case md[1] - when /--/, /-\z/ + if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) end - return [ :comment, md[1] ] if md + return [ :comment, md[1] ] else md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 8f143495..ce6678e8 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -68,6 +68,19 @@ def test_doctype_malformed_comment_end DETAIL end + def test_after_doctype_malformed_comment_short + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed comment + Line: 1 + Position: 8 + Last 80 unconsumed characters: + --> + DETAIL + end + def test_after_doctype_malformed_comment_inner exception = assert_raise(REXML::ParseException) do parse("") From f7040112601104d71d3254a0834c4932b1b68f04 Mon Sep 17 00:00:00 2001 From: Hiroya Fujinami Date: Wed, 19 Jun 2024 14:47:34 +0900 Subject: [PATCH 07/40] Reject unclosed DOCTYPE on parsing (#153) Fix #152 --------- Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 10 ++++- lib/rexml/parsers/treeparser.rb | 23 ++++------ test/parse/test_document_type_declaration.rb | 45 ++++++++++++++++++++ 3 files changed, 63 insertions(+), 15 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 272d8a6b..5791ab1d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -216,7 +216,12 @@ def pull_event x, @closed = @closed, nil return [ :end_element, x ] end - return [ :end_document ] if empty? + if empty? + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: unclosed", @source) + end + return [ :end_document ] + end return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" @@ -373,6 +378,9 @@ def pull_event @document_status = :after_doctype return [ :end_doctype ] end + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) + end end if @document_status == :after_doctype @source.match(/\s*/um, true) diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index bf9a4254..0cb6f7cc 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -16,7 +16,6 @@ def add_listener( listener ) def parse tag_stack = [] - in_doctype = false entities = nil begin while true @@ -39,17 +38,15 @@ def parse tag_stack.pop @build_context = @build_context.parent when :text - if not in_doctype - if @build_context[-1].instance_of? Text - @build_context[-1] << event[1] - else - @build_context.add( - Text.new(event[1], @build_context.whitespace, nil, true) - ) unless ( - @build_context.ignore_whitespace_nodes and - event[1].strip.size==0 - ) - end + if @build_context[-1].instance_of? Text + @build_context[-1] << event[1] + else + @build_context.add( + Text.new(event[1], @build_context.whitespace, nil, true) + ) unless ( + @build_context.ignore_whitespace_nodes and + event[1].strip.size==0 + ) end when :comment c = Comment.new( event[1] ) @@ -60,14 +57,12 @@ def parse when :processing_instruction @build_context.add( Instruction.new( event[1], event[2] ) ) when :end_doctype - in_doctype = false entities.each { |k,v| entities[k] = @build_context.entities[k].value } @build_context = @build_context.parent when :start_doctype doctype = DocType.new( event[1..-1], @build_context ) @build_context = doctype entities = {} - in_doctype = true when :attlistdecl n = AttlistDecl.new( event[1..-1] ) @build_context.add( n ) diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 8faa0b78..3ca0b536 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -53,6 +53,51 @@ def test_no_name end end + class TestUnclosed < self + def test_no_extra_node + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(" + DOCTYPE + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed DOCTYPE: invalid declaration + Line: 1 + Position: 20 + Last 80 unconsumed characters: + #{' '} + DETAIL + end + + def test_text + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(<<~DOCTYPE) + Date: Sat, 22 Jun 2024 10:42:44 +0900 Subject: [PATCH 08/40] Fix a bug that a large XML can't be parsed (#154) GitHub: fix GH-150 If a parsed XML is later than `2 ** 31 - 1`, we can't parse it. Because `StringScanner`s position is stored as `int`. We can avoid the restriction by dropping large parsed content. Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 2 ++ lib/rexml/source.rb | 7 +++++++ test/parser/test_base_parser.rb | 27 +++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 test/parser/test_base_parser.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 5791ab1d..a003ac29 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -204,6 +204,8 @@ def peek depth=0 # Returns the next event. This is a +PullEvent+ object. def pull + @source.drop_parsed_content + pull_event.tap do |event| @listeners.each do |listener| listener.receive event diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 67154832..f12ee172 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -55,6 +55,7 @@ class Source attr_reader :encoding module Private + SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} pre_defined_terms = ["'", '"', "<"] pre_defined_terms.each do |term| @@ -84,6 +85,12 @@ def buffer @scanner.rest end + def drop_parsed_content + if @scanner.pos > Private::SCANNER_RESET_SIZE + @scanner.string = @scanner.rest + end + end + def buffer_encoding=(encoding) @scanner.string.force_encoding(encoding) end diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb new file mode 100644 index 00000000..17d01979 --- /dev/null +++ b/test/parser/test_base_parser.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: false + +require 'rexml/parsers/baseparser' + +module REXMLTests + class BaseParserTester < Test::Unit::TestCase + def test_large_xml + large_text = "a" * 100_000 + xml = <<-XML + + + #{large_text} + #{large_text} + + XML + + parser = REXML::Parsers::BaseParser.new(xml) + while parser.has_next? + parser.pull + end + + assert do + parser.position < xml.bytesize + end + end + end +end From cfa8dd90077000f21f55a6b7e5f041e2b4fd5e04 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 22 Jun 2024 14:21:28 +0900 Subject: [PATCH 09/40] Don't include private_constant-ed module (#155) Included constants are not private. So private constants in private module aren't private. See also: https://github.com/ruby/rexml/pull/154#discussion_r1649469269 --- lib/rexml/parsers/baseparser.rb | 13 ++++++------- lib/rexml/source.rb | 1 - 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index a003ac29..c83e7958 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -134,7 +134,6 @@ module Private ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um end private_constant :Private - include Private def initialize( source ) self.stream = source @@ -302,7 +301,7 @@ def pull_event raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, " Date: Sun, 23 Jun 2024 00:42:36 +0200 Subject: [PATCH 10/40] Add changelog_uri to gemspec (#156) Supported here: https://guides.rubygems.org/specification-reference/#metadata Useful for running https://github.com/MaximeD/gem_updater --- rexml.gemspec | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rexml.gemspec b/rexml.gemspec index 169e49dc..0de3e845 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -16,6 +16,10 @@ Gem::Specification.new do |spec| spec.homepage = "https://github.com/ruby/rexml" spec.license = "BSD-2-Clause" + spec.metadata = { + "changelog_uri" => "#{spec.homepage}/releases/tag/v#{spec.version}" + } + files = [ "LICENSE.txt", "NEWS.md", From e6e07f27c27a8b0955b61ee43ef73a5c283ad038 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 23 Jun 2024 20:50:25 +0900 Subject: [PATCH 11/40] Reuse of Set.new at prefixes variables (#157) ## Why? `Set.new()` instances of the prefixes variable can be reused, reducing initialization costs. ## Result ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.714 17.658 32.898 33.247 i/s - 100.000 times in 5.645176s 5.663160s 3.039707s 3.007755s sax 25.280 25.281 47.483 49.990 i/s - 100.000 times in 3.955694s 3.955534s 2.106006s 2.000389s pull 29.048 29.061 59.944 61.498 i/s - 100.000 times in 3.442599s 3.441014s 1.668222s 1.626060s stream 28.181 28.440 52.340 55.078 i/s - 100.000 times in 3.548546s 3.516169s 1.910599s 1.815599s Comparison: dom after(YJIT): 33.2 i/s before(YJIT): 32.9 i/s - 1.01x slower before: 17.7 i/s - 1.88x slower after: 17.7 i/s - 1.88x slower sax after(YJIT): 50.0 i/s before(YJIT): 47.5 i/s - 1.05x slower after: 25.3 i/s - 1.98x slower before: 25.3 i/s - 1.98x slower pull after(YJIT): 61.5 i/s before(YJIT): 59.9 i/s - 1.03x slower after: 29.1 i/s - 2.12x slower before: 29.0 i/s - 2.12x slower stream after(YJIT): 55.1 i/s before(YJIT): 52.3 i/s - 1.05x slower after: 28.4 i/s - 1.94x slower before: 28.2 i/s - 1.95x slower ``` YJIT=ON : 1.01x - 1.05x faster YJIT=OFF : 0.99x - 1.00x faster --- lib/rexml/parsers/baseparser.rb | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c83e7958..2f068e0c 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -138,6 +138,7 @@ module Private def initialize( source ) self.stream = source @listeners = [] + @prefixes = Set.new end def add_listener( listener ) @@ -253,7 +254,7 @@ def pull_event @source.position = start_position raise REXML::ParseException.new(message, @source) end - @nsstack.unshift(curr_ns=Set.new) + @nsstack.unshift(Set.new) name = parse_name(base_error_message) if @source.match(/\s*\[/um, true) id = [nil, nil, nil] @@ -437,12 +438,12 @@ def pull_event end tag = md[1] @document_status = :in_element - prefixes = Set.new - prefixes << md[2] if md[2] + @prefixes.clear + @prefixes << md[2] if md[2] @nsstack.unshift(curr_ns=Set.new) - attributes, closed = parse_attributes(prefixes, curr_ns) + attributes, closed = parse_attributes(@prefixes, curr_ns) # Verify that all of the prefixes have been defined - for prefix in prefixes + for prefix in @prefixes unless @nsstack.find{|k| k.member?(prefix)} raise UndefinedNamespaceException.new(prefix,@source,self) end From a579730f25ec7443796495541ec57c071b91805d Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 25 Jun 2024 09:07:11 +0900 Subject: [PATCH 12/40] Optimize BaseParser#unnormalize method (#158) ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.704 18.106 34.215 33.806 i/s - 100.000 times in 5.648398s 5.523110s 2.922698s 2.958036s sax 25.664 25.302 48.429 48.602 i/s - 100.000 times in 3.896488s 3.952289s 2.064859s 2.057537s pull 28.966 29.215 61.710 62.068 i/s - 100.000 times in 3.452275s 3.422901s 1.620480s 1.611129s stream 28.291 28.426 53.860 55.548 i/s - 100.000 times in 3.534716s 3.517884s 1.856667s 1.800247s Comparison: dom before(YJIT): 34.2 i/s after(YJIT): 33.8 i/s - 1.01x slower after: 18.1 i/s - 1.89x slower before: 17.7 i/s - 1.93x slower sax after(YJIT): 48.6 i/s before(YJIT): 48.4 i/s - 1.00x slower before: 25.7 i/s - 1.89x slower after: 25.3 i/s - 1.92x slower pull after(YJIT): 62.1 i/s before(YJIT): 61.7 i/s - 1.01x slower after: 29.2 i/s - 2.12x slower before: 29.0 i/s - 2.14x slower stream after(YJIT): 55.5 i/s before(YJIT): 53.9 i/s - 1.03x slower after: 28.4 i/s - 1.95x slower before: 28.3 i/s - 1.96x slower ``` - YJIT=ON : 1.00x - 1.03x faster - YJIT=OFF : 0.98x - 1.02x faster --- lib/rexml/parsers/baseparser.rb | 15 +++++++++++---- test/test_pullparser.rb | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 2f068e0c..275372ee 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -132,6 +132,13 @@ module Private GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ + CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + DEFAULT_ENTITIES_PATTERNS = {} + default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] + default_entities.each do |term| + DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ + end end private_constant :Private @@ -504,10 +511,10 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.gsub( /\r\n?/, "\n" ) + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { + rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') @@ -518,7 +525,7 @@ def unnormalize( string, entities=nil, filter=nil ) unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value - re = /&#{entity_reference};/ + re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) else er = DEFAULT_ENTITIES[entity_reference] @@ -526,7 +533,7 @@ def unnormalize( string, entities=nil, filter=nil ) end end end - rv.gsub!( /&/, '&' ) + rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) end rv end diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 53a985ba..b6a48c93 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -62,6 +62,26 @@ def test_entity_replacement end end + def test_character_references + source = 'AB' + parser = REXML::Parsers::PullParser.new( source ) + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + case element_name + when 'a' + assert_equal('A', event[1]) + when 'b' + assert_equal('B', event[1]) + end + end + end + end + def test_peek_unshift source = "" REXML::Parsers::PullParser.new(source) From 20017eea807e8fa386aa5c79ae779004d8b366dd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 25 Jun 2024 11:26:33 +0900 Subject: [PATCH 13/40] Add 3.3.1 entry --- NEWS.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/NEWS.md b/NEWS.md index c8e9ecc0..3e406574 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,52 @@ # News +## 3.3.1 - 2024-06-25 {#version-3-3-1} + +### Improvements + + * Added support for detecting malformed top-level comments. + * GH-145 + * Patch by Hiroya Fujinami. + + * Improved `REXML::Element#attribute` performance. + * GH-146 + * Patch by Hiroya Fujinami. + + * Added support for detecting malformed `` comments. + * GH-147 + * Patch by Hiroya Fujinami. + + * Added support for detecting unclosed `DOCTYPE`. + * GH-152 + * Patch by Hiroya Fujinami. + + * Added `changlog_uri` metadata to gemspec. + * GH-156 + * Patch by fynsta. + + * Improved parse performance. + * GH-157 + * GH-158 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that large XML can't be parsed. + * GH-154 + * Patch by NAITOH Jun. + + * Fixed a bug that private constants are visible. + * GH-155 + * Patch by NAITOH Jun. + +### Thanks + + * Hiroya Fujinami + + * NAITOH Jun + + * fynsta + ## 3.3.0 - 2024-06-11 {#version-3-3-0} ### Improvements From 78b29137bf1ee46e7cf028f52cfa16f6e2578cfd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 25 Jun 2024 11:27:12 +0900 Subject: [PATCH 14/40] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 3af03ec7..573d0a13 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.1" + VERSION = "3.3.2" REVISION = "" Copyright = COPYRIGHT From face9dd1fdde20351316c6c3b8090a65cd490305 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 27 Jun 2024 06:43:12 +0900 Subject: [PATCH 15/40] Optimize BaseParser#unnormalize method to replace "\r\n" with "\n" only when "\r\n" is included (#160) ## Why? See: https://github.com/ruby/rexml/pull/158#issuecomment-2187663068 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.674 17.567 32.759 32.316 i/s - 100.000 times in 5.657973s 5.692371s 3.052595s 3.094448s sax 25.261 25.377 48.889 49.911 i/s - 100.000 times in 3.958626s 3.940640s 2.045460s 2.003575s pull 28.968 29.121 61.584 61.774 i/s - 100.000 times in 3.452132s 3.433967s 1.623789s 1.618809s stream 28.395 28.803 55.289 57.970 i/s - 100.000 times in 3.521761s 3.471812s 1.808673s 1.725029s Comparison: dom before(YJIT): 32.8 i/s after(YJIT): 32.3 i/s - 1.01x slower before: 17.7 i/s - 1.85x slower after: 17.6 i/s - 1.86x slower sax after(YJIT): 49.9 i/s before(YJIT): 48.9 i/s - 1.02x slower after: 25.4 i/s - 1.97x slower before: 25.3 i/s - 1.98x slower pull after(YJIT): 61.8 i/s before(YJIT): 61.6 i/s - 1.00x slower after: 29.1 i/s - 2.12x slower before: 29.0 i/s - 2.13x slower stream after(YJIT): 58.0 i/s before(YJIT): 55.3 i/s - 1.05x slower after: 28.8 i/s - 2.01x slower before: 28.4 i/s - 2.04x slower ``` - YJIT=ON : 0.98x - 1.05x faster - YJIT=OFF : 0.98x - 1.02x faster --------- Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 6 +++++- test/test_pullparser.rb | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 275372ee..02759e70 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -511,7 +511,11 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) + if string.include?("\r") + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) + else + rv = string.dup + end matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( Private::CHARACTER_REFERENCES ) { diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index b6a48c93..073d896d 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -82,6 +82,27 @@ def test_character_references end end + def test_text_content_with_line_breaks + source = "AB\nC\r\n" + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B\n", events['b']) + assert_equal("C\n", events['c']) + end + def test_peek_unshift source = "" REXML::Parsers::PullParser.new(source) From eb45c8dcca962c04e56f46b0040b2c33278ca3f9 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 8 Jul 2024 05:52:19 +0900 Subject: [PATCH 16/40] fix: Extra content at the end of the document (#161) ## Why? XML with additional content at the end of the document is invalid. https://www.w3.org/TR/2006/REC-xml11-20060816/#document ``` [1] document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc ``` [27] Misc ::= Comment | PI | S ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI ``` [16] PI ::= '' Char*)))? '?>' ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget ``` [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) ``` --- lib/rexml/parsers/baseparser.rb | 9 ++++++ test/parse/test_comment.rb | 12 ++++++++ test/parse/test_element.rb | 34 +++++++++++++++++++++++ test/parse/test_processing_instruction.rb | 12 ++++++++ test/parse/test_text.rb | 25 +++++++++++++++++ test/test_pullparser.rb | 14 +++++----- 6 files changed, 99 insertions(+), 7 deletions(-) create mode 100644 test/parse/test_text.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 02759e70..900c19cc 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -460,8 +460,12 @@ def pull_event @closed = tag @nsstack.shift else + if @tags.empty? and @have_root + raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source) + end @tags.push( tag ) end + @have_root = true return [ :start_element, tag, attributes ] end else @@ -469,6 +473,11 @@ def pull_event if text.chomp!("<") @source.position -= "<".bytesize end + if @tags.empty? and @have_root + unless /\A\s*\z/.match?(text) + raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) + end + end return [ :text, text ] end rescue REXML::UndefinedNamespaceException diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index ce6678e8..46a07409 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -105,5 +105,17 @@ def test_after_doctype_malformed_comment_end DETAIL end end + + def test_after_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end end end diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 14d0703a..a65cfa85 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -85,6 +85,40 @@ def test_garbage_less_than_slash_before_end_tag_at_line_start DETAIL end + + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end end end diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb new file mode 100644 index 00000000..f1622b71 --- /dev/null +++ b/test/parse/test_text.rb @@ -0,0 +1,25 @@ +require "test/unit" +require 'rexml/parsers/baseparser' + +module REXMLTests + class TestParseText < Test::Unit::TestCase + class TestInvalid < self + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('c') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra content at the end of the document (got 'c') + Line: 1 + Position: 8 + Last 80 unconsumed characters: + + DETAIL + end + end + end +end diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 073d896d..0aca46be 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -63,8 +63,10 @@ def test_entity_replacement end def test_character_references - source = 'AB' + source = 'AB' parser = REXML::Parsers::PullParser.new( source ) + + events = {} element_name = '' while parser.has_next? event = parser.pull @@ -72,14 +74,12 @@ def test_character_references when :start_element element_name = event[0] when :text - case element_name - when 'a' - assert_equal('A', event[1]) - when 'b' - assert_equal('B', event[1]) - end + events[element_name] = event[1] end end + + assert_equal('A', events['a']) + assert_equal("B", events['b']) end def test_text_content_with_line_breaks From ebc3e85bfa2796fb4922c1932760bec8390ff87c Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 8 Jul 2024 05:54:06 +0900 Subject: [PATCH 17/40] Add position check for XML declaration (#162) ## Why? XML declaration must be the first item. https://www.w3.org/TR/2006/REC-xml11-20060816/#document ``` [1] document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog ``` [22] prolog ::= XMLDecl Misc* (doctypedecl Misc*)? ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl ``` [23] XMLDecl ::= '' ``` See: https://github.com/ruby/rexml/pull/161#discussion_r1666118193 --- lib/rexml/parsers/baseparser.rb | 5 ++++- test/parse/test_processing_instruction.rb | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 900c19cc..2a448e13 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -644,7 +644,10 @@ def process_instruction(start_position) @source.position = start_position raise REXML::ParseException.new(message, @source) end - if @document_status.nil? and match_data[1] == "xml" + if match_data[1] == "xml" + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end content = match_data[2] version = VERSION.match(content) version = version[1] unless version.nil? diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 40dadd11..13384935 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -39,6 +39,23 @@ def test_garbage_text pi.content, ]) end + + def test_xml_declaration_not_at_document_start + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration is not at the start + Line: 1 + Position: 25 + Last 80 unconsumed characters: + + DETAIL + end end def test_after_root From b2ec329dc1dc7635b224a6d61687c24b1e1db6fd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 10 Jul 2024 09:50:12 +0900 Subject: [PATCH 18/40] test: move an attribute value test to parse/test_element.rb --- test/parse/test_element.rb | 11 +++++++++++ test/test_document.rb | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index a65cfa85..261f25c3 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseElement < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -120,5 +124,12 @@ def test_after_empty_element_tag_root DETAIL end end + + def test_gt_linear_performance_attribute_value + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + '">') + end + end end end diff --git a/test/test_document.rb b/test/test_document.rb index 2b0a8a73..ec0e8a5a 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -1,12 +1,8 @@ # -*- coding: utf-8 -*- # frozen_string_literal: false -require 'core_assertions' - module REXMLTests class TestDocument < Test::Unit::TestCase - include Test::Unit::CoreAssertions - def test_version_attributes_to_s doc = REXML::Document.new(<<~eoxml) @@ -202,13 +198,6 @@ def test_xml_declaration_standalone assert_equal('no', doc.stand_alone?, bug2539) end - def test_gt_linear_performance_attribute_value - seq = [10000, 50000, 100000, 150000, 200000] - assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('" * n + '">') - end - end - def test_each_recursive xml_source = <<~XML From 5e140edc3051741691e00bf96fa5119b44288a42 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 11 Jul 2024 09:49:56 +0900 Subject: [PATCH 19/40] Stop adding extra new line after XML declaration with pretty format (#164) If the XML file does not end with a newline, a space is added to the end of the first line. ```ruby Failure: test_indent(REXMLTests::TestDocument::WriteTest::ArgumentsTest) /Users/naitoh/ghq/github.com/naitoh/rexml/test/test_document.rb:270:in `test_indent' 267: output = "" 268: indent = 2 269: @document.write(output, indent) => 270: assert_equal(<<-EOX.chomp, output) 271: 272: 273: Hello world! <"\n" + "\n" + " Hello world!\n" + ""> expected but was <" \n" + "\n" + " Hello world!\n" + ""> diff: ? Hello world! ``` This is happen because `REXML::Formatters::Pretty#write_document` has a logic that depends on the last text node. We should ignore all top-level text nodes with pretty format. --- lib/rexml/formatters/pretty.rb | 2 +- test/test_document.rb | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/rexml/formatters/pretty.rb b/lib/rexml/formatters/pretty.rb index a1198b7a..a838d835 100644 --- a/lib/rexml/formatters/pretty.rb +++ b/lib/rexml/formatters/pretty.rb @@ -111,7 +111,7 @@ def write_document( node, output ) # itself, then we don't need a carriage return... which makes this # logic more complex. node.children.each { |child| - next if child == node.children[-1] and child.instance_of?(Text) + next if child.instance_of?(Text) unless child == node.children[0] or child.instance_of?(Text) or (child == node.children[1] and !node.children[0].writethis) output << "\n" diff --git a/test/test_document.rb b/test/test_document.rb index ec0e8a5a..9cd77c4e 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -236,7 +236,7 @@ def test_each_recursive class WriteTest < Test::Unit::TestCase def setup - @document = REXML::Document.new(<<-EOX) + @document = REXML::Document.new(<<-EOX.chomp) Hello world! EOX @@ -246,7 +246,7 @@ class ArgumentsTest < self def test_output output = "" @document.write(output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -269,7 +269,7 @@ def test_transitive indent = 2 transitive = true @document.write(output, indent, transitive) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! #{japanese_text} EOX @@ -309,7 +309,7 @@ class OptionsTest < self def test_output output = "" @document.write(:output => output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -329,7 +329,7 @@ def test_indent def test_transitive output = "" @document.write(:output => output, :indent => 2, :transitive => true) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! output, :encoding => encoding) - assert_equal(<<-EOX.encode(encoding), output) + assert_equal(<<-EOX.chomp.encode(encoding), output) #{japanese_text} EOX From 6d6400cdc03b612c3a3181b9055af87d3d2ddc68 Mon Sep 17 00:00:00 2001 From: Watson Date: Thu, 11 Jul 2024 12:13:44 +0900 Subject: [PATCH 20/40] Add tests for REXML::Text.check (#165) This patch will add missing REXML::Text.check tests. This is the tests for the part that is checked using a regular expression: https://github.com/ruby/rexml/blob/b2ec329dc1dc7635b224a6d61687c24b1e1db6fd/lib/rexml/text.rb#L155-L172 --- test/test_text_check.rb | 92 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 test/test_text_check.rb diff --git a/test/test_text_check.rb b/test/test_text_check.rb new file mode 100644 index 00000000..d4076edf --- /dev/null +++ b/test/test_text_check.rb @@ -0,0 +1,92 @@ +# frozen_string_literal: false + +module REXMLTests + class TextCheckTester < Test::Unit::TestCase + + def check(string) + REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil) + end + + def assert_check(string) + assert_nothing_raised { check(string) } + end + + def assert_check_failed(string, illegal_part) + message = "Illegal character #{illegal_part.inspect} in raw string #{string.inspect}" + assert_raise(RuntimeError.new(message)) do + check(string) + end + end + + class TestValid < self + def test_entity_name_start_char_colon + assert_check('&:;') + end + + def test_entity_name_start_char_under_score + assert_check('&_;') + end + + def test_entity_name_mix + assert_check('&A.b-0123;') + end + + def test_character_reference_decimal + assert_check('¢') + end + + def test_character_reference_hex + assert_check('􏿿') + end + + def test_entity_name_non_ascii + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + assert_check("&\u3042\u3044;") + end + + def test_normal_string + assert_check("foo") + end + end + + class TestInvalid < self + def test_lt + assert_check_failed('<;', '<') + end + + def test_lt_mix + assert_check_failed('ab Date: Thu, 11 Jul 2024 18:44:54 +0900 Subject: [PATCH 21/40] Fix test for Text.check (#166) This patch will fix incorrect string in a case where unicode characters. Because of the use of single quotes, it was simply an ASCII string. --- test/test_text_check.rb | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index d4076edf..56d00440 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -20,23 +20,23 @@ def assert_check_failed(string, illegal_part) class TestValid < self def test_entity_name_start_char_colon - assert_check('&:;') + assert_check("&:;") end def test_entity_name_start_char_under_score - assert_check('&_;') + assert_check("&_;") end def test_entity_name_mix - assert_check('&A.b-0123;') + assert_check("&A.b-0123;") end def test_character_reference_decimal - assert_check('¢') + assert_check("¢") end def test_character_reference_hex - assert_check('􏿿') + assert_check("􏿿") end def test_entity_name_non_ascii @@ -52,40 +52,40 @@ def test_normal_string class TestInvalid < self def test_lt - assert_check_failed('<;', '<') + assert_check_failed("<;", "<") end def test_lt_mix - assert_check_failed('ab Date: Thu, 11 Jul 2024 20:52:09 +0900 Subject: [PATCH 22/40] test Text.check: add empty reference case --- test/test_text_check.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index 56d00440..08cacbdb 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -59,6 +59,10 @@ def test_lt_mix assert_check_failed("ab Date: Thu, 11 Jul 2024 21:00:43 +0900 Subject: [PATCH 23/40] test Text.check: add garbage at the end in character reference cases --- test/test_text_check.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index 08cacbdb..b2eebe92 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -67,6 +67,11 @@ def test_entity_reference_missing_colon assert_check_failed("&", "&") end + def test_character_reference_decimal_garbage_at_the_end + # U+0030 DIGIT ZERO + assert_check_failed("0x;", "&") + end + def test_character_reference_decimal_invalid_value # U+0008 BACKSPACE assert_check_failed("", "") @@ -82,6 +87,11 @@ def test_character_reference_format_hex_00x assert_check_failed("�x41;", "�x41;") end + def test_character_reference_hex_garbage_at_the_end + # U+0030 DIGIT ZERO + assert_check_failed("Hx;", "&") + end + def test_character_reference_hex_surrogate_block # U+0D800 SURROGATE PAIR assert_check_failed("�", "�") From 704044056df5bd03ffb60303f42999c8780b0770 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 11 Jul 2024 21:03:54 +0900 Subject: [PATCH 24/40] test Text.check: use "why" for test name --- test/test_text_check.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index b2eebe92..1ba534fa 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -72,7 +72,7 @@ def test_character_reference_decimal_garbage_at_the_end assert_check_failed("0x;", "&") end - def test_character_reference_decimal_invalid_value + def test_character_reference_decimal_control_character # U+0008 BACKSPACE assert_check_failed("", "") end From ddea83ff7a890b9d341fca1aa031d575aa88d1ac Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 11 Jul 2024 21:06:08 +0900 Subject: [PATCH 25/40] test Text.check: add a space at the start in character reference cases --- test/test_text_check.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index 1ba534fa..a1cc2149 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -72,6 +72,11 @@ def test_character_reference_decimal_garbage_at_the_end assert_check_failed("0x;", "&") end + def test_character_reference_decimal_space_at_the_start + # U+0030 DIGIT ZERO + assert_check_failed("&# 48;", "&") + end + def test_character_reference_decimal_control_character # U+0008 BACKSPACE assert_check_failed("", "") @@ -92,6 +97,11 @@ def test_character_reference_hex_garbage_at_the_end assert_check_failed("Hx;", "&") end + def test_character_reference_hex_space_at_the_start + # U+0030 DIGIT ZERO + assert_check_failed("&#x 30;", "&") + end + def test_character_reference_hex_surrogate_block # U+0D800 SURROGATE PAIR assert_check_failed("�", "�") From 20f808478c4b5243adb24cae4fcc357db7116853 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 11 Jul 2024 21:08:26 +0900 Subject: [PATCH 26/40] test Text.check: add entity reference with new line case --- test/test_text_check.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index a1cc2149..11cf65a3 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -111,6 +111,11 @@ def test_entity_name_non_ascii_symbol # U+00BF INVERTED QUESTION MARK assert_check_failed("&\u00BF;", "&") end + + def test_entity_name_new_line + # U+0026 AMPERSAND + assert_check_failed("&\namp\nx;", "&") + end end end end From a5075c151d8e700057d7b3e1fd1db571ac2c4c4c Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Fri, 12 Jul 2024 09:33:30 +0900 Subject: [PATCH 27/40] Do not output :text event after the root tag is closed (#167) ## Why? GitHub: fix GH-163 ## Change - sax_test.rb ``` require 'rexml/parsers/sax2parser' require 'rexml/parsers/pullparser' require 'rexml/parsers/streamparser' require 'libxml-ruby' require 'nokogiri' xml = < a b c EOS class Listener def method_missing(name, *args) p [name, *args] end end puts "LibXML(SAX)" parser = LibXML::XML::SaxParser.string(xml) parser.callbacks = Listener.new parser.parse puts "" puts "Nokogiri(SAX)" parser = Nokogiri::XML::SAX::Parser.new(Listener.new) parser.parse(xml) puts "" puts "REXML(SAX)" parser = REXML::Parsers::SAX2Parser.new(xml) parser.listen(Listener.new) parser.parse puts "" puts "REXML(Pull)" parser = REXML::Parsers::PullParser.new(xml) while parser.has_next? res = parser.pull p res end puts "" puts "REXML(Stream)" parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse ``` ## Before (rexml 3.3.1) ``` LibXML(SAX) [:on_start_document] [:on_start_element_ns, "root", {}, nil, nil, {}] [:on_characters, " a b c \n"] [:on_end_element_ns, "root", nil, nil] [:on_comment, " ok comment "] [:on_processing_instruction, "abc", "version=\"1.0\" "] [:on_end_document] Nokogiri(SAX) [:start_document] [:start_element_namespace, "root", [], nil, nil, []] [:characters, " a b c \n"] [:end_element_namespace, "root", nil, nil] [:comment, " ok comment "] [:processing_instruction, "abc", "version=\"1.0\" "] [:end_document] REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, " a b c \n"] [:progress, 15] [:end_element, nil, "root", "root"] [:progress, 22] [:characters, "\n"] [:progress, 23] [:comment, " ok comment "] [:progress, 42] [:characters, "\n"] [:progress, 43] [:processing_instruction, "abc", " version=\"1.0\" "] [:progress, 65] [:characters, "\n"] [:progress, 66] [:end_document] REXML(Pull) start_element: ["root", {}] text: [" a b c \n", " a b c \n"] end_element: ["root"] text: ["\n", "\n"] comment: [" ok comment "] text: ["\n", "\n"] processing_instruction: ["abc", " version=\"1.0\" "] text: ["\n", "\n"] REXML(Stream) [:tag_start, "root", {}] [:text, " a b c \n"] [:tag_end, "root"] [:text, "\n"] [:comment, " ok comment "] [:text, "\n"] [:instruction, "abc", " version=\"1.0\" "] [:text, "\n"] ``` ## After(This PR) ``` REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, " a b c \n"] [:progress, 15] [:end_element, nil, "root", "root"] [:progress, 22] [:comment, " ok comment "] [:progress, 42] [:processing_instruction, "abc", " version=\"1.0\" "] [:progress, 65] [:end_document] REXML(Pull) start_element: ["root", {}] text: [" a b c \n", " a b c \n"] end_element: ["root"] comment: [" ok comment "] processing_instruction: ["abc", " version=\"1.0\" "] end_document: [] REXML(Stream) [:tag_start, "root", {}] [:text, " a b c \n"] [:tag_end, "root"] [:comment, " ok comment "] [:instruction, "abc", " version=\"1.0\" "] ``` --- lib/rexml/parsers/baseparser.rb | 1 + test/parse/test_text.rb | 15 +++++++++++++++ test/parser/test_ultra_light.rb | 1 - test/test_core.rb | 2 +- test/test_document.rb | 2 +- 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 2a448e13..5cf1af21 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -477,6 +477,7 @@ def pull_event unless /\A\s*\z/.match?(text) raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) end + return pull_event end return [ :text, text ] end diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb index f1622b71..1acefc40 100644 --- a/test/parse/test_text.rb +++ b/test/parse/test_text.rb @@ -21,5 +21,20 @@ def test_after_root DETAIL end end + + def test_whitespace_characters_after_root + parser = REXML::Parsers::BaseParser.new('b ') + + events = [] + while parser.has_next? + event = parser.pull + case event[0] + when :text + events << event[1] + end + end + + assert_equal(["b"], events) + end end end diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb index 44fd1d1e..b3f576ff 100644 --- a/test/parser/test_ultra_light.rb +++ b/test/parser/test_ultra_light.rb @@ -17,7 +17,6 @@ def test_entity_declaration [:entitydecl, "name", "value"] ], [:start_element, :parent, "root", {}], - [:text, "\n"], ], parse(<<-INTERNAL_SUBSET)) diff --git a/test/test_core.rb b/test/test_core.rb index 44e2e7ea..e1fba8a7 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -826,7 +826,7 @@ def test_deep_clone end def test_whitespace_before_root - a = < diff --git a/test/test_document.rb b/test/test_document.rb index 9cd77c4e..33cf4002 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -435,7 +435,7 @@ def test_utf_16 actual_xml = "" document.write(actual_xml) - expected_xml = <<-EOX.encode("UTF-16BE") + expected_xml = <<-EOX.chomp.encode("UTF-16BE") \ufeff Hello world! EOX From 4ebf21f686654af7254beb3721a5c57990eafc30 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 14 Jul 2024 20:22:00 +0900 Subject: [PATCH 28/40] Fix a bug that SAX2 parser doesn't expand the predefined entities for "characters" (#168) ## Why? SAX2 parser expand user-defined entity references and character references but doesn't expand predefined entity references. ## Change - text_unnormalized.rb ``` require 'rexml/document' require 'rexml/parsers/sax2parser' require 'rexml/parsers/pullparser' require 'rexml/parsers/streamparser' xml = < <P>&https://github.com/ruby/rexml/pull/13; <I> <B> Text </B> </I> EOS class Listener def method_missing(name, *args) p [name, *args] end end puts "REXML(DOM)" REXML::Document.new(xml).elements.each("/root/A") {|element| puts element.text} puts "" puts "REXML(Pull)" parser = REXML::Parsers::PullParser.new(xml) while parser.has_next? res = parser.pull p res end puts "" puts "REXML(Stream)" parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse puts "" puts "REXML(SAX)" parser = REXML::Parsers::SAX2Parser.new(xml) parser.listen(Listener.new) parser.parse ``` ## Before (master) ``` $ ruby text_unnormalized.rb REXML(DOM) Text REXML(Pull) start_element: ["root", {}] text: ["\n ", "\n "] start_element: ["A", {}] text: ["<P>&https://github.com/ruby/rexml/pull/13; <I> <B> Text </B> </I>", "

\r Text "] end_element: ["A"] text: ["\n", "\n"] end_element: ["root"] end_document: [] REXML(Stream) [:tag_start, "root", {}] [:text, "\n "] [:tag_start, "A", {}] [:text, "

\r Text "] [:tag_end, "A"] [:text, "\n"] [:tag_end, "root"] REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, "\n "] [:progress, 9] [:start_element, nil, "A", "A", {}] [:progress, 12] [:characters, "<P>\r <I> <B> Text </B> </I>"] #<= This [:progress, 74] [:end_element, nil, "A", "A"] [:progress, 78] [:characters, "\n"] [:progress, 79] [:end_element, nil, "root", "root"] [:progress, 86] [:end_document] ``` ## After(This PR) ``` $ ruby text_unnormalized.rb REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, "\n "] [:progress, 9] [:start_element, nil, "A", "A", {}] [:progress, 12] [:characters, "

\r Text "] [:progress, 74] [:end_element, nil, "A", "A"] [:progress, 78] [:characters, "\n"] [:progress, 79] [:end_element, nil, "root", "root"] [:progress, 86] [:end_document] ``` --- lib/rexml/parsers/sax2parser.rb | 21 ++------------------- lib/rexml/parsers/streamparser.rb | 4 ++-- test/test_pullparser.rb | 16 ++++++++++++++++ test/test_sax.rb | 11 +++++++++++ 4 files changed, 31 insertions(+), 21 deletions(-) diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 6a24ce22..36f98c2a 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -157,25 +157,8 @@ def parse end end when :text - #normalized = @parser.normalize( event[1] ) - #handle( :characters, normalized ) - copy = event[1].clone - - esub = proc { |match| - if @entities.has_key?($1) - @entities[$1].gsub(Text::REFERENCE, &esub) - else - match - end - } - - copy.gsub!( Text::REFERENCE, &esub ) - copy.gsub!( Text::NUMERICENTITY ) {|m| - m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') - } - handle( :characters, copy ) + unnormalized = @parser.unnormalize( event[1], @entities ) + handle( :characters, unnormalized ) when :entitydecl handle_entitydecl( event ) when :processing_instruction, :comment, :attlistdecl, diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 9e0eb0b3..fa3ac496 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -36,8 +36,8 @@ def parse @listener.tag_end( event[1] ) @tag_stack.pop when :text - normalized = @parser.unnormalize( event[1] ) - @listener.text( normalized ) + unnormalized = @parser.unnormalize( event[1] ) + @listener.text( unnormalized ) when :processing_instruction @listener.instruction( *event[1,2] ) when :start_doctype diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 0aca46be..096e8b7f 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -82,6 +82,22 @@ def test_character_references assert_equal("B", events['b']) end + def test_text_entity_references + source = '<P> <I> <B> Text </B> </I>' + parser = REXML::Parsers::PullParser.new( source ) + + events = [] + while parser.has_next? + event = parser.pull + case event.event_type + when :text + events << event[1] + end + end + + assert_equal(["

Text "], events) + end + def test_text_content_with_line_breaks source = "AB\nC\r\n" parser = REXML::Parsers::PullParser.new( source ) diff --git a/test/test_sax.rb b/test/test_sax.rb index 8e905f2e..5a3f5e4e 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -31,6 +31,17 @@ def test_entity_replacement assert_equal '--1234--', results[1] end + def test_characters_predefined_entities + source = '<P> <I> <B> Text </B> </I>' + + sax = Parsers::SAX2Parser.new( source ) + results = [] + sax.listen(:characters) {|x| results << x } + sax.parse + + assert_equal(["

Text "], results) + end + def test_sax2 File.open(fixture_path("documentation.xml")) do |f| parser = Parsers::SAX2Parser.new( f ) From b8a5f4cd5c8fe29c65d7a00e67170223d9d2b50e Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 10:48:53 +0900 Subject: [PATCH 29/40] Fix performance issue caused by using repeated `>` characters inside `/um + INSTRUCTION_TERM = "?>" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -639,7 +640,7 @@ def parse_id_invalid_details(accept_external_id:, end def process_instruction(start_position) - match_data = @source.match(Private::INSTRUCTION_END, true) + match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM) unless match_data message = "Invalid processing instruction node" @source.position = start_position diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 5715c352..4c30532a 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -117,7 +117,7 @@ def read_until(term) def ensure_buffer end - def match(pattern, cons=false) + def match(pattern, cons=false, term: nil) if cons @scanner.scan(pattern).nil? ? nil : @scanner else @@ -240,7 +240,7 @@ def ensure_buffer # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats: # - ">" # - "XXX>" (X is any string excluding '>') - def match( pattern, cons=false ) + def match( pattern, cons=false, term: nil ) while true if cons md = @scanner.scan(pattern) @@ -250,7 +250,7 @@ def match( pattern, cons=false ) break if md return nil if pattern.is_a?(String) return nil if @source.nil? - return nil unless read + return nil unless read(term) end md.nil? ? nil : @scanner diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 13384935..ac4c2ff0 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseProcessinInstruction < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -69,5 +73,12 @@ def test_after_root assert_equal("abc", events[:processing_instruction]) end + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' ?>') + end + end end end From 0af55fa49d4c9369f90f239a9571edab800ed36e Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 10:57:39 +0900 Subject: [PATCH 30/40] Fix ReDoS caused by very large character references using repeated 0s (#169) This patch will fix the ReDoS that is caused by large string of 0s on a character reference (like `�...`). This is occurred in Ruby 3.1 or earlier. --- lib/rexml/text.rb | 48 ++++++++++++++++++-------- test/parse/test_character_reference.rb | 17 +++++++++ 2 files changed, 51 insertions(+), 14 deletions(-) create mode 100644 test/parse/test_character_reference.rb diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index b47bad3b..7e0befe9 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -151,25 +151,45 @@ def Text.check string, pattern, doctype end end - # context sensitive - string.scan(pattern) do - if $1[-1] != ?; - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" - elsif $1[0] == ?& - if $5 and $5[0] == ?# - case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) - when *VALID_CHAR + pos = 0 + while (index = string.index(/<|&/, pos)) + if string[index] == "<" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + unless (end_index = string.index(/[^\s];/, index + 1)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + value = string[(index + 1)..end_index] + if /\s/.match?(value) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + if value[0] == "#" + character_reference = value[1..-1] + + unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference)) + if character_reference[0] == "x" || character_reference[-1] == "x" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" else - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" end - # FIXME: below can't work but this needs API change. - # elsif @parent and $3 and !SUBSTITUTES.include?($1) - # if !doctype or !doctype.entities.has_key?($3) - # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" - # end end + + case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" + end + elsif !(/\A#{Entity::NAME}\z/um.match?(value)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" end + + pos = end_index + 1 end + + string end def node_type diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb new file mode 100644 index 00000000..8ddeccaa --- /dev/null +++ b/test/parse/test_character_reference.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCharacterReference < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_gt_linear_performance_many_preceding_zeros + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end From c1b64c174ec2e8ca2174c51332670e3be30c865f Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 10:57:50 +0900 Subject: [PATCH 31/40] Fix performance issue caused by using repeated `>` characters inside comments (#171) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 3 ++- test/parse/test_comment.rb | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index b117e654..ba205175 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -126,6 +126,7 @@ class BaseParser module Private INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um INSTRUCTION_TERM = "?>" + COMMENT_TERM = "-->" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -243,7 +244,7 @@ def pull_event return process_instruction(start_position) elsif @source.match("/um, true) + md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM) if md.nil? raise REXML::ParseException.new("Unclosed comment", @source) end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 46a07409..543d9ad8 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseComment < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -117,5 +121,12 @@ def test_after_root assert_equal(" ok comment ", events[:comment]) end + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end end end From 9f1415a2616c77cad44a176eee90e8457b4774b6 Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:04:40 +0900 Subject: [PATCH 32/40] Fix performance issue caused by using repeated `>` characters inside `CDATA [ PAYLOAD ]` (#172) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 3 ++- test/parse/test_cdata.rb | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 test/parse/test_cdata.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index ba205175..e2c0fd80 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -127,6 +127,7 @@ module Private INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um INSTRUCTION_TERM = "?>" COMMENT_TERM = "-->" + CDATA_TERM = "]]>" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -431,7 +432,7 @@ def pull_event return [ :comment, md[1] ] else - md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb new file mode 100644 index 00000000..9e8fa8b2 --- /dev/null +++ b/test/parse/test_cdata.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCData < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' ]]>') + end + end + end +end From c33ea498102be65082940e8b7d6d31cb2c6e6ee2 Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:11:17 +0900 Subject: [PATCH 33/40] Fix performance issue caused by using repeated `>` characters after ` " COMMENT_TERM = "-->" CDATA_TERM = "]]>" + DOCTYPE_TERM = "]>" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -384,7 +385,7 @@ def pull_event end return [ :comment, md[1] ] if md end - elsif match = @source.match(/(%.*?;)\s*/um, true) + elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM) return [ :externalentity, match[1] ] elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 3ca0b536..61c3f04d 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -1,9 +1,13 @@ # frozen_string_literal: false require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseDocumentTypeDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + private def parse(doctype) REXML::Document.new(<<-XML).doctype @@ -276,6 +280,16 @@ def test_notation_attlist doctype.children.collect(&:class)) end + def test_gt_linear_performance_malformed_entity + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + begin + REXML::Document.new('" * n + ']>') + rescue + end + end + end + private def parse(internal_subset) super(<<-DOCTYPE) From a79ac8b4b42a9efabe33a0be31bd82d33fd50347 Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:18:11 +0900 Subject: [PATCH 34/40] Fix performance issue caused by using repeated `>` characters inside `]>` (#174) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 2 +- test/parse/test_document_type_declaration.rb | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 7fe6c4e8..4fcdaba7 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -378,7 +378,7 @@ def pull_event raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] - elsif md = @source.match(/--(.*?)-->/um, true) + elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM) case md[1] when /--/, /-\z/ raise REXML::ParseException.new("Malformed comment", @source) diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 61c3f04d..3c3371ea 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -290,6 +290,13 @@ def test_gt_linear_performance_malformed_entity end end + def test_gt_linear_performance_comment + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' -->]>') + end + end + private def parse(internal_subset) super(<<-DOCTYPE) From 67efb5951ed09dbb575c375b130a1e469f437d1f Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:26:57 +0900 Subject: [PATCH 35/40] Fix performance issue caused by using repeated `>` characters inside `]>` (#175) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 8 ++++++-- test/parse/test_entity_declaration.rb | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 4fcdaba7..e8f1a069 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -124,11 +124,15 @@ class BaseParser } module Private - INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um + # Terminal requires two or more letters. INSTRUCTION_TERM = "?>" COMMENT_TERM = "-->" CDATA_TERM = "]]>" DOCTYPE_TERM = "]>" + # Read to the end of DOCTYPE because there is no proper ENTITY termination + ENTITY_TERM = DOCTYPE_TERM + + INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -313,7 +317,7 @@ def pull_event raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, " ]> DETAIL end + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('' * n + '">') + end + end end end From 1cc1d9a74ede52f3d9ce774cafb11c57b3905165 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Jul 2024 11:27:57 +0900 Subject: [PATCH 36/40] Suppress have_root not initialized warnings on Ruby < 3 --- lib/rexml/parsers/baseparser.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index e8f1a069..860be203 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -165,6 +165,7 @@ def add_listener( listener ) def stream=( source ) @source = SourceFactory.create_from( source ) @closed = nil + @have_root = false @document_status = nil @tags = [] @stack = [] From 1f1e6e9b40bf339894e843dfd679c2fb1a5ddbf2 Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:35:41 +0900 Subject: [PATCH 37/40] Fix ReDoS by using repeated space characters inside `]>` (#176) Fix performance by removing unnecessary spaces. This is occurred in Ruby 3.1 or earlier. --- lib/rexml/parsers/baseparser.rb | 2 +- test/parse/test_attlist.rb | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 test/parse/test_attlist.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 860be203..47380f0d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -350,7 +350,7 @@ def pull_event contents = md[0] pairs = {} - values = md[0].scan( ATTDEF_RE ) + values = md[0].strip.scan( ATTDEF_RE ) values.each do |attdef| unless attdef[3] == "#IMPLIED" attdef.compact! diff --git a/test/parse/test_attlist.rb b/test/parse/test_attlist.rb new file mode 100644 index 00000000..eee9309c --- /dev/null +++ b/test/parse/test_attlist.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseAttlist < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new(']>') + end + end + end +end From 910e5a2b487cb5a30989884a39f9cad2cc499cfc Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:36:05 +0900 Subject: [PATCH 38/40] Fix performance issue caused by using repeated `>` characters inside `` (#177) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 2 +- test/parse/test_comment.rb | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 47380f0d..5688c773 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -430,7 +430,7 @@ def pull_event #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md if md[0][0] == ?- - md = @source.match(/--(.*?)-->/um, true) + md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM) if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 543d9ad8..50c765f5 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -128,5 +128,12 @@ def test_gt_linear_performance REXML::Document.new('') end end + + def test_gt_linear_performance_in_element + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end end end From 0e33d3adfb5069b20622e5ed9393d10b8cc17b40 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Jul 2024 11:37:45 +0900 Subject: [PATCH 39/40] test: improve linear performance test names Use "test_linear_performance_XXX" style. --- test/parse/test_attlist.rb | 2 +- test/parse/test_cdata.rb | 2 +- test/parse/test_character_reference.rb | 2 +- test/parse/test_comment.rb | 4 ++-- test/parse/test_document_type_declaration.rb | 4 ++-- test/parse/test_element.rb | 2 +- test/parse/test_entity_declaration.rb | 2 +- test/parse/test_processing_instruction.rb | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/parse/test_attlist.rb b/test/parse/test_attlist.rb index eee9309c..c1b4376c 100644 --- a/test/parse/test_attlist.rb +++ b/test/parse/test_attlist.rb @@ -7,7 +7,7 @@ module REXMLTests class TestParseAttlist < Test::Unit::TestCase include Test::Unit::CoreAssertions - def test_gt_linear_performance + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new(']>') diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb index 9e8fa8b2..b5f1a3bc 100644 --- a/test/parse/test_cdata.rb +++ b/test/parse/test_cdata.rb @@ -7,7 +7,7 @@ module REXMLTests class TestParseCData < Test::Unit::TestCase include Test::Unit::CoreAssertions - def test_gt_linear_performance + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('" * n + ' ]]>') diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb index 8ddeccaa..bf8d2190 100644 --- a/test/parse/test_character_reference.rb +++ b/test/parse/test_character_reference.rb @@ -7,7 +7,7 @@ module REXMLTests class TestParseCharacterReference < Test::Unit::TestCase include Test::Unit::CoreAssertions - def test_gt_linear_performance_many_preceding_zeros + def test_linear_performance_many_preceding_zeros seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('') diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 50c765f5..b7892232 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -122,14 +122,14 @@ def test_after_root assert_equal(" ok comment ", events[:comment]) end - def test_gt_linear_performance + def test_linear_performance_top_level_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('') end end - def test_gt_linear_performance_in_element + def test_linear_performance_in_element_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('') diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 3c3371ea..490a27d4 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -280,7 +280,7 @@ def test_notation_attlist doctype.children.collect(&:class)) end - def test_gt_linear_performance_malformed_entity + def test_linear_performance_percent_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| begin @@ -290,7 +290,7 @@ def test_gt_linear_performance_malformed_entity end end - def test_gt_linear_performance_comment + def test_linear_performance_comment_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('" * n + ' -->]>') diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 261f25c3..2b0746ea 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -125,7 +125,7 @@ def test_after_empty_element_tag_root end end - def test_gt_linear_performance_attribute_value + def test_linear_performance_attribute_value_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('" * n + '">') diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb index 07529016..7d750b90 100644 --- a/test/parse/test_entity_declaration.rb +++ b/test/parse/test_entity_declaration.rb @@ -33,7 +33,7 @@ def test_empty DETAIL end - def test_gt_linear_performance + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('' * n + '">') diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index ac4c2ff0..7943cd3c 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -74,7 +74,7 @@ def test_after_root assert_equal("abc", events[:processing_instruction]) end - def test_gt_linear_performance + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| REXML::Document.new('" * n + ' ?>') From 2b285ac0804f2918de642f7ed4646dc6d645a7fc Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Jul 2024 11:38:07 +0900 Subject: [PATCH 40/40] Add 3.3.2 entry --- NEWS.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/NEWS.md b/NEWS.md index 3e406574..3b62f6aa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,53 @@ # News +## 3.3.2 - 2024-07-16 {#version-3-3-2} + +### Improvements + + * Improved parse performance. + * GH-160 + * Patch by NAITOH Jun. + + * Improved parse performance. + * GH-169 + * GH-170 + * GH-171 + * GH-172 + * GH-173 + * GH-174 + * Patch by Watson. + + * Added support for raising a parse exception when an XML has extra + content after the root element. + * GH-161 + * Patch by NAITOH Jun. + + * Added support for raising a parse exception when an XML + declaration exists in wrong position. + * GH-162 + * Patch by NAITOH Jun. + + * Removed needless a space after XML declaration in pretty print mode. + * GH-164 + * Patch by NAITOH Jun. + + * Stopped to emit `:text` event after the root element. + * GH-167 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that SAX2 parser doesn't expand predefined entities for + `characters` callback. + * GH-168 + * Patch by NAITOH Jun. + +### Thanks + + * NAITOH Jun + + * Watson + ## 3.3.1 - 2024-06-25 {#version-3-3-1} ### Improvements