From 0274467fdba450388a8d71edbc603b0ffbfd4de3 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 11 Jun 2024 15:11:07 +0900
Subject: [PATCH 01/40] Bump version

---
 lib/rexml/rexml.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index 3e870822..3af03ec7 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.3.0"
+  VERSION = "3.3.1"
   REVISION = ""
 
   Copyright = COPYRIGHT

From 6415113201e0ebc334ff26a585ca7fdab418351b Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Tue, 11 Jun 2024 17:38:32 +0900
Subject: [PATCH 02/40] Remove an unused class var `@@namespaces` (#144)

`@@namespaces` is defined under `REXML`, but it is never used.
At least, `rake test` passes when it is removed.

I guess the comment above `@@namespaces` is also false.
---
 lib/rexml/element.rb | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb
index bf913a82..2899759d 100644
--- a/lib/rexml/element.rb
+++ b/lib/rexml/element.rb
@@ -7,14 +7,6 @@
 require_relative "parseexception"
 
 module REXML
-  # An implementation note about namespaces:
-  # As we parse, when we find namespaces we put them in a hash and assign
-  # them a unique ID.  We then convert the namespace prefix for the node
-  # to the unique ID.  This makes namespace lookup much faster for the
-  # cost of extra memory use.  We save the namespace prefix for the
-  # context node and convert it back when we write it.
-  @@namespaces = {}
-
   # An \REXML::Element object represents an XML element.
   #
   # An element:

From b5bf109a599ea733663150e99c09eb44046b41dd Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Thu, 13 Jun 2024 15:12:32 +0900
Subject: [PATCH 03/40] Add a "malformed comment" check for top-level comments
 (#145)

This check was missing. Therefore, `REXML::Document.new("<!--")` raised
the ``undefined method `[]' for nil`` error, for example.

This PR also adds tests for "malformed comment" checks.

---------

Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
---
 lib/rexml/parsers/baseparser.rb |  9 +++-
 test/parse/test_comment.rb      | 96 +++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 test/parse/test_comment.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index eadc78f7..eae0db8b 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -228,7 +228,14 @@ def pull_event
             return process_instruction(start_position)
           elsif @source.match("<!", true)
             if @source.match("--", true)
-              return [ :comment, @source.match(/(.*?)-->/um, true)[1] ]
+              md = @source.match(/(.*?)-->/um, true)
+              if md.nil?
+                raise REXML::ParseException.new("Unclosed comment", @source)
+              end
+              if /--|-\z/.match?(md[1])
+                raise REXML::ParseException.new("Malformed comment", @source)
+              end
+              return [ :comment, md[1] ]
             elsif @source.match("DOCTYPE", true)
               base_error_message = "Malformed DOCTYPE"
               unless @source.match(/\s+/um, true)
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
new file mode 100644
index 00000000..8f143495
--- /dev/null
+++ b/test/parse/test_comment.rb
@@ -0,0 +1,96 @@
+require "test/unit"
+require "rexml/document"
+
+module REXMLTests
+  class TestParseComment < Test::Unit::TestCase
+    def parse(xml)
+      REXML::Document.new(xml)
+    end
+
+    class TestInvalid < self
+      def test_toplevel_unclosed_comment
+        exception = assert_raise(REXML::ParseException) do
+          parse("<!--")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Unclosed comment
+          Line: 1
+          Position: 4
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_toplevel_malformed_comment_inner
+        exception = assert_raise(REXML::ParseException) do
+          parse("<!-- -- -->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 11
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_toplevel_malformed_comment_end
+        exception = assert_raise(REXML::ParseException) do
+          parse("<!-- --->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 9
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_doctype_malformed_comment_inner
+        exception = assert_raise(REXML::ParseException) do
+          parse("<!DOCTYPE foo [<!-- -- -->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 26
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_doctype_malformed_comment_end
+        exception = assert_raise(REXML::ParseException) do
+          parse("<!DOCTYPE foo [<!-- --->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 24
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_after_doctype_malformed_comment_inner
+        exception = assert_raise(REXML::ParseException) do
+          parse("<a><!-- -- -->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 14
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_after_doctype_malformed_comment_end
+        exception = assert_raise(REXML::ParseException) do
+          parse("<a><!-- --->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 12
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+    end
+  end
+end

From 3b026f89b66af7a1e24fe394724e81b06b25d552 Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Thu, 13 Jun 2024 15:55:32 +0900
Subject: [PATCH 04/40] Improve `Element#attribute` implementation as 6500x
 faster (#146)

`Element#namespaces` is heavy method because this method needs to
traverse all ancestors of the element. `Element#attribute` calls
`namespaces` redundantly, so it is much slower.

This PR reduces `namespaces` calls in `Element#attribute`. Also, this PR
removes a redundant `respond_to?` because `namespaces` must return
`Hash` in the current implementation.

Below is the result of a benchmark for this on my laptop.

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/makenowjust/Projects/github.com/makenowjust/simple-dotfiles/.asdf/installs/ruby/3.3.2/bin/ruby -v -S benchmark-driver /Users/makenowjust/Projects/github.com/ruby/rexml/benchmark/attribute.yaml
ruby 3.3.2 (2024-05-30 revision e5a195edf6) [arm64-darwin23]
Calculating -------------------------------------
                     rexml 3.2.6      master  3.2.6(YJIT)  master(YJIT)
   attribute_with_ns     425.420     849.271       5.336k       10.629k i/s -      1.000k times in 2.350620s 1.177481s 0.187416s 0.094084s
attribute_without_ns     834.750      5.587M      10.656k        2.950M i/s -      1.000k times in 1.197963s 0.000179s 0.093846s 0.000339s

Comparison:
                attribute_with_ns
        master(YJIT):     10628.8 i/s
         3.2.6(YJIT):      5335.7 i/s - 1.99x  slower
              master:       849.3 i/s - 12.52x  slower
         rexml 3.2.6:       425.4 i/s - 24.98x  slower

             attribute_without_ns
              master:   5586593.2 i/s
        master(YJIT):   2949854.4 i/s - 1.89x  slower
         3.2.6(YJIT):     10655.8 i/s - 524.28x  slower
         rexml 3.2.6:       834.8 i/s - 6692.53x  slower

```

This result shows that `Element#attribute` is now 6500x faster than the
old implementation if `namespace` is not supplied.

It seems strange that it is slower when YJIT is enabled, but we believe
this is a separate issue.

Thank you.

---------

Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
---
 benchmark/attribute.yaml | 38 ++++++++++++++++++++++++++++++++++++++
 lib/rexml/element.rb     |  9 ++-------
 2 files changed, 40 insertions(+), 7 deletions(-)
 create mode 100644 benchmark/attribute.yaml

diff --git a/benchmark/attribute.yaml b/benchmark/attribute.yaml
new file mode 100644
index 00000000..5dd7fded
--- /dev/null
+++ b/benchmark/attribute.yaml
@@ -0,0 +1,38 @@
+loop_count: 1000
+contexts:
+  - gems:
+      rexml: 3.2.6
+    require: false
+    prelude: require 'rexml'
+  - name: master
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path("lib"))
+      require 'rexml'
+  - name: 3.2.6(YJIT)
+    gems:
+      rexml: 3.2.6
+    require: false
+    prelude: |
+      require 'rexml'
+      RubyVM::YJIT.enable
+  - name: master(YJIT)
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path("lib"))
+      require 'rexml'
+      RubyVM::YJIT.enable
+
+prelude: |
+  require 'rexml/document'
+
+  xml_source = "<deepest x:with_ns='foo' without_ns='bar'></deepest>"
+  100.times do
+    xml_source = "<nest>#{xml_source}</nest>"
+  end
+  xml_source = "<root xmlns:x='xyz'>#{xml_source}</root>"
+
+  document = REXML::Document.new(xml_source)
+  deepest_node = document.elements["//deepest"]
+
+benchmark:
+  with_ns:    deepest_node.attribute("with_ns", "xyz")
+  without_ns: deepest_node.attribute("without_ns")
diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb
index 2899759d..a5808d7c 100644
--- a/lib/rexml/element.rb
+++ b/lib/rexml/element.rb
@@ -1276,16 +1276,11 @@ def [](name_or_index)
     #   document.root.attribute("x", "a") # => a:x='a:x'
     #
     def attribute( name, namespace=nil )
-      prefix = nil
-      if namespaces.respond_to? :key
-        prefix = namespaces.key(namespace) if namespace
-      else
-        prefix = namespaces.index(namespace) if namespace
-      end
+      prefix = namespaces.key(namespace) if namespace
       prefix = nil if prefix == 'xmlns'
 
       ret_val =
-        attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" )
+        attributes.get_attribute( prefix ? "#{prefix}:#{name}" : name )
 
       return ret_val unless ret_val.nil?
       return nil if prefix.nil?

From 1e31ffc7c9170255c2a62773ac1e1d90c4991a9d Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Thu, 13 Jun 2024 23:29:59 +0900
Subject: [PATCH 05/40] Fix small typos (#148)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I found these typos with using
[`typos-cli`](https://github.com/crate-ci/typos).

Now, we can obtain no typo reports from the `typos` command with this
configuration (`.typos.toml`):

```toml
[files]
extend-exclude = [
  "*.svg",
  "*.xml",
]

[default.extend-words]
# Common variable names in this project.
arry = "arry"
blok = "blok"
eles = "eles"
# Incomplete words in test data.
caf = "caf"
# German words in test data.
abl = "abl" # NOTE: It is a part of "Ablüfe".
alle = "alle"
ist = "ist"
technik = "technik"
```

Thank you.

---------

Co-authored-by: Olle Jonsson <olle.jonsson@gmail.com>
---
 test/test_document.rb   | 2 +-
 test/test_light.rb      | 2 +-
 test/test_sax.rb        | 2 +-
 test/xpath/test_base.rb | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_document.rb b/test/test_document.rb
index 7fccbacb..2b0a8a73 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -237,7 +237,7 @@ def test_each_recursive
       document = REXML::Document.new(xml_source)
 
       # Node#each_recursive iterates elements only.
-      # This does not iterate XML declerations, comments, attributes, CDATA sections, etc.
+      # This does not iterate XML declarations, comments, attributes, CDATA sections, etc.
       actual_names = []
       document.each_recursive do |element|
         actual_names << element.attributes["name"]
diff --git a/test/test_light.rb b/test/test_light.rb
index 54b2c52e..c556c978 100644
--- a/test/test_light.rb
+++ b/test/test_light.rb
@@ -62,7 +62,7 @@ def test_access_child_elements
       assert_equal( 'c', a[1].name )
     end
 
-    def test_itterate_over_children
+    def test_iterate_over_children
       foo = make_small_document
       ctr = 0
       foo[0].each { ctr += 1 }
diff --git a/test/test_sax.rb b/test/test_sax.rb
index c2255bf3..8e905f2e 100644
--- a/test/test_sax.rb
+++ b/test/test_sax.rb
@@ -140,7 +140,7 @@ def test_simple_doctype_listener
 
     # test doctype with missing name, should throw ParseException
     # submitted by Jeff Barczewseki
-    def test_doctype_with_mising_name_throws_exception
+    def test_doctype_with_missing_name_throws_exception
       xml = <<~END
         <?xml version="1.0"?>
         <!DOCTYPE >
diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb
index 68b33ab7..1dacd69d 100644
--- a/test/xpath/test_base.rb
+++ b/test/xpath/test_base.rb
@@ -651,7 +651,7 @@ def test_comparisons
       source = "<a><b id='1'/><b id='2'/><b id='3'/></a>"
       doc = REXML::Document.new(source)
 
-      # NOTE TO SER: check that number() is required
+      # NOTE: check that number() is required
       assert_equal 2, REXML::XPath.match(doc, "//b[number(@id) > 1]").size
       assert_equal 3, REXML::XPath.match(doc, "//b[number(@id) >= 1]").size
       assert_equal 1, REXML::XPath.match(doc, "//b[number(@id) <= 1]").size

From d906ae2f05351ea68e5860be9b8c6e1de57dee9b Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Fri, 14 Jun 2024 06:00:13 +0900
Subject: [PATCH 06/40] Add a "Malformed comment" check for invalid comments
 such as `<!-->` (#147)

`Document.new("<a><!-->")` raises `undefined method '[]' for nil`. This
commit fixes it and adds a test for it.
---
 lib/rexml/parsers/baseparser.rb |  5 ++---
 test/parse/test_comment.rb      | 13 +++++++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index eae0db8b..272d8a6b 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -406,12 +406,11 @@ def pull_event
               if md[0][0] == ?-
                 md = @source.match(/--(.*?)-->/um, true)
 
-                case md[1]
-                when /--/, /-\z/
+                if md.nil? || /--|-\z/.match?(md[1])
                   raise REXML::ParseException.new("Malformed comment", @source)
                 end
 
-                return [ :comment, md[1] ] if md
+                return [ :comment, md[1] ]
               else
                 md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
                 return [ :cdata, md[1] ] if md
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index 8f143495..ce6678e8 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -68,6 +68,19 @@ def test_doctype_malformed_comment_end
         DETAIL
       end
 
+      def test_after_doctype_malformed_comment_short
+        exception = assert_raise(REXML::ParseException) do
+          parse("<a><!-->")
+        end
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 8
+          Last 80 unconsumed characters:
+          -->
+        DETAIL
+      end
+
       def test_after_doctype_malformed_comment_inner
         exception = assert_raise(REXML::ParseException) do
           parse("<a><!-- -- -->")

From f7040112601104d71d3254a0834c4932b1b68f04 Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Wed, 19 Jun 2024 14:47:34 +0900
Subject: [PATCH 07/40] Reject unclosed DOCTYPE on parsing (#153)

Fix #152

---------

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
---
 lib/rexml/parsers/baseparser.rb              | 10 ++++-
 lib/rexml/parsers/treeparser.rb              | 23 ++++------
 test/parse/test_document_type_declaration.rb | 45 ++++++++++++++++++++
 3 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 272d8a6b..5791ab1d 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -216,7 +216,12 @@ def pull_event
           x, @closed = @closed, nil
           return [ :end_element, x ]
         end
-        return [ :end_document ] if empty?
+        if empty?
+          if @document_status == :in_doctype
+            raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
+          end
+          return [ :end_document ]
+        end
         return @stack.shift if @stack.size > 0
         #STDERR.puts @source.encoding
         #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
@@ -373,6 +378,9 @@ def pull_event
             @document_status = :after_doctype
             return [ :end_doctype ]
           end
+          if @document_status == :in_doctype
+            raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
+          end
         end
         if @document_status == :after_doctype
           @source.match(/\s*/um, true)
diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb
index bf9a4254..0cb6f7cc 100644
--- a/lib/rexml/parsers/treeparser.rb
+++ b/lib/rexml/parsers/treeparser.rb
@@ -16,7 +16,6 @@ def add_listener( listener )
 
       def parse
         tag_stack = []
-        in_doctype = false
         entities = nil
         begin
           while true
@@ -39,17 +38,15 @@ def parse
               tag_stack.pop
               @build_context = @build_context.parent
             when :text
-              if not in_doctype
-                if @build_context[-1].instance_of? Text
-                  @build_context[-1] << event[1]
-                else
-                  @build_context.add(
-                    Text.new(event[1], @build_context.whitespace, nil, true)
-                  ) unless (
-                    @build_context.ignore_whitespace_nodes and
-                    event[1].strip.size==0
-                  )
-                end
+              if @build_context[-1].instance_of? Text
+                @build_context[-1] << event[1]
+              else
+                @build_context.add(
+                  Text.new(event[1], @build_context.whitespace, nil, true)
+                ) unless (
+                  @build_context.ignore_whitespace_nodes and
+                  event[1].strip.size==0
+                )
               end
             when :comment
               c = Comment.new( event[1] )
@@ -60,14 +57,12 @@ def parse
             when :processing_instruction
               @build_context.add( Instruction.new( event[1], event[2] ) )
             when :end_doctype
-              in_doctype = false
               entities.each { |k,v| entities[k] = @build_context.entities[k].value }
               @build_context = @build_context.parent
             when :start_doctype
               doctype = DocType.new( event[1..-1], @build_context )
               @build_context = doctype
               entities = {}
-              in_doctype = true
             when :attlistdecl
               n = AttlistDecl.new( event[1..-1] )
               @build_context.add( n )
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 8faa0b78..3ca0b536 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -53,6 +53,51 @@ def test_no_name
       end
     end
 
+    class TestUnclosed < self
+      def test_no_extra_node
+        exception = assert_raise(REXML::ParseException) do
+          REXML::Document.new("<!DOCTYPE foo [")
+        end
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed DOCTYPE: unclosed
+          Line: 1
+          Position: 15
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
+
+      def test_start_element
+        exception = assert_raise(REXML::ParseException) do
+          REXML::Document.new(<<~DOCTYPE)
+            <!DOCTYPE foo [ <r>
+          DOCTYPE
+        end
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed DOCTYPE: invalid declaration
+          Line: 1
+          Position: 20
+          Last 80 unconsumed characters:
+          <r>#{' '}
+        DETAIL
+      end
+
+      def test_text
+        exception = assert_raise(REXML::ParseException) do
+          REXML::Document.new(<<~DOCTYPE)
+            <!DOCTYPE foo [ text
+          DOCTYPE
+        end
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed DOCTYPE: invalid declaration
+          Line: 1
+          Position: 21
+          Last 80 unconsumed characters:
+          text#{' '}
+        DETAIL
+      end
+    end
+
     class TestExternalID < self
       class TestSystem < self
         def test_left_bracket_in_system_literal

From 4c288086ed1869f224dc534182019c93d0317319 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sat, 22 Jun 2024 10:42:44 +0900
Subject: [PATCH 08/40] Fix a bug that a large XML can't be parsed (#154)

GitHub: fix GH-150

If a parsed XML is later than `2 ** 31 - 1`, we can't parse it. Because
`StringScanner`s position is stored as `int`. We can avoid the
restriction by dropping large parsed content.

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
---
 lib/rexml/parsers/baseparser.rb |  2 ++
 lib/rexml/source.rb             |  7 +++++++
 test/parser/test_base_parser.rb | 27 +++++++++++++++++++++++++++
 3 files changed, 36 insertions(+)
 create mode 100644 test/parser/test_base_parser.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 5791ab1d..a003ac29 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -204,6 +204,8 @@ def peek depth=0
 
       # Returns the next event.  This is a +PullEvent+ object.
       def pull
+        @source.drop_parsed_content
+
         pull_event.tap do |event|
           @listeners.each do |listener|
             listener.receive event
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 67154832..f12ee172 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -55,6 +55,7 @@ class Source
     attr_reader :encoding
 
     module Private
+      SCANNER_RESET_SIZE = 100000
       PRE_DEFINED_TERM_PATTERNS = {}
       pre_defined_terms = ["'", '"', "<"]
       pre_defined_terms.each do |term|
@@ -84,6 +85,12 @@ def buffer
       @scanner.rest
     end
 
+    def drop_parsed_content
+      if @scanner.pos > Private::SCANNER_RESET_SIZE
+        @scanner.string = @scanner.rest
+      end
+    end
+
     def buffer_encoding=(encoding)
       @scanner.string.force_encoding(encoding)
     end
diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb
new file mode 100644
index 00000000..17d01979
--- /dev/null
+++ b/test/parser/test_base_parser.rb
@@ -0,0 +1,27 @@
+# frozen_string_literal: false
+
+require 'rexml/parsers/baseparser'
+
+module REXMLTests
+  class BaseParserTester < Test::Unit::TestCase
+    def test_large_xml
+      large_text = "a" * 100_000
+      xml = <<-XML
+        <?xml version="1.0"?>
+        <root>
+          <child>#{large_text}</child>
+          <child>#{large_text}</child>
+        </root>
+      XML
+
+      parser = REXML::Parsers::BaseParser.new(xml)
+      while parser.has_next?
+        parser.pull
+      end
+
+      assert do
+        parser.position < xml.bytesize
+      end
+    end
+  end
+end

From cfa8dd90077000f21f55a6b7e5f041e2b4fd5e04 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sat, 22 Jun 2024 14:21:28 +0900
Subject: [PATCH 09/40] Don't include private_constant-ed module (#155)

Included constants are not private. So private constants in private
module aren't private.

See also: https://github.com/ruby/rexml/pull/154#discussion_r1649469269
---
 lib/rexml/parsers/baseparser.rb | 13 ++++++-------
 lib/rexml/source.rb             |  1 -
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index a003ac29..c83e7958 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -134,7 +134,6 @@ module Private
         ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
       end
       private_constant :Private
-      include Private
 
       def initialize( source )
         self.stream = source
@@ -302,7 +301,7 @@ def pull_event
               raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
               return [ :elementdecl, "<!ELEMENT" + md[1] ]
             elsif @source.match("ENTITY", true)
-              match = [:entitydecl, *@source.match(ENTITYDECL_PATTERN, true).captures.compact]
+              match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true).captures.compact]
               ref = false
               if match[1] == '%'
                 ref = true
@@ -328,7 +327,7 @@ def pull_event
               match << '%' if ref
               return match
             elsif @source.match("ATTLIST", true)
-              md = @source.match(ATTLISTDECL_END, true)
+              md = @source.match(Private::ATTLISTDECL_END, true)
               raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
               element = md[1]
               contents = md[0]
@@ -397,7 +396,7 @@ def pull_event
             if @source.match("/", true)
               @nsstack.shift
               last_tag = @tags.pop
-              md = @source.match(CLOSE_PATTERN, true)
+              md = @source.match(Private::CLOSE_PATTERN, true)
               if md and !last_tag
                 message = "Unexpected top-level end tag (got '#{md[1]}')"
                 raise REXML::ParseException.new(message, @source)
@@ -431,7 +430,7 @@ def pull_event
               return process_instruction(start_position)
             else
               # Get the next tag
-              md = @source.match(TAG_PATTERN, true)
+              md = @source.match(Private::TAG_PATTERN, true)
               unless md
                 @source.position = start_position
                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
@@ -539,7 +538,7 @@ def need_source_encoding_update?(xml_declaration_encoding)
       end
 
       def parse_name(base_error_message)
-        md = @source.match(NAME_PATTERN, true)
+        md = @source.match(Private::NAME_PATTERN, true)
         unless md
           if @source.match(/\s*\S/um)
             message = "#{base_error_message}: invalid name"
@@ -618,7 +617,7 @@ def parse_id_invalid_details(accept_external_id:,
       end
 
       def process_instruction(start_position)
-        match_data = @source.match(INSTRUCTION_END, true)
+        match_data = @source.match(Private::INSTRUCTION_END, true)
         unless match_data
           message = "Invalid processing instruction node"
           @source.position = start_position
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index f12ee172..5715c352 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -63,7 +63,6 @@ module Private
       end
     end
     private_constant :Private
-    include Private
 
     # Constructor
     # @param arg must be a String, and should be a valid XML document

From 22d206a657c9923b19fa3e3c7451800f875ba31d Mon Sep 17 00:00:00 2001
From: fynsta <63241108+fynsta@users.noreply.github.com>
Date: Sun, 23 Jun 2024 00:42:36 +0200
Subject: [PATCH 10/40] Add changelog_uri to gemspec (#156)

Supported here:
https://guides.rubygems.org/specification-reference/#metadata

Useful for running https://github.com/MaximeD/gem_updater
---
 rexml.gemspec | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/rexml.gemspec b/rexml.gemspec
index 169e49dc..0de3e845 100644
--- a/rexml.gemspec
+++ b/rexml.gemspec
@@ -16,6 +16,10 @@ Gem::Specification.new do |spec|
   spec.homepage      = "https://github.com/ruby/rexml"
   spec.license       = "BSD-2-Clause"
 
+  spec.metadata = {
+    "changelog_uri" => "#{spec.homepage}/releases/tag/v#{spec.version}"
+  }
+
   files = [
     "LICENSE.txt",
     "NEWS.md",

From e6e07f27c27a8b0955b61ee43ef73a5c283ad038 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sun, 23 Jun 2024 20:50:25 +0900
Subject: [PATCH 11/40] Reuse of Set.new at prefixes variables (#157)

## Why?
`Set.new()` instances of the prefixes variable can be reused, reducing
initialization costs.

## Result
```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     17.714      17.658        32.898       33.247 i/s -     100.000 times in 5.645176s 5.663160s 3.039707s 3.007755s
                 sax     25.280      25.281        47.483       49.990 i/s -     100.000 times in 3.955694s 3.955534s 2.106006s 2.000389s
                pull     29.048      29.061        59.944       61.498 i/s -     100.000 times in 3.442599s 3.441014s 1.668222s 1.626060s
              stream     28.181      28.440        52.340       55.078 i/s -     100.000 times in 3.548546s 3.516169s 1.910599s 1.815599s

Comparison:
                              dom
         after(YJIT):        33.2 i/s
        before(YJIT):        32.9 i/s - 1.01x  slower
              before:        17.7 i/s - 1.88x  slower
               after:        17.7 i/s - 1.88x  slower

                              sax
         after(YJIT):        50.0 i/s
        before(YJIT):        47.5 i/s - 1.05x  slower
               after:        25.3 i/s - 1.98x  slower
              before:        25.3 i/s - 1.98x  slower

                             pull
         after(YJIT):        61.5 i/s
        before(YJIT):        59.9 i/s - 1.03x  slower
               after:        29.1 i/s - 2.12x  slower
              before:        29.0 i/s - 2.12x  slower

                           stream
         after(YJIT):        55.1 i/s
        before(YJIT):        52.3 i/s - 1.05x  slower
               after:        28.4 i/s - 1.94x  slower
              before:        28.2 i/s - 1.95x  slower

```

YJIT=ON : 1.01x - 1.05x faster
YJIT=OFF : 0.99x - 1.00x faster
---
 lib/rexml/parsers/baseparser.rb | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index c83e7958..2f068e0c 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -138,6 +138,7 @@ module Private
       def initialize( source )
         self.stream = source
         @listeners = []
+        @prefixes = Set.new
       end
 
       def add_listener( listener )
@@ -253,7 +254,7 @@ def pull_event
                 @source.position = start_position
                 raise REXML::ParseException.new(message, @source)
               end
-              @nsstack.unshift(curr_ns=Set.new)
+              @nsstack.unshift(Set.new)
               name = parse_name(base_error_message)
               if @source.match(/\s*\[/um, true)
                 id = [nil, nil, nil]
@@ -437,12 +438,12 @@ def pull_event
               end
               tag = md[1]
               @document_status = :in_element
-              prefixes = Set.new
-              prefixes << md[2] if md[2]
+              @prefixes.clear
+              @prefixes << md[2] if md[2]
               @nsstack.unshift(curr_ns=Set.new)
-              attributes, closed = parse_attributes(prefixes, curr_ns)
+              attributes, closed = parse_attributes(@prefixes, curr_ns)
               # Verify that all of the prefixes have been defined
-              for prefix in prefixes
+              for prefix in @prefixes
                 unless @nsstack.find{|k| k.member?(prefix)}
                   raise UndefinedNamespaceException.new(prefix,@source,self)
                 end

From a579730f25ec7443796495541ec57c071b91805d Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Tue, 25 Jun 2024 09:07:11 +0900
Subject: [PATCH 12/40] Optimize BaseParser#unnormalize method (#158)

## Benchmark
```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     17.704      18.106        34.215       33.806 i/s -     100.000 times in 5.648398s 5.523110s 2.922698s 2.958036s
                 sax     25.664      25.302        48.429       48.602 i/s -     100.000 times in 3.896488s 3.952289s 2.064859s 2.057537s
                pull     28.966      29.215        61.710       62.068 i/s -     100.000 times in 3.452275s 3.422901s 1.620480s 1.611129s
              stream     28.291      28.426        53.860       55.548 i/s -     100.000 times in 3.534716s 3.517884s 1.856667s 1.800247s

Comparison:
                              dom
        before(YJIT):        34.2 i/s
         after(YJIT):        33.8 i/s - 1.01x  slower
               after:        18.1 i/s - 1.89x  slower
              before:        17.7 i/s - 1.93x  slower

                              sax
         after(YJIT):        48.6 i/s
        before(YJIT):        48.4 i/s - 1.00x  slower
              before:        25.7 i/s - 1.89x  slower
               after:        25.3 i/s - 1.92x  slower

                             pull
         after(YJIT):        62.1 i/s
        before(YJIT):        61.7 i/s - 1.01x  slower
               after:        29.2 i/s - 2.12x  slower
              before:        29.0 i/s - 2.14x  slower

                           stream
         after(YJIT):        55.5 i/s
        before(YJIT):        53.9 i/s - 1.03x  slower
               after:        28.4 i/s - 1.95x  slower
              before:        28.3 i/s - 1.96x  slower

```

- YJIT=ON : 1.00x - 1.03x faster
- YJIT=OFF : 0.98x - 1.02x faster
---
 lib/rexml/parsers/baseparser.rb | 15 +++++++++++----
 test/test_pullparser.rb         | 20 ++++++++++++++++++++
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 2f068e0c..275372ee 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -132,6 +132,13 @@ module Private
         GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
         PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
         ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
+        CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
+        CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
+        DEFAULT_ENTITIES_PATTERNS = {}
+        default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
+        default_entities.each do |term|
+          DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
+        end
       end
       private_constant :Private
 
@@ -504,10 +511,10 @@ def normalize( input, entities=nil, entity_filter=nil )
 
       # Unescapes all possible entities
       def unnormalize( string, entities=nil, filter=nil )
-        rv = string.gsub( /\r\n?/, "\n" )
+        rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
         matches = rv.scan( REFERENCE_RE )
         return rv if matches.size == 0
-        rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
+        rv.gsub!( Private::CHARACTER_REFERENCES ) {
           m=$1
           m = "0#{m}" if m[0] == ?x
           [Integer(m)].pack('U*')
@@ -518,7 +525,7 @@ def unnormalize( string, entities=nil, filter=nil )
             unless filter and filter.include?(entity_reference)
               entity_value = entity( entity_reference, entities )
               if entity_value
-                re = /&#{entity_reference};/
+                re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
                 rv.gsub!( re, entity_value )
               else
                 er = DEFAULT_ENTITIES[entity_reference]
@@ -526,7 +533,7 @@ def unnormalize( string, entities=nil, filter=nil )
               end
             end
           end
-          rv.gsub!( /&amp;/, '&' )
+          rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
         end
         rv
       end
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index 53a985ba..b6a48c93 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -62,6 +62,26 @@ def test_entity_replacement
       end
     end
 
+    def test_character_references
+      source = '<a>&#65;</a><b>&#x42;</b>'
+      parser = REXML::Parsers::PullParser.new( source )
+      element_name = ''
+      while parser.has_next?
+        event = parser.pull
+        case event.event_type
+        when :start_element
+          element_name = event[0]
+        when :text
+          case element_name
+          when 'a'
+            assert_equal('A', event[1])
+          when 'b'
+            assert_equal('B', event[1])
+          end
+        end
+      end
+    end
+
     def test_peek_unshift
       source = "<a><b/></a>"
       REXML::Parsers::PullParser.new(source)

From 20017eea807e8fa386aa5c79ae779004d8b366dd Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 25 Jun 2024 11:26:33 +0900
Subject: [PATCH 13/40] Add 3.3.1 entry

---
 NEWS.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index c8e9ecc0..3e406574 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,52 @@
 # News
 
+## 3.3.1 - 2024-06-25 {#version-3-3-1}
+
+### Improvements
+
+  * Added support for detecting malformed top-level comments.
+    * GH-145
+    * Patch by Hiroya Fujinami.
+
+  * Improved `REXML::Element#attribute` performance.
+    * GH-146
+    * Patch by Hiroya Fujinami.
+
+  * Added support for detecting malformed `<!-->` comments.
+    * GH-147
+    * Patch by Hiroya Fujinami.
+
+  * Added support for detecting unclosed `DOCTYPE`.
+    * GH-152
+    * Patch by Hiroya Fujinami.
+
+  * Added `changlog_uri` metadata to gemspec.
+    * GH-156
+    * Patch by fynsta.
+
+  * Improved parse performance.
+    * GH-157
+    * GH-158
+    * Patch by NAITOH Jun.
+
+### Fixes
+
+  * Fixed a bug that large XML can't be parsed.
+    * GH-154
+    * Patch by NAITOH Jun.
+
+  * Fixed a bug that private constants are visible.
+    * GH-155
+    * Patch by NAITOH Jun.
+
+### Thanks
+
+  * Hiroya Fujinami
+
+  * NAITOH Jun
+
+  * fynsta
+
 ## 3.3.0 - 2024-06-11 {#version-3-3-0}
 
 ### Improvements

From 78b29137bf1ee46e7cf028f52cfa16f6e2578cfd Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 25 Jun 2024 11:27:12 +0900
Subject: [PATCH 14/40] Bump version

---
 lib/rexml/rexml.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index 3af03ec7..573d0a13 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.3.1"
+  VERSION = "3.3.2"
   REVISION = ""
 
   Copyright = COPYRIGHT

From face9dd1fdde20351316c6c3b8090a65cd490305 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Thu, 27 Jun 2024 06:43:12 +0900
Subject: [PATCH 15/40] Optimize BaseParser#unnormalize method to replace
 "\r\n" with "\n" only when "\r\n" is included (#160)

## Why?

See: https://github.com/ruby/rexml/pull/158#issuecomment-2187663068

## Benchmark

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     17.674      17.567        32.759       32.316 i/s -     100.000 times in 5.657973s 5.692371s 3.052595s 3.094448s
                 sax     25.261      25.377        48.889       49.911 i/s -     100.000 times in 3.958626s 3.940640s 2.045460s 2.003575s
                pull     28.968      29.121        61.584       61.774 i/s -     100.000 times in 3.452132s 3.433967s 1.623789s 1.618809s
              stream     28.395      28.803        55.289       57.970 i/s -     100.000 times in 3.521761s 3.471812s 1.808673s 1.725029s

Comparison:
                              dom
        before(YJIT):        32.8 i/s
         after(YJIT):        32.3 i/s - 1.01x  slower
              before:        17.7 i/s - 1.85x  slower
               after:        17.6 i/s - 1.86x  slower

                              sax
         after(YJIT):        49.9 i/s
        before(YJIT):        48.9 i/s - 1.02x  slower
               after:        25.4 i/s - 1.97x  slower
              before:        25.3 i/s - 1.98x  slower

                             pull
         after(YJIT):        61.8 i/s
        before(YJIT):        61.6 i/s - 1.00x  slower
               after:        29.1 i/s - 2.12x  slower
              before:        29.0 i/s - 2.13x  slower

                           stream
         after(YJIT):        58.0 i/s
        before(YJIT):        55.3 i/s - 1.05x  slower
               after:        28.8 i/s - 2.01x  slower
              before:        28.4 i/s - 2.04x  slower

```

- YJIT=ON : 0.98x - 1.05x faster
- YJIT=OFF : 0.98x - 1.02x faster

---------

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
---
 lib/rexml/parsers/baseparser.rb |  6 +++++-
 test/test_pullparser.rb         | 21 +++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 275372ee..02759e70 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -511,7 +511,11 @@ def normalize( input, entities=nil, entity_filter=nil )
 
       # Unescapes all possible entities
       def unnormalize( string, entities=nil, filter=nil )
-        rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
+        if string.include?("\r")
+          rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
+        else
+          rv = string.dup
+        end
         matches = rv.scan( REFERENCE_RE )
         return rv if matches.size == 0
         rv.gsub!( Private::CHARACTER_REFERENCES ) {
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index b6a48c93..073d896d 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -82,6 +82,27 @@ def test_character_references
       end
     end
 
+    def test_text_content_with_line_breaks
+      source = "<root><a>A</a><b>B\n</b><c>C\r\n</c></root>"
+      parser = REXML::Parsers::PullParser.new( source )
+
+      events = {}
+      element_name = ''
+      while parser.has_next?
+        event = parser.pull
+        case event.event_type
+        when :start_element
+          element_name = event[0]
+        when :text
+          events[element_name] = event[1]
+        end
+      end
+
+      assert_equal('A', events['a'])
+      assert_equal("B\n", events['b'])
+      assert_equal("C\n", events['c'])
+    end
+
     def test_peek_unshift
       source = "<a><b/></a>"
       REXML::Parsers::PullParser.new(source)

From eb45c8dcca962c04e56f46b0040b2c33278ca3f9 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Mon, 8 Jul 2024 05:52:19 +0900
Subject: [PATCH 16/40] fix: Extra content at the end of the document (#161)

## Why?

XML with additional content at the end of the document is invalid.

https://www.w3.org/TR/2006/REC-xml11-20060816/#document

```
[1]   document   ::=   ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc

```
[27]    Misc       ::=          Comment | PI | S
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI

```
[16]    PI         ::=          '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget

```
[17]    PITarget           ::=          Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
```
---
 lib/rexml/parsers/baseparser.rb           |  9 ++++++
 test/parse/test_comment.rb                | 12 ++++++++
 test/parse/test_element.rb                | 34 +++++++++++++++++++++++
 test/parse/test_processing_instruction.rb | 12 ++++++++
 test/parse/test_text.rb                   | 25 +++++++++++++++++
 test/test_pullparser.rb                   | 14 +++++-----
 6 files changed, 99 insertions(+), 7 deletions(-)
 create mode 100644 test/parse/test_text.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 02759e70..900c19cc 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -460,8 +460,12 @@ def pull_event
                 @closed = tag
                 @nsstack.shift
               else
+                if @tags.empty? and @have_root
+                  raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
+                end
                 @tags.push( tag )
               end
+              @have_root = true
               return [ :start_element, tag, attributes ]
             end
           else
@@ -469,6 +473,11 @@ def pull_event
             if text.chomp!("<")
               @source.position -= "<".bytesize
             end
+            if @tags.empty? and @have_root
+              unless /\A\s*\z/.match?(text)
+                raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
+              end
+            end
             return [ :text, text ]
           end
         rescue REXML::UndefinedNamespaceException
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index ce6678e8..46a07409 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -105,5 +105,17 @@ def test_after_doctype_malformed_comment_end
         DETAIL
       end
     end
+
+    def test_after_root
+      parser = REXML::Parsers::BaseParser.new('<a></a><!-- ok comment -->')
+
+      events = {}
+      while parser.has_next?
+        event = parser.pull
+        events[event[0]] = event[1]
+      end
+
+      assert_equal(" ok comment ", events[:comment])
+    end
   end
 end
diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb
index 14d0703a..a65cfa85 100644
--- a/test/parse/test_element.rb
+++ b/test/parse/test_element.rb
@@ -85,6 +85,40 @@ def test_garbage_less_than_slash_before_end_tag_at_line_start
 </ </x>
         DETAIL
       end
+
+      def test_after_root
+        exception = assert_raise(REXML::ParseException) do
+          parser = REXML::Parsers::BaseParser.new('<a></a><b>')
+          while parser.has_next?
+            parser.pull
+          end
+        end
+
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed XML: Extra tag at the end of the document (got '<b')
+          Line: 1
+          Position: 10
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
+
+      def test_after_empty_element_tag_root
+        exception = assert_raise(REXML::ParseException) do
+          parser = REXML::Parsers::BaseParser.new('<a/><b>')
+          while parser.has_next?
+            parser.pull
+          end
+        end
+
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed XML: Extra tag at the end of the document (got '<b')
+          Line: 1
+          Position: 7
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
     end
   end
 end
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index f0c0c24e..40dadd11 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -40,5 +40,17 @@ def test_garbage_text
                      ])
       end
     end
+
+    def test_after_root
+      parser = REXML::Parsers::BaseParser.new('<a></a><?abc version="1.0" ?>')
+
+      events = {}
+      while parser.has_next?
+        event = parser.pull
+        events[event[0]] = event[1]
+      end
+
+      assert_equal("abc", events[:processing_instruction])
+    end
   end
 end
diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb
new file mode 100644
index 00000000..f1622b71
--- /dev/null
+++ b/test/parse/test_text.rb
@@ -0,0 +1,25 @@
+require "test/unit"
+require 'rexml/parsers/baseparser'
+
+module REXMLTests
+  class TestParseText < Test::Unit::TestCase
+    class TestInvalid < self
+      def test_after_root
+        exception = assert_raise(REXML::ParseException) do
+          parser = REXML::Parsers::BaseParser.new('<a></a>c')
+          while parser.has_next?
+            parser.pull
+          end
+        end
+
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed XML: Extra content at the end of the document (got 'c')
+          Line: 1
+          Position: 8
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
+    end
+  end
+end
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index 073d896d..0aca46be 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -63,8 +63,10 @@ def test_entity_replacement
     end
 
     def test_character_references
-      source = '<a>&#65;</a><b>&#x42;</b>'
+      source = '<root><a>&#65;</a><b>&#x42;</b></root>'
       parser = REXML::Parsers::PullParser.new( source )
+
+      events = {}
       element_name = ''
       while parser.has_next?
         event = parser.pull
@@ -72,14 +74,12 @@ def test_character_references
         when :start_element
           element_name = event[0]
         when :text
-          case element_name
-          when 'a'
-            assert_equal('A', event[1])
-          when 'b'
-            assert_equal('B', event[1])
-          end
+          events[element_name] = event[1]
         end
       end
+
+      assert_equal('A', events['a'])
+      assert_equal("B", events['b'])
     end
 
     def test_text_content_with_line_breaks

From ebc3e85bfa2796fb4922c1932760bec8390ff87c Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Mon, 8 Jul 2024 05:54:06 +0900
Subject: [PATCH 17/40] Add position check for XML declaration (#162)

## Why?
XML declaration must be the first item.

https://www.w3.org/TR/2006/REC-xml11-20060816/#document

```
[1]   document   ::=   ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog

```
[22]   prolog   ::=   	XMLDecl Misc* (doctypedecl Misc*)?
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl

```
[23]   XMLDecl  ::=   '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
```

See: https://github.com/ruby/rexml/pull/161#discussion_r1666118193
---
 lib/rexml/parsers/baseparser.rb           |  5 ++++-
 test/parse/test_processing_instruction.rb | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 900c19cc..2a448e13 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -644,7 +644,10 @@ def process_instruction(start_position)
           @source.position = start_position
           raise REXML::ParseException.new(message, @source)
         end
-        if @document_status.nil? and match_data[1] == "xml"
+        if match_data[1] == "xml"
+          if @document_status
+            raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
+          end
           content = match_data[2]
           version = VERSION.match(content)
           version = version[1] unless version.nil?
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index 40dadd11..13384935 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -39,6 +39,23 @@ def test_garbage_text
                        pi.content,
                      ])
       end
+
+      def test_xml_declaration_not_at_document_start
+        exception = assert_raise(REXML::ParseException) do
+          parser = REXML::Parsers::BaseParser.new('<a><?xml version="1.0" ?></a>')
+          while parser.has_next?
+            parser.pull
+          end
+        end
+
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed XML: XML declaration is not at the start
+          Line: 1
+          Position: 25
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
     end
 
     def test_after_root

From b2ec329dc1dc7635b224a6d61687c24b1e1db6fd Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 10 Jul 2024 09:50:12 +0900
Subject: [PATCH 18/40] test: move an attribute value test to
 parse/test_element.rb

---
 test/parse/test_element.rb | 11 +++++++++++
 test/test_document.rb      | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb
index a65cfa85..261f25c3 100644
--- a/test/parse/test_element.rb
+++ b/test/parse/test_element.rb
@@ -1,8 +1,12 @@
 require "test/unit"
+require "core_assertions"
+
 require "rexml/document"
 
 module REXMLTests
   class TestParseElement < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
     def parse(xml)
       REXML::Document.new(xml)
     end
@@ -120,5 +124,12 @@ def test_after_empty_element_tag_root
         DETAIL
       end
     end
+
+    def test_gt_linear_performance_attribute_value
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<test testing="' + ">" * n + '"></test>')
+      end
+    end
   end
 end
diff --git a/test/test_document.rb b/test/test_document.rb
index 2b0a8a73..ec0e8a5a 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -1,12 +1,8 @@
 # -*- coding: utf-8 -*-
 # frozen_string_literal: false
 
-require 'core_assertions'
-
 module REXMLTests
   class TestDocument < Test::Unit::TestCase
-    include Test::Unit::CoreAssertions
-
     def test_version_attributes_to_s
       doc = REXML::Document.new(<<~eoxml)
         <?xml version="1.0" encoding="UTF-8" standalone="no"?>
@@ -202,13 +198,6 @@ def test_xml_declaration_standalone
       assert_equal('no', doc.stand_alone?, bug2539)
     end
 
-    def test_gt_linear_performance_attribute_value
-      seq = [10000, 50000, 100000, 150000, 200000]
-      assert_linear_performance(seq, rehearsal: 10) do |n|
-        REXML::Document.new('<test testing="' + ">" * n + '"></test>')
-      end
-    end
-
     def test_each_recursive
       xml_source = <<~XML
         <?xml version="1.0" encoding="UTF-8" standalone="yes"?>

From 5e140edc3051741691e00bf96fa5119b44288a42 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Thu, 11 Jul 2024 09:49:56 +0900
Subject: [PATCH 19/40] Stop adding extra new line after XML declaration with
 pretty format (#164)

If the XML file does not end with a newline, a space is added to the end
of the first line.

```ruby
Failure: test_indent(REXMLTests::TestDocument::WriteTest::ArgumentsTest)
/Users/naitoh/ghq/github.com/naitoh/rexml/test/test_document.rb:270:in `test_indent'
     267:           output = ""
     268:           indent = 2
     269:           @document.write(output, indent)
  => 270:           assert_equal(<<-EOX.chomp, output)
     271: <?xml version='1.0' encoding='UTF-8'?>
     272: <message>
     273:   Hello world!
<"<?xml version='1.0' encoding='UTF-8'?>\n" +
"<message>\n" +
"  Hello world!\n" +
"</message>"> expected but was
<"<?xml version='1.0' encoding='UTF-8'?> \n" +
"<message>\n" +
"  Hello world!\n" +
"</message>">

diff:
? <?xml version='1.0' encoding='UTF-8'?>
  <message>
    Hello world!
  </message>

```

This is happen because `REXML::Formatters::Pretty#write_document` has a
logic that depends on the last text node.

We should ignore all top-level text nodes with pretty format.
---
 lib/rexml/formatters/pretty.rb |  2 +-
 test/test_document.rb          | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/rexml/formatters/pretty.rb b/lib/rexml/formatters/pretty.rb
index a1198b7a..a838d835 100644
--- a/lib/rexml/formatters/pretty.rb
+++ b/lib/rexml/formatters/pretty.rb
@@ -111,7 +111,7 @@ def write_document( node, output )
         # itself, then we don't need a carriage return... which makes this
         # logic more complex.
         node.children.each { |child|
-          next if child == node.children[-1] and child.instance_of?(Text)
+          next if child.instance_of?(Text)
           unless child == node.children[0] or child.instance_of?(Text) or
             (child == node.children[1] and !node.children[0].writethis)
             output << "\n"
diff --git a/test/test_document.rb b/test/test_document.rb
index ec0e8a5a..9cd77c4e 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -236,7 +236,7 @@ def test_each_recursive
 
     class WriteTest < Test::Unit::TestCase
       def setup
-        @document = REXML::Document.new(<<-EOX)
+        @document = REXML::Document.new(<<-EOX.chomp)
 <?xml version="1.0" encoding="UTF-8"?>
 <message>Hello world!</message>
 EOX
@@ -246,7 +246,7 @@ class ArgumentsTest < self
         def test_output
           output = ""
           @document.write(output)
-          assert_equal(<<-EOX, output)
+          assert_equal(<<-EOX.chomp, output)
 <?xml version='1.0' encoding='UTF-8'?>
 <message>Hello world!</message>
 EOX
@@ -269,7 +269,7 @@ def test_transitive
           indent = 2
           transitive = true
           @document.write(output, indent, transitive)
-          assert_equal(<<-EOX, output)
+          assert_equal(<<-EOX.chomp, output)
 <?xml version='1.0' encoding='UTF-8'?>
 <message
 >Hello world!</message
@@ -298,7 +298,7 @@ def test_encoding
           japanese_text = "こんにちは"
           @document.root.text = japanese_text
           @document.write(output, indent, transitive, ie_hack, encoding)
-          assert_equal(<<-EOX.encode(encoding), output)
+          assert_equal(<<-EOX.chomp.encode(encoding), output)
 <?xml version='1.0' encoding='SHIFT_JIS'?>
 <message>#{japanese_text}</message>
 EOX
@@ -309,7 +309,7 @@ class OptionsTest < self
         def test_output
           output = ""
           @document.write(:output => output)
-          assert_equal(<<-EOX, output)
+          assert_equal(<<-EOX.chomp, output)
 <?xml version='1.0' encoding='UTF-8'?>
 <message>Hello world!</message>
 EOX
@@ -329,7 +329,7 @@ def test_indent
         def test_transitive
           output = ""
           @document.write(:output => output, :indent => 2, :transitive => true)
-          assert_equal(<<-EOX, output)
+          assert_equal(<<-EOX.chomp, output)
 <?xml version='1.0' encoding='UTF-8'?>
 <message
 >Hello world!</message
@@ -351,7 +351,7 @@ def test_encoding
           japanese_text = "こんにちは"
           @document.root.text = japanese_text
           @document.write(:output => output, :encoding => encoding)
-          assert_equal(<<-EOX.encode(encoding), output)
+          assert_equal(<<-EOX.chomp.encode(encoding), output)
 <?xml version='1.0' encoding='SHIFT_JIS'?>
 <message>#{japanese_text}</message>
 EOX

From 6d6400cdc03b612c3a3181b9055af87d3d2ddc68 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Thu, 11 Jul 2024 12:13:44 +0900
Subject: [PATCH 20/40] Add tests for REXML::Text.check (#165)

This patch will add missing REXML::Text.check tests.

This is the tests for the part that is checked using a regular
expression:

https://github.com/ruby/rexml/blob/b2ec329dc1dc7635b224a6d61687c24b1e1db6fd/lib/rexml/text.rb#L155-L172
---
 test/test_text_check.rb | 92 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 test/test_text_check.rb

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
new file mode 100644
index 00000000..d4076edf
--- /dev/null
+++ b/test/test_text_check.rb
@@ -0,0 +1,92 @@
+# frozen_string_literal: false
+
+module REXMLTests
+  class TextCheckTester < Test::Unit::TestCase
+
+    def check(string)
+      REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil)
+    end
+
+    def assert_check(string)
+      assert_nothing_raised { check(string) }
+    end
+
+    def assert_check_failed(string, illegal_part)
+      message = "Illegal character #{illegal_part.inspect} in raw string #{string.inspect}"
+      assert_raise(RuntimeError.new(message)) do
+        check(string)
+      end
+    end
+
+    class TestValid < self
+      def test_entity_name_start_char_colon
+        assert_check('&:;')
+      end
+
+      def test_entity_name_start_char_under_score
+        assert_check('&_;')
+      end
+
+      def test_entity_name_mix
+        assert_check('&A.b-0123;')
+      end
+
+      def test_character_reference_decimal
+        assert_check('&#0162;')
+      end
+
+      def test_character_reference_hex
+        assert_check('&#x10FFFF;')
+      end
+
+      def test_entity_name_non_ascii
+        # U+3042 HIRAGANA LETTER A
+        # U+3044 HIRAGANA LETTER I
+        assert_check("&\u3042\u3044;")
+      end
+
+      def test_normal_string
+        assert_check("foo")
+      end
+    end
+
+    class TestInvalid < self
+      def test_lt
+        assert_check_failed('<;', '<')
+      end
+
+      def test_lt_mix
+        assert_check_failed('ab<cd', '<')
+      end
+
+      def test_entity_reference_missing_colon
+        assert_check_failed('&amp', '&')
+      end
+
+      def test_character_reference_decimal_invalid_value
+        # U+0008 BACKSPACE
+        assert_check_failed('&#8;', '&#8;')
+      end
+
+      def test_character_reference_format_hex_0x
+        # U+0041 LATIN CAPITAL LETTER A
+        assert_check_failed('&#0x41;', '&#0x41;')
+      end
+
+      def test_character_reference_format_hex_00x
+        # U+0041 LATIN CAPITAL LETTER A
+        assert_check_failed('&#00x41;', '&#00x41;')
+      end
+
+      def test_character_reference_hex_surrogate_block
+        # U+0D800 SURROGATE PAIR
+        assert_check_failed('&#xD800;', '&#xD800;')
+      end
+
+      def test_entity_name_non_ascii_symbol
+        # U+00BF INVERTED QUESTION MARK
+        assert_check_failed('&\u00BF;', '&')
+      end
+    end
+  end
+end

From 87f7cd0f492dbe4c0963d41e0fbec8f4d29d357a Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Thu, 11 Jul 2024 18:44:54 +0900
Subject: [PATCH 21/40] Fix test for Text.check (#166)

This patch will fix incorrect string in a case where unicode characters.
Because of the use of single quotes, it was simply an ASCII string.
---
 test/test_text_check.rb | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index d4076edf..56d00440 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -20,23 +20,23 @@ def assert_check_failed(string, illegal_part)
 
     class TestValid < self
       def test_entity_name_start_char_colon
-        assert_check('&:;')
+        assert_check("&:;")
       end
 
       def test_entity_name_start_char_under_score
-        assert_check('&_;')
+        assert_check("&_;")
       end
 
       def test_entity_name_mix
-        assert_check('&A.b-0123;')
+        assert_check("&A.b-0123;")
       end
 
       def test_character_reference_decimal
-        assert_check('&#0162;')
+        assert_check("&#0162;")
       end
 
       def test_character_reference_hex
-        assert_check('&#x10FFFF;')
+        assert_check("&#x10FFFF;")
       end
 
       def test_entity_name_non_ascii
@@ -52,40 +52,40 @@ def test_normal_string
 
     class TestInvalid < self
       def test_lt
-        assert_check_failed('<;', '<')
+        assert_check_failed("<;", "<")
       end
 
       def test_lt_mix
-        assert_check_failed('ab<cd', '<')
+        assert_check_failed("ab<cd", "<")
       end
 
       def test_entity_reference_missing_colon
-        assert_check_failed('&amp', '&')
+        assert_check_failed("&amp", "&")
       end
 
       def test_character_reference_decimal_invalid_value
         # U+0008 BACKSPACE
-        assert_check_failed('&#8;', '&#8;')
+        assert_check_failed("&#8;", "&#8;")
       end
 
       def test_character_reference_format_hex_0x
         # U+0041 LATIN CAPITAL LETTER A
-        assert_check_failed('&#0x41;', '&#0x41;')
+        assert_check_failed("&#0x41;", "&#0x41;")
       end
 
       def test_character_reference_format_hex_00x
         # U+0041 LATIN CAPITAL LETTER A
-        assert_check_failed('&#00x41;', '&#00x41;')
+        assert_check_failed("&#00x41;", "&#00x41;")
       end
 
       def test_character_reference_hex_surrogate_block
         # U+0D800 SURROGATE PAIR
-        assert_check_failed('&#xD800;', '&#xD800;')
+        assert_check_failed("&#xD800;", "&#xD800;")
       end
 
       def test_entity_name_non_ascii_symbol
         # U+00BF INVERTED QUESTION MARK
-        assert_check_failed('&\u00BF;', '&')
+        assert_check_failed("&\u00BF;", "&")
       end
     end
   end

From 18214a6460b6e6ba583acec656634edda50b2768 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Jul 2024 20:52:09 +0900
Subject: [PATCH 22/40] test Text.check: add empty reference case

---
 test/test_text_check.rb | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index 56d00440..08cacbdb 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -59,6 +59,10 @@ def test_lt_mix
         assert_check_failed("ab<cd", "<")
       end
 
+      def test_reference_empty
+        assert_check_failed("&;", "&")
+      end
+
       def test_entity_reference_missing_colon
         assert_check_failed("&amp", "&")
       end

From 0832562f31375d33077f6a737cee2e3df8424b28 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Jul 2024 21:00:43 +0900
Subject: [PATCH 23/40] test Text.check: add garbage at the end in character
 reference cases

---
 test/test_text_check.rb | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index 08cacbdb..b2eebe92 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -67,6 +67,11 @@ def test_entity_reference_missing_colon
         assert_check_failed("&amp", "&")
       end
 
+      def test_character_reference_decimal_garbage_at_the_end
+        # U+0030 DIGIT ZERO
+        assert_check_failed("&#48x;", "&")
+      end
+
       def test_character_reference_decimal_invalid_value
         # U+0008 BACKSPACE
         assert_check_failed("&#8;", "&#8;")
@@ -82,6 +87,11 @@ def test_character_reference_format_hex_00x
         assert_check_failed("&#00x41;", "&#00x41;")
       end
 
+      def test_character_reference_hex_garbage_at_the_end
+        # U+0030 DIGIT ZERO
+        assert_check_failed("&#x48x;", "&")
+      end
+
       def test_character_reference_hex_surrogate_block
         # U+0D800 SURROGATE PAIR
         assert_check_failed("&#xD800;", "&#xD800;")

From 704044056df5bd03ffb60303f42999c8780b0770 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Jul 2024 21:03:54 +0900
Subject: [PATCH 24/40] test Text.check: use "why" for test name

---
 test/test_text_check.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index b2eebe92..1ba534fa 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -72,7 +72,7 @@ def test_character_reference_decimal_garbage_at_the_end
         assert_check_failed("&#48x;", "&")
       end
 
-      def test_character_reference_decimal_invalid_value
+      def test_character_reference_decimal_control_character
         # U+0008 BACKSPACE
         assert_check_failed("&#8;", "&#8;")
       end

From ddea83ff7a890b9d341fca1aa031d575aa88d1ac Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Jul 2024 21:06:08 +0900
Subject: [PATCH 25/40] test Text.check: add a space at the start in character
 reference cases

---
 test/test_text_check.rb | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index 1ba534fa..a1cc2149 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -72,6 +72,11 @@ def test_character_reference_decimal_garbage_at_the_end
         assert_check_failed("&#48x;", "&")
       end
 
+      def test_character_reference_decimal_space_at_the_start
+        # U+0030 DIGIT ZERO
+        assert_check_failed("&# 48;", "&")
+      end
+
       def test_character_reference_decimal_control_character
         # U+0008 BACKSPACE
         assert_check_failed("&#8;", "&#8;")
@@ -92,6 +97,11 @@ def test_character_reference_hex_garbage_at_the_end
         assert_check_failed("&#x48x;", "&")
       end
 
+      def test_character_reference_hex_space_at_the_start
+        # U+0030 DIGIT ZERO
+        assert_check_failed("&#x 30;", "&")
+      end
+
       def test_character_reference_hex_surrogate_block
         # U+0D800 SURROGATE PAIR
         assert_check_failed("&#xD800;", "&#xD800;")

From 20f808478c4b5243adb24cae4fcc357db7116853 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Jul 2024 21:08:26 +0900
Subject: [PATCH 26/40] test Text.check: add entity reference with new line
 case

---
 test/test_text_check.rb | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index a1cc2149..11cf65a3 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -111,6 +111,11 @@ def test_entity_name_non_ascii_symbol
         # U+00BF INVERTED QUESTION MARK
         assert_check_failed("&\u00BF;", "&")
       end
+
+      def test_entity_name_new_line
+        # U+0026 AMPERSAND
+        assert_check_failed("&\namp\nx;", "&")
+      end
     end
   end
 end

From a5075c151d8e700057d7b3e1fd1db571ac2c4c4c Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Fri, 12 Jul 2024 09:33:30 +0900
Subject: [PATCH 27/40] Do not output :text event after the root tag is closed
 (#167)

## Why?
GitHub: fix GH-163

## Change
- sax_test.rb
```
require 'rexml/parsers/sax2parser'
require 'rexml/parsers/pullparser'
require 'rexml/parsers/streamparser'

require 'libxml-ruby'
require 'nokogiri'

xml = <<EOS
<root> a b  c
</root>
<!-- ok comment -->
<?abc version="1.0" ?>
EOS

class Listener
  def method_missing(name, *args)
    p [name, *args]
  end
end

puts "LibXML(SAX)"
parser = LibXML::XML::SaxParser.string(xml)
parser.callbacks = Listener.new
parser.parse

puts ""
puts "Nokogiri(SAX)"
parser = Nokogiri::XML::SAX::Parser.new(Listener.new)
parser.parse(xml)

puts ""
puts "REXML(SAX)"
parser = REXML::Parsers::SAX2Parser.new(xml)
parser.listen(Listener.new)
parser.parse

puts ""
puts "REXML(Pull)"
parser = REXML::Parsers::PullParser.new(xml)
while parser.has_next?
    res = parser.pull
    p res
end

puts ""
puts "REXML(Stream)"
parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse
```

## Before (rexml 3.3.1)
```
LibXML(SAX)
[:on_start_document]
[:on_start_element_ns, "root", {}, nil, nil, {}]
[:on_characters, " a b  c \n"]
[:on_end_element_ns, "root", nil, nil]
[:on_comment, " ok comment "]
[:on_processing_instruction, "abc", "version=\"1.0\" "]
[:on_end_document]

Nokogiri(SAX)
[:start_document]
[:start_element_namespace, "root", [], nil, nil, []]
[:characters, " a b  c \n"]
[:end_element_namespace, "root", nil, nil]
[:comment, " ok comment "]
[:processing_instruction, "abc", "version=\"1.0\" "]
[:end_document]

REXML(SAX)
[:start_document]
[:start_element, nil, "root", "root", {}]
[:progress, 6]
[:characters, " a b  c \n"]
[:progress, 15]
[:end_element, nil, "root", "root"]
[:progress, 22]
[:characters, "\n"]
[:progress, 23]
[:comment, " ok comment "]
[:progress, 42]
[:characters, "\n"]
[:progress, 43]
[:processing_instruction, "abc", " version=\"1.0\" "]
[:progress, 65]
[:characters, "\n"]
[:progress, 66]
[:end_document]

REXML(Pull)
start_element: ["root", {}]
text: [" a b  c \n", " a b  c \n"]
end_element: ["root"]
text: ["\n", "\n"]
comment: [" ok comment "]
text: ["\n", "\n"]
processing_instruction: ["abc", " version=\"1.0\" "]
text: ["\n", "\n"]

REXML(Stream)
[:tag_start, "root", {}]
[:text, " a b  c \n"]
[:tag_end, "root"]
[:text, "\n"]
[:comment, " ok comment "]
[:text, "\n"]
[:instruction, "abc", " version=\"1.0\" "]
[:text, "\n"]
```

## After(This PR)
```
REXML(SAX)
[:start_document]
[:start_element, nil, "root", "root", {}]
[:progress, 6]
[:characters, " a b  c \n"]
[:progress, 15]
[:end_element, nil, "root", "root"]
[:progress, 22]
[:comment, " ok comment "]
[:progress, 42]
[:processing_instruction, "abc", " version=\"1.0\" "]
[:progress, 65]
[:end_document]

REXML(Pull)
start_element: ["root", {}]
text: [" a b  c \n", " a b  c \n"]
end_element: ["root"]
comment: [" ok comment "]
processing_instruction: ["abc", " version=\"1.0\" "]
end_document: []

REXML(Stream)
[:tag_start, "root", {}]
[:text, " a b  c \n"]
[:tag_end, "root"]
[:comment, " ok comment "]
[:instruction, "abc", " version=\"1.0\" "]
```
---
 lib/rexml/parsers/baseparser.rb |  1 +
 test/parse/test_text.rb         | 15 +++++++++++++++
 test/parser/test_ultra_light.rb |  1 -
 test/test_core.rb               |  2 +-
 test/test_document.rb           |  2 +-
 5 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 2a448e13..5cf1af21 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -477,6 +477,7 @@ def pull_event
               unless /\A\s*\z/.match?(text)
                 raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
               end
+              return pull_event
             end
             return [ :text, text ]
           end
diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb
index f1622b71..1acefc40 100644
--- a/test/parse/test_text.rb
+++ b/test/parse/test_text.rb
@@ -21,5 +21,20 @@ def test_after_root
         DETAIL
       end
     end
+
+    def test_whitespace_characters_after_root
+      parser = REXML::Parsers::BaseParser.new('<a>b</a> ')
+
+      events = []
+      while parser.has_next?
+        event = parser.pull
+        case event[0]
+        when :text
+          events << event[1]
+        end
+      end
+
+      assert_equal(["b"], events)
+    end
   end
 end
diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb
index 44fd1d1e..b3f576ff 100644
--- a/test/parser/test_ultra_light.rb
+++ b/test/parser/test_ultra_light.rb
@@ -17,7 +17,6 @@ def test_entity_declaration
                        [:entitydecl, "name", "value"]
                      ],
                      [:start_element, :parent, "root", {}],
-                     [:text, "\n"],
                    ],
                    parse(<<-INTERNAL_SUBSET))
 <!ENTITY name "value">
diff --git a/test/test_core.rb b/test/test_core.rb
index 44e2e7ea..e1fba8a7 100644
--- a/test/test_core.rb
+++ b/test/test_core.rb
@@ -826,7 +826,7 @@ def test_deep_clone
     end
 
     def test_whitespace_before_root
-      a = <<EOL
+      a = <<EOL.chomp
 <?xml version='1.0'?>
   <blo>
     <wak>
diff --git a/test/test_document.rb b/test/test_document.rb
index 9cd77c4e..33cf4002 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -435,7 +435,7 @@ def test_utf_16
 
           actual_xml = ""
           document.write(actual_xml)
-          expected_xml = <<-EOX.encode("UTF-16BE")
+          expected_xml = <<-EOX.chomp.encode("UTF-16BE")
 \ufeff<?xml version='1.0' encoding='UTF-16'?>
 <message>Hello world!</message>
 EOX

From 4ebf21f686654af7254beb3721a5c57990eafc30 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sun, 14 Jul 2024 20:22:00 +0900
Subject: [PATCH 28/40] Fix a bug that SAX2 parser doesn't expand the
 predefined entities for "characters" (#168)

## Why?

SAX2 parser expand user-defined entity references and character
references but doesn't expand predefined entity references.

## Change
- text_unnormalized.rb
```
require 'rexml/document'
require 'rexml/parsers/sax2parser'
require 'rexml/parsers/pullparser'
require 'rexml/parsers/streamparser'

xml = <<EOS
<root>
  <A>&lt;P&gt;&https://github.com/ruby/rexml/pull/13; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;</A>
</root>
EOS

class Listener
  def method_missing(name, *args)
    p [name, *args]
  end
end

puts "REXML(DOM)"
REXML::Document.new(xml).elements.each("/root/A") {|element| puts element.text}

puts ""
puts "REXML(Pull)"
parser = REXML::Parsers::PullParser.new(xml)
while parser.has_next?
    res = parser.pull
    p res
end

puts ""
puts "REXML(Stream)"
parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse

puts ""
puts "REXML(SAX)"
parser = REXML::Parsers::SAX2Parser.new(xml)
parser.listen(Listener.new)
parser.parse
```

## Before (master)
```
$ ruby text_unnormalized.rb
REXML(DOM)
 <I> <B> Text </B>  </I>

REXML(Pull)
start_element: ["root", {}]
text: ["\n  ", "\n  "]
start_element: ["A", {}]
text: ["&lt;P&gt;&https://github.com/ruby/rexml/pull/13; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;", "<P>\r <I> <B> Text </B>  </I>"]
end_element: ["A"]
text: ["\n", "\n"]
end_element: ["root"]
end_document: []

REXML(Stream)
[:tag_start, "root", {}]
[:text, "\n  "]
[:tag_start, "A", {}]
[:text, "<P>\r <I> <B> Text </B>  </I>"]
[:tag_end, "A"]
[:text, "\n"]
[:tag_end, "root"]

REXML(SAX)
[:start_document]
[:start_element, nil, "root", "root", {}]
[:progress, 6]
[:characters, "\n  "]
[:progress, 9]
[:start_element, nil, "A", "A", {}]
[:progress, 12]
[:characters, "&lt;P&gt;\r &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;"] #<= This
[:progress, 74]
[:end_element, nil, "A", "A"]
[:progress, 78]
[:characters, "\n"]
[:progress, 79]
[:end_element, nil, "root", "root"]
[:progress, 86]
[:end_document]
```

## After(This PR)
```
$ ruby text_unnormalized.rb
REXML(SAX)
[:start_document]
[:start_element, nil, "root", "root", {}]
[:progress, 6]
[:characters, "\n  "]
[:progress, 9]
[:start_element, nil, "A", "A", {}]
[:progress, 12]
[:characters, "<P>\r <I> <B> Text </B>  </I>"]
[:progress, 74]
[:end_element, nil, "A", "A"]
[:progress, 78]
[:characters, "\n"]
[:progress, 79]
[:end_element, nil, "root", "root"]
[:progress, 86]
[:end_document]
```
---
 lib/rexml/parsers/sax2parser.rb   | 21 ++-------------------
 lib/rexml/parsers/streamparser.rb |  4 ++--
 test/test_pullparser.rb           | 16 ++++++++++++++++
 test/test_sax.rb                  | 11 +++++++++++
 4 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb
index 6a24ce22..36f98c2a 100644
--- a/lib/rexml/parsers/sax2parser.rb
+++ b/lib/rexml/parsers/sax2parser.rb
@@ -157,25 +157,8 @@ def parse
               end
             end
           when :text
-            #normalized = @parser.normalize( event[1] )
-            #handle( :characters, normalized )
-            copy = event[1].clone
-
-            esub = proc { |match|
-              if @entities.has_key?($1)
-                @entities[$1].gsub(Text::REFERENCE, &esub)
-              else
-                match
-              end
-            }
-
-            copy.gsub!( Text::REFERENCE, &esub )
-            copy.gsub!( Text::NUMERICENTITY ) {|m|
-              m=$1
-              m = "0#{m}" if m[0] == ?x
-              [Integer(m)].pack('U*')
-            }
-            handle( :characters, copy )
+            unnormalized = @parser.unnormalize( event[1], @entities )
+            handle( :characters, unnormalized )
           when :entitydecl
             handle_entitydecl( event )
           when :processing_instruction, :comment, :attlistdecl,
diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb
index 9e0eb0b3..fa3ac496 100644
--- a/lib/rexml/parsers/streamparser.rb
+++ b/lib/rexml/parsers/streamparser.rb
@@ -36,8 +36,8 @@ def parse
             @listener.tag_end( event[1] )
             @tag_stack.pop
           when :text
-            normalized = @parser.unnormalize( event[1] )
-            @listener.text( normalized )
+            unnormalized = @parser.unnormalize( event[1] )
+            @listener.text( unnormalized )
           when :processing_instruction
             @listener.instruction( *event[1,2] )
           when :start_doctype
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index 0aca46be..096e8b7f 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -82,6 +82,22 @@ def test_character_references
       assert_equal("B", events['b'])
     end
 
+    def test_text_entity_references
+      source = '<root><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;</a></root>'
+      parser = REXML::Parsers::PullParser.new( source )
+
+      events = []
+      while parser.has_next?
+        event = parser.pull
+        case event.event_type
+        when :text
+          events << event[1]
+        end
+      end
+
+      assert_equal(["<P> <I> <B> Text </B>  </I>"], events)
+    end
+
     def test_text_content_with_line_breaks
       source = "<root><a>A</a><b>B\n</b><c>C\r\n</c></root>"
       parser = REXML::Parsers::PullParser.new( source )
diff --git a/test/test_sax.rb b/test/test_sax.rb
index 8e905f2e..5a3f5e4e 100644
--- a/test/test_sax.rb
+++ b/test/test_sax.rb
@@ -31,6 +31,17 @@ def test_entity_replacement
       assert_equal '--1234--', results[1]
     end
 
+    def test_characters_predefined_entities
+      source = '<root><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;</a></root>'
+
+      sax = Parsers::SAX2Parser.new( source )
+      results = []
+      sax.listen(:characters) {|x| results << x }
+      sax.parse
+
+      assert_equal(["<P> <I> <B> Text </B>  </I>"], results)
+    end
+
     def test_sax2
       File.open(fixture_path("documentation.xml")) do |f|
         parser = Parsers::SAX2Parser.new( f )

From b8a5f4cd5c8fe29c65d7a00e67170223d9d2b50e Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 10:48:53 +0900
Subject: [PATCH 29/40] Fix performance issue caused by using repeated `>`
 characters inside `<?xml` (#170)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb           |  3 ++-
 lib/rexml/source.rb                       |  6 +++---
 test/parse/test_processing_instruction.rb | 11 +++++++++++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 5cf1af21..b117e654 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -125,6 +125,7 @@ class BaseParser
 
       module Private
         INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
+        INSTRUCTION_TERM = "?>"
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -639,7 +640,7 @@ def parse_id_invalid_details(accept_external_id:,
       end
 
       def process_instruction(start_position)
-        match_data = @source.match(Private::INSTRUCTION_END, true)
+        match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM)
         unless match_data
           message = "Invalid processing instruction node"
           @source.position = start_position
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 5715c352..4c30532a 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -117,7 +117,7 @@ def read_until(term)
     def ensure_buffer
     end
 
-    def match(pattern, cons=false)
+    def match(pattern, cons=false, term: nil)
       if cons
         @scanner.scan(pattern).nil? ? nil : @scanner
       else
@@ -240,7 +240,7 @@ def ensure_buffer
     # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats:
     # - ">"
     # - "XXX>" (X is any string excluding '>')
-    def match( pattern, cons=false )
+    def match( pattern, cons=false, term: nil )
       while true
         if cons
           md = @scanner.scan(pattern)
@@ -250,7 +250,7 @@ def match( pattern, cons=false )
         break if md
         return nil if pattern.is_a?(String)
         return nil if @source.nil?
-        return nil unless read
+        return nil unless read(term)
       end
 
       md.nil? ? nil : @scanner
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index 13384935..ac4c2ff0 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -1,8 +1,12 @@
 require "test/unit"
+require "core_assertions"
+
 require "rexml/document"
 
 module REXMLTests
   class TestParseProcessinInstruction < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
     def parse(xml)
       REXML::Document.new(xml)
     end
@@ -69,5 +73,12 @@ def test_after_root
 
       assert_equal("abc", events[:processing_instruction])
     end
+
+    def test_gt_linear_performance
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<?xml version="1.0" ' + ">" * n + ' ?>')
+      end
+    end
   end
 end

From 0af55fa49d4c9369f90f239a9571edab800ed36e Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 10:57:39 +0900
Subject: [PATCH 30/40] Fix ReDoS caused by very large character references
 using repeated 0s (#169)

This patch will fix the ReDoS that is caused by large string of 0s on a
character reference (like `&#00000000...`).

This is occurred in Ruby 3.1 or earlier.
---
 lib/rexml/text.rb                      | 48 ++++++++++++++++++--------
 test/parse/test_character_reference.rb | 17 +++++++++
 2 files changed, 51 insertions(+), 14 deletions(-)
 create mode 100644 test/parse/test_character_reference.rb

diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb
index b47bad3b..7e0befe9 100644
--- a/lib/rexml/text.rb
+++ b/lib/rexml/text.rb
@@ -151,25 +151,45 @@ def Text.check string, pattern, doctype
         end
       end
 
-      # context sensitive
-      string.scan(pattern) do
-        if $1[-1] != ?;
-          raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
-        elsif $1[0] == ?&
-          if $5 and $5[0] == ?#
-            case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
-            when *VALID_CHAR
+      pos = 0
+      while (index = string.index(/<|&/, pos))
+        if string[index] == "<"
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+
+        unless (end_index = string.index(/[^\s];/, index + 1))
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+
+        value = string[(index + 1)..end_index]
+        if /\s/.match?(value)
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+
+        if value[0] == "#"
+          character_reference = value[1..-1]
+
+          unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
+            if character_reference[0] == "x" || character_reference[-1] == "x"
+              raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
             else
-              raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
+              raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
             end
-          # FIXME: below can't work but this needs API change.
-          # elsif @parent and $3 and !SUBSTITUTES.include?($1)
-          #   if !doctype or !doctype.entities.has_key?($3)
-          #     raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
-          #   end
           end
+
+          case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
+          when *VALID_CHAR
+          else
+            raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
+          end
+        elsif !(/\A#{Entity::NAME}\z/um.match?(value))
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
         end
+
+        pos = end_index + 1
       end
+
+      string
     end
 
     def node_type
diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb
new file mode 100644
index 00000000..8ddeccaa
--- /dev/null
+++ b/test/parse/test_character_reference.rb
@@ -0,0 +1,17 @@
+require "test/unit"
+require "core_assertions"
+
+require "rexml/document"
+
+module REXMLTests
+  class TestParseCharacterReference < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
+    def test_gt_linear_performance_many_preceding_zeros
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<test testing="&#' + "0" * n + '97;"/>')
+      end
+    end
+  end
+end

From c1b64c174ec2e8ca2174c51332670e3be30c865f Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 10:57:50 +0900
Subject: [PATCH 31/40] Fix performance issue caused by using repeated `>`
 characters inside comments (#171)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb |  3 ++-
 test/parse/test_comment.rb      | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index b117e654..ba205175 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -126,6 +126,7 @@ class BaseParser
       module Private
         INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
         INSTRUCTION_TERM = "?>"
+        COMMENT_TERM = "-->"
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -243,7 +244,7 @@ def pull_event
             return process_instruction(start_position)
           elsif @source.match("<!", true)
             if @source.match("--", true)
-              md = @source.match(/(.*?)-->/um, true)
+              md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM)
               if md.nil?
                 raise REXML::ParseException.new("Unclosed comment", @source)
               end
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index 46a07409..543d9ad8 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -1,8 +1,12 @@
 require "test/unit"
+require "core_assertions"
+
 require "rexml/document"
 
 module REXMLTests
   class TestParseComment < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
     def parse(xml)
       REXML::Document.new(xml)
     end
@@ -117,5 +121,12 @@ def test_after_root
 
       assert_equal(" ok comment ", events[:comment])
     end
+
+    def test_gt_linear_performance
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<!-- ' + ">" * n + ' -->')
+      end
+    end
   end
 end

From 9f1415a2616c77cad44a176eee90e8457b4774b6 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:04:40 +0900
Subject: [PATCH 32/40] Fix performance issue caused by using repeated `>`
 characters inside `CDATA [ PAYLOAD ]` (#172)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb |  3 ++-
 test/parse/test_cdata.rb        | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 test/parse/test_cdata.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index ba205175..e2c0fd80 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -127,6 +127,7 @@ module Private
         INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
         INSTRUCTION_TERM = "?>"
         COMMENT_TERM = "-->"
+        CDATA_TERM = "]]>"
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -431,7 +432,7 @@ def pull_event
 
                 return [ :comment, md[1] ]
               else
-                md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
+                md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM)
                 return [ :cdata, md[1] ] if md
               end
               raise REXML::ParseException.new( "Declarations can only occur "+
diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb
new file mode 100644
index 00000000..9e8fa8b2
--- /dev/null
+++ b/test/parse/test_cdata.rb
@@ -0,0 +1,17 @@
+require "test/unit"
+require "core_assertions"
+
+require "rexml/document"
+
+module REXMLTests
+  class TestParseCData < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
+    def test_gt_linear_performance
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<description><![CDATA[ ' + ">" * n + ' ]]></description>')
+      end
+    end
+  end
+end

From c33ea498102be65082940e8b7d6d31cb2c6e6ee2 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:11:17 +0900
Subject: [PATCH 33/40] Fix performance issue caused by using repeated `>`
 characters after ` <!DOCTYPE name` (#173)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb              |  3 ++-
 test/parse/test_document_type_declaration.rb | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index e2c0fd80..7fe6c4e8 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -128,6 +128,7 @@ module Private
         INSTRUCTION_TERM = "?>"
         COMMENT_TERM = "-->"
         CDATA_TERM = "]]>"
+        DOCTYPE_TERM = "]>"
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -384,7 +385,7 @@ def pull_event
               end
               return [ :comment, md[1] ] if md
             end
-          elsif match = @source.match(/(%.*?;)\s*/um, true)
+          elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM)
             return [ :externalentity, match[1] ]
           elsif @source.match(/\]\s*>/um, true)
             @document_status = :after_doctype
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 3ca0b536..61c3f04d 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -1,9 +1,13 @@
 # frozen_string_literal: false
 require "test/unit"
+require "core_assertions"
+
 require "rexml/document"
 
 module REXMLTests
   class TestParseDocumentTypeDeclaration < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
     private
     def parse(doctype)
       REXML::Document.new(<<-XML).doctype
@@ -276,6 +280,16 @@ def test_notation_attlist
                      doctype.children.collect(&:class))
       end
 
+      def test_gt_linear_performance_malformed_entity
+        seq = [10000, 50000, 100000, 150000, 200000]
+        assert_linear_performance(seq, rehearsal: 10) do |n|
+          begin
+            REXML::Document.new('<!DOCTYPE root [' + "%>" * n + ']><test/>')
+          rescue
+          end
+        end
+      end
+
       private
       def parse(internal_subset)
         super(<<-DOCTYPE)

From a79ac8b4b42a9efabe33a0be31bd82d33fd50347 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:18:11 +0900
Subject: [PATCH 34/40] Fix performance issue caused by using repeated `>`
 characters inside `<!DOCTYPE root [<!-- PAYLOAD -->]>` (#174)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb              | 2 +-
 test/parse/test_document_type_declaration.rb | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 7fe6c4e8..4fcdaba7 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -378,7 +378,7 @@ def pull_event
                 raise REXML::ParseException.new(message, @source)
               end
               return [:notationdecl, name, *id]
-            elsif md = @source.match(/--(.*?)-->/um, true)
+            elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
               case md[1]
               when /--/, /-\z/
                 raise REXML::ParseException.new("Malformed comment", @source)
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 61c3f04d..3c3371ea 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -290,6 +290,13 @@ def test_gt_linear_performance_malformed_entity
         end
       end
 
+      def test_gt_linear_performance_comment
+        seq = [10000, 50000, 100000, 150000, 200000]
+        assert_linear_performance(seq, rehearsal: 10) do |n|
+          REXML::Document.new('<!DOCTYPE root [<!-- ' + ">" * n + ' -->]>')
+        end
+      end
+
       private
       def parse(internal_subset)
         super(<<-DOCTYPE)

From 67efb5951ed09dbb575c375b130a1e469f437d1f Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:26:57 +0900
Subject: [PATCH 35/40] Fix performance issue caused by using repeated `>`
 characters inside `<!DOCTYPE name [<!ENTITY>]>` (#175)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb       | 8 ++++++--
 test/parse/test_entity_declaration.rb | 7 +++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 4fcdaba7..e8f1a069 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -124,11 +124,15 @@ class BaseParser
       }
 
       module Private
-        INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
+        # Terminal requires two or more letters.
         INSTRUCTION_TERM = "?>"
         COMMENT_TERM = "-->"
         CDATA_TERM = "]]>"
         DOCTYPE_TERM = "]>"
+        # Read to the end of DOCTYPE because there is no proper ENTITY termination
+        ENTITY_TERM = DOCTYPE_TERM
+
+        INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -313,7 +317,7 @@ def pull_event
               raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
               return [ :elementdecl, "<!ELEMENT" + md[1] ]
             elsif @source.match("ENTITY", true)
-              match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true).captures.compact]
+              match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true, term: Private::ENTITY_TERM).captures.compact]
               ref = false
               if match[1] == '%'
                 ref = true
diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
index e15deec6..07529016 100644
--- a/test/parse/test_entity_declaration.rb
+++ b/test/parse/test_entity_declaration.rb
@@ -32,5 +32,12 @@ def test_empty
  <!ENTITY>  ]> <r/>
       DETAIL
     end
+
+    def test_gt_linear_performance
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version "' + '>' * n + '">')
+      end
+    end
   end
 end

From 1cc1d9a74ede52f3d9ce774cafb11c57b3905165 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 16 Jul 2024 11:27:57 +0900
Subject: [PATCH 36/40] Suppress have_root not initialized warnings on Ruby < 3

---
 lib/rexml/parsers/baseparser.rb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index e8f1a069..860be203 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -165,6 +165,7 @@ def add_listener( listener )
       def stream=( source )
         @source = SourceFactory.create_from( source )
         @closed = nil
+        @have_root = false
         @document_status = nil
         @tags = []
         @stack = []

From 1f1e6e9b40bf339894e843dfd679c2fb1a5ddbf2 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:35:41 +0900
Subject: [PATCH 37/40] Fix ReDoS by using repeated space characters inside
 `<!DOCTYPE name [<!ATTLIST>]>` (#176)

Fix performance by removing unnecessary spaces.

This is occurred in Ruby 3.1 or earlier.
---
 lib/rexml/parsers/baseparser.rb |  2 +-
 test/parse/test_attlist.rb      | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 test/parse/test_attlist.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 860be203..47380f0d 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -350,7 +350,7 @@ def pull_event
               contents = md[0]
 
               pairs = {}
-              values = md[0].scan( ATTDEF_RE )
+              values = md[0].strip.scan( ATTDEF_RE )
               values.each do |attdef|
                 unless attdef[3] == "#IMPLIED"
                   attdef.compact!
diff --git a/test/parse/test_attlist.rb b/test/parse/test_attlist.rb
new file mode 100644
index 00000000..eee9309c
--- /dev/null
+++ b/test/parse/test_attlist.rb
@@ -0,0 +1,17 @@
+require "test/unit"
+require "core_assertions"
+
+require "rexml/document"
+
+module REXMLTests
+  class TestParseAttlist < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
+    def test_gt_linear_performance
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<!DOCTYPE schema SYSTEM "foo.dtd" [<!ATTLIST ' + " " * n + ' root v CDATA #FIXED "test">]>')
+      end
+    end
+  end
+end

From 910e5a2b487cb5a30989884a39f9cad2cc499cfc Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:36:05 +0900
Subject: [PATCH 38/40] Fix performance issue caused by using repeated `>`
 characters inside `<xml><!-- --></xml>` (#177)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb | 2 +-
 test/parse/test_comment.rb      | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 47380f0d..5688c773 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -430,7 +430,7 @@ def pull_event
               #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
               raise REXML::ParseException.new("Malformed node", @source) unless md
               if md[0][0] == ?-
-                md = @source.match(/--(.*?)-->/um, true)
+                md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
 
                 if md.nil? || /--|-\z/.match?(md[1])
                   raise REXML::ParseException.new("Malformed comment", @source)
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index 543d9ad8..50c765f5 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -128,5 +128,12 @@ def test_gt_linear_performance
         REXML::Document.new('<!-- ' + ">" * n + ' -->')
       end
     end
+
+    def test_gt_linear_performance_in_element
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<xml><!-- ' + '>' * n + ' --></xml>')
+      end
+    end
   end
 end

From 0e33d3adfb5069b20622e5ed9393d10b8cc17b40 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 16 Jul 2024 11:37:45 +0900
Subject: [PATCH 39/40] test: improve linear performance test names

Use "test_linear_performance_XXX" style.
---
 test/parse/test_attlist.rb                   | 2 +-
 test/parse/test_cdata.rb                     | 2 +-
 test/parse/test_character_reference.rb       | 2 +-
 test/parse/test_comment.rb                   | 4 ++--
 test/parse/test_document_type_declaration.rb | 4 ++--
 test/parse/test_element.rb                   | 2 +-
 test/parse/test_entity_declaration.rb        | 2 +-
 test/parse/test_processing_instruction.rb    | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/parse/test_attlist.rb b/test/parse/test_attlist.rb
index eee9309c..c1b4376c 100644
--- a/test/parse/test_attlist.rb
+++ b/test/parse/test_attlist.rb
@@ -7,7 +7,7 @@ module REXMLTests
   class TestParseAttlist < Test::Unit::TestCase
     include Test::Unit::CoreAssertions
 
-    def test_gt_linear_performance
+    def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<!DOCTYPE schema SYSTEM "foo.dtd" [<!ATTLIST ' + " " * n + ' root v CDATA #FIXED "test">]>')
diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb
index 9e8fa8b2..b5f1a3bc 100644
--- a/test/parse/test_cdata.rb
+++ b/test/parse/test_cdata.rb
@@ -7,7 +7,7 @@ module REXMLTests
   class TestParseCData < Test::Unit::TestCase
     include Test::Unit::CoreAssertions
 
-    def test_gt_linear_performance
+    def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<description><![CDATA[ ' + ">" * n + ' ]]></description>')
diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb
index 8ddeccaa..bf8d2190 100644
--- a/test/parse/test_character_reference.rb
+++ b/test/parse/test_character_reference.rb
@@ -7,7 +7,7 @@ module REXMLTests
   class TestParseCharacterReference < Test::Unit::TestCase
     include Test::Unit::CoreAssertions
 
-    def test_gt_linear_performance_many_preceding_zeros
+    def test_linear_performance_many_preceding_zeros
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<test testing="&#' + "0" * n + '97;"/>')
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index 50c765f5..b7892232 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -122,14 +122,14 @@ def test_after_root
       assert_equal(" ok comment ", events[:comment])
     end
 
-    def test_gt_linear_performance
+    def test_linear_performance_top_level_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<!-- ' + ">" * n + ' -->')
       end
     end
 
-    def test_gt_linear_performance_in_element
+    def test_linear_performance_in_element_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<xml><!-- ' + '>' * n + ' --></xml>')
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 3c3371ea..490a27d4 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -280,7 +280,7 @@ def test_notation_attlist
                      doctype.children.collect(&:class))
       end
 
-      def test_gt_linear_performance_malformed_entity
+      def test_linear_performance_percent_gt
         seq = [10000, 50000, 100000, 150000, 200000]
         assert_linear_performance(seq, rehearsal: 10) do |n|
           begin
@@ -290,7 +290,7 @@ def test_gt_linear_performance_malformed_entity
         end
       end
 
-      def test_gt_linear_performance_comment
+      def test_linear_performance_comment_gt
         seq = [10000, 50000, 100000, 150000, 200000]
         assert_linear_performance(seq, rehearsal: 10) do |n|
           REXML::Document.new('<!DOCTYPE root [<!-- ' + ">" * n + ' -->]>')
diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb
index 261f25c3..2b0746ea 100644
--- a/test/parse/test_element.rb
+++ b/test/parse/test_element.rb
@@ -125,7 +125,7 @@ def test_after_empty_element_tag_root
       end
     end
 
-    def test_gt_linear_performance_attribute_value
+    def test_linear_performance_attribute_value_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<test testing="' + ">" * n + '"></test>')
diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
index 07529016..7d750b90 100644
--- a/test/parse/test_entity_declaration.rb
+++ b/test/parse/test_entity_declaration.rb
@@ -33,7 +33,7 @@ def test_empty
       DETAIL
     end
 
-    def test_gt_linear_performance
+    def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version "' + '>' * n + '">')
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index ac4c2ff0..7943cd3c 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -74,7 +74,7 @@ def test_after_root
       assert_equal("abc", events[:processing_instruction])
     end
 
-    def test_gt_linear_performance
+    def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<?xml version="1.0" ' + ">" * n + ' ?>')

From 2b285ac0804f2918de642f7ed4646dc6d645a7fc Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 16 Jul 2024 11:38:07 +0900
Subject: [PATCH 40/40] Add 3.3.2 entry

---
 NEWS.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 3e406574..3b62f6aa 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,53 @@
 # News
 
+## 3.3.2 - 2024-07-16 {#version-3-3-2}
+
+### Improvements
+
+  * Improved parse performance.
+    * GH-160
+    * Patch by NAITOH Jun.
+
+  * Improved parse performance.
+    * GH-169
+    * GH-170
+    * GH-171
+    * GH-172
+    * GH-173
+    * GH-174
+    * Patch by Watson.
+
+  * Added support for raising a parse exception when an XML has extra
+    content after the root element.
+    * GH-161
+    * Patch by NAITOH Jun.
+
+  * Added support for raising a parse exception when an XML
+    declaration exists in wrong position.
+    * GH-162
+    * Patch by NAITOH Jun.
+
+  * Removed needless a space after XML declaration in pretty print mode.
+    * GH-164
+    * Patch by NAITOH Jun.
+
+  * Stopped to emit `:text` event after the root element.
+    * GH-167
+    * Patch by NAITOH Jun.
+
+### Fixes
+
+  * Fixed a bug that SAX2 parser doesn't expand predefined entities for
+    `characters` callback.
+    * GH-168
+    * Patch by NAITOH Jun.
+
+### Thanks
+
+  * NAITOH Jun
+
+  * Watson
+
 ## 3.3.1 - 2024-06-25 {#version-3-3-1}
 
 ### Improvements