diff --git a/NEWS.md b/NEWS.md index c8e9ecc0..3b62f6aa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,100 @@ # News +## 3.3.2 - 2024-07-16 {#version-3-3-2} + +### Improvements + + * Improved parse performance. + * GH-160 + * Patch by NAITOH Jun. + + * Improved parse performance. + * GH-169 + * GH-170 + * GH-171 + * GH-172 + * GH-173 + * GH-174 + * Patch by Watson. + + * Added support for raising a parse exception when an XML has extra + content after the root element. + * GH-161 + * Patch by NAITOH Jun. + + * Added support for raising a parse exception when an XML + declaration exists in wrong position. + * GH-162 + * Patch by NAITOH Jun. + + * Removed needless a space after XML declaration in pretty print mode. + * GH-164 + * Patch by NAITOH Jun. + + * Stopped to emit `:text` event after the root element. + * GH-167 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that SAX2 parser doesn't expand predefined entities for + `characters` callback. + * GH-168 + * Patch by NAITOH Jun. + +### Thanks + + * NAITOH Jun + + * Watson + +## 3.3.1 - 2024-06-25 {#version-3-3-1} + +### Improvements + + * Added support for detecting malformed top-level comments. + * GH-145 + * Patch by Hiroya Fujinami. + + * Improved `REXML::Element#attribute` performance. + * GH-146 + * Patch by Hiroya Fujinami. + + * Added support for detecting malformed `` comments. + * GH-147 + * Patch by Hiroya Fujinami. + + * Added support for detecting unclosed `DOCTYPE`. + * GH-152 + * Patch by Hiroya Fujinami. + + * Added `changlog_uri` metadata to gemspec. + * GH-156 + * Patch by fynsta. + + * Improved parse performance. + * GH-157 + * GH-158 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that large XML can't be parsed. + * GH-154 + * Patch by NAITOH Jun. + + * Fixed a bug that private constants are visible. + * GH-155 + * Patch by NAITOH Jun. + +### Thanks + + * Hiroya Fujinami + + * NAITOH Jun + + * fynsta + ## 3.3.0 - 2024-06-11 {#version-3-3-0} ### Improvements diff --git a/benchmark/attribute.yaml b/benchmark/attribute.yaml new file mode 100644 index 00000000..5dd7fded --- /dev/null +++ b/benchmark/attribute.yaml @@ -0,0 +1,38 @@ +loop_count: 1000 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + xml_source = "" + 100.times do + xml_source = "#{xml_source}" + end + xml_source = "#{xml_source}" + + document = REXML::Document.new(xml_source) + deepest_node = document.elements["//deepest"] + +benchmark: + with_ns: deepest_node.attribute("with_ns", "xyz") + without_ns: deepest_node.attribute("without_ns") diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index bf913a82..a5808d7c 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -7,14 +7,6 @@ require_relative "parseexception" module REXML - # An implementation note about namespaces: - # As we parse, when we find namespaces we put them in a hash and assign - # them a unique ID. We then convert the namespace prefix for the node - # to the unique ID. This makes namespace lookup much faster for the - # cost of extra memory use. We save the namespace prefix for the - # context node and convert it back when we write it. - @@namespaces = {} - # An \REXML::Element object represents an XML element. # # An element: @@ -1284,16 +1276,11 @@ def [](name_or_index) # document.root.attribute("x", "a") # => a:x='a:x' # def attribute( name, namespace=nil ) - prefix = nil - if namespaces.respond_to? :key - prefix = namespaces.key(namespace) if namespace - else - prefix = namespaces.index(namespace) if namespace - end + prefix = namespaces.key(namespace) if namespace prefix = nil if prefix == 'xmlns' ret_val = - attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + attributes.get_attribute( prefix ? "#{prefix}:#{name}" : name ) return ret_val unless ret_val.nil? return nil if prefix.nil? diff --git a/lib/rexml/formatters/pretty.rb b/lib/rexml/formatters/pretty.rb index a1198b7a..a838d835 100644 --- a/lib/rexml/formatters/pretty.rb +++ b/lib/rexml/formatters/pretty.rb @@ -111,7 +111,7 @@ def write_document( node, output ) # itself, then we don't need a carriage return... which makes this # logic more complex. node.children.each { |child| - next if child == node.children[-1] and child.instance_of?(Text) + next if child.instance_of?(Text) unless child == node.children[0] or child.instance_of?(Text) or (child == node.children[1] and !node.children[0].writethis) output << "\n" diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index eadc78f7..5688c773 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -124,6 +124,14 @@ class BaseParser } module Private + # Terminal requires two or more letters. + INSTRUCTION_TERM = "?>" + COMMENT_TERM = "-->" + CDATA_TERM = "]]>" + DOCTYPE_TERM = "]>" + # Read to the end of DOCTYPE because there is no proper ENTITY termination + ENTITY_TERM = DOCTYPE_TERM + INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um @@ -132,13 +140,20 @@ module Private GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ + CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + DEFAULT_ENTITIES_PATTERNS = {} + default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] + default_entities.each do |term| + DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ + end end private_constant :Private - include Private def initialize( source ) self.stream = source @listeners = [] + @prefixes = Set.new end def add_listener( listener ) @@ -150,6 +165,7 @@ def add_listener( listener ) def stream=( source ) @source = SourceFactory.create_from( source ) @closed = nil + @have_root = false @document_status = nil @tags = [] @stack = [] @@ -204,6 +220,8 @@ def peek depth=0 # Returns the next event. This is a +PullEvent+ object. def pull + @source.drop_parsed_content + pull_event.tap do |event| @listeners.each do |listener| listener.receive event @@ -216,7 +234,12 @@ def pull_event x, @closed = @closed, nil return [ :end_element, x ] end - return [ :end_document ] if empty? + if empty? + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: unclosed", @source) + end + return [ :end_document ] + end return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" @@ -228,7 +251,14 @@ def pull_event return process_instruction(start_position) elsif @source.match("/um, true)[1] ] + md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM) + if md.nil? + raise REXML::ParseException.new("Unclosed comment", @source) + end + if /--|-\z/.match?(md[1]) + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] elsif @source.match("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" unless @source.match(/\s+/um, true) @@ -240,7 +270,7 @@ def pull_event @source.position = start_position raise REXML::ParseException.new(message, @source) end - @nsstack.unshift(curr_ns=Set.new) + @nsstack.unshift(Set.new) name = parse_name(base_error_message) if @source.match(/\s*\[/um, true) id = [nil, nil, nil] @@ -288,7 +318,7 @@ def pull_event raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, "/um, true) + elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM) case md[1] when /--/, /-\z/ raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] if md end - elsif match = @source.match(/(%.*?;)\s*/um, true) + elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM) return [ :externalentity, match[1] ] elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype return [ :end_doctype ] end + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) + end end if @document_status == :after_doctype @source.match(/\s*/um, true) @@ -380,7 +413,7 @@ def pull_event if @source.match("/", true) @nsstack.shift last_tag = @tags.pop - md = @source.match(CLOSE_PATTERN, true) + md = @source.match(Private::CLOSE_PATTERN, true) if md and !last_tag message = "Unexpected top-level end tag (got '#{md[1]}')" raise REXML::ParseException.new(message, @source) @@ -397,16 +430,15 @@ def pull_event #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md if md[0][0] == ?- - md = @source.match(/--(.*?)-->/um, true) + md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM) - case md[1] - when /--/, /-\z/ + if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) end - return [ :comment, md[1] ] if md + return [ :comment, md[1] ] else - md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ @@ -415,19 +447,19 @@ def pull_event return process_instruction(start_position) else # Get the next tag - md = @source.match(TAG_PATTERN, true) + md = @source.match(Private::TAG_PATTERN, true) unless md @source.position = start_position raise REXML::ParseException.new("malformed XML: missing tag start", @source) end tag = md[1] @document_status = :in_element - prefixes = Set.new - prefixes << md[2] if md[2] + @prefixes.clear + @prefixes << md[2] if md[2] @nsstack.unshift(curr_ns=Set.new) - attributes, closed = parse_attributes(prefixes, curr_ns) + attributes, closed = parse_attributes(@prefixes, curr_ns) # Verify that all of the prefixes have been defined - for prefix in prefixes + for prefix in @prefixes unless @nsstack.find{|k| k.member?(prefix)} raise UndefinedNamespaceException.new(prefix,@source,self) end @@ -437,8 +469,12 @@ def pull_event @closed = tag @nsstack.shift else + if @tags.empty? and @have_root + raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source) + end @tags.push( tag ) end + @have_root = true return [ :start_element, tag, attributes ] end else @@ -446,6 +482,12 @@ def pull_event if text.chomp!("<") @source.position -= "<".bytesize end + if @tags.empty? and @have_root + unless /\A\s*\z/.match?(text) + raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) + end + return pull_event + end return [ :text, text ] end rescue REXML::UndefinedNamespaceException @@ -488,10 +530,14 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.gsub( /\r\n?/, "\n" ) + if string.include?("\r") + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) + else + rv = string.dup + end matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { + rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') @@ -502,7 +548,7 @@ def unnormalize( string, entities=nil, filter=nil ) unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value - re = /&#{entity_reference};/ + re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) else er = DEFAULT_ENTITIES[entity_reference] @@ -510,7 +556,7 @@ def unnormalize( string, entities=nil, filter=nil ) end end end - rv.gsub!( /&/, '&' ) + rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) end rv end @@ -523,7 +569,7 @@ def need_source_encoding_update?(xml_declaration_encoding) end def parse_name(base_error_message) - md = @source.match(NAME_PATTERN, true) + md = @source.match(Private::NAME_PATTERN, true) unless md if @source.match(/\s*\S/um) message = "#{base_error_message}: invalid name" @@ -602,13 +648,16 @@ def parse_id_invalid_details(accept_external_id:, end def process_instruction(start_position) - match_data = @source.match(INSTRUCTION_END, true) + match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM) unless match_data message = "Invalid processing instruction node" @source.position = start_position raise REXML::ParseException.new(message, @source) end - if @document_status.nil? and match_data[1] == "xml" + if match_data[1] == "xml" + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end content = match_data[2] version = VERSION.match(content) version = version[1] unless version.nil? diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 6a24ce22..36f98c2a 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -157,25 +157,8 @@ def parse end end when :text - #normalized = @parser.normalize( event[1] ) - #handle( :characters, normalized ) - copy = event[1].clone - - esub = proc { |match| - if @entities.has_key?($1) - @entities[$1].gsub(Text::REFERENCE, &esub) - else - match - end - } - - copy.gsub!( Text::REFERENCE, &esub ) - copy.gsub!( Text::NUMERICENTITY ) {|m| - m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') - } - handle( :characters, copy ) + unnormalized = @parser.unnormalize( event[1], @entities ) + handle( :characters, unnormalized ) when :entitydecl handle_entitydecl( event ) when :processing_instruction, :comment, :attlistdecl, diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 9e0eb0b3..fa3ac496 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -36,8 +36,8 @@ def parse @listener.tag_end( event[1] ) @tag_stack.pop when :text - normalized = @parser.unnormalize( event[1] ) - @listener.text( normalized ) + unnormalized = @parser.unnormalize( event[1] ) + @listener.text( unnormalized ) when :processing_instruction @listener.instruction( *event[1,2] ) when :start_doctype diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index bf9a4254..0cb6f7cc 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -16,7 +16,6 @@ def add_listener( listener ) def parse tag_stack = [] - in_doctype = false entities = nil begin while true @@ -39,17 +38,15 @@ def parse tag_stack.pop @build_context = @build_context.parent when :text - if not in_doctype - if @build_context[-1].instance_of? Text - @build_context[-1] << event[1] - else - @build_context.add( - Text.new(event[1], @build_context.whitespace, nil, true) - ) unless ( - @build_context.ignore_whitespace_nodes and - event[1].strip.size==0 - ) - end + if @build_context[-1].instance_of? Text + @build_context[-1] << event[1] + else + @build_context.add( + Text.new(event[1], @build_context.whitespace, nil, true) + ) unless ( + @build_context.ignore_whitespace_nodes and + event[1].strip.size==0 + ) end when :comment c = Comment.new( event[1] ) @@ -60,14 +57,12 @@ def parse when :processing_instruction @build_context.add( Instruction.new( event[1], event[2] ) ) when :end_doctype - in_doctype = false entities.each { |k,v| entities[k] = @build_context.entities[k].value } @build_context = @build_context.parent when :start_doctype doctype = DocType.new( event[1..-1], @build_context ) @build_context = doctype entities = {} - in_doctype = true when :attlistdecl n = AttlistDecl.new( event[1..-1] ) @build_context.add( n ) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 3e870822..573d0a13 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.0" + VERSION = "3.3.2" REVISION = "" Copyright = COPYRIGHT diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 67154832..4c30532a 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -55,6 +55,7 @@ class Source attr_reader :encoding module Private + SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} pre_defined_terms = ["'", '"', "<"] pre_defined_terms.each do |term| @@ -62,7 +63,6 @@ module Private end end private_constant :Private - include Private # Constructor # @param arg must be a String, and should be a valid XML document @@ -84,6 +84,12 @@ def buffer @scanner.rest end + def drop_parsed_content + if @scanner.pos > Private::SCANNER_RESET_SIZE + @scanner.string = @scanner.rest + end + end + def buffer_encoding=(encoding) @scanner.string.force_encoding(encoding) end @@ -111,7 +117,7 @@ def read_until(term) def ensure_buffer end - def match(pattern, cons=false) + def match(pattern, cons=false, term: nil) if cons @scanner.scan(pattern).nil? ? nil : @scanner else @@ -234,7 +240,7 @@ def ensure_buffer # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats: # - ">" # - "XXX>" (X is any string excluding '>') - def match( pattern, cons=false ) + def match( pattern, cons=false, term: nil ) while true if cons md = @scanner.scan(pattern) @@ -244,7 +250,7 @@ def match( pattern, cons=false ) break if md return nil if pattern.is_a?(String) return nil if @source.nil? - return nil unless read + return nil unless read(term) end md.nil? ? nil : @scanner diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index b47bad3b..7e0befe9 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -151,25 +151,45 @@ def Text.check string, pattern, doctype end end - # context sensitive - string.scan(pattern) do - if $1[-1] != ?; - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" - elsif $1[0] == ?& - if $5 and $5[0] == ?# - case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) - when *VALID_CHAR + pos = 0 + while (index = string.index(/<|&/, pos)) + if string[index] == "<" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + unless (end_index = string.index(/[^\s];/, index + 1)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + value = string[(index + 1)..end_index] + if /\s/.match?(value) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + if value[0] == "#" + character_reference = value[1..-1] + + unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference)) + if character_reference[0] == "x" || character_reference[-1] == "x" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" else - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" end - # FIXME: below can't work but this needs API change. - # elsif @parent and $3 and !SUBSTITUTES.include?($1) - # if !doctype or !doctype.entities.has_key?($3) - # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" - # end end + + case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" + end + elsif !(/\A#{Entity::NAME}\z/um.match?(value)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" end + + pos = end_index + 1 end + + string end def node_type diff --git a/rexml.gemspec b/rexml.gemspec index 169e49dc..0de3e845 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -16,6 +16,10 @@ Gem::Specification.new do |spec| spec.homepage = "https://github.com/ruby/rexml" spec.license = "BSD-2-Clause" + spec.metadata = { + "changelog_uri" => "#{spec.homepage}/releases/tag/v#{spec.version}" + } + files = [ "LICENSE.txt", "NEWS.md", diff --git a/test/parse/test_attlist.rb b/test/parse/test_attlist.rb new file mode 100644 index 00000000..c1b4376c --- /dev/null +++ b/test/parse/test_attlist.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseAttlist < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new(']>') + end + end + end +end diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb new file mode 100644 index 00000000..b5f1a3bc --- /dev/null +++ b/test/parse/test_cdata.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCData < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' ]]>') + end + end + end +end diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb new file mode 100644 index 00000000..bf8d2190 --- /dev/null +++ b/test/parse/test_character_reference.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCharacterReference < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_many_preceding_zeros + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb new file mode 100644 index 00000000..b7892232 --- /dev/null +++ b/test/parse/test_comment.rb @@ -0,0 +1,139 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseComment < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def parse(xml) + REXML::Document.new(xml) + end + + class TestInvalid < self + def test_toplevel_unclosed_comment + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 11 + Last 80 unconsumed characters: + DETAIL + end + + def test_toplevel_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 9 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 26 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 24 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_short + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed comment + Line: 1 + Position: 8 + Last 80 unconsumed characters: + --> + DETAIL + end + + def test_after_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 14 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 12 + Last 80 unconsumed characters: + DETAIL + end + end + + def test_after_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end + + def test_linear_performance_top_level_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + + def test_linear_performance_in_element_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 8faa0b78..490a27d4 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -1,9 +1,13 @@ # frozen_string_literal: false require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseDocumentTypeDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + private def parse(doctype) REXML::Document.new(<<-XML).doctype @@ -53,6 +57,51 @@ def test_no_name end end + class TestUnclosed < self + def test_no_extra_node + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(" + DOCTYPE + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed DOCTYPE: invalid declaration + Line: 1 + Position: 20 + Last 80 unconsumed characters: + #{' '} + DETAIL + end + + def test_text + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(<<~DOCTYPE) + " * n + ']>') + rescue + end + end + end + + def test_linear_performance_comment_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' -->]>') + end + end + private def parse(internal_subset) super(<<-DOCTYPE) diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 14d0703a..2b0746ea 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseElement < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -85,6 +89,47 @@ def test_garbage_less_than_slash_before_end_tag_at_line_start DETAIL end + + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '" * n + '">') + end end end end diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb index e15deec6..7d750b90 100644 --- a/test/parse/test_entity_declaration.rb +++ b/test/parse/test_entity_declaration.rb @@ -32,5 +32,12 @@ def test_empty ]> DETAIL end + + def test_linear_performance_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('' * n + '">') + end + end end end diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index f0c0c24e..7943cd3c 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseProcessinInstruction < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -39,6 +43,42 @@ def test_garbage_text pi.content, ]) end + + def test_xml_declaration_not_at_document_start + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration is not at the start + Line: 1 + Position: 25 + Last 80 unconsumed characters: + + DETAIL + end + end + + def test_after_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end + + def test_linear_performance_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' ?>') + end end end end diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb new file mode 100644 index 00000000..1acefc40 --- /dev/null +++ b/test/parse/test_text.rb @@ -0,0 +1,40 @@ +require "test/unit" +require 'rexml/parsers/baseparser' + +module REXMLTests + class TestParseText < Test::Unit::TestCase + class TestInvalid < self + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('c') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra content at the end of the document (got 'c') + Line: 1 + Position: 8 + Last 80 unconsumed characters: + + DETAIL + end + end + + def test_whitespace_characters_after_root + parser = REXML::Parsers::BaseParser.new('b ') + + events = [] + while parser.has_next? + event = parser.pull + case event[0] + when :text + events << event[1] + end + end + + assert_equal(["b"], events) + end + end +end diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb new file mode 100644 index 00000000..17d01979 --- /dev/null +++ b/test/parser/test_base_parser.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: false + +require 'rexml/parsers/baseparser' + +module REXMLTests + class BaseParserTester < Test::Unit::TestCase + def test_large_xml + large_text = "a" * 100_000 + xml = <<-XML + + + #{large_text} + #{large_text} + + XML + + parser = REXML::Parsers::BaseParser.new(xml) + while parser.has_next? + parser.pull + end + + assert do + parser.position < xml.bytesize + end + end + end +end diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb index 44fd1d1e..b3f576ff 100644 --- a/test/parser/test_ultra_light.rb +++ b/test/parser/test_ultra_light.rb @@ -17,7 +17,6 @@ def test_entity_declaration [:entitydecl, "name", "value"] ], [:start_element, :parent, "root", {}], - [:text, "\n"], ], parse(<<-INTERNAL_SUBSET)) diff --git a/test/test_core.rb b/test/test_core.rb index 44e2e7ea..e1fba8a7 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -826,7 +826,7 @@ def test_deep_clone end def test_whitespace_before_root - a = < diff --git a/test/test_document.rb b/test/test_document.rb index 7fccbacb..33cf4002 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -1,12 +1,8 @@ # -*- coding: utf-8 -*- # frozen_string_literal: false -require 'core_assertions' - module REXMLTests class TestDocument < Test::Unit::TestCase - include Test::Unit::CoreAssertions - def test_version_attributes_to_s doc = REXML::Document.new(<<~eoxml) @@ -202,13 +198,6 @@ def test_xml_declaration_standalone assert_equal('no', doc.stand_alone?, bug2539) end - def test_gt_linear_performance_attribute_value - seq = [10000, 50000, 100000, 150000, 200000] - assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('" * n + '">') - end - end - def test_each_recursive xml_source = <<~XML @@ -237,7 +226,7 @@ def test_each_recursive document = REXML::Document.new(xml_source) # Node#each_recursive iterates elements only. - # This does not iterate XML declerations, comments, attributes, CDATA sections, etc. + # This does not iterate XML declarations, comments, attributes, CDATA sections, etc. actual_names = [] document.each_recursive do |element| actual_names << element.attributes["name"] @@ -247,7 +236,7 @@ def test_each_recursive class WriteTest < Test::Unit::TestCase def setup - @document = REXML::Document.new(<<-EOX) + @document = REXML::Document.new(<<-EOX.chomp) Hello world! EOX @@ -257,7 +246,7 @@ class ArgumentsTest < self def test_output output = "" @document.write(output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -280,7 +269,7 @@ def test_transitive indent = 2 transitive = true @document.write(output, indent, transitive) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! #{japanese_text} EOX @@ -320,7 +309,7 @@ class OptionsTest < self def test_output output = "" @document.write(:output => output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -340,7 +329,7 @@ def test_indent def test_transitive output = "" @document.write(:output => output, :indent => 2, :transitive => true) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! output, :encoding => encoding) - assert_equal(<<-EOX.encode(encoding), output) + assert_equal(<<-EOX.chomp.encode(encoding), output) #{japanese_text} EOX @@ -446,7 +435,7 @@ def test_utf_16 actual_xml = "" document.write(actual_xml) - expected_xml = <<-EOX.encode("UTF-16BE") + expected_xml = <<-EOX.chomp.encode("UTF-16BE") \ufeff Hello world! EOX diff --git a/test/test_light.rb b/test/test_light.rb index 54b2c52e..c556c978 100644 --- a/test/test_light.rb +++ b/test/test_light.rb @@ -62,7 +62,7 @@ def test_access_child_elements assert_equal( 'c', a[1].name ) end - def test_itterate_over_children + def test_iterate_over_children foo = make_small_document ctr = 0 foo[0].each { ctr += 1 } diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 53a985ba..096e8b7f 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -62,6 +62,63 @@ def test_entity_replacement end end + def test_character_references + source = 'AB' + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B", events['b']) + end + + def test_text_entity_references + source = '<P> <I> <B> Text </B> </I>' + parser = REXML::Parsers::PullParser.new( source ) + + events = [] + while parser.has_next? + event = parser.pull + case event.event_type + when :text + events << event[1] + end + end + + assert_equal(["

Text "], events) + end + + def test_text_content_with_line_breaks + source = "AB\nC\r\n" + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B\n", events['b']) + assert_equal("C\n", events['c']) + end + def test_peek_unshift source = "" REXML::Parsers::PullParser.new(source) diff --git a/test/test_sax.rb b/test/test_sax.rb index c2255bf3..5a3f5e4e 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -31,6 +31,17 @@ def test_entity_replacement assert_equal '--1234--', results[1] end + def test_characters_predefined_entities + source = '<P> <I> <B> Text </B> </I>' + + sax = Parsers::SAX2Parser.new( source ) + results = [] + sax.listen(:characters) {|x| results << x } + sax.parse + + assert_equal(["

Text "], results) + end + def test_sax2 File.open(fixture_path("documentation.xml")) do |f| parser = Parsers::SAX2Parser.new( f ) @@ -140,7 +151,7 @@ def test_simple_doctype_listener # test doctype with missing name, should throw ParseException # submitted by Jeff Barczewseki - def test_doctype_with_mising_name_throws_exception + def test_doctype_with_missing_name_throws_exception xml = <<~END diff --git a/test/test_text_check.rb b/test/test_text_check.rb new file mode 100644 index 00000000..11cf65a3 --- /dev/null +++ b/test/test_text_check.rb @@ -0,0 +1,121 @@ +# frozen_string_literal: false + +module REXMLTests + class TextCheckTester < Test::Unit::TestCase + + def check(string) + REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil) + end + + def assert_check(string) + assert_nothing_raised { check(string) } + end + + def assert_check_failed(string, illegal_part) + message = "Illegal character #{illegal_part.inspect} in raw string #{string.inspect}" + assert_raise(RuntimeError.new(message)) do + check(string) + end + end + + class TestValid < self + def test_entity_name_start_char_colon + assert_check("&:;") + end + + def test_entity_name_start_char_under_score + assert_check("&_;") + end + + def test_entity_name_mix + assert_check("&A.b-0123;") + end + + def test_character_reference_decimal + assert_check("¢") + end + + def test_character_reference_hex + assert_check("􏿿") + end + + def test_entity_name_non_ascii + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + assert_check("&\u3042\u3044;") + end + + def test_normal_string + assert_check("foo") + end + end + + class TestInvalid < self + def test_lt + assert_check_failed("<;", "<") + end + + def test_lt_mix + assert_check_failed("ab 1]").size assert_equal 3, REXML::XPath.match(doc, "//b[number(@id) >= 1]").size assert_equal 1, REXML::XPath.match(doc, "//b[number(@id) <= 1]").size