From 22ae8f34c5725b25058a3bc3a17262d73d16eb78 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 21 Aug 2025 01:58:23 +0000
Subject: [PATCH 1/3] Initial plan


From 9676237adb8509028a228c4e1d1519340cff5f88 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 21 Aug 2025 02:08:39 +0000
Subject: [PATCH 2/3] Initial repository analysis and build validation

Co-authored-by: oalders <96205+oalders@users.noreply.github.com>
---
 eg/htext       |  2 +-
 test.html      | 18 ++++++++++++++++++
 test_parser.pl | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 test.html
 create mode 100644 test_parser.pl
diff --git a/eg/htext b/eg/htext
index 17f0d75..5525a6d 100755
--- a/eg/htext
+++ b/eg/htext
@@ -17,7 +17,7 @@ sub tag {
 
 sub text {
     return if $inside{script} || $inside{style};
-    print encode('utf8', $_[0]);
+    print Encode::encode('utf8', $_[0]);
 }
 
 HTML::Parser->new(
diff --git a/test.html b/test.html
new file mode 100644
index 0000000..f6736c6
--- /dev/null
+++ b/test.html
@@ -0,0 +1,18 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Test HTML Document</title>
+    <style>
+        body { background-color: #f0f0f0; }
+    </style>
+    <script>
+        console.log("Hello World");
+    </script>
+</head>
+<body bgcolor="white">
+    <h1>Welcome to HTML Parser Test</h1>
+    <p>This is a <b>test</b> document with <font color="red">some</font> text.</p>
+    <a href="https://example.com/" onclick="alert('clicked')">Link</a>
+    <!-- This is a comment -->
+</body>
+</html>
\ No newline at end of file
diff --git a/test_parser.pl b/test_parser.pl
new file mode 100644
index 0000000..cf39299
--- /dev/null
+++ b/test_parser.pl
@@ -0,0 +1,33 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use HTML::Parser;
+
+my $p = HTML::Parser->new(
+    api_version => 3,
+    start_h => [sub {
+        my ($tagname, $attr) = @_;
+        print "Start tag: $tagname";
+        if ($attr && %$attr) {
+            print " with attributes: ";
+            for my $key (keys %$attr) {
+                print "$key='$attr->{$key}' ";
+            }
+        }
+        print "\n";
+    }, "tagname,attr"],
+    
+    end_h => [sub {
+        my ($tagname) = @_;
+        print "End tag: $tagname\n";
+    }, "tagname"],
+    
+    text_h => [sub {
+        my ($text) = @_;
+        $text =~ s/^\s+|\s+$//g; # trim whitespace
+        print "Text: '$text'\n" if $text;
+    }, "dtext"],
+);
+
+$p->parse_file("test.html");
+$p->eof;
\ No newline at end of file

From 9d6d9aba56b79052603e03d5dcd694a997069aba Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 21 Aug 2025 02:10:54 +0000
Subject: [PATCH 3/3] Add comprehensive GitHub Copilot instructions

Co-authored-by: oalders <96205+oalders@users.noreply.github.com>
---
 .github/copilot-instructions.md | 153 ++++++++++++++++++++++++++++++++
 .gitignore                      |   2 +
 eg/htext                        |   2 +-
 test.html                       |  18 ----
 test_parser.pl                  |  33 -------
 5 files changed, 156 insertions(+), 52 deletions(-)
 create mode 100644 .github/copilot-instructions.md
 delete mode 100644 test.html
 delete mode 100644 test_parser.pl

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000..a7a678d
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,153 @@
+# HTML::Parser - Perl HTML Parser Module
+
+HTML::Parser is a C/XS-based Perl module for parsing HTML documents. It's part of the libwww-perl organization and provides event-driven HTML parsing with support for multiple parser modes and extensive customization options.
+
+Always reference these instructions first and fallback to search or bash commands only when you encounter unexpected information that does not match the info here.
+
+## Working Effectively
+
+- Bootstrap, build, and test the repository:
+  - `perl Makefile.PL` -- generates Makefile (takes ~0.12 seconds)
+  - `make` -- builds the C/XS module (takes ~1.5 seconds, NEVER CANCEL)
+  - `make test` -- runs 464 tests (takes ~3 seconds, NEVER CANCEL)
+- Clean the build:
+  - `make clean` -- removes build artifacts (takes ~0.01 seconds)
+- Dependencies are automatically handled by the Perl build system:
+  - Runtime dependencies: HTML::Tagset, HTTP::Headers, URI, etc.
+  - Test dependencies: Test::More, File::Spec, Config, etc.
+  - All dependencies are typically available in standard Perl installations
+
+## Validation
+
+- ALWAYS test core functionality after making changes to the XS code or main Parser.pm:
+  ```perl
+  perl -MHTML::Parser -e 'print "HTML::Parser loads successfully\n"'
+  ```
+- ALWAYS test example scripts in eg/ directory:
+  - `perl eg/htext test.html` -- extracts plain text from HTML
+  - `perl eg/hstrip test.html` -- strips unwanted tags and attributes
+- ALWAYS run a complete parsing scenario manually:
+  - Create test HTML file with various elements (tags, attributes, text, comments)
+  - Parse it with HTML::Parser using start_h, end_h, text_h handlers
+  - Verify all elements are parsed correctly
+- ALWAYS run the full test suite before committing: `make test`
+- The test suite covers 464 test cases across 50 test files and must ALL pass
+
+## Build System Details
+
+- Uses ExtUtils::MakeMaker build system (traditional Perl approach)
+- XS (C extension) compilation is handled automatically
+- Generated files during build: Parser.c (from Parser.xs), Parser.so, blib/ directory
+- Configuration: Makefile.PL defines build parameters including MARKED_SECTION support
+- Build artifacts are placed in blib/ directory structure
+
+## Project Structure
+
+### Key Files and Directories:
+- `lib/HTML/Parser.pm` -- Main Perl module with XS loading
+- `Parser.xs` -- XS interface between Perl and C code
+- `hparser.c` -- Core C parsing engine
+- `lib/HTML/` -- Additional modules (Entities, LinkExtor, HeadParser, etc.)
+- `t/` -- Test suite (50 test files, 464 tests total)
+- `eg/` -- Example scripts demonstrating usage
+- `cpanfile` -- Dependency specification
+- `Makefile.PL` -- Build configuration
+
+### Important Modules:
+- `HTML::Parser` -- Main parser class (lib/HTML/Parser.pm)
+- `HTML::Entities` -- HTML entity encoding/decoding (lib/HTML/Entities.pm)
+- `HTML::LinkExtor` -- Extract links from HTML (lib/HTML/LinkExtor.pm)
+- `HTML::HeadParser` -- Parse HTML head sections (lib/HTML/HeadParser.pm)
+- `HTML::PullParser` -- Pull-style parsing interface (lib/HTML/PullParser.pm)
+
+## Testing
+
+- Test suite is comprehensive with 464 tests across multiple scenarios
+- Tests cover: basic parsing, entity handling, filters, callbacks, edge cases
+- All tests use Test::More framework
+- Key test categories:
+  - Parser functionality (t/parser.t, t/callback.t)
+  - Entity handling (t/entities.t, t/uentities.t)
+  - Filter methods (t/filter.t, t/filter-methods.t)
+  - Unicode support (t/unicode.t)
+  - Various parser modes and options
+
+## Common Tasks
+
+### Building from scratch:
+```bash
+perl Makefile.PL
+make
+make test
+```
+
+### Testing specific functionality:
+```bash
+# Test entity handling
+perl -MHTML::Entities -e 'print HTML::Entities::encode_entities("<test>") . "\n"'
+
+# Test basic parsing
+perl -MHTML::Parser -e '
+  my $p = HTML::Parser->new(text_h => [sub { print "$_[0]\n" }, "dtext"]);
+  $p->parse("<p>Hello &amp; world</p>");
+  $p->eof;
+'
+```
+
+### Manual validation scenarios:
+1. **Basic HTML parsing**: Create HTML with tags, attributes, text, and entities. Parse and verify all components are extracted correctly.
+2. **Entity decoding**: Test HTML entities like &amp;, &lt;, &gt;, &#39; are properly decoded.
+3. **Filter functionality**: Test ignore_tags, report_tags, and ignore_elements work correctly.
+4. **Callback handling**: Verify start_h, end_h, text_h callbacks receive correct parameters.
+
+### File outputs from commonly run commands:
+
+#### Repository root listing:
+```
+Changes          TODO             dist.ini         hparser.c        lib/             t/
+LICENSE          cpanfile         eg/              hparser.h        mkhctype         test.html
+Makefile.PL      .github/         entities.html    hints/           mkpfunc          test_parser.pl
+META.json        .gitignore       .perltidyrc      typemap          Parser.xs        tokenpos.h
+README.md        .mailmap         hctype.h         pfunc.h          ppport.h         util.c
+```
+
+#### Example scripts (eg/ directory):
+```
+hanchors  hbody  hdisable  hdump  hform  hlc  hrefsub  hstrip  htext  htextsub  htitle
+```
+
+#### Test directory:
+- 50 test files covering all functionality
+- Tests range from basic parsing to complex Unicode scenarios
+- All tests must pass for a valid build
+
+## CI/CD
+
+- GitHub Actions workflows for Linux, macOS, and Windows
+- Workflows test multiple Perl versions (5.10 to 5.40)
+- All builds must pass before merge
+- Located in .github/workflows/
+
+## Development Notes
+
+- This is a mature, stable codebase (version 3.84)
+- Changes should be minimal and well-tested
+- XS/C code changes require careful validation
+- Backward compatibility is important
+- Follow existing code style (see .perltidyrc)
+
+## Performance
+
+- Parser is optimized for speed with C implementation
+- Handles large documents efficiently
+- Event-driven approach minimizes memory usage
+- Build times are fast (~1.5 seconds total)
+- Test execution is quick (~3 seconds for full suite)
+
+## Troubleshooting
+
+- If build fails, ensure C compiler is available
+- Missing dependencies are usually auto-detected
+- Test failures indicate breaking changes
+- XS compilation errors suggest C code issues
+- Use `make clean` to reset build state
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 053e57a..0c3f0f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,6 @@ Makefile
 MYMETA.*
 .build/
 HTML-Parser-*/
+test.html
+test_parser.pl
 
diff --git a/eg/htext b/eg/htext
index 5525a6d..17f0d75 100755
--- a/eg/htext
+++ b/eg/htext
@@ -17,7 +17,7 @@ sub tag {
 
 sub text {
     return if $inside{script} || $inside{style};
-    print Encode::encode('utf8', $_[0]);
+    print encode('utf8', $_[0]);
 }
 
 HTML::Parser->new(
diff --git a/test.html b/test.html
deleted file mode 100644
index f6736c6..0000000
--- a/test.html
+++ /dev/null
@@ -1,18 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-    <title>Test HTML Document</title>
-    <style>
-        body { background-color: #f0f0f0; }
-    </style>
-    <script>
-        console.log("Hello World");
-    </script>
-</head>
-<body bgcolor="white">
-    <h1>Welcome to HTML Parser Test</h1>
-    <p>This is a <b>test</b> document with <font color="red">some</font> text.</p>
-    <a href="https://example.com/" onclick="alert('clicked')">Link</a>
-    <!-- This is a comment -->
-</body>
-</html>
\ No newline at end of file
diff --git a/test_parser.pl b/test_parser.pl
deleted file mode 100644
index cf39299..0000000
--- a/test_parser.pl
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/perl
-use strict;
-use warnings;
-use HTML::Parser;
-
-my $p = HTML::Parser->new(
-    api_version => 3,
-    start_h => [sub {
-        my ($tagname, $attr) = @_;
-        print "Start tag: $tagname";
-        if ($attr && %$attr) {
-            print " with attributes: ";
-            for my $key (keys %$attr) {
-                print "$key='$attr->{$key}' ";
-            }
-        }
-        print "\n";
-    }, "tagname,attr"],
-    
-    end_h => [sub {
-        my ($tagname) = @_;
-        print "End tag: $tagname\n";
-    }, "tagname"],
-    
-    text_h => [sub {
-        my ($text) = @_;
-        $text =~ s/^\s+|\s+$//g; # trim whitespace
-        print "Text: '$text'\n" if $text;
-    }, "dtext"],
-);
-
-$p->parse_file("test.html");
-$p->eof;
\ No newline at end of file