From 22ae8f34c5725b25058a3bc3a17262d73d16eb78 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 01:58:23 +0000 Subject: [PATCH 1/3] Initial plan From 9676237adb8509028a228c4e1d1519340cff5f88 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:08:39 +0000 Subject: [PATCH 2/3] Initial repository analysis and build validation Co-authored-by: oalders <96205+oalders@users.noreply.github.com> --- eg/htext | 2 +- test.html | 18 ++++++++++++++++++ test_parser.pl | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 test.html create mode 100644 test_parser.pl diff --git a/eg/htext b/eg/htext index 17f0d75..5525a6d 100755 --- a/eg/htext +++ b/eg/htext @@ -17,7 +17,7 @@ sub tag { sub text { return if $inside{script} || $inside{style}; - print encode('utf8', $_[0]); + print Encode::encode('utf8', $_[0]); } HTML::Parser->new( diff --git a/test.html b/test.html new file mode 100644 index 0000000..f6736c6 --- /dev/null +++ b/test.html @@ -0,0 +1,18 @@ + + +
+This is a test document with some text.
+ Link + + + \ No newline at end of file diff --git a/test_parser.pl b/test_parser.pl new file mode 100644 index 0000000..cf39299 --- /dev/null +++ b/test_parser.pl @@ -0,0 +1,33 @@ +#!/usr/bin/perl +use strict; +use warnings; +use HTML::Parser; + +my $p = HTML::Parser->new( + api_version => 3, + start_h => [sub { + my ($tagname, $attr) = @_; + print "Start tag: $tagname"; + if ($attr && %$attr) { + print " with attributes: "; + for my $key (keys %$attr) { + print "$key='$attr->{$key}' "; + } + } + print "\n"; + }, "tagname,attr"], + + end_h => [sub { + my ($tagname) = @_; + print "End tag: $tagname\n"; + }, "tagname"], + + text_h => [sub { + my ($text) = @_; + $text =~ s/^\s+|\s+$//g; # trim whitespace + print "Text: '$text'\n" if $text; + }, "dtext"], +); + +$p->parse_file("test.html"); +$p->eof; \ No newline at end of file From 9d6d9aba56b79052603e03d5dcd694a997069aba Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:10:54 +0000 Subject: [PATCH 3/3] Add comprehensive GitHub Copilot instructions Co-authored-by: oalders <96205+oalders@users.noreply.github.com> --- .github/copilot-instructions.md | 153 ++++++++++++++++++++++++++++++++ .gitignore | 2 + eg/htext | 2 +- test.html | 18 ---- test_parser.pl | 33 ------- 5 files changed, 156 insertions(+), 52 deletions(-) create mode 100644 .github/copilot-instructions.md delete mode 100644 test.html delete mode 100644 test_parser.pl diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..a7a678d --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,153 @@ +# HTML::Parser - Perl HTML Parser Module + +HTML::Parser is a C/XS-based Perl module for parsing HTML documents. It's part of the libwww-perl organization and provides event-driven HTML parsing with support for multiple parser modes and extensive customization options. + +Always reference these instructions first and fallback to search or bash commands only when you encounter unexpected information that does not match the info here. + +## Working Effectively + +- Bootstrap, build, and test the repository: + - `perl Makefile.PL` -- generates Makefile (takes ~0.12 seconds) + - `make` -- builds the C/XS module (takes ~1.5 seconds, NEVER CANCEL) + - `make test` -- runs 464 tests (takes ~3 seconds, NEVER CANCEL) +- Clean the build: + - `make clean` -- removes build artifacts (takes ~0.01 seconds) +- Dependencies are automatically handled by the Perl build system: + - Runtime dependencies: HTML::Tagset, HTTP::Headers, URI, etc. + - Test dependencies: Test::More, File::Spec, Config, etc. + - All dependencies are typically available in standard Perl installations + +## Validation + +- ALWAYS test core functionality after making changes to the XS code or main Parser.pm: + ```perl + perl -MHTML::Parser -e 'print "HTML::Parser loads successfully\n"' + ``` +- ALWAYS test example scripts in eg/ directory: + - `perl eg/htext test.html` -- extracts plain text from HTML + - `perl eg/hstrip test.html` -- strips unwanted tags and attributes +- ALWAYS run a complete parsing scenario manually: + - Create test HTML file with various elements (tags, attributes, text, comments) + - Parse it with HTML::Parser using start_h, end_h, text_h handlers + - Verify all elements are parsed correctly +- ALWAYS run the full test suite before committing: `make test` +- The test suite covers 464 test cases across 50 test files and must ALL pass + +## Build System Details + +- Uses ExtUtils::MakeMaker build system (traditional Perl approach) +- XS (C extension) compilation is handled automatically +- Generated files during build: Parser.c (from Parser.xs), Parser.so, blib/ directory +- Configuration: Makefile.PL defines build parameters including MARKED_SECTION support +- Build artifacts are placed in blib/ directory structure + +## Project Structure + +### Key Files and Directories: +- `lib/HTML/Parser.pm` -- Main Perl module with XS loading +- `Parser.xs` -- XS interface between Perl and C code +- `hparser.c` -- Core C parsing engine +- `lib/HTML/` -- Additional modules (Entities, LinkExtor, HeadParser, etc.) +- `t/` -- Test suite (50 test files, 464 tests total) +- `eg/` -- Example scripts demonstrating usage +- `cpanfile` -- Dependency specification +- `Makefile.PL` -- Build configuration + +### Important Modules: +- `HTML::Parser` -- Main parser class (lib/HTML/Parser.pm) +- `HTML::Entities` -- HTML entity encoding/decoding (lib/HTML/Entities.pm) +- `HTML::LinkExtor` -- Extract links from HTML (lib/HTML/LinkExtor.pm) +- `HTML::HeadParser` -- Parse HTML head sections (lib/HTML/HeadParser.pm) +- `HTML::PullParser` -- Pull-style parsing interface (lib/HTML/PullParser.pm) + +## Testing + +- Test suite is comprehensive with 464 tests across multiple scenarios +- Tests cover: basic parsing, entity handling, filters, callbacks, edge cases +- All tests use Test::More framework +- Key test categories: + - Parser functionality (t/parser.t, t/callback.t) + - Entity handling (t/entities.t, t/uentities.t) + - Filter methods (t/filter.t, t/filter-methods.t) + - Unicode support (t/unicode.t) + - Various parser modes and options + +## Common Tasks + +### Building from scratch: +```bash +perl Makefile.PL +make +make test +``` + +### Testing specific functionality: +```bash +# Test entity handling +perl -MHTML::Entities -e 'print HTML::Entities::encode_entities("Hello & world
"); + $p->eof; +' +``` + +### Manual validation scenarios: +1. **Basic HTML parsing**: Create HTML with tags, attributes, text, and entities. Parse and verify all components are extracted correctly. +2. **Entity decoding**: Test HTML entities like &, <, >, ' are properly decoded. +3. **Filter functionality**: Test ignore_tags, report_tags, and ignore_elements work correctly. +4. **Callback handling**: Verify start_h, end_h, text_h callbacks receive correct parameters. + +### File outputs from commonly run commands: + +#### Repository root listing: +``` +Changes TODO dist.ini hparser.c lib/ t/ +LICENSE cpanfile eg/ hparser.h mkhctype test.html +Makefile.PL .github/ entities.html hints/ mkpfunc test_parser.pl +META.json .gitignore .perltidyrc typemap Parser.xs tokenpos.h +README.md .mailmap hctype.h pfunc.h ppport.h util.c +``` + +#### Example scripts (eg/ directory): +``` +hanchors hbody hdisable hdump hform hlc hrefsub hstrip htext htextsub htitle +``` + +#### Test directory: +- 50 test files covering all functionality +- Tests range from basic parsing to complex Unicode scenarios +- All tests must pass for a valid build + +## CI/CD + +- GitHub Actions workflows for Linux, macOS, and Windows +- Workflows test multiple Perl versions (5.10 to 5.40) +- All builds must pass before merge +- Located in .github/workflows/ + +## Development Notes + +- This is a mature, stable codebase (version 3.84) +- Changes should be minimal and well-tested +- XS/C code changes require careful validation +- Backward compatibility is important +- Follow existing code style (see .perltidyrc) + +## Performance + +- Parser is optimized for speed with C implementation +- Handles large documents efficiently +- Event-driven approach minimizes memory usage +- Build times are fast (~1.5 seconds total) +- Test execution is quick (~3 seconds for full suite) + +## Troubleshooting + +- If build fails, ensure C compiler is available +- Missing dependencies are usually auto-detected +- Test failures indicate breaking changes +- XS compilation errors suggest C code issues +- Use `make clean` to reset build state \ No newline at end of file diff --git a/.gitignore b/.gitignore index 053e57a..0c3f0f6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,6 @@ Makefile MYMETA.* .build/ HTML-Parser-*/ +test.html +test_parser.pl diff --git a/eg/htext b/eg/htext index 5525a6d..17f0d75 100755 --- a/eg/htext +++ b/eg/htext @@ -17,7 +17,7 @@ sub tag { sub text { return if $inside{script} || $inside{style}; - print Encode::encode('utf8', $_[0]); + print encode('utf8', $_[0]); } HTML::Parser->new( diff --git a/test.html b/test.html deleted file mode 100644 index f6736c6..0000000 --- a/test.html +++ /dev/null @@ -1,18 +0,0 @@ - - - -This is a test document with some text.
- Link - - - \ No newline at end of file diff --git a/test_parser.pl b/test_parser.pl deleted file mode 100644 index cf39299..0000000 --- a/test_parser.pl +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use HTML::Parser; - -my $p = HTML::Parser->new( - api_version => 3, - start_h => [sub { - my ($tagname, $attr) = @_; - print "Start tag: $tagname"; - if ($attr && %$attr) { - print " with attributes: "; - for my $key (keys %$attr) { - print "$key='$attr->{$key}' "; - } - } - print "\n"; - }, "tagname,attr"], - - end_h => [sub { - my ($tagname) = @_; - print "End tag: $tagname\n"; - }, "tagname"], - - text_h => [sub { - my ($text) = @_; - $text =~ s/^\s+|\s+$//g; # trim whitespace - print "Text: '$text'\n" if $text; - }, "dtext"], -); - -$p->parse_file("test.html"); -$p->eof; \ No newline at end of file