From 6111a8630234be0dc9720cb067e6eb1bbef4707f Mon Sep 17 00:00:00 2001 From: FriedrichFroebel Date: Tue, 23 Apr 2024 20:57:36 +0200 Subject: [PATCH 01/11] add dependabot configuration for GitHub Actions --- .github/dependabot.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..5ace4600 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" From ff425c5e81622f423772837574616200412425e5 Mon Sep 17 00:00:00 2001 From: FriedrichFroebel Date: Wed, 24 Apr 2024 16:56:07 +0200 Subject: [PATCH 02/11] run monthly and group Co-authored-by: Rotzbua --- .github/dependabot.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 5ace4600..2390d8c8 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -3,4 +3,8 @@ updates: - package-ecosystem: "github-actions" directory: "/" schedule: - interval: "weekly" + interval: "monthly" + groups: + github-actions: + patterns: + - "*" From 9fdc1571a7ba875dbca5a3aedbd89aa14ed7e4db Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 28 May 2024 22:11:47 +0500 Subject: [PATCH 03/11] Replace lxml[html-clean] with lxml + lxml-html-clean, fix the deps in setup.py. --- requirements.txt | 3 ++- setup.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 78ebfdea..92e1b91d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # project requirements, install them using following command: # pip install -r requirements.txt -lxml[html_clean] +lxml +lxml-html-clean requests rdflib>=6.0.0 pyrdfa3 diff --git a/setup.py b/setup.py index a7fb8ab2..bc4b0101 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,7 @@ def get_version(): python_requires=">=3.8", install_requires=[ "lxml", + "lxml-html-clean", "rdflib>=6.0.0", "pyrdfa3", "mf2py", From 6ca6f1e9bfe020db6ba7feaa6876572f65866ce2 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 28 May 2024 22:08:48 +0500 Subject: [PATCH 04/11] Release notes for 0.17.0. --- HISTORY.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/HISTORY.rst b/HISTORY.rst index dffc396b..17c9b03d 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,15 @@ History ======= +v0.17.0 (YYYY-MM-DD) +-------------------- + +* Added support for Python 3.12 (PR #218) +* Added support for lxml >= 5.2.0 (PR #217, #234) +* Cleaned up and modernized the code (PR #214, #219, #220, #222, #223, #224, + #225, #226, #227) +* Improved the pre-commit and CI configuration (PR #226, #233) + v0.16.0 (2023-07-07) -------------------- From 3bf7546b7c9740b5fa66ceea769790aa49dd04c8 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Wed, 29 May 2024 11:50:03 +0500 Subject: [PATCH 05/11] =?UTF-8?q?Bump=20version:=200.16.0=20=E2=86=92=200.?= =?UTF-8?q?17.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- HISTORY.rst | 2 +- extruct/VERSION | 2 +- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 17c9b03d..41c94b28 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,7 +2,7 @@ History ======= -v0.17.0 (YYYY-MM-DD) +v0.17.0 (2024-05-29) -------------------- * Added support for Python 3.12 (PR #218) diff --git a/extruct/VERSION b/extruct/VERSION index 04a373ef..c5523bd0 100644 --- a/extruct/VERSION +++ b/extruct/VERSION @@ -1 +1 @@ -0.16.0 +0.17.0 diff --git a/setup.cfg b/setup.cfg index ee9ef376..f78ab4b7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.16.0 +current_version = 0.17.0 commit = True tag = True From 2c8a3f3a83cf2f067048835114bdba81b7f092e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 25 Jul 2024 10:03:36 +0200 Subject: [PATCH 06/11] Add a minimal action I/O implementation to improve SearchAction support --- extruct/w3cmicrodata.py | 12 ++++++++++++ tests/samples/schema.org/SearchAction.001.html | 8 ++++++++ tests/samples/schema.org/SearchAction.001.json | 1 + tests/test_microdata.py | 15 +++++++++++++++ 4 files changed, 36 insertions(+) create mode 100644 tests/samples/schema.org/SearchAction.001.html create mode 100644 tests/samples/schema.org/SearchAction.001.json diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index 89a79b3c..c2696eae 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -235,6 +235,18 @@ def _extract_property_value(self, node, items_seen, base_url, itemids, force=Fal elif node.get("content"): return node.get("content") + # https://schema.org/docs/actions.html#part-4 + elif ( + (itemprop := node.get("itemprop")) + and (itemprop.endswith("-input") or itemprop.endswith("-output")) + ): + result = {} + if "required" in node.attrib: + result["valueRequired"] = True + if name := node.get("name"): + result["valueName"] = name + return result + else: return self._extract_textContent(node) diff --git a/tests/samples/schema.org/SearchAction.001.html b/tests/samples/schema.org/SearchAction.001.html new file mode 100644 index 00000000..ef493125 --- /dev/null +++ b/tests/samples/schema.org/SearchAction.001.html @@ -0,0 +1,8 @@ +
+ +
+ + + +
+
diff --git a/tests/samples/schema.org/SearchAction.001.json b/tests/samples/schema.org/SearchAction.001.json new file mode 100644 index 00000000..bd967365 --- /dev/null +++ b/tests/samples/schema.org/SearchAction.001.json @@ -0,0 +1 @@ +[{"type": "https://schema.org/WebSite", "properties": {"url": "https://www.example.com/", "potentialAction": {"type": "https://schema.org/SearchAction", "properties": {"target": "https://query.example.com/search?q={search_term_string}", "query-input": {"valueRequired": true, "valueName": "search_term_string"}}}}}] \ No newline at end of file diff --git a/tests/test_microdata.py b/tests/test_microdata.py index c1168a02..69e305cb 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -10,6 +10,18 @@ class TestMicrodata(unittest.TestCase): maxDiff = None + def _test_schemaorg(self, schema, indexes=None): + indexes = indexes or [1] + for i in indexes: + body = get_testdata("schema.org", f"{schema}.{i:03d}.html") + expected = json.loads( + get_testdata("schema.org", f"{schema}.{i:03d}.json").decode() + ) + mde = MicrodataExtractor() + data = mde.extract(body) + self.assertEqual(data, expected) + + def test_schemaorg_CreativeWork(self): for i in [1]: body = get_testdata("schema.org", "CreativeWork.{:03d}.html".format(i)) @@ -63,6 +75,9 @@ def test_schemaorg_Event(self): self.assertEqual(data, expected) + def test_schemaorg_SearchAction(self): + self._test_schemaorg("SearchAction") + def test_w3c_textContent_values(self): body = get_testdata("w3c", "microdata.4.2.strings.html") expected = json.loads( From 1000c838a921372d77733bbddd6189f0296eee14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 25 Jul 2024 10:22:32 +0200 Subject: [PATCH 07/11] Run pre-commit --- extruct/w3cmicrodata.py | 5 ++--- tests/test_microdata.py | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index c2696eae..0a7dbe6a 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -236,9 +236,8 @@ def _extract_property_value(self, node, items_seen, base_url, itemids, force=Fal return node.get("content") # https://schema.org/docs/actions.html#part-4 - elif ( - (itemprop := node.get("itemprop")) - and (itemprop.endswith("-input") or itemprop.endswith("-output")) + elif (itemprop := node.get("itemprop")) and ( + itemprop.endswith("-input") or itemprop.endswith("-output") ): result = {} if "required" in node.attrib: diff --git a/tests/test_microdata.py b/tests/test_microdata.py index 69e305cb..eda2d26c 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -21,7 +21,6 @@ def _test_schemaorg(self, schema, indexes=None): data = mde.extract(body) self.assertEqual(data, expected) - def test_schemaorg_CreativeWork(self): for i in [1]: body = get_testdata("schema.org", "CreativeWork.{:03d}.html".format(i)) From 8f540888b2ca4ac19ae6c6a7a1d7d17633952333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 8 Nov 2024 15:58:05 +0100 Subject: [PATCH 08/11] 0.18.0 release notes (#238) --- HISTORY.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/HISTORY.rst b/HISTORY.rst index 41c94b28..e1dc17f1 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,12 @@ History ======= +v0.18.0 (2024-11-08) +-------------------- + +* Addded support for the ``valueRequired`` and ``valueName`` fields of `action + I/O `_ to the microdata parser. + v0.17.0 (2024-05-29) -------------------- From 259dde016a5fb9e7d3061bd87def411f74edceb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 8 Nov 2024 15:58:44 +0100 Subject: [PATCH 09/11] =?UTF-8?q?Bump=20version:=200.17.0=20=E2=86=92=200.?= =?UTF-8?q?18.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extruct/VERSION | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/extruct/VERSION b/extruct/VERSION index c5523bd0..66333910 100644 --- a/extruct/VERSION +++ b/extruct/VERSION @@ -1 +1 @@ -0.17.0 +0.18.0 diff --git a/setup.cfg b/setup.cfg index f78ab4b7..9c145abe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.17.0 +current_version = 0.18.0 commit = True tag = True From 2fd175e10e7e6ed670744f9b871e6323e982f1ca Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Wed, 5 Feb 2025 20:14:40 +0400 Subject: [PATCH 10/11] Add coverage upload. (#239) --- .github/workflows/python-package.yml | 4 ++++ tox.ini | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 73f42b50..387622fa 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -31,6 +31,10 @@ jobs: - name: tox run: | tox -e `python -c "import sys; print('py' + ''.join(sys.version.split('.')[:2]))"` + - name: Upload coverage report + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} check: runs-on: ubuntu-latest diff --git a/tox.ini b/tox.ini index 6a7f4eb1..b9f8c12f 100644 --- a/tox.ini +++ b/tox.ini @@ -6,11 +6,11 @@ envlist = py38, py39, py310, py311, py312 deps = -rrequirements-dev.txt commands = - py.test --cov-report=term --cov-report= --cov=extruct {posargs:extruct tests} + py.test --cov-report=term --cov-report= --cov-report=xml --cov=extruct {posargs:extruct tests} [testenv:py39] commands = - py.test --cov-report=term --cov-report= --cov=extruct {posargs:extruct tests} + py.test --cov-report=term --cov-report= --cov-report=xml --cov=extruct {posargs:extruct tests} python -m readme_renderer README.rst -o /tmp/README.html [testenv:linters] From a31daaadb82ec684b7d468d3b15734b8ae3b7265 Mon Sep 17 00:00:00 2001 From: Ihor Date: Mon, 24 Mar 2025 13:06:18 +0200 Subject: [PATCH 11/11] Skip empty JSON-LD scripts by trimming and skipping empty input (#240) --- extruct/jsonld.py | 4 +++- tests/test_jsonld.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 989ac04a..d25a4183 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -33,7 +33,9 @@ def extract_items(self, document, base_url=None): ] def _extract_items(self, node): - script = node.xpath("string()") + script = node.xpath("string()").strip() + if not script: + return try: # TODO: `strict=False` can be configurable if needed data = json.loads(script, strict=False) diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index d274b3d1..178b3229 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -63,3 +63,9 @@ def test_null(self): jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected) + + def test_empty_jsonld_script(self): + jsonlde = JsonLdExtractor() + body = '' + data = jsonlde.extract(body) + self.assertEqual(data, [])