diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..2390d8c8 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,10 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + groups: + github-actions: + patterns: + - "*" diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 73f42b50..387622fa 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -31,6 +31,10 @@ jobs: - name: tox run: | tox -e `python -c "import sys; print('py' + ''.join(sys.version.split('.')[:2]))"` + - name: Upload coverage report + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} check: runs-on: ubuntu-latest diff --git a/HISTORY.rst b/HISTORY.rst index dffc396b..e1dc17f1 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,21 @@ History ======= +v0.18.0 (2024-11-08) +-------------------- + +* Addded support for the ``valueRequired`` and ``valueName`` fields of `action + I/O `_ to the microdata parser. + +v0.17.0 (2024-05-29) +-------------------- + +* Added support for Python 3.12 (PR #218) +* Added support for lxml >= 5.2.0 (PR #217, #234) +* Cleaned up and modernized the code (PR #214, #219, #220, #222, #223, #224, + #225, #226, #227) +* Improved the pre-commit and CI configuration (PR #226, #233) + v0.16.0 (2023-07-07) -------------------- diff --git a/extruct/VERSION b/extruct/VERSION index 04a373ef..66333910 100644 --- a/extruct/VERSION +++ b/extruct/VERSION @@ -1 +1 @@ -0.16.0 +0.18.0 diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 989ac04a..d25a4183 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -33,7 +33,9 @@ def extract_items(self, document, base_url=None): ] def _extract_items(self, node): - script = node.xpath("string()") + script = node.xpath("string()").strip() + if not script: + return try: # TODO: `strict=False` can be configurable if needed data = json.loads(script, strict=False) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index 89a79b3c..0a7dbe6a 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -235,6 +235,17 @@ def _extract_property_value(self, node, items_seen, base_url, itemids, force=Fal elif node.get("content"): return node.get("content") + # https://schema.org/docs/actions.html#part-4 + elif (itemprop := node.get("itemprop")) and ( + itemprop.endswith("-input") or itemprop.endswith("-output") + ): + result = {} + if "required" in node.attrib: + result["valueRequired"] = True + if name := node.get("name"): + result["valueName"] = name + return result + else: return self._extract_textContent(node) diff --git a/requirements.txt b/requirements.txt index 78ebfdea..92e1b91d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # project requirements, install them using following command: # pip install -r requirements.txt -lxml[html_clean] +lxml +lxml-html-clean requests rdflib>=6.0.0 pyrdfa3 diff --git a/setup.cfg b/setup.cfg index ee9ef376..9c145abe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.16.0 +current_version = 0.18.0 commit = True tag = True diff --git a/setup.py b/setup.py index a7fb8ab2..bc4b0101 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,7 @@ def get_version(): python_requires=">=3.8", install_requires=[ "lxml", + "lxml-html-clean", "rdflib>=6.0.0", "pyrdfa3", "mf2py", diff --git a/tests/samples/schema.org/SearchAction.001.html b/tests/samples/schema.org/SearchAction.001.html new file mode 100644 index 00000000..ef493125 --- /dev/null +++ b/tests/samples/schema.org/SearchAction.001.html @@ -0,0 +1,8 @@ +
+ +
+ + + +
+
diff --git a/tests/samples/schema.org/SearchAction.001.json b/tests/samples/schema.org/SearchAction.001.json new file mode 100644 index 00000000..bd967365 --- /dev/null +++ b/tests/samples/schema.org/SearchAction.001.json @@ -0,0 +1 @@ +[{"type": "https://schema.org/WebSite", "properties": {"url": "https://www.example.com/", "potentialAction": {"type": "https://schema.org/SearchAction", "properties": {"target": "https://query.example.com/search?q={search_term_string}", "query-input": {"valueRequired": true, "valueName": "search_term_string"}}}}}] \ No newline at end of file diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index d274b3d1..178b3229 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -63,3 +63,9 @@ def test_null(self): jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected) + + def test_empty_jsonld_script(self): + jsonlde = JsonLdExtractor() + body = '' + data = jsonlde.extract(body) + self.assertEqual(data, []) diff --git a/tests/test_microdata.py b/tests/test_microdata.py index c1168a02..eda2d26c 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -10,6 +10,17 @@ class TestMicrodata(unittest.TestCase): maxDiff = None + def _test_schemaorg(self, schema, indexes=None): + indexes = indexes or [1] + for i in indexes: + body = get_testdata("schema.org", f"{schema}.{i:03d}.html") + expected = json.loads( + get_testdata("schema.org", f"{schema}.{i:03d}.json").decode() + ) + mde = MicrodataExtractor() + data = mde.extract(body) + self.assertEqual(data, expected) + def test_schemaorg_CreativeWork(self): for i in [1]: body = get_testdata("schema.org", "CreativeWork.{:03d}.html".format(i)) @@ -63,6 +74,9 @@ def test_schemaorg_Event(self): self.assertEqual(data, expected) + def test_schemaorg_SearchAction(self): + self._test_schemaorg("SearchAction") + def test_w3c_textContent_values(self): body = get_testdata("w3c", "microdata.4.2.strings.html") expected = json.loads( diff --git a/tox.ini b/tox.ini index 6a7f4eb1..b9f8c12f 100644 --- a/tox.ini +++ b/tox.ini @@ -6,11 +6,11 @@ envlist = py38, py39, py310, py311, py312 deps = -rrequirements-dev.txt commands = - py.test --cov-report=term --cov-report= --cov=extruct {posargs:extruct tests} + py.test --cov-report=term --cov-report= --cov-report=xml --cov=extruct {posargs:extruct tests} [testenv:py39] commands = - py.test --cov-report=term --cov-report= --cov=extruct {posargs:extruct tests} + py.test --cov-report=term --cov-report= --cov-report=xml --cov=extruct {posargs:extruct tests} python -m readme_renderer README.rst -o /tmp/README.html [testenv:linters]