Skip to content

Commit 3cee0cf

Browse files
Add tuples of wikitext link data to senses
This adds a new field under a sense in `senses` called `links`. `links` is a list of tuples/lists that contain 1: visible text of the link 2: the link anchor itself
1 parent 3b4a54a commit 3cee0cf

File tree

1 file changed

+59
-17
lines changed

1 file changed

+59
-17
lines changed

wiktextract/page.py

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1353,7 +1353,8 @@ def parse_sense_node(node, sense_base, pos):
13531353
# "indentation", like "#:" or "##:"
13541354
return False
13551355

1356-
# If a recursion call succeeds in push_sense(), bubble it up with added.
1356+
# If a recursion call succeeds in push_sense(), bubble it up with
1357+
# `added`.
13571358
# added |= push_sense() or added |= parse_sense_node(...) to OR.
13581359
added = False
13591360

@@ -1466,10 +1467,13 @@ def sense_template_fn(name, ht):
14661467
add_form_of_tags(ctx, name, config.FORM_OF_TEMPLATES, sense_base)
14671468
return None
14681469

1470+
link_tuples = []
1471+
14691472
def extract_link_texts(item):
14701473
"""Recursively extracts link texts from the gloss source. This
14711474
information is used to select whether to remove final "." from
14721475
form_of/alt_of (e.g., ihm/Hunsrik)."""
1476+
nonlocal link_tuples
14731477
if isinstance(item, (list, tuple)):
14741478
for x in item:
14751479
extract_link_texts(x)
@@ -1498,7 +1502,8 @@ def extract_link_texts(item):
14981502
# get the raw text of non-list contents of this node, and other stuff
14991503
# like tag and category data added to sense_base
15001504
rawgloss = clean_node(config, ctx, sense_base, contents,
1501-
template_fn=sense_template_fn)
1505+
template_fn=sense_template_fn,
1506+
collect_links=True)
15021507

15031508
if not rawgloss:
15041509
return False
@@ -3469,13 +3474,13 @@ def parse_page(ctx: Wtp, word: str, text: str, config: WiktionaryConfig) -> list
34693474
return ret
34703475

34713476

3472-
def clean_node(config, ctx, category_data, value, template_fn=None,
3473-
post_template_fn=None):
3477+
def clean_node(config, ctx, sense_data, value, template_fn=None,
3478+
post_template_fn=None, collect_links=False):
34743479
"""Expands the node to text, cleaning up any HTML and duplicate spaces.
34753480
This is intended for expanding things like glosses for a single sense."""
34763481
assert isinstance(config, WiktionaryConfig)
34773482
assert isinstance(ctx, Wtp)
3478-
assert category_data is None or isinstance(category_data, dict)
3483+
assert sense_data is None or isinstance(sense_data, dict)
34793484
assert template_fn is None or callable(template_fn)
34803485
assert post_template_fn is None or callable(post_template_fn)
34813486
# print("CLEAN_NODE:", repr(value))
@@ -3518,23 +3523,60 @@ def clean_node_handler_fn(node):
35183523
post_template_fn=post_template_fn)
35193524
# print("clean_node: v={!r}".format(v))
35203525

3521-
# Capture categories if category_data has been given. We also track
3526+
# Capture categories if sense_data has been given. We also track
35223527
# Lua execution errors here.
3523-
if category_data is not None:
3528+
# If collect_links=True (for glosses), capture links
3529+
if sense_data is not None:
35243530
# Check for Lua execution error
35253531
if v.find('<strong class="error">Lua execution error') >= 0:
3526-
data_append(ctx, category_data, "tags", "error-lua-exec")
3532+
data_append(ctx, sense_data, "tags", "error-lua-exec")
35273533
if v.find('<strong class="error">Lua timeout error') >= 0:
3528-
data_append(ctx, category_data, "tags", "error-lua-timeout")
3534+
data_append(ctx, sense_data, "tags", "error-lua-timeout")
35293535
# Capture Category tags
3530-
for m in re.finditer(r"(?is)\[\[:?\s*Category\s*:([^]|]+)", v):
3531-
cat = clean_value(config, m.group(1))
3532-
cat = re.sub(r"\s+", " ", cat)
3533-
cat = cat.strip()
3534-
if not cat:
3535-
continue
3536-
if cat not in category_data.get("categories", ()):
3537-
data_append(ctx, category_data, "categories", cat)
3536+
if not collect_links:
3537+
for m in re.finditer(r"(?is)\[\[:?\s*Category\s*:([^]|]+)", v):
3538+
cat = clean_value(config, m.group(1))
3539+
cat = re.sub(r"\s+", " ", cat)
3540+
cat = cat.strip()
3541+
if not cat:
3542+
continue
3543+
if cat not in sense_data.get("categories", ()):
3544+
data_append(ctx, sense_data, "categories", cat)
3545+
else:
3546+
for m in re.finditer(r"(?is)\[\[:?(\s*([^][|:]+):)?\s*([^]|]+)"
3547+
r"(\|([^]|]+))?\]\]", v):
3548+
# Add here other stuff different "Something:restofthelink"
3549+
# things;
3550+
if m.group(1) and m.group(1).strip() == "Category":
3551+
cat = clean_value(config, m.group(3))
3552+
cat = re.sub(r"\s+", " ", cat)
3553+
cat = cat.strip()
3554+
if not cat:
3555+
continue
3556+
if cat not in sense_data.get("categories", ()):
3557+
data_append(ctx, sense_data, "categories", cat)
3558+
elif not m.group(1):
3559+
if m.group(5):
3560+
ltext = clean_value(config, m.group(5))
3561+
ltarget = clean_value(config, m.group(3))
3562+
elif not m.group(3):
3563+
continue
3564+
else:
3565+
txt = clean_value(config, m.group(3))
3566+
ltext = txt
3567+
ltarget = txt
3568+
ltarget = re.sub(r"\s+", " ", ltarget)
3569+
ltarget = ltarget.strip()
3570+
ltext = re.sub(r"\s+", " ", ltext)
3571+
ltext = ltext.strip()
3572+
if not ltext and not ltarget:
3573+
continue
3574+
if not ltext and ltarget:
3575+
ltext = ltarget
3576+
ltuple = (ltext, ltarget)
3577+
if ltuple not in sense_data.get("links", ()):
3578+
data_append(ctx, sense_data, "links", ltuple)
3579+
35383580

35393581
v = clean_value(config, v)
35403582
# print("After clean_value:", repr(v))

0 commit comments

Comments
 (0)