@@ -1353,7 +1353,8 @@ def parse_sense_node(node, sense_base, pos):
13531353 # "indentation", like "#:" or "##:"
13541354 return False
13551355
1356- # If a recursion call succeeds in push_sense(), bubble it up with added.
1356+ # If a recursion call succeeds in push_sense(), bubble it up with
1357+ # `added`.
13571358 # added |= push_sense() or added |= parse_sense_node(...) to OR.
13581359 added = False
13591360
@@ -1466,10 +1467,13 @@ def sense_template_fn(name, ht):
14661467 add_form_of_tags (ctx , name , config .FORM_OF_TEMPLATES , sense_base )
14671468 return None
14681469
1470+ link_tuples = []
1471+
14691472 def extract_link_texts (item ):
14701473 """Recursively extracts link texts from the gloss source. This
14711474 information is used to select whether to remove final "." from
14721475 form_of/alt_of (e.g., ihm/Hunsrik)."""
1476+ nonlocal link_tuples
14731477 if isinstance (item , (list , tuple )):
14741478 for x in item :
14751479 extract_link_texts (x )
@@ -1498,7 +1502,8 @@ def extract_link_texts(item):
14981502 # get the raw text of non-list contents of this node, and other stuff
14991503 # like tag and category data added to sense_base
15001504 rawgloss = clean_node (config , ctx , sense_base , contents ,
1501- template_fn = sense_template_fn )
1505+ template_fn = sense_template_fn ,
1506+ collect_links = True )
15021507
15031508 if not rawgloss :
15041509 return False
@@ -3469,13 +3474,13 @@ def parse_page(ctx: Wtp, word: str, text: str, config: WiktionaryConfig) -> list
34693474 return ret
34703475
34713476
3472- def clean_node (config , ctx , category_data , value , template_fn = None ,
3473- post_template_fn = None ):
3477+ def clean_node (config , ctx , sense_data , value , template_fn = None ,
3478+ post_template_fn = None , collect_links = False ):
34743479 """Expands the node to text, cleaning up any HTML and duplicate spaces.
34753480 This is intended for expanding things like glosses for a single sense."""
34763481 assert isinstance (config , WiktionaryConfig )
34773482 assert isinstance (ctx , Wtp )
3478- assert category_data is None or isinstance (category_data , dict )
3483+ assert sense_data is None or isinstance (sense_data , dict )
34793484 assert template_fn is None or callable (template_fn )
34803485 assert post_template_fn is None or callable (post_template_fn )
34813486 # print("CLEAN_NODE:", repr(value))
@@ -3518,23 +3523,60 @@ def clean_node_handler_fn(node):
35183523 post_template_fn = post_template_fn )
35193524 # print("clean_node: v={!r}".format(v))
35203525
3521- # Capture categories if category_data has been given. We also track
3526+ # Capture categories if sense_data has been given. We also track
35223527 # Lua execution errors here.
3523- if category_data is not None :
3528+ # If collect_links=True (for glosses), capture links
3529+ if sense_data is not None :
35243530 # Check for Lua execution error
35253531 if v .find ('<strong class="error">Lua execution error' ) >= 0 :
3526- data_append (ctx , category_data , "tags" , "error-lua-exec" )
3532+ data_append (ctx , sense_data , "tags" , "error-lua-exec" )
35273533 if v .find ('<strong class="error">Lua timeout error' ) >= 0 :
3528- data_append (ctx , category_data , "tags" , "error-lua-timeout" )
3534+ data_append (ctx , sense_data , "tags" , "error-lua-timeout" )
35293535 # Capture Category tags
3530- for m in re .finditer (r"(?is)\[\[:?\s*Category\s*:([^]|]+)" , v ):
3531- cat = clean_value (config , m .group (1 ))
3532- cat = re .sub (r"\s+" , " " , cat )
3533- cat = cat .strip ()
3534- if not cat :
3535- continue
3536- if cat not in category_data .get ("categories" , ()):
3537- data_append (ctx , category_data , "categories" , cat )
3536+ if not collect_links :
3537+ for m in re .finditer (r"(?is)\[\[:?\s*Category\s*:([^]|]+)" , v ):
3538+ cat = clean_value (config , m .group (1 ))
3539+ cat = re .sub (r"\s+" , " " , cat )
3540+ cat = cat .strip ()
3541+ if not cat :
3542+ continue
3543+ if cat not in sense_data .get ("categories" , ()):
3544+ data_append (ctx , sense_data , "categories" , cat )
3545+ else :
3546+ for m in re .finditer (r"(?is)\[\[:?(\s*([^][|:]+):)?\s*([^]|]+)"
3547+ r"(\|([^]|]+))?\]\]" , v ):
3548+ # Add here other stuff different "Something:restofthelink"
3549+ # things;
3550+ if m .group (1 ) and m .group (1 ).strip () == "Category" :
3551+ cat = clean_value (config , m .group (3 ))
3552+ cat = re .sub (r"\s+" , " " , cat )
3553+ cat = cat .strip ()
3554+ if not cat :
3555+ continue
3556+ if cat not in sense_data .get ("categories" , ()):
3557+ data_append (ctx , sense_data , "categories" , cat )
3558+ elif not m .group (1 ):
3559+ if m .group (5 ):
3560+ ltext = clean_value (config , m .group (5 ))
3561+ ltarget = clean_value (config , m .group (3 ))
3562+ elif not m .group (3 ):
3563+ continue
3564+ else :
3565+ txt = clean_value (config , m .group (3 ))
3566+ ltext = txt
3567+ ltarget = txt
3568+ ltarget = re .sub (r"\s+" , " " , ltarget )
3569+ ltarget = ltarget .strip ()
3570+ ltext = re .sub (r"\s+" , " " , ltext )
3571+ ltext = ltext .strip ()
3572+ if not ltext and not ltarget :
3573+ continue
3574+ if not ltext and ltarget :
3575+ ltext = ltarget
3576+ ltuple = (ltext , ltarget )
3577+ if ltuple not in sense_data .get ("links" , ()):
3578+ data_append (ctx , sense_data , "links" , ltuple )
3579+
35383580
35393581 v = clean_value (config , v )
35403582 # print("After clean_value:", repr(v))
0 commit comments