Skip to content

Commit 8ca3bf2

Browse files
Fixed a bug in classify_desc()
Previously when doing unicode data stuff with the text, the unicode data was decomposed into components pieces (like an ellipse "…" -> "...") to make it easier to figure out what data they have (especially with COMBINING characters), but that data was then zip()ed together with the original text, causing mismatches when handling it; the indexes were off, so "character -> unicodedata" did not hold true. The description data is now normalized into normalized_desc, which is used when appropriate to analyze characters.
1 parent 6f16f57 commit 8ca3bf2

File tree

2 files changed

+20
-4
lines changed

2 files changed

+20
-4
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,7 @@ kj-wiktwords.sh
1313
tmp/
1414
.venv/
1515
.vscode/
16+
p
17+
pages/
18+
bacpages/
19+
outtmp/

wiktextract/form_descriptions.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2397,6 +2397,8 @@ def classify_desc(desc, allow_unknown_tags=False, no_unknown_starts=False):
23972397
if not desc:
23982398
return "other"
23992399

2400+
normalized_desc = unicodedata.normalize("NFKD", desc)
2401+
24002402
# If it can be fully decoded as tags without errors, treat as tags
24012403
tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts)
24022404
for tagset in tagsets:
@@ -2428,8 +2430,10 @@ def classify_desc(desc, allow_unknown_tags=False, no_unknown_starts=False):
24282430
if have_non_english >= len(lst) - 1 and have_non_english > 0:
24292431
return "taxonomic"
24302432

2431-
# If all words are in our English dictionary, interpret as English
2432-
if re.match(r"^[ -~―—“”…'‘’ʹ€]+$", desc) and len(desc) > 1:
2433+
# If all words are in our English dictionary, interpret as English.
2434+
# [ -~] is regex black magic, "all characters from space to tilde"
2435+
# in ASCII. Took me a while to figure out.
2436+
if re.match(r"^[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1:
24332437
if desc in english_words and desc[0].isalpha():
24342438
return "english" # Handles ones containing whitespace
24352439
desc1 = re.sub(tokenizer_fixup_re,
@@ -2488,11 +2492,15 @@ def classify_desc(desc, allow_unknown_tags=False, no_unknown_starts=False):
24882492
# treat as romanization
24892493
classes = list(unicodedata.category(x)
24902494
if x not in ("-", ",", ":", "/", '"') else "OK"
2491-
for x in unicodedata.normalize("NFKD", desc))
2495+
for x in normalized_desc)
24922496
classes1 = []
24932497
num_latin = 0
24942498
num_greek = 0
2495-
for ch, cl in zip(desc, classes):
2499+
# part = ""
2500+
# for ch, cl in zip(normalized_desc, classes):
2501+
# part += f"{ch}({cl})"
2502+
# print(part)
2503+
for ch, cl in zip(normalized_desc, classes):
24962504
if ch in ("'", # ' in Arabic, / in IPA-like parenthesized forms
24972505
".", # e.g., "..." in translations
24982506
";",
@@ -2504,7 +2512,11 @@ def classify_desc(desc, allow_unknown_tags=False, no_unknown_starts=False):
25042512
'“',
25052513
'”',
25062514
"/",
2515+
"?",
25072516
"…", # alternative to "..."
2517+
"⁉", # 見る/Japanese automatic transcriptions...
2518+
"?",
2519+
"!",
25082520
"⁻", # superscript -, used in some Cantonese roman, e.g. "we"
25092521
"ʔ",
25102522
"ʼ",

0 commit comments

Comments
 (0)