Skip to content

Commit 66413ce

Browse files
authored
Merge pull request syntactic#7 from syntactic/2025_update
Add comprehensive Unicode support and publish to PyPI (v2.1.1) - Add Unicode support for 10+ major language scripts (Chinese, Japanese, Korean, Arabic, Russian, Hebrew, Greek, Thai, Hindi, and more) in both parsers (legacy and modern) - Add 11 comprehensive Unicode tests covering all supported scripts - Fix argparse support in DeterministicGenerator CLI (--help now works) - Publish to PyPI as 'jsgf-tools' package - Add modern pyproject.toml for Python packaging standards - Add MANIFEST.in for proper file inclusion in distribution - Update README with PyPI installation instructions and Unicode examples - Update setup.py with enhanced metadata and keywords
2 parents 54a7470 + f0fad03 commit 66413ce

15 files changed

+1792
-15
lines changed

DeterministicGenerator.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
a segmentation fault.
3737
"""
3838

39-
import sys, itertools
39+
import sys, itertools, argparse
4040
import JSGFParser as parser
4141
import JSGFGrammar as gram
4242

@@ -124,20 +124,25 @@ def main():
124124
"""Main function for command line usage"""
125125
global grammar
126126

127-
if len(sys.argv) != 2:
128-
print("Usage: python DeterministicGenerator.py <grammarFile>")
129-
sys.exit(1)
127+
arg_parser = argparse.ArgumentParser(
128+
description='Generate all possible strings from a non-recursive JSGF grammar'
129+
)
130+
arg_parser.add_argument(
131+
'grammarFile',
132+
help='Path to the JSGF grammar file'
133+
)
134+
args = arg_parser.parse_args()
130135

131136
try:
132-
with open(sys.argv[1], 'r') as fileStream:
137+
with open(args.grammarFile, 'r') as fileStream:
133138
grammar = parser.getGrammarObject(fileStream)
134139

135140
for rule in grammar.publicRules:
136141
expansions = processRHS(rule.rhs)
137142
for expansion in expansions:
138143
print(expansion)
139144
except FileNotFoundError:
140-
print(f"Error: Grammar file '{sys.argv[1]}' not found")
145+
print(f"Error: Grammar file '{args.grammarFile}' not found")
141146
sys.exit(1)
142147
except Exception as e:
143148
print(f"Error processing grammar: {e}")

JSGFParser.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,30 @@
6262
import JSGFGrammar as gram
6363
from pyparsing import (Word, Literal, Group, Optional, ZeroOrMore, OneOrMore,
6464
Forward, MatchFirst, Combine, alphas, alphanums, nums,
65-
stringEnd)
65+
stringEnd, pyparsing_unicode)
6666

6767
sys.setrecursionlimit(100000)
6868
usePackrat = True
6969

70+
# Unicode support: Tier 1 + Tier 2 scripts for comprehensive language coverage
71+
# Covers 5+ billion speakers: Latin, CJK, Arabic, Cyrillic, Devanagari, Hangul, Hebrew, Greek, Thai
72+
# Note: Using printables for scripts with combining characters (Thai, Devanagari)
73+
_unicode_letters = (
74+
# Tier 1: Major scripts (Latin, CJK, Arabic, Cyrillic)
75+
pyparsing_unicode.Latin1.alphas +
76+
pyparsing_unicode.LatinA.alphas +
77+
pyparsing_unicode.LatinB.alphas +
78+
pyparsing_unicode.CJK.alphas +
79+
pyparsing_unicode.Arabic.alphas +
80+
pyparsing_unicode.Cyrillic.alphas +
81+
# Tier 2: Common scripts (using printables for scripts with combining marks)
82+
pyparsing_unicode.Devanagari.printables +
83+
pyparsing_unicode.Hangul.alphas +
84+
pyparsing_unicode.Hebrew.alphas +
85+
pyparsing_unicode.Greek.alphas +
86+
pyparsing_unicode.Thai.printables
87+
)
88+
7089
def foundWeight(s, loc, toks):
7190
"""
7291
PyParsing action to run when a weight is found.
@@ -165,11 +184,11 @@ def foundSeq(s, loc, toks):
165184
# PyParsing rule for a weight
166185
weight = (Literal('/').suppress() + (Word(nums + '.')).setResultsName('weightAmount') + Literal('/').suppress()).setParseAction(foundWeight).setResultsName("weight")
167186

168-
# PyParsing rule for a token
169-
token = Word(alphanums+"'_-,.?@").setResultsName('token').setParseAction(foundToken)
187+
# PyParsing rule for a token (with Unicode support)
188+
token = Word(alphanums + _unicode_letters + "'_-,.?@").setResultsName('token').setParseAction(foundToken)
170189

171-
# PyParsing rule for a nonterminal reference
172-
nonterminal = Combine(Literal('<') + Word(alphanums+'$_:;,=|/\\()[]@#%!^&~') + Literal('>')).setParseAction(foundNonterminal).setResultsName('NonTerminal')
190+
# PyParsing rule for a nonterminal reference (with Unicode support)
191+
nonterminal = Combine(Literal('<') + Word(alphanums + _unicode_letters + '$_:;,=|/\\()[]@#%!^&~') + Literal('>')).setParseAction(foundNonterminal).setResultsName('NonTerminal')
173192

174193
Sequence = Forward()
175194

MANIFEST.in

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
include README.md
2+
include LICENSE
3+
include CLAUDE.md
4+
include requirements-dev.txt
5+
include pytest.ini
6+
include *.gram
7+
recursive-include jsgf *.py
8+
recursive-exclude * __pycache__
9+
recursive-exclude * *.py[co]

README.md

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ A Python library for parsing and generating strings from JSGF (Java Speech Gramm
1515

1616
## Installation
1717

18+
### From PyPI (Recommended)
19+
```bash
20+
pip install jsgf-tools
21+
```
22+
1823
### From Source
1924
```bash
2025
git clone https://github.com/syntactic/JSGFTools.git
@@ -26,7 +31,7 @@ pip install -e .
2631
```bash
2732
git clone https://github.com/syntactic/JSGFTools.git
2833
cd JSGFTools
29-
pip install -r requirements-dev.txt
34+
pip install -e ".[dev]"
3035
```
3136

3237
## Quick Start
@@ -101,6 +106,26 @@ public <start> = <greeting> <target>;
101106
- Grouping with parentheses
102107
- Comments (// and /* */)
103108
- Public and private rules
109+
- **Unicode support** for 10+ major language scripts
110+
111+
### Unicode Support
112+
113+
JSGFTools fully supports Unicode characters in both tokens and rule names, covering:
114+
- **Latin scripts** (English, Spanish, French, etc.)
115+
- **CJK** (Chinese, Japanese Kanji, Korean Hanja)
116+
- **Arabic** (Arabic, Persian, Urdu)
117+
- **Cyrillic** (Russian, Ukrainian, Bulgarian)
118+
- **Devanagari** (Hindi, Sanskrit, Marathi)
119+
- **Hangul** (Korean)
120+
- **Hebrew**
121+
- **Greek**
122+
- **Thai**
123+
124+
Example:
125+
```jsgf
126+
public <greeting> = hello | 你好 | こんにちは | مرحبا | привет | שלום;
127+
public <问候> = 您好 | 欢迎;
128+
```
104129

105130
### Not Yet Supported
106131
- Kleene operators (* and +)
@@ -162,5 +187,7 @@ MIT License. See [LICENSE](LICENSE) file for details.
162187

163188
## Version History
164189

190+
- **2.1.1**: Fixed argparse support in DeterministicGenerator CLI (--help now works)
191+
- **2.1.0**: Added comprehensive Unicode support (10+ language scripts), published to PyPI
165192
- **2.0.0**: Complete Python 3 modernization, added test suite, improved packaging
166193
- **1.x**: Original Python 2.7 version

jsgf/__init__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""
2+
JSGF Tools - Modern Python library for parsing and generating from JSGF grammars.
3+
4+
This package provides a clean, object-oriented API for working with JSGF grammars.
5+
"""
6+
7+
from .grammar import Grammar
8+
from .generators import DeterministicGenerator, ProbabilisticGenerator
9+
from .exceptions import JSGFError, ParseError, GenerationError
10+
11+
__version__ = "2.0.0"
12+
__all__ = [
13+
"Grammar",
14+
"DeterministicGenerator",
15+
"ProbabilisticGenerator",
16+
"JSGFError",
17+
"ParseError",
18+
"GenerationError"
19+
]

jsgf/ast_nodes.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
"""
2+
Abstract Syntax Tree nodes for JSGF grammars.
3+
4+
This module provides the core AST node classes that represent different
5+
parts of a JSGF grammar structure.
6+
"""
7+
8+
from typing import List, Union, Any, Optional
9+
from abc import ABC, abstractmethod
10+
11+
12+
class JSGFNode(ABC):
13+
"""Base class for all JSGF AST nodes."""
14+
15+
@abstractmethod
16+
def __str__(self) -> str:
17+
"""Return a string representation of this node."""
18+
pass
19+
20+
def __repr__(self) -> str:
21+
return f"{self.__class__.__name__}({str(self)})"
22+
23+
24+
class Terminal(JSGFNode):
25+
"""Represents a terminal symbol (token) in the grammar."""
26+
27+
def __init__(self, value: str):
28+
self.value = value
29+
30+
def __str__(self) -> str:
31+
return self.value
32+
33+
def __eq__(self, other: Any) -> bool:
34+
return isinstance(other, Terminal) and self.value == other.value
35+
36+
def __hash__(self) -> int:
37+
return hash(self.value)
38+
39+
40+
class NonTerminal(JSGFNode):
41+
"""Represents a non-terminal symbol in the grammar."""
42+
43+
def __init__(self, name: str):
44+
self.name = name
45+
46+
def __str__(self) -> str:
47+
return self.name
48+
49+
def __eq__(self, other: Any) -> bool:
50+
return isinstance(other, NonTerminal) and self.name == other.name
51+
52+
def __hash__(self) -> int:
53+
return hash(self.name)
54+
55+
56+
class Sequence(JSGFNode):
57+
"""Represents a sequence of elements."""
58+
59+
def __init__(self, elements: List[JSGFNode]):
60+
self.elements = elements
61+
62+
def __str__(self) -> str:
63+
return " ".join(str(element) for element in self.elements)
64+
65+
def __iter__(self):
66+
return iter(self.elements)
67+
68+
def __len__(self) -> int:
69+
return len(self.elements)
70+
71+
def __getitem__(self, index: int) -> JSGFNode:
72+
return self.elements[index]
73+
74+
75+
class Alternative(JSGFNode):
76+
"""Represents alternatives (choices) in the grammar."""
77+
78+
def __init__(self, choices: List[Union[JSGFNode, tuple]]):
79+
"""
80+
Initialize alternatives.
81+
82+
Args:
83+
choices: List of choices. Each choice can be:
84+
- A JSGFNode (unweighted)
85+
- A tuple of (JSGFNode, weight) (weighted)
86+
"""
87+
self.choices = []
88+
for choice in choices:
89+
if isinstance(choice, tuple):
90+
node, weight = choice
91+
self.choices.append((node, float(weight)))
92+
else:
93+
self.choices.append((choice, 1.0)) # Default weight
94+
95+
def __str__(self) -> str:
96+
choice_strs = []
97+
for node, weight in self.choices:
98+
if weight != 1.0:
99+
choice_strs.append(f"/{weight}/ {node}")
100+
else:
101+
choice_strs.append(str(node))
102+
return "( " + " | ".join(choice_strs) + " )"
103+
104+
def __iter__(self):
105+
return iter(self.choices)
106+
107+
def __len__(self) -> int:
108+
return len(self.choices)
109+
110+
def get_weights(self) -> List[float]:
111+
"""Return the weights of all choices."""
112+
return [weight for _, weight in self.choices]
113+
114+
def get_nodes(self) -> List[JSGFNode]:
115+
"""Return the nodes of all choices."""
116+
return [node for node, _ in self.choices]
117+
118+
119+
class Optional(JSGFNode):
120+
"""Represents an optional element in the grammar."""
121+
122+
def __init__(self, element: JSGFNode):
123+
self.element = element
124+
125+
def __str__(self) -> str:
126+
return f"[ {self.element} ]"
127+
128+
129+
class Group(JSGFNode):
130+
"""Represents a grouped element."""
131+
132+
def __init__(self, element: JSGFNode):
133+
self.element = element
134+
135+
def __str__(self) -> str:
136+
return f"( {self.element} )"
137+
138+
139+
class Rule:
140+
"""Represents a complete grammar rule."""
141+
142+
def __init__(self, name: str, expansion: JSGFNode, is_public: bool = False):
143+
self.name = name
144+
self.expansion = expansion
145+
self.is_public = is_public
146+
147+
def __str__(self) -> str:
148+
prefix = "public " if self.is_public else ""
149+
return f"{prefix}<{self.name}> = {self.expansion};"
150+
151+
def __repr__(self) -> str:
152+
return f"Rule(name='{self.name}', is_public={self.is_public})"

0 commit comments

Comments
 (0)