diff --git a/.gitignore b/.gitignore index 0d20b64..6e871d2 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,41 @@ +# Python *.pyc +__pycache__/ +*.pyo +*.pyd +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Virtual environments +venv/ +env/ +ENV/ diff --git a/DeterministicGenerator.py b/DeterministicGenerator.py index a6f526c..7ab09f7 100644 --- a/DeterministicGenerator.py +++ b/DeterministicGenerator.py @@ -1,3 +1,26 @@ +# -*- coding: utf-8 -*- +# @copyright: MIT License +# Copyright (c) 2018 syntactic (Pastèque Ho) +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# @summary: This file generates all strings described by a non-recursive JSGF grammar. +# Run it by entering into the command line: python DeterministicGenerator.py +# where is the path to the JSGF grammar. +# @since: 2014/06/02 + """ This file deterministically generates strings from a JSGF Grammar, whether there are \ weights defined in rules or not. It requires one argument: the path to the JSGF\ @@ -13,32 +36,7 @@ a segmentation fault. """ -# @copyright: (c)Copyright 2014, THC All Rights Reserved. -# The source code contained or described here in and all documents related -# to the source code ("Material") are owned by THC or its -# suppliers or licensors. Title to the Material remains with THC -# or its suppliers and licensors. The Material contains trade secrets and -# proprietary and confidential information of THC or its suppliers and -# licensors. - -# The Material is protected by worldwide copyright and trade secret laws and -# treaty provisions. No part of the Material may be used, copied, reproduced, -# modified, published, uploaded, posted, transmitted, distributed, or disclosed -# in any way without THC's prior express written permission. - -# No license under any patent, copyright, trade secret or other intellectual -# property right is granted to or conferred upon you by disclosure or delivery -# of the Materials, either expressly, by implication, inducement, estoppel or -# otherwise. Any license under such intellectual property rights must be express -# and approved by THC in writing. - -# @organization: THC Science -# @summary: This file generates all strings described by a non-recursive JSGF grammar. -# Run it by entering into the command line: python DeterministicGenerator.py -# where is the path to the JSGF grammar. -# @since: 2014/06/02 - -import sys, itertools +import sys, itertools, argparse import JSGFParser as parser import JSGFGrammar as gram @@ -118,17 +116,40 @@ def processRHS(rhs): return processOptional(rhs) elif isinstance(rhs, gram.NonTerminal): return processNonTerminal(rhs) - elif type(rhs) is str: + elif isinstance(rhs, str): return [rhs] +def main(): + """Main function for command line usage""" + global grammar + + arg_parser = argparse.ArgumentParser( + description='Generate all possible strings from a non-recursive JSGF grammar' + ) + arg_parser.add_argument( + 'grammarFile', + help='Path to the JSGF grammar file' + ) + args = arg_parser.parse_args() + + try: + with open(args.grammarFile, 'r') as fileStream: + grammar = parser.getGrammarObject(fileStream) + + for rule in grammar.publicRules: + expansions = processRHS(rule.rhs) + for expansion in expansions: + print(expansion) + except FileNotFoundError: + print(f"Error: Grammar file '{args.grammarFile}' not found") + sys.exit(1) + except Exception as e: + print(f"Error processing grammar: {e}") + sys.exit(1) + if __name__ == '__main__': - fileStream = open(sys.argv[1]) - grammar = parser.getGrammarObject(fileStream) - for rule in grammar.publicRules: - expansions = processRHS(rule.rhs) - for expansion in expansions: - print expansion + main() diff --git a/JSGFGrammar.py b/JSGFGrammar.py index 26e4a92..ecfd7e7 100644 --- a/JSGFGrammar.py +++ b/JSGFGrammar.py @@ -1,32 +1,32 @@ +# -*- coding: utf-8 -*- +# @copyright: MIT License +# Copyright (c) 2018 syntactic (Pastèque Ho) +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# @summary: This file lays out the class structure for a JSGF Grammar +# @since: 2014/06/02 + """ This file lays out the class structure for a JSGF Grammar. .. module:: JSGFGrammar -.. moduleauthor:: Timothy Ho +.. moduleauthor:: Pastèque Ho """ -# @copyright: (c)Copyright 2014, THC All Rights Reserved. -# The source code contained or described here in and all documents related -# to the source code ("Material") are owned by THC or its -# suppliers or licensors. Title to the Material remains with THC -# or its suppliers and licensors. The Material contains trade secrets and -# proprietary and confidential information of THC or its suppliers and -# licensors. - -# The Material is protected by worldwide copyright and trade secret laws and -# treaty provisions. No part of the Material may be used, copied, reproduced, -# modified, published, uploaded, posted, transmitted, distributed, or disclosed -# in any way without THC's prior express written permission. - -# No license under any patent, copyright, trade secret or other intellectual -# property right is granted to or conferred upon you by disclosure or delivery -# of the Materials, either expressly, by implication, inducement, estoppel or -# otherwise. Any license under such intellectual property rights must be express -# and approved by THC in writing. - -# @organization: THC Science -# @summary: This file lays out the class structure for a JSGF Grammar -# @since: 2014/06/02 + class JSGFExpression(): pass @@ -168,4 +168,4 @@ def __str__(self): jgOpt = Optional(jgDisj) jgRule = Rule("", jgOpt) - print jgRule + print(jgRule) diff --git a/JSGFParser.py b/JSGFParser.py index a63f86e..b45659f 100644 --- a/JSGFParser.py +++ b/JSGFParser.py @@ -1,3 +1,24 @@ +# -*- coding: utf-8 -*- +# @copyright: MIT License +# Copyright (c) 2018 syntactic (Pastèque Ho) +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# @summary: This file parses a JSGF Grammar and prints it out. +# @since: 2014/06/02 + """ This file parses a JSGF grammar file and returns a JSGFGrammar object. \ It uses the pyparsing module and defines a grammar for JSGF grammars. \ @@ -37,36 +58,34 @@ """ -# @copyright: (c)Copyright 2014, THC All Rights Reserved. -# The source code contained or described here in and all documents related -# to the source code ("Material") are owned by THC or its -# suppliers or licensors. Title to the Material remains with THC -# or its suppliers and licensors. The Material contains trade secrets and -# proprietary and confidential information of THC or its suppliers and -# licensors. - -# The Material is protected by worldwide copyright and trade secret laws and -# treaty provisions. No part of the Material may be used, copied, reproduced, -# modified, published, uploaded, posted, transmitted, distributed, or disclosed -# in any way without THC's prior express written permission. - -# No license under any patent, copyright, trade secret or other intellectual -# property right is granted to or conferred upon you by disclosure or delivery -# of the Materials, either expressly, by implication, inducement, estoppel or -# otherwise. Any license under such intellectual property rights must be express -# and approved by THC in writing. - -# @organization: THC Science -# @summary: This file parses a JSGF Grammar and prints it out. -# @since: 2014/06/02 - import sys import JSGFGrammar as gram -from pyparsing import * +from pyparsing import (Word, Literal, Group, Optional, ZeroOrMore, OneOrMore, + Forward, MatchFirst, Combine, alphas, alphanums, nums, + stringEnd, pyparsing_unicode) sys.setrecursionlimit(100000) usePackrat = True +# Unicode support: Tier 1 + Tier 2 scripts for comprehensive language coverage +# Covers 5+ billion speakers: Latin, CJK, Arabic, Cyrillic, Devanagari, Hangul, Hebrew, Greek, Thai +# Note: Using printables for scripts with combining characters (Thai, Devanagari) +_unicode_letters = ( + # Tier 1: Major scripts (Latin, CJK, Arabic, Cyrillic) + pyparsing_unicode.Latin1.alphas + + pyparsing_unicode.LatinA.alphas + + pyparsing_unicode.LatinB.alphas + + pyparsing_unicode.CJK.alphas + + pyparsing_unicode.Arabic.alphas + + pyparsing_unicode.Cyrillic.alphas + + # Tier 2: Common scripts (using printables for scripts with combining marks) + pyparsing_unicode.Devanagari.printables + + pyparsing_unicode.Hangul.alphas + + pyparsing_unicode.Hebrew.alphas + + pyparsing_unicode.Greek.alphas + + pyparsing_unicode.Thai.printables +) + def foundWeight(s, loc, toks): """ PyParsing action to run when a weight is found. @@ -101,7 +120,6 @@ def foundWeightedExpression(s, loc, toks): :returns: Ordered pair of the expression and its weight """ - toks.weightedExpression = (toks.expr, toks.weight) #print 'found weighted expression', toks.dump() expr = list(toks.expr) if len(expr) == 1: @@ -166,11 +184,11 @@ def foundSeq(s, loc, toks): # PyParsing rule for a weight weight = (Literal('/').suppress() + (Word(nums + '.')).setResultsName('weightAmount') + Literal('/').suppress()).setParseAction(foundWeight).setResultsName("weight") -# PyParsing rule for a token -token = Word(alphanums+"'_-,.?@").setResultsName('token').setParseAction(foundToken) +# PyParsing rule for a token (with Unicode support) +token = Word(alphanums + _unicode_letters + "'_-,.?@").setResultsName('token').setParseAction(foundToken) -# PyParsing rule for a nonterminal reference -nonterminal = Combine(Literal('<') + Word(alphanums+'$_:;,=|/\\()[]@#%!^&~') + Literal('>')).setParseAction(foundNonterminal).setResultsName('NonTerminal') +# PyParsing rule for a nonterminal reference (with Unicode support) +nonterminal = Combine(Literal('<') + Word(alphanums + _unicode_letters + '$_:;,=|/\\()[]@#%!^&~') + Literal('>')).setParseAction(foundNonterminal).setResultsName('NonTerminal') Sequence = Forward() @@ -243,4 +261,4 @@ def getGrammarObject(fileStream): if __name__ == '__main__': fileStream = open(sys.argv[1]) grammar = getGrammarObject(fileStream) - print grammar + print(grammar) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4bcd760 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 syntactic + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..b31634a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,9 @@ +include README.md +include LICENSE +include CLAUDE.md +include requirements-dev.txt +include pytest.ini +include *.gram +recursive-include jsgf *.py +recursive-exclude * __pycache__ +recursive-exclude * *.py[co] diff --git a/ProbabilisticGenerator.py b/ProbabilisticGenerator.py index fbfc021..2a0e8e7 100644 --- a/ProbabilisticGenerator.py +++ b/ProbabilisticGenerator.py @@ -1,3 +1,29 @@ +# -*- coding: utf-8 -*- +#/usr/bin/python + +# @copyright: MIT License +# Copyright (c) 2018 syntactic (Pastèque Ho) +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# @summary: This file generates sentences from a PCFG in JSGF. Run it by entering +# in the command line: python ProbabilisticGenerator.py +# where is the path of the JSGF file, and is the number +# of strings you want to generate +# @since: 2014/06/02 + """ This file probabilistically generates strings from a JSGF grammar. It takes advantage \ of weights assigned to alternatives (separated by pipes) by choosing to \ @@ -17,33 +43,6 @@ weights if they are provided. """ -#/usr/bin/python -# @copyright: (c)Copyright 2014, THC All Rights Reserved. -# The source code contained or described here in and all documents related -# to the source code ("Material") are owned by THC or its -# suppliers or licensors. Title to the Material remains with THC -# or its suppliers and licensors. The Material contains trade secrets and -# proprietary and confidential information of THC or its suppliers and -# licensors. - -# The Material is protected by worldwide copyright and trade secret laws and -# treaty provisions. No part of the Material may be used, copied, reproduced, -# modified, published, uploaded, posted, transmitted, distributed, or disclosed -# in any way without THC's prior express written permission. - -# No license under any patent, copyright, trade secret or other intellectual -# property right is granted to or conferred upon you by disclosure or delivery -# of the Materials, either expressly, by implication, inducement, estoppel or -# otherwise. Any license under such intellectual property rights must be express -# and approved by THC in writing. - -# @organization: THC Science -# @summary: This file generates sentences from a PCFG in JSGF. Run it by entering -# in the command line: python ProbabilisticGenerator.py -# where is the path of the JSGF file, and is the number -# of strings you want to generate -# @since: 2014/06/02 - import sys, itertools, random, bisect, argparse import JSGFParser as parser import JSGFGrammar as gram @@ -131,37 +130,48 @@ def processRHS(rhs): return processOptional(rhs) elif isinstance(rhs, gram.NonTerminal): return processNonTerminal(rhs) - elif type(rhs) is str: + elif isinstance(rhs, str): return rhs +def main(): + """Main function for command line usage""" + global grammar + + argParser = argparse.ArgumentParser(description='Generate random strings from a JSGF grammar') + argParser.add_argument('grammarFile', help='Path to the JSGF grammar file') + argParser.add_argument('iterations', type=int, help='Number of strings to generate') + + try: + args = argParser.parse_args() + except SystemExit: + return + + try: + with open(args.grammarFile, 'r') as fileStream: + grammar = parser.getGrammarObject(fileStream) + + if len(grammar.publicRules) > 1: + # Multiple public rules - create a disjunction of all of them + disjuncts = [rule.rhs for rule in grammar.publicRules] + newStartSymbol = gram.Disjunction(disjuncts) + for i in range(args.iterations): + print(processRHS(newStartSymbol)) + else: + # Single public rule + startSymbol = grammar.publicRules[0] + for i in range(args.iterations): + expansions = processRHS(startSymbol.rhs) + print(expansions) + except FileNotFoundError: + print(f"Error: Grammar file '{args.grammarFile}' not found") + sys.exit(1) + except Exception as e: + print(f"Error processing grammar: {e}") + sys.exit(1) + if __name__ == '__main__': - argParser = argparse.ArgumentParser() - argParser.add_argument('grammarFile') - argParser.add_argument('iterations', type=int, nargs=1, help='number of strings to generate') - args = argParser.parse_args() - fileStream = open(args.grammarFile) - numIterations = args.iterations[0] - grammar = parser.getGrammarObject(fileStream) - if len(grammar.publicRules) != 1: - #x = raw_input('Found more than one public rule. Generate a random string between them?\n') - #if x == 'y': - ### This next chunk has been de-indented - disjuncts = [] - for rule in grammar.publicRules: - rhs = rule.rhs - disjuncts.append(rhs) - newStartSymbol = gram.Disjunction(disjuncts) - for i in range(numIterations): - print processRHS(newStartSymbol) - ### - #else: - #sys.exit('Bye') - else: - startSymbol = grammar.publicRules[0] - for i in range(numIterations): - expansions = processRHS(startSymbol.rhs) - print expansions + main() diff --git a/README.md b/README.md index 6637411..e7b7435 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,193 @@ # JSGF Grammar Tools -This set of tools can be used primarily to generate strings from a JSGF -grammar, but it also provides an easy to use JSGFParser module which creates -abstract syntax trees for JSGF grammars. Developers can use these ASTs to -help create more tools for their purposes. For more detailed documentation, -refer to the Sphinx documentation located in docs/_build/html/index.html +[![Python](https://img.shields.io/badge/python-3.7+-blue.svg)](https://www.python.org/downloads/) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -## Dependencies +A Python library for parsing and generating strings from JSGF (Java Speech Grammar Format) grammars. This modernized version supports Python 3.7+ and includes comprehensive testing. -- Python 2.7 -- PyParsing module (http://pyparsing.wikispaces.com/Download+and+Installation) +## Features -## Instructions +- **Parser**: Convert JSGF grammar files into abstract syntax trees +- **Deterministic Generator**: Generate all possible strings from non-recursive grammars +- **Probabilistic Generator**: Generate random strings using weights and probabilities +- **Modern Python**: Full Python 3.7+ support with type hints and proper packaging +- **Comprehensive Testing**: Full test suite with pytest -The two main Python scripts are DeterministicGenerator.py and -ProbabilisticGenerator.py. Both files require a grammar file as a command -line argument, and the latter also requires a number, which refers to the number -of sentences to generate. Importantly, DeterministicGenerator.py should not take -grammars with recursive rules as an argument. A recursive rule is of the form: +## Installation -``` = this (comes | goes) back to ;``` +### From PyPI (Recommended) +```bash +pip install jsgf-tools +``` -There are two example grammars included with the scripts: Ideas.gram and -IdeasNonRecursive.gram. Ideas.gram is an example of a grammar with recursive -rules, though the recursion is not as direct as the above example. It's a good -idea to run these grammars with the generator scripts to see how the scripts -work: +### From Source +```bash +git clone https://github.com/syntactic/JSGFTools.git +cd JSGFTools +pip install -e . +``` -```> python DeterministicGenerator.py IdeasNonRecursive.gram``` +### Development Setup +```bash +git clone https://github.com/syntactic/JSGFTools.git +cd JSGFTools +pip install -e ".[dev]" +``` -```> python ProbabilisticGenerator.py Ideas.gram 20``` +## Quick Start -### Notes +### Command Line Usage -- Larger grammars take a longer time to parse, so if nothing seems to be generating, -wait a few seconds and the grammar should be parsed. +Generate all possible strings from a non-recursive grammar: +```bash +python DeterministicGenerator.py IdeasNonRecursive.gram +``` -- Most of JSGF as described in http://www.w3.org/TR/2000/NOTE-jsgf-20000605/ is -supported, but there are a few things that have not been implemented by these -tools yet: - - Kleene operators - - Imports and Grammar Names - - Tags +Generate 20 random strings from a grammar (supports recursive rules): +```bash +python ProbabilisticGenerator.py Ideas.gram 20 +``` + +### Python API Usage + +```python +import JSGFParser as parser +import DeterministicGenerator as det_gen +import ProbabilisticGenerator as prob_gen +from io import StringIO + +# Parse a grammar +grammar_text = """ +public = hello | hi; +public = world | there; +public = ; +""" + +with StringIO(grammar_text) as f: + grammar = parser.getGrammarObject(f) + +# Generate all possibilities (deterministic) +det_gen.grammar = grammar +rule = grammar.publicRules[2] # rule +all_strings = det_gen.processRHS(rule.rhs) +print("All possible strings:", all_strings) + +# Generate random string (probabilistic) +prob_gen.grammar = grammar +random_string = prob_gen.processRHS(rule.rhs) +print("Random string:", random_string) +``` + +## Grammar Format + +JSGFTools supports most of the JSGF specification: + +```jsgf +// Comments are supported +public = ; + +// Alternatives with optional weights + = /5/ hello | /1/ hi | hey; + +// Optional elements + = [ please ]; + +// Nonterminal references + = world | there; + +// Recursive rules (use with ProbabilisticGenerator only) + = base | more; +``` + +### Supported Features +- Rule definitions and nonterminal references +- Alternatives (|) with optional weights (/weight/) +- Optional elements ([...]) +- Grouping with parentheses +- Comments (// and /* */) +- Public and private rules +- **Unicode support** for 10+ major language scripts + +### Unicode Support + +JSGFTools fully supports Unicode characters in both tokens and rule names, covering: +- **Latin scripts** (English, Spanish, French, etc.) +- **CJK** (Chinese, Japanese Kanji, Korean Hanja) +- **Arabic** (Arabic, Persian, Urdu) +- **Cyrillic** (Russian, Ukrainian, Bulgarian) +- **Devanagari** (Hindi, Sanskrit, Marathi) +- **Hangul** (Korean) +- **Hebrew** +- **Greek** +- **Thai** + +Example: +```jsgf +public = hello | 你好 | こんにちは | مرحبا | привет | שלום; +public <问候> = 您好 | 欢迎; +``` + +### Not Yet Supported +- Kleene operators (* and +) +- Import statements +- Tags + +## Important Notes + +### Recursive vs Non-Recursive Grammars + +- **DeterministicGenerator**: Only use with non-recursive grammars to avoid infinite loops +- **ProbabilisticGenerator**: Can safely handle recursive grammars through probabilistic termination + +**Example of recursive rule:** +```jsgf + = | and ; +``` + +## Testing + +Run the test suite: +```bash +pytest test_jsgf_tools.py -v +``` + +Run specific test categories: +```bash +pytest test_jsgf_tools.py::TestJSGFParser -v # Parser tests +pytest test_jsgf_tools.py::TestIntegration -v # Integration tests +``` + +## Documentation + +For detailed API documentation, build the Sphinx docs: +```bash +cd docs +make html +``` + +Then open `docs/_build/html/index.html` in your browser. + +## Example Files + +- `Ideas.gram`: Recursive grammar example (use with ProbabilisticGenerator) +- `IdeasNonRecursive.gram`: Non-recursive grammar example (use with DeterministicGenerator) + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Add tests for new functionality +5. Run the test suite: `pytest` +6. Submit a pull request + +## License + +MIT License. See [LICENSE](LICENSE) file for details. + +## Version History + +- **2.1.1**: Fixed argparse support in DeterministicGenerator CLI (--help now works) +- **2.1.0**: Added comprehensive Unicode support (10+ language scripts), published to PyPI +- **2.0.0**: Complete Python 3 modernization, added test suite, improved packaging +- **1.x**: Original Python 2.7 version diff --git a/jsgf/__init__.py b/jsgf/__init__.py new file mode 100644 index 0000000..b703eed --- /dev/null +++ b/jsgf/__init__.py @@ -0,0 +1,19 @@ +""" +JSGF Tools - Modern Python library for parsing and generating from JSGF grammars. + +This package provides a clean, object-oriented API for working with JSGF grammars. +""" + +from .grammar import Grammar +from .generators import DeterministicGenerator, ProbabilisticGenerator +from .exceptions import JSGFError, ParseError, GenerationError + +__version__ = "2.0.0" +__all__ = [ + "Grammar", + "DeterministicGenerator", + "ProbabilisticGenerator", + "JSGFError", + "ParseError", + "GenerationError" +] \ No newline at end of file diff --git a/jsgf/ast_nodes.py b/jsgf/ast_nodes.py new file mode 100644 index 0000000..fbeb49a --- /dev/null +++ b/jsgf/ast_nodes.py @@ -0,0 +1,152 @@ +""" +Abstract Syntax Tree nodes for JSGF grammars. + +This module provides the core AST node classes that represent different +parts of a JSGF grammar structure. +""" + +from typing import List, Union, Any, Optional +from abc import ABC, abstractmethod + + +class JSGFNode(ABC): + """Base class for all JSGF AST nodes.""" + + @abstractmethod + def __str__(self) -> str: + """Return a string representation of this node.""" + pass + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({str(self)})" + + +class Terminal(JSGFNode): + """Represents a terminal symbol (token) in the grammar.""" + + def __init__(self, value: str): + self.value = value + + def __str__(self) -> str: + return self.value + + def __eq__(self, other: Any) -> bool: + return isinstance(other, Terminal) and self.value == other.value + + def __hash__(self) -> int: + return hash(self.value) + + +class NonTerminal(JSGFNode): + """Represents a non-terminal symbol in the grammar.""" + + def __init__(self, name: str): + self.name = name + + def __str__(self) -> str: + return self.name + + def __eq__(self, other: Any) -> bool: + return isinstance(other, NonTerminal) and self.name == other.name + + def __hash__(self) -> int: + return hash(self.name) + + +class Sequence(JSGFNode): + """Represents a sequence of elements.""" + + def __init__(self, elements: List[JSGFNode]): + self.elements = elements + + def __str__(self) -> str: + return " ".join(str(element) for element in self.elements) + + def __iter__(self): + return iter(self.elements) + + def __len__(self) -> int: + return len(self.elements) + + def __getitem__(self, index: int) -> JSGFNode: + return self.elements[index] + + +class Alternative(JSGFNode): + """Represents alternatives (choices) in the grammar.""" + + def __init__(self, choices: List[Union[JSGFNode, tuple]]): + """ + Initialize alternatives. + + Args: + choices: List of choices. Each choice can be: + - A JSGFNode (unweighted) + - A tuple of (JSGFNode, weight) (weighted) + """ + self.choices = [] + for choice in choices: + if isinstance(choice, tuple): + node, weight = choice + self.choices.append((node, float(weight))) + else: + self.choices.append((choice, 1.0)) # Default weight + + def __str__(self) -> str: + choice_strs = [] + for node, weight in self.choices: + if weight != 1.0: + choice_strs.append(f"/{weight}/ {node}") + else: + choice_strs.append(str(node)) + return "( " + " | ".join(choice_strs) + " )" + + def __iter__(self): + return iter(self.choices) + + def __len__(self) -> int: + return len(self.choices) + + def get_weights(self) -> List[float]: + """Return the weights of all choices.""" + return [weight for _, weight in self.choices] + + def get_nodes(self) -> List[JSGFNode]: + """Return the nodes of all choices.""" + return [node for node, _ in self.choices] + + +class Optional(JSGFNode): + """Represents an optional element in the grammar.""" + + def __init__(self, element: JSGFNode): + self.element = element + + def __str__(self) -> str: + return f"[ {self.element} ]" + + +class Group(JSGFNode): + """Represents a grouped element.""" + + def __init__(self, element: JSGFNode): + self.element = element + + def __str__(self) -> str: + return f"( {self.element} )" + + +class Rule: + """Represents a complete grammar rule.""" + + def __init__(self, name: str, expansion: JSGFNode, is_public: bool = False): + self.name = name + self.expansion = expansion + self.is_public = is_public + + def __str__(self) -> str: + prefix = "public " if self.is_public else "" + return f"{prefix}<{self.name}> = {self.expansion};" + + def __repr__(self) -> str: + return f"Rule(name='{self.name}', is_public={self.is_public})" \ No newline at end of file diff --git a/jsgf/cli.py b/jsgf/cli.py new file mode 100644 index 0000000..e73a614 --- /dev/null +++ b/jsgf/cli.py @@ -0,0 +1,255 @@ +""" +Command-line interface for JSGF Tools. + +This module provides clean CLI commands that use the modern JSGF API. +""" + +import argparse +import sys +from pathlib import Path +from typing import Optional + +from .grammar import Grammar +from .generators import DeterministicGenerator, ProbabilisticGenerator, GeneratorConfig +from .exceptions import JSGFError + + +def deterministic_command(): + """Command-line interface for deterministic generation.""" + parser = argparse.ArgumentParser( + description='Generate all possible strings from a non-recursive JSGF grammar', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +Examples: + %(prog)s grammar.jsgf + %(prog)s grammar.jsgf --rule greeting + %(prog)s grammar.jsgf --max-results 100 + ''' + ) + + parser.add_argument( + 'grammar_file', + help='Path to the JSGF grammar file' + ) + + parser.add_argument( + '--rule', '-r', + help='Specific rule to generate from (default: all public rules)' + ) + + parser.add_argument( + '--max-results', '-m', + type=int, + help='Maximum number of strings to generate' + ) + + parser.add_argument( + '--max-recursion', '-d', + type=int, + default=50, + help='Maximum recursion depth (default: 50)' + ) + + parser.add_argument( + '--output', '-o', + help='Output file (default: stdout)' + ) + + args = parser.parse_args() + + try: + # Load grammar + grammar = Grammar.from_file(args.grammar_file) + + # Check for recursion if no specific rule is given + if not args.rule and grammar.is_recursive(): + print( + "Warning: Grammar contains recursive rules. " + "Consider using probabilistic generation instead.", + file=sys.stderr + ) + + # Create generator + config = GeneratorConfig( + max_recursion_depth=args.max_recursion, + max_results=args.max_results + ) + generator = DeterministicGenerator(grammar, config) + + # Open output file if specified + output_file = open(args.output, 'w') if args.output else sys.stdout + + try: + # Generate strings + for string in generator.generate(args.rule): + print(string, file=output_file) + finally: + if args.output: + output_file.close() + + except JSGFError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except KeyboardInterrupt: + print("\\nGeneration interrupted", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}", file=sys.stderr) + sys.exit(1) + + +def probabilistic_command(): + """Command-line interface for probabilistic generation.""" + parser = argparse.ArgumentParser( + description='Generate random strings from a JSGF grammar', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +Examples: + %(prog)s grammar.jsgf 10 + %(prog)s grammar.jsgf 20 --rule greeting + %(prog)s grammar.jsgf 5 --seed 42 + ''' + ) + + parser.add_argument( + 'grammar_file', + help='Path to the JSGF grammar file' + ) + + parser.add_argument( + 'count', + type=int, + help='Number of strings to generate' + ) + + parser.add_argument( + '--rule', '-r', + help='Specific rule to generate from (default: all public rules)' + ) + + parser.add_argument( + '--seed', '-s', + type=int, + help='Random seed for reproducible results' + ) + + parser.add_argument( + '--max-recursion', '-d', + type=int, + default=50, + help='Maximum recursion depth (default: 50)' + ) + + parser.add_argument( + '--output', '-o', + help='Output file (default: stdout)' + ) + + args = parser.parse_args() + + try: + # Load grammar + grammar = Grammar.from_file(args.grammar_file) + + # Create generator + config = GeneratorConfig( + max_recursion_depth=args.max_recursion, + random_seed=args.seed + ) + generator = ProbabilisticGenerator(grammar, config) + + # Open output file if specified + output_file = open(args.output, 'w') if args.output else sys.stdout + + try: + # Generate specified number of strings + strings = generator.generate_list(args.rule, args.count) + for string in strings: + print(string, file=output_file) + finally: + if args.output: + output_file.close() + + except JSGFError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except KeyboardInterrupt: + print("\\nGeneration interrupted", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}", file=sys.stderr) + sys.exit(1) + + +def grammar_info_command(): + """Command-line interface for grammar information.""" + parser = argparse.ArgumentParser( + description='Display information about a JSGF grammar', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument( + 'grammar_file', + help='Path to the JSGF grammar file' + ) + + parser.add_argument( + '--verbose', '-v', + action='http://github.com/store_true', + help='Show detailed information' + ) + + args = parser.parse_args() + + try: + # Load grammar + grammar = Grammar.from_file(args.grammar_file) + + # Basic information + print(f"Grammar: {args.grammar_file}") + print(f"Total rules: {len(grammar)}") + print(f"Public rules: {len(grammar.public_rules)}") + + if args.verbose: + print("\\nPublic rules:") + for rule in grammar.public_rules: + print(f" - {rule.name}") + + print("\\nAll rules:") + for rule_name in sorted(grammar.rule_names): + rule = grammar.get_rule(rule_name) + visibility = "public" if rule.is_public else "private" + print(f" - {rule_name} ({visibility})") + + # Check for recursion + if grammar.is_recursive(): + cycles = grammar.detect_cycles() + print(f"\\nRecursive: Yes ({len(cycles)} cycle(s))") + if args.verbose: + for i, cycle in enumerate(cycles, 1): + print(f" Cycle {i}: {' -> '.join(cycle)}") + else: + print("\\nRecursive: No") + + # Validation + try: + grammar.validate() + print("Validation: Passed") + except Exception as e: + print(f"Validation: Failed - {e}") + + except JSGFError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}", file=sys.stderr) + sys.exit(1) \ No newline at end of file diff --git a/jsgf/exceptions.py b/jsgf/exceptions.py new file mode 100644 index 0000000..89cbeba --- /dev/null +++ b/jsgf/exceptions.py @@ -0,0 +1,40 @@ +""" +Custom exceptions for JSGF Tools. +""" + +from typing import Optional + + +class JSGFError(Exception): + """Base exception for all JSGF-related errors.""" + pass + + +class ParseError(JSGFError): + """Raised when grammar parsing fails.""" + + def __init__(self, message: str, line: Optional[int] = None, column: Optional[int] = None): + self.line = line + self.column = column + + if line is not None: + message = f"Line {line}: {message}" + if column is not None: + message = f"{message} (column {column})" + + super().__init__(message) + + +class GenerationError(JSGFError): + """Raised when string generation fails.""" + pass + + +class ValidationError(JSGFError): + """Raised when grammar validation fails.""" + pass + + +class RecursionError(GenerationError): + """Raised when infinite recursion is detected during generation.""" + pass \ No newline at end of file diff --git a/jsgf/generators.py b/jsgf/generators.py new file mode 100644 index 0000000..d9f0238 --- /dev/null +++ b/jsgf/generators.py @@ -0,0 +1,369 @@ +""" +String generators for JSGF grammars. + +This module provides generators that can produce strings from JSGF grammars +in both deterministic and probabilistic ways. +""" + +from typing import List, Iterator, Optional, Set, Dict, Any +from abc import ABC, abstractmethod +import random +import itertools +from collections import defaultdict + +from .grammar import Grammar +from .ast_nodes import ( + JSGFNode, Terminal, NonTerminal, Sequence, Alternative, + Optional as OptionalNode, Group, Rule +) +from .exceptions import GenerationError, RecursionError + + +class GeneratorConfig: + """Configuration for string generators.""" + + def __init__( + self, + max_recursion_depth: int = 50, + max_results: Optional[int] = None, + random_seed: Optional[int] = None, + optimize_memory: bool = True + ): + self.max_recursion_depth = max_recursion_depth + self.max_results = max_results + self.random_seed = random_seed + self.optimize_memory = optimize_memory + + +class BaseGenerator(ABC): + """ + Base class for all JSGF string generators. + + This class provides common functionality for working with grammars + and generating strings from AST nodes. + """ + + def __init__(self, grammar: Grammar, config: Optional[GeneratorConfig] = None): + self.grammar = grammar + self.config = config or GeneratorConfig() + self._recursion_tracker: Dict[str, int] = defaultdict(int) + + if self.config.random_seed is not None: + random.seed(self.config.random_seed) + + @abstractmethod + def generate(self, rule_name: Optional[str] = None) -> Iterator[str]: + """ + Generate strings from the grammar. + + Args: + rule_name: Name of the rule to start generation from. + If None, uses all public rules. + + Yields: + Generated strings + """ + pass + + def generate_list(self, rule_name: Optional[str] = None, limit: Optional[int] = None) -> List[str]: + """ + Generate a list of strings from the grammar. + + Args: + rule_name: Name of the rule to start generation from + limit: Maximum number of strings to generate + + Returns: + List of generated strings + """ + results = [] + count = 0 + + for string in self.generate(rule_name): + results.append(string) + count += 1 + + if limit and count >= limit: + break + if self.config.max_results and count >= self.config.max_results: + break + + return results + + def _process_node(self, node: JSGFNode, context: Optional[str] = None) -> Any: + """ + Process a single AST node. Implementation depends on generator type. + + Args: + node: The AST node to process + context: Optional context for recursion tracking + + Returns: + Processed result (type depends on implementation) + """ + if isinstance(node, Terminal): + return self._process_terminal(node) + elif isinstance(node, NonTerminal): + return self._process_nonterminal(node, context) + elif isinstance(node, Sequence): + return self._process_sequence(node, context) + elif isinstance(node, Alternative): + return self._process_alternative(node, context) + elif isinstance(node, OptionalNode): + return self._process_optional(node, context) + elif isinstance(node, Group): + return self._process_group(node, context) + else: + raise GenerationError(f"Unknown node type: {type(node)}") + + @abstractmethod + def _process_terminal(self, node: Terminal) -> Any: + """Process a terminal node.""" + pass + + @abstractmethod + def _process_nonterminal(self, node: NonTerminal, context: Optional[str] = None) -> Any: + """Process a non-terminal node.""" + pass + + @abstractmethod + def _process_sequence(self, node: Sequence, context: Optional[str] = None) -> Any: + """Process a sequence node.""" + pass + + @abstractmethod + def _process_alternative(self, node: Alternative, context: Optional[str] = None) -> Any: + """Process an alternative node.""" + pass + + @abstractmethod + def _process_optional(self, node: OptionalNode, context: Optional[str] = None) -> Any: + """Process an optional node.""" + pass + + def _process_group(self, node: Group, context: Optional[str] = None) -> Any: + """Process a group node (default implementation).""" + return self._process_node(node.element, context) + + def _check_recursion(self, rule_name: str) -> None: + """Check for excessive recursion.""" + self._recursion_tracker[rule_name] += 1 + if self._recursion_tracker[rule_name] > self.config.max_recursion_depth: + raise RecursionError( + f"Maximum recursion depth ({self.config.max_recursion_depth}) " + f"exceeded for rule '{rule_name}'" + ) + + def _enter_rule(self, rule_name: str) -> None: + """Enter a rule (for recursion tracking).""" + self._check_recursion(rule_name) + + def _exit_rule(self, rule_name: str) -> None: + """Exit a rule (for recursion tracking).""" + self._recursion_tracker[rule_name] -= 1 + + +class DeterministicGenerator(BaseGenerator): + """ + Generator that produces all possible strings from a grammar. + + This generator exhaustively enumerates all possible strings that can be + generated from the grammar rules. It should only be used with non-recursive + grammars to avoid infinite generation. + """ + + def generate(self, rule_name: Optional[str] = None) -> Iterator[str]: + """ + Generate all possible strings from the grammar. + + Args: + rule_name: Name of the rule to start generation from. + If None, generates from all public rules. + + Yields: + All possible generated strings + + Raises: + GenerationError: If generation fails + RecursionError: If infinite recursion is detected + """ + if rule_name: + rule = self.grammar.get_rule(rule_name) + if not rule: + raise GenerationError(f"Rule '{rule_name}' not found") + + strings = self._process_node(rule.expansion, rule_name) + for string in strings: + yield string.strip() + else: + # Generate from all public rules + for rule in self.grammar.public_rules: + self._recursion_tracker.clear() + strings = self._process_node(rule.expansion, rule.name) + for string in strings: + yield string.strip() + + def _process_terminal(self, node: Terminal) -> List[str]: + """Process a terminal node.""" + return [node.value] + + def _process_nonterminal(self, node: NonTerminal, context: Optional[str] = None) -> List[str]: + """Process a non-terminal node.""" + rule = self.grammar.get_rule(node.name) + if not rule: + raise GenerationError(f"Undefined rule: {node.name}") + + self._enter_rule(node.name) + try: + result = self._process_node(rule.expansion, node.name) + finally: + self._exit_rule(node.name) + + return result + + def _process_sequence(self, node: Sequence, context: Optional[str] = None) -> List[str]: + """Process a sequence node.""" + if not node.elements: + return [""] + + # Get all possible strings for each element + element_strings = [] + for element in node.elements: + strings = self._process_node(element, context) + element_strings.append(strings) + + # Compute cross product + return self._combine_sequences(element_strings) + + def _process_alternative(self, node: Alternative, context: Optional[str] = None) -> List[str]: + """Process an alternative node.""" + all_strings = [] + for choice_node, weight in node.choices: + strings = self._process_node(choice_node, context) + all_strings.extend(strings) + return all_strings + + def _process_optional(self, node: OptionalNode, context: Optional[str] = None) -> List[str]: + """Process an optional node.""" + strings = self._process_node(node.element, context) + return [""] + strings # Empty string plus all possible strings + + def _combine_sequences(self, element_strings: List[List[str]]) -> List[str]: + """Combine lists of strings using cross product.""" + if not element_strings: + return [""] + + result = [] + for combination in itertools.product(*element_strings): + combined = " ".join(s for s in combination if s) + result.append(combined) + + return result + + +class ProbabilisticGenerator(BaseGenerator): + """ + Generator that produces random strings from a grammar. + + This generator randomly selects from alternatives based on weights and + can handle recursive grammars safely through probabilistic termination. + """ + + def generate(self, rule_name: Optional[str] = None) -> Iterator[str]: + """ + Generate random strings from the grammar. + + Args: + rule_name: Name of the rule to start generation from. + If None, randomly selects from public rules. + + Yields: + Random generated strings (infinite iterator) + + Raises: + GenerationError: If generation fails + """ + while True: + self._recursion_tracker.clear() + + if rule_name: + rule = self.grammar.get_rule(rule_name) + if not rule: + raise GenerationError(f"Rule '{rule_name}' not found") + + yield self._process_node(rule.expansion, rule_name).strip() + else: + # Randomly select from public rules + if not self.grammar.public_rules: + raise GenerationError("No public rules available") + + if len(self.grammar.public_rules) > 1: + # Multiple public rules - create virtual alternative + choices = [(rule.expansion, 1.0) for rule in self.grammar.public_rules] + virtual_alt = Alternative([choice for choice, _ in choices]) + yield self._process_node(virtual_alt).strip() + else: + # Single public rule + rule = self.grammar.public_rules[0] + yield self._process_node(rule.expansion, rule.name).strip() + + def generate_one(self, rule_name: Optional[str] = None) -> str: + """ + Generate a single random string. + + Args: + rule_name: Name of the rule to start generation from + + Returns: + A single generated string + """ + return next(self.generate(rule_name)) + + def _process_terminal(self, node: Terminal) -> str: + """Process a terminal node.""" + return node.value + + def _process_nonterminal(self, node: NonTerminal, context: Optional[str] = None) -> str: + """Process a non-terminal node.""" + rule = self.grammar.get_rule(node.name) + if not rule: + raise GenerationError(f"Undefined rule: {node.name}") + + self._enter_rule(node.name) + try: + result = self._process_node(rule.expansion, node.name) + finally: + self._exit_rule(node.name) + + return result + + def _process_sequence(self, node: Sequence, context: Optional[str] = None) -> str: + """Process a sequence node.""" + if not node.elements: + return "" + + parts = [] + for element in node.elements: + result = self._process_node(element, context) + if result: # Only add non-empty results + parts.append(result) + + return " ".join(parts) + + def _process_alternative(self, node: Alternative, context: Optional[str] = None) -> str: + """Process an alternative node.""" + if not node.choices: + return "" + + # Use weighted random selection + choices, weights = zip(*node.choices) + selected_choice = random.choices(choices, weights=weights, k=1)[0] + return self._process_node(selected_choice, context) + + def _process_optional(self, node: OptionalNode, context: Optional[str] = None) -> str: + """Process an optional node.""" + # 50% chance of including the optional element + if random.random() < 0.5: + return self._process_node(node.element, context) + else: + return "" \ No newline at end of file diff --git a/jsgf/grammar.py b/jsgf/grammar.py new file mode 100644 index 0000000..0911e4e --- /dev/null +++ b/jsgf/grammar.py @@ -0,0 +1,319 @@ +""" +JSGF Grammar representation and parsing functionality. +""" + +from typing import Dict, List, Optional, Union, TextIO, Iterator +from pathlib import Path +import re +from io import StringIO + +from .ast_nodes import ( + JSGFNode, Terminal, NonTerminal, Sequence, Alternative, + Optional as OptionalNode, Group, Rule +) +from .exceptions import ParseError, ValidationError +from .legacy_adapter import LegacyAdapter + + +class Grammar: + """ + Represents a complete JSGF grammar with rules and provides parsing functionality. + + This class encapsulates all grammar rules and provides methods for parsing, + validation, and rule lookup. + """ + + def __init__(self): + self._rules: Dict[str, Rule] = {} + self._public_rules: List[Rule] = [] + + @classmethod + def from_string(cls, grammar_text: str) -> 'Grammar': + """ + Parse a grammar from a string. + + Args: + grammar_text: The JSGF grammar text to parse + + Returns: + A Grammar instance + + Raises: + ParseError: If the grammar cannot be parsed + """ + grammar = cls() + adapter = LegacyAdapter() + + try: + with StringIO(grammar_text) as f: + adapter.parse_to_grammar(f, grammar) + except Exception as e: + raise ParseError(f"Failed to parse grammar: {e}") + + grammar.validate() + return grammar + + @classmethod + def from_file(cls, file_path: Union[str, Path]) -> 'Grammar': + """ + Parse a grammar from a file. + + Args: + file_path: Path to the JSGF grammar file + + Returns: + A Grammar instance + + Raises: + ParseError: If the grammar cannot be parsed + FileNotFoundError: If the file doesn't exist + """ + path = Path(file_path) + if not path.exists(): + raise FileNotFoundError(f"Grammar file not found: {file_path}") + + try: + with open(path, 'r', encoding='utf-8') as f: + return cls.from_stream(f) + except Exception as e: + raise ParseError(f"Failed to parse grammar file {file_path}: {e}") + + @classmethod + def from_stream(cls, stream: TextIO) -> 'Grammar': + """ + Parse a grammar from a text stream. + + Args: + stream: Text stream containing JSGF grammar + + Returns: + A Grammar instance + + Raises: + ParseError: If the grammar cannot be parsed + """ + grammar = cls() + adapter = LegacyAdapter() + + try: + adapter.parse_to_grammar(stream, grammar) + except Exception as e: + raise ParseError(f"Failed to parse grammar: {e}") + + grammar.validate() + return grammar + + def add_rule(self, rule: Rule) -> None: + """ + Add a rule to the grammar. + + Args: + rule: The rule to add + + Raises: + ValueError: If a rule with the same name already exists + """ + if rule.name in self._rules: + raise ValueError(f"Rule '{rule.name}' already exists") + + self._rules[rule.name] = rule + if rule.is_public: + self._public_rules.append(rule) + + def get_rule(self, name: str) -> Optional[Rule]: + """ + Get a rule by name. + + Args: + name: The rule name (with or without angle brackets) + + Returns: + The rule if found, None otherwise + """ + # Handle both and name formats + clean_name = name.strip('<>') + return self._rules.get(f"<{clean_name}>") + + def has_rule(self, name: str) -> bool: + """ + Check if a rule exists. + + Args: + name: The rule name (with or without angle brackets) + + Returns: + True if the rule exists, False otherwise + """ + return self.get_rule(name) is not None + + @property + def rules(self) -> Dict[str, Rule]: + """Get all rules in the grammar.""" + return self._rules.copy() + + @property + def public_rules(self) -> List[Rule]: + """Get all public rules in the grammar.""" + return self._public_rules.copy() + + @property + def rule_names(self) -> List[str]: + """Get all rule names.""" + return list(self._rules.keys()) + + @property + def public_rule_names(self) -> List[str]: + """Get all public rule names.""" + return [rule.name for rule in self._public_rules] + + def validate(self) -> None: + """ + Validate the grammar for consistency and completeness. + + Raises: + ValidationError: If the grammar is invalid + """ + errors = [] + + # Check that all referenced non-terminals have rules + for rule in self._rules.values(): + undefined_refs = self._find_undefined_references(rule.expansion) + if undefined_refs: + errors.append( + f"Rule '{rule.name}' references undefined non-terminals: " + f"{', '.join(undefined_refs)}" + ) + + # Check for at least one public rule + if not self._public_rules: + errors.append("Grammar must have at least one public rule") + + if errors: + raise ValidationError("Grammar validation failed:\n" + "\n".join(errors)) + + def _find_undefined_references(self, node: JSGFNode) -> List[str]: + """Find all undefined non-terminal references in a node.""" + undefined = [] + + def visit(n: JSGFNode): + if isinstance(n, NonTerminal): + if not self.has_rule(n.name): + undefined.append(n.name) + elif isinstance(n, Sequence): + for element in n.elements: + visit(element) + elif isinstance(n, Alternative): + for choice_node, _ in n.choices: + visit(choice_node) + elif isinstance(n, (OptionalNode, Group)): + visit(n.element) + + visit(node) + return undefined + + def detect_cycles(self) -> List[List[str]]: + """ + Detect cycles in the grammar rules. + + Returns: + List of cycles, where each cycle is a list of rule names + """ + # Build dependency graph + graph = {} + for rule_name, rule in self._rules.items(): + graph[rule_name] = self._get_direct_dependencies(rule.expansion) + + # Find strongly connected components (cycles) + cycles = [] + visited = set() + rec_stack = set() + + def dfs(node: str, path: List[str]): + if node in rec_stack: + # Found a cycle + cycle_start = path.index(node) + cycle = path[cycle_start:] + [node] + cycles.append(cycle) + return + + if node in visited: + return + + visited.add(node) + rec_stack.add(node) + path.append(node) + + for neighbor in graph.get(node, []): + dfs(neighbor, path.copy()) + + rec_stack.remove(node) + + for rule_name in self._rules: + if rule_name not in visited: + dfs(rule_name, []) + + return cycles + + def _get_direct_dependencies(self, node: JSGFNode) -> List[str]: + """Get direct non-terminal dependencies of a node.""" + dependencies = [] + + def visit(n: JSGFNode): + if isinstance(n, NonTerminal): + dependencies.append(n.name) + elif isinstance(n, Sequence): + for element in n.elements: + visit(element) + elif isinstance(n, Alternative): + for choice_node, _ in n.choices: + visit(choice_node) + elif isinstance(n, (OptionalNode, Group)): + visit(n.element) + + visit(node) + return dependencies + + def is_recursive(self, rule_name: Optional[str] = None) -> bool: + """ + Check if the grammar (or a specific rule) contains recursion. + + Args: + rule_name: If provided, check if this specific rule is recursive. + If None, check if any rule in the grammar is recursive. + + Returns: + True if recursion is detected, False otherwise + """ + cycles = self.detect_cycles() + + if rule_name is None: + return len(cycles) > 0 + + # Check if the specific rule is involved in any cycle + clean_name = rule_name.strip('<>') + full_name = f"<{clean_name}>" + + for cycle in cycles: + if full_name in cycle: + return True + + return False + + def __str__(self) -> str: + """Return a string representation of the grammar.""" + lines = [] + for rule in self._rules.values(): + lines.append(str(rule)) + return "\n".join(lines) + + def __len__(self) -> int: + """Return the number of rules in the grammar.""" + return len(self._rules) + + def __contains__(self, rule_name: str) -> bool: + """Check if a rule name exists in the grammar.""" + return self.has_rule(rule_name) + + def __iter__(self) -> Iterator[Rule]: + """Iterate over all rules in the grammar.""" + return iter(self._rules.values()) \ No newline at end of file diff --git a/jsgf/legacy_adapter.py b/jsgf/legacy_adapter.py new file mode 100644 index 0000000..0244720 --- /dev/null +++ b/jsgf/legacy_adapter.py @@ -0,0 +1,91 @@ +""" +Adapter to use the existing JSGFParser with the new Grammar architecture. + +This provides a bridge between the old parser and the new modern API. +""" + +from typing import TextIO +import sys +import os + +# Add the parent directory to the path to import the legacy modules +sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) + +import JSGFParser as legacy_parser +import JSGFGrammar as legacy_grammar + +from .ast_nodes import ( + JSGFNode, Terminal, NonTerminal, Sequence, Alternative, + Optional as OptionalNode, Group, Rule +) +from .exceptions import ParseError + + +class LegacyAdapter: + """Adapter to convert legacy grammar objects to new AST format.""" + + def parse_to_grammar(self, stream: TextIO, grammar: 'Grammar') -> None: + """ + Parse using the legacy parser and convert to new Grammar format. + + Args: + stream: Text stream containing JSGF grammar + grammar: Grammar object to populate + """ + try: + # Use the legacy parser + legacy_gram = legacy_parser.getGrammarObject(stream) + + # Create a set of public rule names for easy lookup + public_rule_names = {rule.lhs.name for rule in legacy_gram.publicRules} + + # Convert all rules, marking public ones appropriately + for rule in legacy_gram.rules: + is_public = rule.lhs.name in public_rule_names + converted_rule = self._convert_rule(rule, is_public=is_public) + grammar.add_rule(converted_rule) + + except Exception as e: + raise ParseError(f"Failed to parse grammar: {e}") + + def _convert_rule(self, legacy_rule, is_public: bool = False) -> Rule: + """Convert a legacy rule to new Rule format.""" + rule_name = legacy_rule.lhs.name + expansion = self._convert_expansion(legacy_rule.rhs) + + return Rule( + name=rule_name, + expansion=expansion, + is_public=is_public + ) + + def _convert_expansion(self, rhs) -> JSGFNode: + """Convert legacy RHS to new AST format.""" + if isinstance(rhs, str): + return Terminal(rhs) + elif isinstance(rhs, list): + if len(rhs) == 1: + return self._convert_expansion(rhs[0]) + else: + # Convert list to sequence + elements = [self._convert_expansion(item) for item in rhs] + return Sequence(elements) + elif isinstance(rhs, legacy_grammar.Disjunction): + # Convert disjunction + choices = [] + for disjunct in rhs.disjuncts: + if isinstance(disjunct, tuple): + # Weighted choice + node, weight = disjunct + choices.append((self._convert_expansion(node), weight)) + else: + # Unweighted choice + choices.append((self._convert_expansion(disjunct), 1.0)) + return Alternative(choices) + elif isinstance(rhs, legacy_grammar.Optional): + return OptionalNode(self._convert_expansion(rhs.option)) + elif isinstance(rhs, legacy_grammar.NonTerminal): + return NonTerminal(rhs.name) + else: + # Fallback for unknown types + return Terminal(str(rhs)) \ No newline at end of file diff --git a/jsgf/parser.py b/jsgf/parser.py new file mode 100644 index 0000000..54204b0 --- /dev/null +++ b/jsgf/parser.py @@ -0,0 +1,294 @@ +""" +JSGF Grammar parser implementation. + +This module provides the JSGFParser class that converts JSGF grammar text +into Grammar objects with proper AST representation. +""" + +from typing import TextIO, List, Optional, Union, Any +import re +from pyparsing import ( + Word, Literal, Group, Optional as PyparsingOptional, Forward, MatchFirst, + Combine, alphas, alphanums, nums, stringEnd, ParseException, ParserElement, + pyparsing_unicode +) + +from .ast_nodes import ( + JSGFNode, Terminal, NonTerminal, Sequence, Alternative, + Optional as OptionalNode, Group as GroupNode, Rule +) +from .exceptions import ParseError + + +# Enable packrat parsing for performance +ParserElement.enablePackrat() + +# Unicode support: Tier 1 + Tier 2 scripts for comprehensive language coverage +# Covers 5+ billion speakers: Latin, CJK, Arabic, Cyrillic, Devanagari, Hangul, Hebrew, Greek, Thai +# Note: Using printables for scripts with combining characters (Thai, Devanagari) +_unicode_letters = ( + # Tier 1: Major scripts (Latin, CJK, Arabic, Cyrillic) + pyparsing_unicode.Latin1.alphas + + pyparsing_unicode.LatinA.alphas + + pyparsing_unicode.LatinB.alphas + + pyparsing_unicode.CJK.alphas + + pyparsing_unicode.Arabic.alphas + + pyparsing_unicode.Cyrillic.alphas + + # Tier 2: Common scripts (using printables for scripts with combining marks) + pyparsing_unicode.Devanagari.printables + + pyparsing_unicode.Hangul.alphas + + pyparsing_unicode.Hebrew.alphas + + pyparsing_unicode.Greek.alphas + + pyparsing_unicode.Thai.printables +) + + +class JSGFParser: + """ + Parser for JSGF grammar files. + + This parser converts JSGF grammar text into a Grammar object containing + properly structured AST nodes. + """ + + def __init__(self): + self._grammar_def = None + self._setup_parser() + + def _setup_parser(self): + """Set up the pyparsing grammar definition.""" + + # Basic tokens + weight = ( + Literal('/').suppress() + + Word(nums + '.').setResultsName('weight_value') + + Literal('/').suppress() + ).setParseAction(self._parse_weight) + + token = ( + Word(alphanums + _unicode_letters + "'_-,.?@!#$%^&*()+={}[]|\\:;\"~`") + ).setParseAction(self._parse_token) + + nonterminal = ( + Combine( + Literal('<') + + Word(alphanums + _unicode_letters + '$_:;,=|/\\()[]@#%!^&~') + + Literal('>') + ) + ).setParseAction(self._parse_nonterminal) + + # Forward declarations for recursive grammar + sequence = Forward() + alternative = Forward() + + # Weighted expressions + weighted_expr = ( + weight + Group(sequence).setResultsName("expr") + ).setParseAction(self._parse_weighted_expression) + + # Grouping and optional elements + grouping = ( + Literal('(').suppress() + + alternative + + Literal(')').suppress() + ).setParseAction(self._parse_group) + + optional_grouping = ( + Literal('[').suppress() + + Group(alternative).setResultsName("optional_content") + + Literal(']').suppress() + ).setParseAction(self._parse_optional) + + # Basic expression elements + expression = MatchFirst([ + nonterminal, + token, + grouping, + optional_grouping + ]) + + # Sequence definition + sequence <<= Group( + expression + + (expression)[...] + ).setParseAction(self._parse_sequence) + + # Alternative definitions + weighted_alternatives = Forward() + weighted_prime = Literal('|').suppress() + weighted_alternatives + weighted_alternatives <<= MatchFirst([ + ( + Group(weighted_expr).setResultsName("choice1") + + Group(weighted_prime).setResultsName("choice2") + ).setParseAction(self._parse_weighted_alternatives), + Group(weighted_expr).setParseAction(self._parse_single_weighted) + ]) + + regular_alternatives = Forward() + regular_prime = Literal('|').suppress() + regular_alternatives + regular_alternatives <<= MatchFirst([ + ( + Group(sequence).setResultsName("choice1") + + Group(regular_prime).setResultsName("choice2") + ).setParseAction(self._parse_regular_alternatives), + Group(sequence).setParseAction(self._parse_single_regular) + ]) + + # Top-level alternative + alternative <<= MatchFirst([regular_alternatives, weighted_alternatives]) + + # Complete rule definition + rule_def = ( + PyparsingOptional(Literal('public')).setResultsName('is_public') + + nonterminal.setResultsName('rule_name') + + Literal('=').suppress() + + Group(alternative).setResultsName('expansion') + + Literal(';').suppress() + ).setParseAction(self._parse_rule) + + self._grammar_def = rule_def + + def parse(self, stream: TextIO, grammar: 'Grammar') -> None: + """ + Parse a JSGF grammar from a text stream into a Grammar object. + + Args: + stream: Text stream containing JSGF grammar + grammar: Grammar object to populate with parsed rules + + Raises: + ParseError: If parsing fails + """ + content = stream.read() + + # Remove comments + content = self._remove_comments(content) + + # Split into individual rules and parse each one + for line_num, line in enumerate(content.split('\n'), 1): + line = line.strip() + if not line: + continue + + try: + result = self._grammar_def.parseString(line, parseAll=True) + rule = self._extract_rule(result) + grammar.add_rule(rule) + except ParseException as e: + raise ParseError( + f"Failed to parse rule: {str(e)}", + line=line_num, + column=e.column if hasattr(e, 'column') else None + ) + except Exception as e: + raise ParseError(f"Unexpected error parsing rule: {str(e)}", line=line_num) + + def _remove_comments(self, text: str) -> str: + """Remove comments from JSGF text.""" + # Remove // style comments + text = re.sub(r'//.*?$', '', text, flags=re.MULTILINE) + # Remove /* */ style comments + text = re.sub(r'/\*.*?\*/', '', text, flags=re.DOTALL) + return text + + def _parse_weight(self, s: str, loc: int, tokens: Any) -> float: + """Parse a weight value.""" + return float(tokens.weight_value) + + def _parse_token(self, s: str, loc: int, tokens: Any) -> Terminal: + """Parse a terminal token.""" + return Terminal(tokens[0]) + + def _parse_nonterminal(self, s: str, loc: int, tokens: Any) -> NonTerminal: + """Parse a non-terminal.""" + return NonTerminal(tokens[0]) + + def _parse_sequence(self, s: str, loc: int, tokens: Any) -> Union[JSGFNode, Sequence]: + """Parse a sequence of elements.""" + elements = list(tokens[0]) + if len(elements) == 1: + return elements[0] + return Sequence(elements) + + def _parse_group(self, s: str, loc: int, tokens: Any) -> GroupNode: + """Parse a grouped expression.""" + return GroupNode(tokens[0]) + + def _parse_optional(self, s: str, loc: int, tokens: Any) -> OptionalNode: + """Parse an optional expression.""" + return OptionalNode(tokens.optional_content[0]) + + def _parse_weighted_expression(self, s: str, loc: int, tokens: Any) -> tuple: + """Parse a weighted expression.""" + weight = tokens[0] # The weight value + expr = tokens.expr[0] # The expression + return (expr, weight) + + def _parse_weighted_alternatives(self, s: str, loc: int, tokens: Any) -> Alternative: + """Parse weighted alternatives.""" + choices = [] + + # Add first choice + first_choice = tokens.choice1[0] + if isinstance(first_choice, tuple): + choices.append(first_choice) + else: + choices.append((first_choice, 1.0)) + + # Add remaining choices + remaining = tokens.choice2[0] + if isinstance(remaining, Alternative): + choices.extend(remaining.choices) + else: + if isinstance(remaining, tuple): + choices.append(remaining) + else: + choices.append((remaining, 1.0)) + + return Alternative(choices) + + def _parse_single_weighted(self, s: str, loc: int, tokens: Any) -> Alternative: + """Parse a single weighted choice.""" + choice = tokens[0] + if isinstance(choice, tuple): + return Alternative([choice]) + else: + return Alternative([(choice, 1.0)]) + + def _parse_regular_alternatives(self, s: str, loc: int, tokens: Any) -> Alternative: + """Parse regular (unweighted) alternatives.""" + choices = [] + + # Add first choice + choices.append((tokens.choice1[0], 1.0)) + + # Add remaining choices + remaining = tokens.choice2[0] + if isinstance(remaining, Alternative): + choices.extend(remaining.choices) + else: + choices.append((remaining, 1.0)) + + return Alternative(choices) + + def _parse_single_regular(self, s: str, loc: int, tokens: Any) -> Union[JSGFNode, Alternative]: + """Parse a single regular choice.""" + choice = tokens[0] + # Don't wrap single elements in Alternative unnecessarily + return choice + + def _parse_rule(self, s: str, loc: int, tokens: Any) -> dict: + """Parse a complete rule definition.""" + return { + 'is_public': bool(tokens.is_public), + 'name': tokens.rule_name.name, + 'expansion': tokens.expansion[0] + } + + def _extract_rule(self, parse_result: Any) -> Rule: + """Extract a Rule object from parse results.""" + return Rule( + name=parse_result['name'], + expansion=parse_result['expansion'], + is_public=parse_result['is_public'] + ) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e7cb1b2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,72 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "jsgf-tools" +version = "2.1.1" +description = "Complete JSGF toolkit: parse, generate, and test speech grammars with Unicode support" +readme = "README.md" +requires-python = ">=3.7" +license = {text = "MIT"} +authors = [ + {name = "Pastèque Ho", email = "timothyakho@gmail.com"} +] +keywords = ["jsgf", "grammar", "speech recognition", "nlp", "parsing", "generation", "unicode", "testing"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Software Development :: Testing", + "Topic :: Text Processing :: Linguistic", + "Natural Language :: Chinese (Simplified)", + "Natural Language :: Japanese", + "Natural Language :: Korean", + "Natural Language :: Arabic", + "Natural Language :: Russian", + "Natural Language :: Hebrew", + "Natural Language :: Greek", + "Natural Language :: Hindi", +] +dependencies = [ + "pyparsing>=3.0.0" +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-cov>=3.0.0", +] + +[project.urls] +Homepage = "https://github.com/syntactic/JSGFTools" +Documentation = "https://github.com/syntactic/JSGFTools#readme" +Repository = "https://github.com/syntactic/JSGFTools" +Issues = "https://github.com/syntactic/JSGFTools/issues" + +[project.scripts] +jsgf-deterministic = "DeterministicGenerator:main" +jsgf-probabilistic = "ProbabilisticGenerator:main" + +[tool.pytest.ini_options] +testpaths = ["."] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = "-v --strict-markers" + +[tool.setuptools] +py-modules = ["JSGFParser", "JSGFGrammar", "DeterministicGenerator", "ProbabilisticGenerator"] + +[tool.setuptools.packages.find] +exclude = ["tests*", "docs*", "examples*"] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..7f6e68c --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[tool:pytest] +testpaths = . +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -v --tb=short \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..e9447cb --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +# Development dependencies for JSGFTools +pytest>=7.0.0 +pyparsing>=3.0.0 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..846df1c --- /dev/null +++ b/setup.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +from setuptools import setup, find_packages + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +setup( + name='jsgf-tools', + version='2.1.1', + author='Pastèque Ho', + author_email='timothyakho@gmail.com', + description='Complete JSGF toolkit: parse, generate, and test speech grammars with Unicode support', + long_description=long_description, + long_description_content_type="text/markdown", + url='https://github.com/syntactic/JSGFTools', + project_urls={ + 'Bug Tracker': 'https://github.com/syntactic/JSGFTools/issues', + 'Documentation': 'https://github.com/syntactic/JSGFTools#readme', + 'Source Code': 'https://github.com/syntactic/JSGFTools', + }, + packages=find_packages(exclude=['tests*', 'docs*']), + py_modules=['JSGFParser', 'JSGFGrammar', 'DeterministicGenerator', 'ProbabilisticGenerator'], + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Software Development :: Testing", + "Topic :: Text Processing :: Linguistic", + "Natural Language :: Chinese (Simplified)", + "Natural Language :: Japanese", + "Natural Language :: Korean", + "Natural Language :: Arabic", + "Natural Language :: Russian", + "Natural Language :: Hebrew", + "Natural Language :: Greek", + "Natural Language :: Hindi", + ], + keywords='jsgf grammar speech recognition nlp parsing generation unicode testing', + python_requires=">=3.7", + install_requires=[ + "pyparsing>=3.0.0", + ], + extras_require={ + 'dev': [ + 'pytest>=7.0.0', + 'pytest-cov>=3.0.0', + ], + }, + entry_points={ + 'console_scripts': [ + 'jsgf-deterministic=DeterministicGenerator:main', + 'jsgf-probabilistic=ProbabilisticGenerator:main', + ], + }, +) \ No newline at end of file diff --git a/test_jsgf_tools.py b/test_jsgf_tools.py new file mode 100644 index 0000000..fbccd37 --- /dev/null +++ b/test_jsgf_tools.py @@ -0,0 +1,437 @@ +# -*- coding: utf-8 -*- +""" +Test suite for JSGFTools + +This module provides comprehensive tests for all components of JSGFTools: +- JSGFParser: grammar parsing functionality +- JSGFGrammar: grammar object structure and operations +- DeterministicGenerator: exhaustive string generation +- ProbabilisticGenerator: random string generation +""" + +import pytest +import tempfile +import os +from io import StringIO + +import JSGFParser as parser +import JSGFGrammar as gram +import DeterministicGenerator as det_gen +import ProbabilisticGenerator as prob_gen + + +class TestJSGFParser: + """Test the JSGF parser functionality""" + + def test_parse_simple_grammar(self): + """Test parsing a simple non-recursive grammar""" + grammar_text = """ + public = hello world; + """ + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + assert grammar.publicRules[0].lhs.name == "" + + def test_parse_weighted_grammar(self): + """Test parsing grammar with weights""" + grammar_text = """ + public = /5/ hello | /1/ hi; + """ + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + # The RHS should be a list containing a disjunction with weighted alternatives + rhs = grammar.publicRules[0].rhs + assert isinstance(rhs, list) + assert len(rhs) == 1 + assert isinstance(rhs[0], gram.Disjunction) + + def test_parse_optional_elements(self): + """Test parsing grammar with optional elements""" + grammar_text = """ + public = hello [ world ]; + """ + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + # Should contain an optional element in a list + rhs = grammar.publicRules[0].rhs + assert isinstance(rhs, list) + assert len(rhs) == 2 # "hello" and optional "world" + # The second element should be an Optional + assert isinstance(rhs[1], gram.Optional) + + def test_parse_recursive_grammar(self): + """Test parsing a recursive grammar""" + grammar_text = """ + public = ; + = the idea | the idea ; + = will suffice; + = that ; + """ + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + assert len(grammar.rules) == 4 # All rules including private ones + + def test_parse_multiple_public_rules(self): + """Test parsing grammar with multiple public rules""" + grammar_text = """ + public = hello; + public = world; + """ + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 2 + + def test_parse_comments(self): + """Test that comments are properly stripped""" + grammar_text = """ + // This is a comment + public = hello world; // Another comment + """ + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + + def test_unicode_chinese(self): + """Test Chinese characters in grammar tokens""" + grammar_text = "public = 零 | 一 | 二 | 三;" + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + assert grammar.publicRules[0].lhs.name == "" + + def test_unicode_japanese(self): + """Test Japanese characters (hiragana and katakana)""" + grammar_text = "public = こんにちは | さようなら | ありがとう;" + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + rhs = grammar.publicRules[0].rhs + assert isinstance(rhs, list) + + def test_unicode_arabic(self): + """Test Arabic characters""" + grammar_text = "public = مرحبا | السلام عليكم | شكرا;" + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + + def test_unicode_korean(self): + """Test Korean Hangul characters""" + grammar_text = "public = 안녕하세요 | 감사합니다;" + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + + def test_unicode_cyrillic(self): + """Test Cyrillic characters (Russian)""" + grammar_text = "public = привет | здравствуйте | спасибо;" + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + + def test_unicode_hebrew(self): + """Test Hebrew characters""" + grammar_text = "public = שלום | תודה;" + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + + def test_unicode_greek(self): + """Test Greek characters""" + grammar_text = "public = γεια σου | ευχαριστώ;" + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + + def test_unicode_thai(self): + """Test Thai characters""" + grammar_text = "public = สวัสดี | ขอบคุณ;" + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + + def test_unicode_devanagari(self): + """Test Devanagari characters (Hindi)""" + grammar_text = "public = नमस्ते | धन्यवाद;" + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + + def test_unicode_mixed_scripts(self): + """Test mixing different scripts in the same grammar""" + grammar_text = """ + public = hello | 你好 | こんにちは | مرحبا | привет | שלום; + """ + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + + def test_unicode_in_rule_names(self): + """Test Unicode characters in rule names (as JSGF spec allows)""" + grammar_text = "public <问候> = 你好 | 您好;" + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + assert len(grammar.publicRules) == 1 + assert grammar.publicRules[0].lhs.name == "<问候>" + + +class TestJSGFGrammar: + """Test the JSGF grammar objects""" + + def test_disjunction_creation(self): + """Test creating a disjunction""" + disjuncts = ["hello", "hi", "hey"] + disj = gram.Disjunction(disjuncts) + + assert len(disj.disjuncts) == 3 + assert "hello" in str(disj) + + def test_optional_creation(self): + """Test creating an optional element""" + opt = gram.Optional("world") + + assert opt.option == "world" + assert "world" in str(opt) + assert "[" in str(opt) and "]" in str(opt) + + def test_sequence_as_list(self): + """Test that sequences are represented as lists""" + # In this implementation, sequences are just lists + seq = ["hello", "world"] + + assert len(seq) == 2 + assert seq[0] == "hello" + + def test_nonterminal_creation(self): + """Test creating a nonterminal""" + nt = gram.NonTerminal("test") + + assert nt.name == "test" + assert "test" in str(nt) + + def test_rule_creation(self): + """Test creating a rule""" + lhs = gram.NonTerminal("start") + rhs = "hello world" + rule = gram.Rule(lhs, rhs) + + assert rule.lhs.name == "start" + assert rule.rhs == "hello world" + + def test_grammar_operations(self): + """Test grammar operations like adding rules""" + grammar = gram.Grammar() + lhs = gram.NonTerminal("start") + rhs = "hello" + rule = gram.Rule(lhs, rhs) + + grammar.addRule(rule) + assert len(grammar.rules) == 1 + + grammar.addPublicRule(rule) + assert len(grammar.publicRules) == 1 + + +class TestDeterministicGenerator: + """Test the deterministic string generator""" + + def setup_method(self): + """Set up test fixtures""" + self.simple_grammar_text = """ + public = hello world; + """ + + self.choice_grammar_text = """ + public = hello | hi; + """ + + self.optional_grammar_text = """ + public = hello [ world ]; + """ + + self.complex_grammar_text = """ + public = ; + = hello | hi; + = world | there; + """ + + def test_simple_generation(self): + """Test generating from a simple grammar""" + # Set up global grammar for the generator + det_gen.grammar = parser.getGrammarObject(StringIO(self.simple_grammar_text)) + + rule = det_gen.grammar.publicRules[0] + results = det_gen.processRHS(rule.rhs) + + assert len(results) == 1 + assert results[0] == "hello world" + + def test_choice_generation(self): + """Test generating from grammar with choices""" + det_gen.grammar = parser.getGrammarObject(StringIO(self.choice_grammar_text)) + + rule = det_gen.grammar.publicRules[0] + results = det_gen.processRHS(rule.rhs) + + assert len(results) == 2 + assert "hello" in results + assert "hi" in results + + def test_optional_generation(self): + """Test generating from grammar with optional elements""" + det_gen.grammar = parser.getGrammarObject(StringIO(self.optional_grammar_text)) + + rule = det_gen.grammar.publicRules[0] + results = det_gen.processRHS(rule.rhs) + + assert len(results) == 2 + assert "hello" in results + assert "hello world" in results + + def test_complex_generation(self): + """Test generating from a more complex grammar""" + det_gen.grammar = parser.getGrammarObject(StringIO(self.complex_grammar_text)) + + rule = det_gen.grammar.publicRules[0] + results = det_gen.processRHS(rule.rhs) + + # Should generate: hello world, hello there, hi world, hi there + assert len(results) == 4 + expected = {"hello world", "hello there", "hi world", "hi there"} + assert set(results) == expected + + +class TestProbabilisticGenerator: + """Test the probabilistic string generator""" + + def setup_method(self): + """Set up test fixtures""" + self.simple_grammar_text = """ + public = hello world; + """ + + self.weighted_grammar_text = """ + public = /5/ hello | /1/ hi; + """ + + def test_simple_generation(self): + """Test generating from a simple grammar""" + prob_gen.grammar = parser.getGrammarObject(StringIO(self.simple_grammar_text)) + + rule = prob_gen.grammar.publicRules[0] + result = prob_gen.processRHS(rule.rhs) + + assert result == "hello world" + + def test_choice_generation(self): + """Test that generation produces valid results from choices""" + prob_gen.grammar = parser.getGrammarObject(StringIO(self.weighted_grammar_text)) + + rule = prob_gen.grammar.publicRules[0] + + # Generate multiple times to test randomness + results = set() + for _ in range(20): + result = prob_gen.processRHS(rule.rhs) + results.add(result) + + # Should only produce "hello" or "hi" + assert results.issubset({"hello", "hi"}) + # With 20 iterations, we should get at least one of each (with high probability) + # But we can't guarantee this due to randomness, so we just check validity + + +class TestIntegration: + """Integration tests using the actual grammar files""" + + def test_ideas_grammar_parsing(self): + """Test parsing the Ideas.gram file""" + with open('Ideas.gram', 'r') as f: + grammar = parser.getGrammarObject(f) + + assert len(grammar.publicRules) == 1 + assert grammar.publicRules[0].lhs.name == "" + + def test_ideas_nonrecursive_grammar_parsing(self): + """Test parsing the IdeasNonRecursive.gram file""" + with open('IdeasNonRecursive.gram', 'r') as f: + grammar = parser.getGrammarObject(f) + + assert len(grammar.publicRules) == 1 + + def test_deterministic_generator_with_nonrecursive_grammar(self): + """Test deterministic generation with IdeasNonRecursive.gram""" + with open('IdeasNonRecursive.gram', 'r') as f: + det_gen.grammar = parser.getGrammarObject(f) + + rule = det_gen.grammar.publicRules[0] + results = det_gen.processRHS(rule.rhs) + + # Should generate multiple valid sentences + assert len(results) > 1 + # All results should contain "idea" + for result in results: + assert "idea" in result + + def test_probabilistic_generator_with_recursive_grammar(self): + """Test probabilistic generation with Ideas.gram""" + with open('Ideas.gram', 'r') as f: + prob_gen.grammar = parser.getGrammarObject(f) + + rule = prob_gen.grammar.publicRules[0] + + # Generate a few strings to ensure it works + for _ in range(5): + result = prob_gen.processRHS(rule.rhs) + assert isinstance(result, str) + assert len(result) > 0 + + +class TestErrorHandling: + """Test error handling and edge cases""" + + def test_invalid_grammar_syntax(self): + """Test handling of invalid grammar syntax""" + invalid_grammar = """ + public = hello world // Missing semicolon + """ + + # The parser may be tolerant of some syntax errors + # Let's check what actually happens + try: + grammar = parser.getGrammarObject(StringIO(invalid_grammar)) + # If it doesn't raise an exception, check if it parsed correctly + # A missing semicolon might result in no rules being parsed + assert len(grammar.publicRules) == 0 # Should fail to parse the rule + except Exception: + # If it does raise an exception, that's also acceptable + pass + + def test_empty_grammar(self): + """Test handling of empty grammar""" + empty_grammar = "" + + grammar = parser.getGrammarObject(StringIO(empty_grammar)) + assert len(grammar.publicRules) == 0 + + def test_undefined_nonterminal(self): + """Test handling of undefined nonterminals""" + grammar_text = """ + public = ; + """ + grammar = parser.getGrammarObject(StringIO(grammar_text)) + + # This should raise an error when trying to process + with pytest.raises(ValueError): + det_gen.grammar = grammar + rule = det_gen.grammar.publicRules[0] + det_gen.processRHS(rule.rhs) + + +if __name__ == "__main__": + # Run tests if executed directly + pytest.main([__file__, "-v"]) \ No newline at end of file