Skip to content

Commit fdf9ce9

Browse files
committed
Merge pull request nltk#1000 from sahutd/issue948
fix treebank tokenizer when string ends with comma
2 parents 0c2b97a + 267af42 commit fdf9ce9

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

nltk/tokenize/treebank.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ class TreebankWordTokenizer(TokenizerI):
4040
>>> s = "They'll save and invest more."
4141
>>> TreebankWordTokenizer().tokenize(s)
4242
['They', "'ll", 'save', 'and', 'invest', 'more', '.']
43+
>>> s = "hi, my name can't hello,"
44+
>>> TreebankWordTokenizer().tokenize(s)
45+
['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
4346
"""
4447

4548
# List of contractions adapted from Robert MacIntyre's tokenizer.
@@ -64,6 +67,7 @@ def tokenize(self, text):
6467

6568
#punctuation
6669
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
70+
text = re.sub(r'([:,])$', r' \1 ', text)
6771
text = re.sub(r'\.\.\.', r' ... ', text)
6872
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
6973
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)

0 commit comments

Comments
 (0)