Description
Go version
go version go1.24.3 darwin/arm64
Output of go env
in your module/workspace:
AR='ar'
CC='cc'
CGO_CFLAGS='-O2 -g'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_ENABLED='1'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-O2 -g'
CXX='c++'
GCCGO='gccgo'
GO111MODULE=''
GOARCH='arm64'
GOARM64='v8.0'
GOAUTH='netrc'
GOBIN=''
GOCACHE='/Users/evan.jones/Library/Caches/go-build'
GOCACHEPROG=''
GODEBUG=''
GOENV='/Users/evan.jones/Library/Application Support/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFIPS140='off'
GOFLAGS=''
GOGCCFLAGS='-fPIC -arch arm64 -pthread -fno-caret-diagnostics -Qunused-arguments -fmessage-length=0 -ffile-prefix-map=/var/folders/pp/tvwz4y2x2qz97pf8bftqxhrw0000gp/T/go-build572495338=/tmp/go-build -gno-record-gcc-switches -fno-common'
GOHOSTARCH='arm64'
GOHOSTOS='darwin'
GOINSECURE=''
GOMOD='/Users/evan.jones/go_x_text_bug/go.mod'
GOMODCACHE='/Users/evan.jones/go/pkg/mod'
GOOS='darwin'
GOPATH='/Users/evan.jones/go'
GOROOT='/opt/homebrew/Cellar/go/1.24.3/libexec'
GOSUMDB='sum.golang.org'
GOTELEMETRY='on'
GOTELEMETRYDIR='/Users/evan.jones/Library/Application Support/go/telemetry'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/opt/homebrew/Cellar/go/1.24.3/libexec/pkg/tool/darwin_arm64'
GOVCS=''
GOVERSION='go1.24.3'
GOWORK=''
PKG_CONFIG='pkg-config'
What did you do?
When using norm.Iter
on some invalid UTF-8 byte sequence, Iter.Done
always returns false
. In these cases Iter.Next
returns an empty byte slice. This causes code attempting to use the iterator to enter an infinite loop.
What did you see happen?
An infinite loop: the iterator always returns Iter.Done() == false
.
What did you expect to see?
I expected norm.Iter
to be equivalent to executing norm.Form.String
, followed by rune-by-rune iteration over the resulting string. In particular, I was changing code that was using for i, rune := range norm.NFC.String(input) { ... }
to try to use norm.Iter
instead. I had a fuzz test to compare the two implementations, and it found these invalid cases.
Here is a unit test that I expect to pass:
func TestNFCIterBug(t *testing.T) {
const maxIterations = 20
const badInput = "\xf0\xd9\x95"
nfcString := norm.NFC.String(badInput)
for i, b := range []byte(badInput) {
t.Logf("badInput byte i=%d b=0x%x", i, b)
}
for i, r := range nfcString {
t.Logf("nfcString rune i=%d r=0x%x", i, r)
}
iter := norm.Iter{}
iter.InitString(norm.NFC, nfcString)
i := 0
for !iter.Done() {
bytes := iter.Next()
t.Logf("norm Iter i=%d bytes=%#v", i, bytes)
i += 1
if i > maxIterations {
t.Fatalf("stopping after %d iterations to avoid infinite loop", maxIterations)
}
}
}
The output of this test is the following:
=== RUN TestNFCBug
main_test.go:17: badInput byte i=0 b=0xf0
main_test.go:17: badInput byte i=1 b=0xd9
main_test.go:17: badInput byte i=2 b=0x95
main_test.go:21: nfcString rune i=0 r=0xfffd
main_test.go:21: nfcString rune i=1 r=0x655
main_test.go:29: norm Iter i=0 bytes=[]byte{}
main_test.go:29: norm Iter i=1 bytes=[]byte{}
main_test.go:29: norm Iter i=2 bytes=[]byte{}
main_test.go:29: norm Iter i=3 bytes=[]byte{}
main_test.go:29: norm Iter i=4 bytes=[]byte{}
main_test.go:29: norm Iter i=5 bytes=[]byte{}
main_test.go:29: norm Iter i=6 bytes=[]byte{}
main_test.go:29: norm Iter i=7 bytes=[]byte{}
main_test.go:29: norm Iter i=8 bytes=[]byte{}
main_test.go:29: norm Iter i=9 bytes=[]byte{}
main_test.go:29: norm Iter i=10 bytes=[]byte{}
main_test.go:29: norm Iter i=11 bytes=[]byte{}
main_test.go:29: norm Iter i=12 bytes=[]byte{}
main_test.go:29: norm Iter i=13 bytes=[]byte{}
main_test.go:29: norm Iter i=14 bytes=[]byte{}
main_test.go:29: norm Iter i=15 bytes=[]byte{}
main_test.go:29: norm Iter i=16 bytes=[]byte{}
main_test.go:29: norm Iter i=17 bytes=[]byte{}
main_test.go:29: norm Iter i=18 bytes=[]byte{}
main_test.go:29: norm Iter i=19 bytes=[]byte{}
main_test.go:29: norm Iter i=20 bytes=[]byte{}
main_test.go:32: stopping after 20 iterations to avoid infinite loop
I also have a fuzz test that I used to find this input that I am happy to contribute to the Go project if it is useful:
// Compare norm.NFC.Bytes to norm.Iter.
func FuzzNFCIterator(f *testing.F) {
f.Add("")
f.Add("ascii")
f.Add("e\u0301 decomposed")
f.Fuzz(func(t *testing.T, s string) {
// check UTF-8 valid strings only: no problems
// if !utf8.ValidString(s) {
// return
// }
normalized := string(norm.NFC.String(s))
runes := []rune(normalized)
iter := norm.Iter{}
iter.InitString(norm.NFC, normalized)
runeI := 0
for !iter.Done() {
runeBytes := iter.Next()
if len(runeBytes) == 0 {
t.Fatalf("iter.Next() returned empty byte slice for s=%#v %s",
s, strconv.QuoteToASCII(s))
}
for len(runeBytes) > 0 {
rune, runeLen := utf8.DecodeRune(runeBytes)
runeBytes = runeBytes[runeLen:]
if runes[runeI] != rune {
t.Fatalf("s=%#v %s: runes[runeI=%d]=0x%x iter returned 0x%x",
s, strconv.QuoteToASCII(s), runeI, runes[runeI], rune)
}
if rune == utf8.RuneError {
t.Fatalf("s=%#v %s: iter returned utf8.RuneError at runeI=%d",
s, strconv.QuoteToASCII(s), runeI)
}
runeI++
}
}
if runeI != len(runes) {
t.Fatalf("s=%#v %s: expected %d runes, got %d",
s, strconv.QuoteToASCII(s), len(runes), runeI)
}
})
}
This code is available as a standalone git repository in case that is helpfu: https://github.com/evanj/go_x_text_bug