Skip to content

Commit

Permalink
unicode/utf8: add AppendRune
Browse files Browse the repository at this point in the history
AppendRune appends the UTF-8 encoding of a rune to a []byte.
It is a generally more user friendly than EncodeRune.

    EncodeASCIIRune-4     2.35ns ± 2%
    EncodeJapaneseRune-4  4.60ns ± 2%
    AppendASCIIRune-4     0.30ns ± 3%
    AppendJapaneseRune-4  4.70ns ± 2%

The ASCII case is written to be inlineable.

Fixes #47609

Change-Id: If4f71eedffd2bd4ef0d7f960cb55b41c637eec54
Reviewed-on: https://go-review.googlesource.com/c/go/+/345571
Trust: Joe Tsai <[email protected]>
Reviewed-by: Rob Pike <[email protected]>
Run-TryBot: Rob Pike <[email protected]>
TryBot-Result: Go Bot <[email protected]>
  • Loading branch information
dsnet committed Aug 28, 2021
1 parent ef4cb2f commit f371b30
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/cmd/compile/internal/test/inl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ func TestIntendedInlining(t *testing.T) {
"FullRune",
"FullRuneInString",
"RuneLen",
"AppendRune",
"ValidRune",
},
"reflect": {
Expand Down
26 changes: 26 additions & 0 deletions src/unicode/utf8/utf8.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,32 @@ func EncodeRune(p []byte, r rune) int {
}
}

// AppendRune appends the UTF-8 encoding of r to the end of p and
// returns the extended buffer. If the rune is out of range,
// it appends the encoding of RuneError.
func AppendRune(p []byte, r rune) []byte {
// This function is inlineable for fast handling of ASCII.
if uint32(r) <= rune1Max {
return append(p, byte(r))
}
return appendRuneNonASCII(p, r)
}

func appendRuneNonASCII(p []byte, r rune) []byte {
// Negative values are erroneous. Making it unsigned addresses the problem.
switch i := uint32(r); {
case i <= rune2Max:
return append(p, t2|byte(r>>6), tx|byte(r)&maskx)
case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
r = RuneError
fallthrough
case i <= rune3Max:
return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
default:
return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
}
}

// RuneCount returns the number of runes in p. Erroneous and short
// encodings are treated as single runes of width 1 byte.
func RuneCount(p []byte) int {
Expand Down
25 changes: 25 additions & 0 deletions src/unicode/utf8/utf8_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,17 @@ func TestEncodeRune(t *testing.T) {
}
}

func TestAppendRune(t *testing.T) {
for _, m := range utf8map {
if buf := AppendRune(nil, m.r); string(buf) != m.str {
t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str)
}
if buf := AppendRune([]byte("init"), m.r); string(buf) != "init"+m.str {
t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, "init"+m.str)
}
}
}

func TestDecodeRune(t *testing.T) {
for _, m := range utf8map {
b := []byte(m.str)
Expand Down Expand Up @@ -583,6 +594,20 @@ func BenchmarkEncodeJapaneseRune(b *testing.B) {
}
}

func BenchmarkAppendASCIIRune(b *testing.B) {
buf := make([]byte, UTFMax)
for i := 0; i < b.N; i++ {
AppendRune(buf[:0], 'a')
}
}

func BenchmarkAppendJapaneseRune(b *testing.B) {
buf := make([]byte, UTFMax)
for i := 0; i < b.N; i++ {
AppendRune(buf[:0], '本')
}
}

func BenchmarkDecodeASCIIRune(b *testing.B) {
a := []byte{'a'}
for i := 0; i < b.N; i++ {
Expand Down

0 comments on commit f371b30

Please sign in to comment.