Skip to content

Commit

Permalink
encoding/ianaindex: add ASCII, document Index.Encoding
Browse files Browse the repository at this point in the history
Index.Encoding returns a nil Encoding in case the charset is valid but
unsupported by the library. Document this behavior.

Because of this, US-ASCII is seen as unsupported.
Register it as a regular encoding. The decoder replaces non-ASCII bytes
with the unicode replacement character. The encoder returns a
RepertoireError when a non-ASCII rune is encountered.

Fixes golang/go#19421

Change-Id: I4c24ba2114a5012be88488e63aa6e57df955eb96
GitHub-Last-Rev: 418ee6d
GitHub-Pull-Request: #10
Reviewed-on: https://go-review.googlesource.com/c/text/+/212077
Reviewed-by: Daniel Martí <[email protected]>
Reviewed-by: Marcel van Lohuizen <[email protected]>
Run-TryBot: Daniel Martí <[email protected]>
TryBot-Result: Gobot Gobot <[email protected]>
  • Loading branch information
emersion authored and mvdan committed Aug 26, 2020
1 parent 79eda68 commit a8b4671
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 0 deletions.
74 changes: 74 additions & 0 deletions encoding/ianaindex/ascii.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package ianaindex

import (
"unicode"
"unicode/utf8"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/transform"
"golang.org/x/text/encoding/internal/identifier"
)

type asciiDecoder struct {
transform.NopResetter
}

func (d asciiDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for _, c := range src {
if c > unicode.MaxASCII {
r := unicode.ReplacementChar
if nDst + utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
nSrc++
continue
}

if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc++
}
return nDst, nSrc, err
}

type asciiEncoder struct {
transform.NopResetter
}

func (d asciiEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for _, c := range src {
if c > unicode.MaxASCII {
err = internal.RepertoireError(encoding.ASCIISub)
break
}

if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc++
}
return nDst, nSrc, err
}

var asciiEnc = &internal.Encoding{
Encoding: &internal.SimpleEncoding{
asciiDecoder{},
asciiEncoder{},
},
Name: "US-ASCII",
MIB: identifier.ASCII,
}
38 changes: 38 additions & 0 deletions encoding/ianaindex/ascii_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package ianaindex

import (
"unicode"
"testing"

"golang.org/x/text/encoding"
)

func TestASCIIDecoder(t *testing.T) {
repl := string(unicode.ReplacementChar)
input := "Comment Candide fut élevé dans un beau château"
want := "Comment Candide fut " + repl + repl + "lev" + repl + repl + " dans un beau ch" + repl + repl + "teau"
got, err := asciiEnc.NewDecoder().String(input)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != want {
t.Fatalf("asciiEnc.NewDecoder().String() = %q, want %q", got, want)
}
}

func TestASCIIEncoder(t *testing.T) {
repl := string(encoding.ASCIISub)
input := "Comment Candide fut élevé dans un beau château"
want := "Comment Candide fut " + repl + "lev" + repl + " dans un beau ch" + repl + "teau"
got, err := encoding.ReplaceUnsupported(asciiEnc.NewEncoder()).String(input)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != want {
t.Fatalf("asciiEnc.NewEncoder().String() = %q, want %q", got, want)
}
}
5 changes: 5 additions & 0 deletions encoding/ianaindex/ianaindex.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ var (

// Encoding returns an Encoding for IANA-registered names. Matching is
// case-insensitive.
//
// If the provided name doesn't match a IANA-registered charset, an error is
// returned. If the name matches a IANA-registered charset but isn't supported,
// a nil encoding and a nil error are returned.
func (x *Index) Encoding(name string) (encoding.Encoding, error) {
name = strings.TrimSpace(name)
// First try without lowercasing (possibly creating an allocation).
Expand Down Expand Up @@ -150,6 +154,7 @@ func mibName(x int) string {
}

var encodings = [numIANA]encoding.Encoding{
enc3: asciiEnc,
enc106: unicode.UTF8,
enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM),
enc1013: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
Expand Down
1 change: 1 addition & 0 deletions encoding/ianaindex/ianaindex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ func TestEncoding(t *testing.T) {
{MIME, " l5 ", "ISO-8859-9", nil},
{MIME, "latin5 ", "ISO-8859-9", nil},
{MIME, "LATIN5 ", "ISO-8859-9", nil},
{MIME, "us-ascii", "US-ASCII", nil},
{MIME, "latin 5", "", errInvalidName},
{MIME, "latin-5", "", errInvalidName},

Expand Down

0 comments on commit a8b4671

Please sign in to comment.