diff --git a/testdata/bocchan.golden b/testdata/bocchan.golden index e1d59e5..9c4f4b5 100644 --- a/testdata/bocchan.golden +++ b/testdata/bocchan.golden @@ -69017,27 +69017,26 @@ 14:"けれども" (66: 22, 26) KNOWN [25526] [助詞 接続助詞 * *] 15:"、" (78: 26, 27) KNOWN [97] [記号 読点 * *] 16:"事情" (81: 27, 29) KNOWN [120441] [名詞 一般 * *] -17:"や" (87: 29, 30) KNOWN [75284] [助詞 並立助詞 * *] -18:"むをえん" (90: 30, 34) UNKNOWN [21] [名詞 一般 * *] -19:"から" (102: 34, 36) KNOWN [18770] [助詞 格助詞 一般 *] -20:"処決" (108: 36, 38) KNOWN [139140] [名詞 サ変接続 * *] -21:"《" (114: 38, 39) KNOWN [110] [記号 括弧開 * *] -22:"しょ" (117: 39, 41) KNOWN [33068] [名詞 一般 * *] -23:"けつ" (123: 41, 43) KNOWN [25314] [名詞 一般 * *] -24:"》" (129: 43, 44) KNOWN [111] [記号 括弧閉 * *] -25:"し" (132: 44, 45) KNOWN [30757] [動詞 自立 * *] -26:"て" (135: 45, 46) KNOWN [46599] [助詞 接続助詞 * *] -27:"くれ" (138: 46, 48) KNOWN [24504] [動詞 非自立 * *] -28:"と" (144: 48, 49) KNOWN [47727] [助詞 格助詞 引用 *] -29:"云わ" (147: 49, 51) KNOWN [121186] [動詞 自立 * *] -30:"れ" (153: 51, 52) KNOWN [79333] [動詞 接尾 * *] -31:"た" (156: 52, 53) KNOWN [39233] [助動詞 * * *] -32:"と" (159: 53, 54) KNOWN [47727] [助詞 格助詞 引用 *] -33:"の" (162: 54, 55) KNOWN [55829] [助詞 連体化 * *] -34:"事" (165: 55, 56) KNOWN [120342] [名詞 非自立 一般 *] -35:"だ" (168: 56, 57) KNOWN [41863] [助動詞 * * *] -36:"。" (171: 57, 58) KNOWN [98] [記号 句点 * *] -37:"EOS" (174: 58, 58) DUMMY [-1] [] +17:"やむをえん" (87: 29, 34) KNOWN [76155] [動詞 自立 * *] +18:"から" (102: 34, 36) KNOWN [18772] [助詞 接続助詞 * *] +19:"処決" (108: 36, 38) KNOWN [139140] [名詞 サ変接続 * *] +20:"《" (114: 38, 39) KNOWN [110] [記号 括弧開 * *] +21:"しょ" (117: 39, 41) KNOWN [33068] [名詞 一般 * *] +22:"けつ" (123: 41, 43) KNOWN [25314] [名詞 一般 * *] +23:"》" (129: 43, 44) KNOWN [111] [記号 括弧閉 * *] +24:"し" (132: 44, 45) KNOWN [30757] [動詞 自立 * *] +25:"て" (135: 45, 46) KNOWN [46599] [助詞 接続助詞 * *] +26:"くれ" (138: 46, 48) KNOWN [24504] [動詞 非自立 * *] +27:"と" (144: 48, 49) KNOWN [47727] [助詞 格助詞 引用 *] +28:"云わ" (147: 49, 51) KNOWN [121186] [動詞 自立 * *] +29:"れ" (153: 51, 52) KNOWN [79333] [動詞 接尾 * *] +30:"た" (156: 52, 53) KNOWN [39233] [助動詞 * * *] +31:"と" (159: 53, 54) KNOWN [47727] [助詞 格助詞 引用 *] +32:"の" (162: 54, 55) KNOWN [55829] [助詞 連体化 * *] +33:"事" (165: 55, 56) KNOWN [120342] [名詞 非自立 一般 *] +34:"だ" (168: 56, 57) KNOWN [41863] [助動詞 * * *] +35:"。" (171: 57, 58) KNOWN [98] [記号 句点 * *] +36:"EOS" (174: 58, 58) DUMMY [-1] [] 0:"BOS" (0: 0, 0) DUMMY [-1] [] 1:"「" (0: 0, 1) KNOWN [112] [記号 括弧開 * *] 2:"そんな" (3: 1, 4) KNOWN [39105] [連体詞 * * *] @@ -70925,48 +70924,46 @@ 41:"は" (177: 59, 60) KNOWN [57061] [助詞 係助詞 * *] 42:"二" (180: 60, 61) KNOWN [120507] [名詞 数 * *] 43:"人" (183: 61, 62) KNOWN [122955] [名詞 接尾 助数詞 *] -44:"づれ" (186: 62, 64) UNKNOWN [21] [名詞 一般 * *] -45:"だ" (192: 64, 65) KNOWN [41863] [助動詞 * * *] -46:"が" (195: 65, 66) KNOWN [19674] [助詞 接続助詞 * *] -47:"、" (198: 66, 67) KNOWN [97] [記号 読点 * *] -48:"――" (201: 67, 69) KNOWN [64] [記号 一般 * *] -49:"どうも" (207: 69, 72) KNOWN [50527] [副詞 一般 * *] -50:"有望" (216: 72, 74) KNOWN [245874] [名詞 形容動詞語幹 * *] -51:"らしい" (222: 74, 77) KNOWN [79005] [助動詞 * * *] -52:"」" (231: 77, 78) KNOWN [113] [記号 括弧閉 * *] -53:"「" (234: 78, 79) KNOWN [112] [記号 括弧開 * *] -54:"どうして" (237: 79, 83) KNOWN [50496] [副詞 一般 * *] -55:"」" (249: 83, 84) KNOWN [113] [記号 括弧閉 * *] -56:"「" (252: 84, 85) KNOWN [112] [記号 括弧開 * *] -57:"どうして" (255: 85, 89) KNOWN [50496] [副詞 一般 * *] -58:"って" (267: 89, 91) KNOWN [43419] [助詞 格助詞 連語 *] -59:"、" (273: 91, 92) KNOWN [97] [記号 読点 * *] -60:"ああ" (276: 92, 94) KNOWN [128] [副詞 助詞類接続 * *] -61:"云う" (282: 94, 96) KNOWN [121177] [動詞 自立 * *] -62:"狡" (288: 96, 97) KNOWN [286317] [名詞 一般 * *] -63:"《" (291: 97, 98) KNOWN [110] [記号 括弧開 * *] -64:"ずる" (294: 98, 100) KNOWN [36738] [名詞 一般 * *] -65:"》" (300: 100, 101) KNOWN [111] [記号 括弧閉 * *] -66:"い" (303: 101, 102) KNOWN [3664] [動詞 自立 * *] -67:"奴" (306: 102, 103) KNOWN [182552] [名詞 代名詞 一般 *] -68:"だ" (309: 103, 104) KNOWN [41863] [助動詞 * * *] -69:"から" (312: 104, 106) KNOWN [18772] [助詞 接続助詞 * *] -70:"、" (318: 106, 107) KNOWN [97] [記号 読点 * *] -71:"芸者" (321: 107, 109) KNOWN [329442] [名詞 一般 * *] -72:"を" (327: 109, 110) KNOWN [80580] [助詞 格助詞 一般 *] -73:"先" (330: 110, 111) KNOWN [133470] [名詞 一般 * *] -74:"へ" (333: 111, 112) KNOWN [65964] [助詞 格助詞 一般 *] -75:"よこし" (336: 112, 115) KNOWN [77728] [動詞 自立 * *] -76:"て" (345: 115, 116) KNOWN [46599] [助詞 接続助詞 * *] -77:"、" (348: 116, 117) KNOWN [97] [記号 読点 * *] -78:"後" (351: 117, 118) KNOWN [211861] [名詞 接尾 副詞可能 *] -79:"から" (354: 118, 120) KNOWN [18770] [助詞 格助詞 一般 *] -80:"忍ん" (360: 120, 122) KNOWN [214515] [動詞 自立 * *] -81:"で" (366: 122, 123) KNOWN [47317] [助詞 接続助詞 * *] -82:"くる" (369: 123, 125) KNOWN [24365] [動詞 非自立 * *] -83:"かも" (375: 125, 127) KNOWN [18664] [助詞 副助詞 * *] -84:"知" (381: 127, 128) KNOWN [299982] [名詞 一般 * *] -85:"EOS" (384: 128, 128) DUMMY [-1] [] +44:"づれだが" (186: 62, 66) UNKNOWN [19] [名詞 固有名詞 組織 *] +45:"、" (198: 66, 67) KNOWN [97] [記号 読点 * *] +46:"――" (201: 67, 69) KNOWN [64] [記号 一般 * *] +47:"どうも" (207: 69, 72) KNOWN [50527] [副詞 一般 * *] +48:"有望" (216: 72, 74) KNOWN [245874] [名詞 形容動詞語幹 * *] +49:"らしい" (222: 74, 77) KNOWN [79005] [助動詞 * * *] +50:"」" (231: 77, 78) KNOWN [113] [記号 括弧閉 * *] +51:"「" (234: 78, 79) KNOWN [112] [記号 括弧開 * *] +52:"どうして" (237: 79, 83) KNOWN [50496] [副詞 一般 * *] +53:"」" (249: 83, 84) KNOWN [113] [記号 括弧閉 * *] +54:"「" (252: 84, 85) KNOWN [112] [記号 括弧開 * *] +55:"どうして" (255: 85, 89) KNOWN [50496] [副詞 一般 * *] +56:"って" (267: 89, 91) KNOWN [43419] [助詞 格助詞 連語 *] +57:"、" (273: 91, 92) KNOWN [97] [記号 読点 * *] +58:"ああ" (276: 92, 94) KNOWN [128] [副詞 助詞類接続 * *] +59:"云う" (282: 94, 96) KNOWN [121177] [動詞 自立 * *] +60:"狡" (288: 96, 97) KNOWN [286317] [名詞 一般 * *] +61:"《" (291: 97, 98) KNOWN [110] [記号 括弧開 * *] +62:"ずる" (294: 98, 100) KNOWN [36738] [名詞 一般 * *] +63:"》" (300: 100, 101) KNOWN [111] [記号 括弧閉 * *] +64:"い" (303: 101, 102) KNOWN [3664] [動詞 自立 * *] +65:"奴" (306: 102, 103) KNOWN [182552] [名詞 代名詞 一般 *] +66:"だ" (309: 103, 104) KNOWN [41863] [助動詞 * * *] +67:"から" (312: 104, 106) KNOWN [18772] [助詞 接続助詞 * *] +68:"、" (318: 106, 107) KNOWN [97] [記号 読点 * *] +69:"芸者" (321: 107, 109) KNOWN [329442] [名詞 一般 * *] +70:"を" (327: 109, 110) KNOWN [80580] [助詞 格助詞 一般 *] +71:"先" (330: 110, 111) KNOWN [133470] [名詞 一般 * *] +72:"へ" (333: 111, 112) KNOWN [65964] [助詞 格助詞 一般 *] +73:"よこし" (336: 112, 115) KNOWN [77728] [動詞 自立 * *] +74:"て" (345: 115, 116) KNOWN [46599] [助詞 接続助詞 * *] +75:"、" (348: 116, 117) KNOWN [97] [記号 読点 * *] +76:"後" (351: 117, 118) KNOWN [211861] [名詞 接尾 副詞可能 *] +77:"から" (354: 118, 120) KNOWN [18770] [助詞 格助詞 一般 *] +78:"忍ん" (360: 120, 122) KNOWN [214515] [動詞 自立 * *] +79:"で" (366: 122, 123) KNOWN [47317] [助詞 接続助詞 * *] +80:"くる" (369: 123, 125) KNOWN [24365] [動詞 非自立 * *] +81:"かも" (375: 125, 127) KNOWN [18664] [助詞 副助詞 * *] +82:"知" (381: 127, 128) KNOWN [299982] [名詞 一般 * *] +83:"EOS" (384: 128, 128) DUMMY [-1] [] 0:"BOS" (0: 0, 0) DUMMY [-1] [] 1:"れ" (0: 0, 1) KNOWN [79334] [動詞 接尾 * *] 2:"ない" (3: 1, 3) KNOWN [50959] [助動詞 * * *] diff --git a/tokenizer/lattice/lattice.go b/tokenizer/lattice/lattice.go index b0c03b1..58d1967 100644 --- a/tokenizer/lattice/lattice.go +++ b/tokenizer/lattice/lattice.go @@ -161,14 +161,19 @@ func (la *Lattice) Build(inp string) { } } } + + prev := pos + if c, size := utf8.DecodeLastRuneInString(inp[pos:endPos]); c != utf8.RuneError { + prev = endPos - size + } id := la.dic.UnkDict.Index[int32(class)] - for i, w := pos, 0; i < endPos; i += w { - _, w = utf8.DecodeRuneInString(inp[i:]) - end := i + w - dup := la.dic.UnkDict.IndexDup[int32(class)] - for x := 0; x < int(dup)+1; x++ { - la.addNode(runePos, int(id)+x, pos, runePos, UNKNOWN, inp[pos:end]) + dup := la.dic.UnkDict.IndexDup[int32(class)] + for x := 0; x < int(dup)+1; x++ { + if pos < prev { + // add the string with one character truncated at the end. + la.addNode(runePos, int(id)+x, pos, runePos, UNKNOWN, inp[pos:prev]) } + la.addNode(runePos, int(id)+x, pos, runePos, UNKNOWN, inp[pos:endPos]) } } } diff --git a/tokenizer/lattice/lattice_test.go b/tokenizer/lattice/lattice_test.go index 28881d1..cc4b654 100644 --- a/tokenizer/lattice/lattice_test.go +++ b/tokenizer/lattice/lattice_test.go @@ -143,16 +143,13 @@ func Test_LatticeBuildUnknown(t *testing.T) { t.Errorf("lattice initialize error: got %v, expected %v", *la.list[len(la.list)-1][0], eos) } - expected := 7 - if len(la.list[1]) != expected { + expected := 18 + if len(la.list[3]) != expected { t.Fatalf("lattice initialize error: got %v, expected %v", len(la.list[1]), expected) } - l := la.list[1] + l := la.list[3] var known, unknown, undef int for _, v := range l { - if v.Surface != string([]rune(inp)[0:1]) { - t.Errorf("lattice initialize error: got %+v, expected surface %c", v, []rune(inp)[0]) - } switch v.Class { case KNOWN: known++ @@ -162,11 +159,11 @@ func Test_LatticeBuildUnknown(t *testing.T) { undef++ } } - if known != 1 { - t.Errorf("lattice initialize error: got KNOWN %d, expected 1, %+v", known, l) + if known != 0 { + t.Errorf("lattice initialize error: got KNOWN %d, expected 0, %+v", known, l) } - if unknown != 6 { - t.Errorf("lattice initialize error: got UNKNOWN %d, expected 6, %+v", unknown, l) + if unknown != 18 { + t.Errorf("lattice initialize error: got UNKNOWN %d, expected 18, %+v", unknown, l) } if undef != 0 { t.Errorf("lattice initialize error: got unexpected class %d, %+v", undef, l) diff --git a/tokenizer/tokenizer_test.go b/tokenizer/tokenizer_test.go index fd1b6bc..72f60f5 100644 --- a/tokenizer/tokenizer_test.go +++ b/tokenizer/tokenizer_test.go @@ -478,3 +478,20 @@ func BenchmarkAnalyzeExtended(b *testing.B) { tnz.Analyze(benchSampleText, Extended) } } + +func BenchmarkTooLongUnknownToken(b *testing.B) { + input := `GOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO` + d, err := dict.LoadDictFile(testDictPath) + if err != nil { + b.Fatalf("unexpected error, %v", err) + } + tnz, err := New(d) + if err != nil { + b.Fatalf("unexpected error, %v", err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + tnz.Tokenize(input) + } +}