Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix to avoid adding too many nodes for unknown words #240

Merged
merged 3 commits into from
Jun 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 60 additions & 63 deletions testdata/bocchan.golden
Original file line number Diff line number Diff line change
Expand Up @@ -69017,27 +69017,26 @@
14:"けれども" (66: 22, 26) KNOWN [25526] [助詞 接続助詞 * *]
15:"、" (78: 26, 27) KNOWN [97] [記号 読点 * *]
16:"事情" (81: 27, 29) KNOWN [120441] [名詞 一般 * *]
17:"や" (87: 29, 30) KNOWN [75284] [助詞 並立助詞 * *]
18:"むをえん" (90: 30, 34) UNKNOWN [21] [名詞 一般 * *]
19:"から" (102: 34, 36) KNOWN [18770] [助詞 格助詞 一般 *]
20:"処決" (108: 36, 38) KNOWN [139140] [名詞 サ変接続 * *]
21:"《" (114: 38, 39) KNOWN [110] [記号 括弧開 * *]
22:"しょ" (117: 39, 41) KNOWN [33068] [名詞 一般 * *]
23:"けつ" (123: 41, 43) KNOWN [25314] [名詞 一般 * *]
24:"》" (129: 43, 44) KNOWN [111] [記号 括弧閉 * *]
25:"し" (132: 44, 45) KNOWN [30757] [動詞 自立 * *]
26:"て" (135: 45, 46) KNOWN [46599] [助詞 接続助詞 * *]
27:"くれ" (138: 46, 48) KNOWN [24504] [動詞 非自立 * *]
28:"と" (144: 48, 49) KNOWN [47727] [助詞 格助詞 引用 *]
29:"云わ" (147: 49, 51) KNOWN [121186] [動詞 自立 * *]
30:"れ" (153: 51, 52) KNOWN [79333] [動詞 接尾 * *]
31:"た" (156: 52, 53) KNOWN [39233] [助動詞 * * *]
32:"と" (159: 53, 54) KNOWN [47727] [助詞 格助詞 引用 *]
33:"の" (162: 54, 55) KNOWN [55829] [助詞 連体化 * *]
34:"事" (165: 55, 56) KNOWN [120342] [名詞 非自立 一般 *]
35:"だ" (168: 56, 57) KNOWN [41863] [助動詞 * * *]
36:"。" (171: 57, 58) KNOWN [98] [記号 句点 * *]
37:"EOS" (174: 58, 58) DUMMY [-1] []
17:"やむをえん" (87: 29, 34) KNOWN [76155] [動詞 自立 * *]
18:"から" (102: 34, 36) KNOWN [18772] [助詞 接続助詞 * *]
19:"処決" (108: 36, 38) KNOWN [139140] [名詞 サ変接続 * *]
20:"《" (114: 38, 39) KNOWN [110] [記号 括弧開 * *]
21:"しょ" (117: 39, 41) KNOWN [33068] [名詞 一般 * *]
22:"けつ" (123: 41, 43) KNOWN [25314] [名詞 一般 * *]
23:"》" (129: 43, 44) KNOWN [111] [記号 括弧閉 * *]
24:"し" (132: 44, 45) KNOWN [30757] [動詞 自立 * *]
25:"て" (135: 45, 46) KNOWN [46599] [助詞 接続助詞 * *]
26:"くれ" (138: 46, 48) KNOWN [24504] [動詞 非自立 * *]
27:"と" (144: 48, 49) KNOWN [47727] [助詞 格助詞 引用 *]
28:"云わ" (147: 49, 51) KNOWN [121186] [動詞 自立 * *]
29:"れ" (153: 51, 52) KNOWN [79333] [動詞 接尾 * *]
30:"た" (156: 52, 53) KNOWN [39233] [助動詞 * * *]
31:"と" (159: 53, 54) KNOWN [47727] [助詞 格助詞 引用 *]
32:"の" (162: 54, 55) KNOWN [55829] [助詞 連体化 * *]
33:"事" (165: 55, 56) KNOWN [120342] [名詞 非自立 一般 *]
34:"だ" (168: 56, 57) KNOWN [41863] [助動詞 * * *]
35:"。" (171: 57, 58) KNOWN [98] [記号 句点 * *]
36:"EOS" (174: 58, 58) DUMMY [-1] []
0:"BOS" (0: 0, 0) DUMMY [-1] []
1:"「" (0: 0, 1) KNOWN [112] [記号 括弧開 * *]
2:"そんな" (3: 1, 4) KNOWN [39105] [連体詞 * * *]
Expand Down Expand Up @@ -70925,48 +70924,46 @@
41:"は" (177: 59, 60) KNOWN [57061] [助詞 係助詞 * *]
42:"二" (180: 60, 61) KNOWN [120507] [名詞 数 * *]
43:"人" (183: 61, 62) KNOWN [122955] [名詞 接尾 助数詞 *]
44:"づれ" (186: 62, 64) UNKNOWN [21] [名詞 一般 * *]
45:"だ" (192: 64, 65) KNOWN [41863] [助動詞 * * *]
46:"が" (195: 65, 66) KNOWN [19674] [助詞 接続助詞 * *]
47:"、" (198: 66, 67) KNOWN [97] [記号 読点 * *]
48:"――" (201: 67, 69) KNOWN [64] [記号 一般 * *]
49:"どうも" (207: 69, 72) KNOWN [50527] [副詞 一般 * *]
50:"有望" (216: 72, 74) KNOWN [245874] [名詞 形容動詞語幹 * *]
51:"らしい" (222: 74, 77) KNOWN [79005] [助動詞 * * *]
52:"」" (231: 77, 78) KNOWN [113] [記号 括弧閉 * *]
53:"「" (234: 78, 79) KNOWN [112] [記号 括弧開 * *]
54:"どうして" (237: 79, 83) KNOWN [50496] [副詞 一般 * *]
55:"」" (249: 83, 84) KNOWN [113] [記号 括弧閉 * *]
56:"「" (252: 84, 85) KNOWN [112] [記号 括弧開 * *]
57:"どうして" (255: 85, 89) KNOWN [50496] [副詞 一般 * *]
58:"って" (267: 89, 91) KNOWN [43419] [助詞 格助詞 連語 *]
59:"、" (273: 91, 92) KNOWN [97] [記号 読点 * *]
60:"ああ" (276: 92, 94) KNOWN [128] [副詞 助詞類接続 * *]
61:"云う" (282: 94, 96) KNOWN [121177] [動詞 自立 * *]
62:"狡" (288: 96, 97) KNOWN [286317] [名詞 一般 * *]
63:"《" (291: 97, 98) KNOWN [110] [記号 括弧開 * *]
64:"ずる" (294: 98, 100) KNOWN [36738] [名詞 一般 * *]
65:"》" (300: 100, 101) KNOWN [111] [記号 括弧閉 * *]
66:"い" (303: 101, 102) KNOWN [3664] [動詞 自立 * *]
67:"奴" (306: 102, 103) KNOWN [182552] [名詞 代名詞 一般 *]
68:"だ" (309: 103, 104) KNOWN [41863] [助動詞 * * *]
69:"から" (312: 104, 106) KNOWN [18772] [助詞 接続助詞 * *]
70:"、" (318: 106, 107) KNOWN [97] [記号 読点 * *]
71:"芸者" (321: 107, 109) KNOWN [329442] [名詞 一般 * *]
72:"を" (327: 109, 110) KNOWN [80580] [助詞 格助詞 一般 *]
73:"先" (330: 110, 111) KNOWN [133470] [名詞 一般 * *]
74:"へ" (333: 111, 112) KNOWN [65964] [助詞 格助詞 一般 *]
75:"よこし" (336: 112, 115) KNOWN [77728] [動詞 自立 * *]
76:"て" (345: 115, 116) KNOWN [46599] [助詞 接続助詞 * *]
77:"、" (348: 116, 117) KNOWN [97] [記号 読点 * *]
78:"後" (351: 117, 118) KNOWN [211861] [名詞 接尾 副詞可能 *]
79:"から" (354: 118, 120) KNOWN [18770] [助詞 格助詞 一般 *]
80:"忍ん" (360: 120, 122) KNOWN [214515] [動詞 自立 * *]
81:"で" (366: 122, 123) KNOWN [47317] [助詞 接続助詞 * *]
82:"くる" (369: 123, 125) KNOWN [24365] [動詞 非自立 * *]
83:"かも" (375: 125, 127) KNOWN [18664] [助詞 副助詞 * *]
84:"知" (381: 127, 128) KNOWN [299982] [名詞 一般 * *]
85:"EOS" (384: 128, 128) DUMMY [-1] []
44:"づれだが" (186: 62, 66) UNKNOWN [19] [名詞 固有名詞 組織 *]
45:"、" (198: 66, 67) KNOWN [97] [記号 読点 * *]
46:"――" (201: 67, 69) KNOWN [64] [記号 一般 * *]
47:"どうも" (207: 69, 72) KNOWN [50527] [副詞 一般 * *]
48:"有望" (216: 72, 74) KNOWN [245874] [名詞 形容動詞語幹 * *]
49:"らしい" (222: 74, 77) KNOWN [79005] [助動詞 * * *]
50:"」" (231: 77, 78) KNOWN [113] [記号 括弧閉 * *]
51:"「" (234: 78, 79) KNOWN [112] [記号 括弧開 * *]
52:"どうして" (237: 79, 83) KNOWN [50496] [副詞 一般 * *]
53:"」" (249: 83, 84) KNOWN [113] [記号 括弧閉 * *]
54:"「" (252: 84, 85) KNOWN [112] [記号 括弧開 * *]
55:"どうして" (255: 85, 89) KNOWN [50496] [副詞 一般 * *]
56:"って" (267: 89, 91) KNOWN [43419] [助詞 格助詞 連語 *]
57:"、" (273: 91, 92) KNOWN [97] [記号 読点 * *]
58:"ああ" (276: 92, 94) KNOWN [128] [副詞 助詞類接続 * *]
59:"云う" (282: 94, 96) KNOWN [121177] [動詞 自立 * *]
60:"狡" (288: 96, 97) KNOWN [286317] [名詞 一般 * *]
61:"《" (291: 97, 98) KNOWN [110] [記号 括弧開 * *]
62:"ずる" (294: 98, 100) KNOWN [36738] [名詞 一般 * *]
63:"》" (300: 100, 101) KNOWN [111] [記号 括弧閉 * *]
64:"い" (303: 101, 102) KNOWN [3664] [動詞 自立 * *]
65:"奴" (306: 102, 103) KNOWN [182552] [名詞 代名詞 一般 *]
66:"だ" (309: 103, 104) KNOWN [41863] [助動詞 * * *]
67:"から" (312: 104, 106) KNOWN [18772] [助詞 接続助詞 * *]
68:"、" (318: 106, 107) KNOWN [97] [記号 読点 * *]
69:"芸者" (321: 107, 109) KNOWN [329442] [名詞 一般 * *]
70:"を" (327: 109, 110) KNOWN [80580] [助詞 格助詞 一般 *]
71:"先" (330: 110, 111) KNOWN [133470] [名詞 一般 * *]
72:"へ" (333: 111, 112) KNOWN [65964] [助詞 格助詞 一般 *]
73:"よこし" (336: 112, 115) KNOWN [77728] [動詞 自立 * *]
74:"て" (345: 115, 116) KNOWN [46599] [助詞 接続助詞 * *]
75:"、" (348: 116, 117) KNOWN [97] [記号 読点 * *]
76:"後" (351: 117, 118) KNOWN [211861] [名詞 接尾 副詞可能 *]
77:"から" (354: 118, 120) KNOWN [18770] [助詞 格助詞 一般 *]
78:"忍ん" (360: 120, 122) KNOWN [214515] [動詞 自立 * *]
79:"で" (366: 122, 123) KNOWN [47317] [助詞 接続助詞 * *]
80:"くる" (369: 123, 125) KNOWN [24365] [動詞 非自立 * *]
81:"かも" (375: 125, 127) KNOWN [18664] [助詞 副助詞 * *]
82:"知" (381: 127, 128) KNOWN [299982] [名詞 一般 * *]
83:"EOS" (384: 128, 128) DUMMY [-1] []
0:"BOS" (0: 0, 0) DUMMY [-1] []
1:"れ" (0: 0, 1) KNOWN [79334] [動詞 接尾 * *]
2:"ない" (3: 1, 3) KNOWN [50959] [助動詞 * * *]
Expand Down
17 changes: 11 additions & 6 deletions tokenizer/lattice/lattice.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,19 @@ func (la *Lattice) Build(inp string) {
}
}
}

prev := pos
if c, size := utf8.DecodeLastRuneInString(inp[pos:endPos]); c != utf8.RuneError {
prev = endPos - size
}
id := la.dic.UnkDict.Index[int32(class)]
for i, w := pos, 0; i < endPos; i += w {
_, w = utf8.DecodeRuneInString(inp[i:])
end := i + w
dup := la.dic.UnkDict.IndexDup[int32(class)]
for x := 0; x < int(dup)+1; x++ {
la.addNode(runePos, int(id)+x, pos, runePos, UNKNOWN, inp[pos:end])
dup := la.dic.UnkDict.IndexDup[int32(class)]
for x := 0; x < int(dup)+1; x++ {
if pos < prev {
// add the string with one character truncated at the end.
la.addNode(runePos, int(id)+x, pos, runePos, UNKNOWN, inp[pos:prev])
}
la.addNode(runePos, int(id)+x, pos, runePos, UNKNOWN, inp[pos:endPos])
}
}
}
Expand Down
17 changes: 7 additions & 10 deletions tokenizer/lattice/lattice_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,16 +143,13 @@ func Test_LatticeBuildUnknown(t *testing.T) {
t.Errorf("lattice initialize error: got %v, expected %v", *la.list[len(la.list)-1][0], eos)
}

expected := 7
if len(la.list[1]) != expected {
expected := 18
if len(la.list[3]) != expected {
t.Fatalf("lattice initialize error: got %v, expected %v", len(la.list[1]), expected)
}
l := la.list[1]
l := la.list[3]
var known, unknown, undef int
for _, v := range l {
if v.Surface != string([]rune(inp)[0:1]) {
t.Errorf("lattice initialize error: got %+v, expected surface %c", v, []rune(inp)[0])
}
switch v.Class {
case KNOWN:
known++
Expand All @@ -162,11 +159,11 @@ func Test_LatticeBuildUnknown(t *testing.T) {
undef++
}
}
if known != 1 {
t.Errorf("lattice initialize error: got KNOWN %d, expected 1, %+v", known, l)
if known != 0 {
t.Errorf("lattice initialize error: got KNOWN %d, expected 0, %+v", known, l)
}
if unknown != 6 {
t.Errorf("lattice initialize error: got UNKNOWN %d, expected 6, %+v", unknown, l)
if unknown != 18 {
t.Errorf("lattice initialize error: got UNKNOWN %d, expected 18, %+v", unknown, l)
}
if undef != 0 {
t.Errorf("lattice initialize error: got unexpected class %d, %+v", undef, l)
Expand Down
17 changes: 17 additions & 0 deletions tokenizer/tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -478,3 +478,20 @@ func BenchmarkAnalyzeExtended(b *testing.B) {
tnz.Analyze(benchSampleText, Extended)
}
}

func BenchmarkTooLongUnknownToken(b *testing.B) {
input := `GOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO`
d, err := dict.LoadDictFile(testDictPath)
if err != nil {
b.Fatalf("unexpected error, %v", err)
}
tnz, err := New(d)
if err != nil {
b.Fatalf("unexpected error, %v", err)
}

b.ResetTimer()
for i := 0; i < b.N; i++ {
tnz.Tokenize(input)
}
}