Skip to content

Commit 42da864

Browse files
committed
In bogusComment, make sure unconsume not called after a potential buffer up
Wasn't able to repro with the supplied test case, but could previously happen. Audited other uses of unconsume and all are immediately after consume, so safe to call (as there's no path to a bufferup) Fixes #1612
1 parent eba3e39 commit 42da864

File tree

4 files changed

+22
-7
lines changed

4 files changed

+22
-7
lines changed

src/main/java/org/jsoup/parser/Tokeniser.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import org.jsoup.internal.StringUtil;
55
import org.jsoup.nodes.Entities;
66

7+
import javax.annotation.Nullable;
78
import java.util.Arrays;
89

910
/**
@@ -152,7 +153,7 @@ void advanceTransition(TokeniserState state) {
152153

153154
final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays
154155
final private int[] multipointHolder = new int[2];
155-
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
156+
@Nullable int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
156157
if (reader.isEmpty())
157158
return null;
158159
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())

src/main/java/org/jsoup/parser/TokeniserState.java

+5-6
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ void read(Tokeniser t, CharacterReader r) {
106106
break;
107107
case '?':
108108
t.createBogusCommentPending();
109-
t.advanceTransition(BogusComment);
109+
t.transition(BogusComment);
110110
break;
111111
default:
112112
if (r.matchesAsciiAlpha()) {
@@ -136,7 +136,8 @@ void read(Tokeniser t, CharacterReader r) {
136136
} else {
137137
t.error(this);
138138
t.createBogusCommentPending();
139-
t.transition(BogusComment); // reconsume char
139+
t.commentPending.append('/'); // push the / back on that got us here
140+
t.transition(BogusComment);
140141
}
141142
}
142143
},
@@ -906,11 +907,9 @@ void read(Tokeniser t, CharacterReader r) {
906907
BogusComment {
907908
void read(Tokeniser t, CharacterReader r) {
908909
// todo: handle bogus comment starting from eof. when does that trigger?
909-
// rewind to capture character that lead us here
910-
r.unconsume();
911910
t.commentPending.append(r.consumeTo('>'));
912911
// todo: replace nullChar with replaceChar
913-
char next = r.consume();
912+
char next = r.current();
914913
if (next == '>' || next == eof) {
915914
t.emitCommentPending();
916915
t.transition(Data);
@@ -933,7 +932,7 @@ void read(Tokeniser t, CharacterReader r) {
933932
} else {
934933
t.error(this);
935934
t.createBogusCommentPending();
936-
t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
935+
t.transition(BogusComment);
937936
}
938937
}
939938
},

src/test/java/org/jsoup/integration/FuzzFixesTest.java

+15
Original file line numberDiff line numberDiff line change
@@ -205,4 +205,19 @@ public void oob() throws IOException {
205205
Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
206206
assertNotNull(docXml);
207207
}
208+
209+
@Test
210+
public void unconsume() throws IOException {
211+
// https://github.com/jhy/jsoup/issues/1612
212+
// I wasn't able to repro this with different ways of loading strings - think somehow the fuzzers input
213+
// buffer is different and the bufferUp() happened at a different point. Regardless, did find an unsafe use
214+
// of unconsume() after a buffer up in bogus comment, so cleaned that up.
215+
File in = ParseTest.getFile("/fuzztests/1612.html.gz");
216+
217+
Document doc = Jsoup.parse(in, "UTF-8");
218+
assertNotNull(doc);
219+
220+
Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
221+
assertNotNull(docXml);
222+
}
208223
}
2.32 KB
Binary file not shown.

0 commit comments

Comments
 (0)