In bogusComment, make sure unconsume not called after a potential buffer up

jhy · jhy · commit 42da86439df7 · 2021-08-14T15:31:35.000+10:00
Wasn't able to repro with the supplied test case, but could previously happen. Audited other uses of unconsume and all are immediately after consume, so safe to call (as there's no path to a bufferup) Fixes #1612
diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java
@@ -4,6 +4,7 @@
 import org.jsoup.internal.StringUtil;
 import org.jsoup.nodes.Entities;
 
+import javax.annotation.Nullable;
 import java.util.Arrays;
 
 /**
@@ -152,7 +153,7 @@ void advanceTransition(TokeniserState state) {
 
     final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays
     final private int[] multipointHolder = new int[2];
-    int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
+    @Nullable int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
         if (reader.isEmpty())
             return null;
         if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
diff --git a/src/main/java/org/jsoup/parser/TokeniserState.java b/src/main/java/org/jsoup/parser/TokeniserState.java
@@ -106,7 +106,7 @@ void read(Tokeniser t, CharacterReader r) {
                     break;
                 case '?':
                     t.createBogusCommentPending();
-                    t.advanceTransition(BogusComment);
+                    t.transition(BogusComment);
                     break;
                 default:
                     if (r.matchesAsciiAlpha()) {
@@ -136,7 +136,8 @@ void read(Tokeniser t, CharacterReader r) {
             } else {
                 t.error(this);
                 t.createBogusCommentPending();
-                t.transition(BogusComment); // reconsume char
+                t.commentPending.append('/'); // push the / back on that got us here
+                t.transition(BogusComment);
             }
         }
     },
@@ -906,11 +907,9 @@ void read(Tokeniser t, CharacterReader r) {
     BogusComment {
         void read(Tokeniser t, CharacterReader r) {
             // todo: handle bogus comment starting from eof. when does that trigger?
-            // rewind to capture character that lead us here
-            r.unconsume();
             t.commentPending.append(r.consumeTo('>'));
             // todo: replace nullChar with replaceChar
-            char next = r.consume();
+            char next = r.current();
             if (next == '>' || next == eof) {
                 t.emitCommentPending();
                 t.transition(Data);
@@ -933,7 +932,7 @@ void read(Tokeniser t, CharacterReader r) {
             } else {
                 t.error(this);
                 t.createBogusCommentPending();
-                t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
+                t.transition(BogusComment);
             }
         }
     },
diff --git a/src/test/java/org/jsoup/integration/FuzzFixesTest.java b/src/test/java/org/jsoup/integration/FuzzFixesTest.java
@@ -205,4 +205,19 @@ public void oob() throws IOException {
         Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
         assertNotNull(docXml);
     }
+
+    @Test
+    public void unconsume() throws IOException {
+        // https://github.com/jhy/jsoup/issues/1612
+        // I wasn't able to repro this with different ways of loading strings - think somehow the fuzzers input
+        // buffer is different and the bufferUp() happened at a different point. Regardless, did find an unsafe use
+        // of unconsume() after a buffer up in bogus comment, so cleaned that up.
+        File in = ParseTest.getFile("/fuzztests/1612.html.gz");
+
+        Document doc = Jsoup.parse(in, "UTF-8");
+        assertNotNull(doc);
+
+        Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
+        assertNotNull(docXml);
+    }
 }
diff --git a/src/test/resources/fuzztests/1612.html.gz b/src/test/resources/fuzztests/1612.html.gz