Initial implementation of fuzzing harness

Added pipeline Fix path in build script Formatting Add Updated changelog to reflect PR number
jsvine · Jan 5, 2025 · c18609d · c18609d
1 parent c562774
commit c18609d
Show file tree

Hide file tree

Showing 6 changed files with 205 additions and 0 deletions.
diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
@@ -0,0 +1,40 @@
+name: CIFuzz
+on:
+  push:
+    branches:
+      - stable
+      - develop
+  pull_request:
+permissions: {}
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    permissions:
+      security-events: write
+    steps:
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'pdfplumber'
+        language: python
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'pdfplumber'
+        language: python
+        fuzz-seconds: 800
+        output-sarif: true
+    - name: Upload Crash
+      uses: actions/upload-artifact@v3
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts
+    - name: Upload Sarif
+      if: always() && steps.build.outcome == 'success'
+      uses: github/codeql-action/upload-sarif@v2
+      with:
+        # Path to SARIF file relative to the root of the repository
+        sarif_file: cifuzz-sarif/results.sarif
+        checkout_path: cifuzz-sarif
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. The format
 - Add `--format text` options to CLI (in addition to previously-available `csv` and `json`) (h/t @brandonrobertz). ([#1235](https://github.com/jsvine/pdfplumber/pull/1235))
 - Add `raise_unicode_errors: bool` parameter to `pdfplumber.open()` to allow bypassing `UnicodeDecodeError`s in annotation-parsing and generate warnings instead (h/t @stolarczyk). ([#1195](https://github.com/jsvine/pdfplumber/issues/1195))
 - Add `name` property to `image` objects (h/t @djr2015). ([#1201](https://github.com/jsvine/pdfplumber/discussions/1201))
+- Added necessary build scripts, pipelines, and harnesses to integrate with [OSS-Fuzz](https://github.com/google/oss-fuzz). ([#1245](https://github.com/jsvine/pdfplumber/pull/1245)
 
 ### Fixed
 

diff --git a/README.md b/README.md
@@ -570,6 +570,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
 - [@wodny](https://github.com/wodny)
 - [Michal Stolarczyk](https://github.com/stolarczyk)
 - [Brandon Roberts](https://github.com/brandonrobertz)
+- [@ennamarie19](https://github.com/ennamarie19/)
 
 ## Contributing
 

diff --git a/fuzz/build.sh b/fuzz/build.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -eu
+
+cd "$SRC"/pdfplumber
+pip3 install .
+
+# Build fuzzers in $OUT
+for fuzzer in $(find fuzz -name '*_fuzzer.py');do
+  compile_python_fuzzer "$fuzzer"
+done
+
+mkdir -p fuzz/corpus
+find . -name "*.pdf" -exec cp "{}" fuzz/corpus \;
+zip -q $OUT/pdf_load_fuzzer_seed_corpus.zip fuzz/corpus/*
diff --git a/fuzz/fuzz_helpers.py b/fuzz/fuzz_helpers.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python3
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+import contextlib
+import io
+import tempfile
+from enum import IntEnum
+from typing import Protocol, Type, TypeVar
+
+import atheris
+
+
+class HasMax(Protocol):
+    MAX: int
+
+
+T = TypeVar("T", bound=IntEnum)
+
+
+class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider):
+    def ConsumeRandomBytes(self) -> bytes:
+        return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes()))
+
+    def ConsumeRandomString(self) -> str:
+        return self.ConsumeUnicodeNoSurrogates(
+            self.ConsumeIntInRange(0, self.remaining_bytes())
+        )
+
+    def ConsumeRemainingString(self) -> str:
+        return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes())
+
+    def ConsumeRemainingBytes(self) -> bytes:
+        return self.ConsumeBytes(self.remaining_bytes())
+
+    @contextlib.contextmanager
+    def ConsumeMemoryFile(
+        self, all_data: bool = False, as_bytes: bool = True
+    ) -> io.BytesIO:
+        if all_data:
+            file_data = (
+                self.ConsumeRemainingBytes()
+                if as_bytes
+                else self.ConsumeRemainingString()
+            )
+        else:
+            file_data = (
+                self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
+            )
+
+        file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data)
+        yield file
+        file.close()
+
+    @contextlib.contextmanager
+    def ConsumeTemporaryFile(
+        self, suffix: str, all_data: bool = False, as_bytes: bool = True
+    ) -> str:
+        if all_data:
+            file_data = (
+                self.ConsumeRemainingBytes()
+                if as_bytes
+                else self.ConsumeRemainingString()
+            )
+        else:
+            file_data = (
+                self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
+            )
+
+        mode = "w+b" if as_bytes else "w+"
+        tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix)
+        tfile.write(file_data)
+        tfile.seek(0)
+        tfile.flush()
+        yield tfile.name
+        tfile.close()
+
+    def ConsumeEnum(self, enum_type: Type[T]) -> T:
+        return enum_type(self.ConsumeIntInRange(0, enum_type.MAX))
diff --git a/fuzz/pdf_load_fuzzer.py b/fuzz/pdf_load_fuzzer.py
@@ -0,0 +1,59 @@
+import sys
+from enum import IntEnum
+
+import atheris
+from fuzz_helpers import EnhancedFuzzedDataProvider
+
+with atheris.instrument_imports(include=["pdfplumber"]):
+    from pdfminer.pdftypes import PDFException
+    from pdfminer.psparser import PSException
+
+    import pdfplumber
+
+
+class CastType(IntEnum):
+    CSV = 0
+    IMAGE = 1
+    JSON = 2
+    DICT = 3
+    MAX = 4
+
+
+def TestOneInput(data: bytes):
+    fdp = EnhancedFuzzedDataProvider(data)
+
+    try:
+        with fdp.ConsumeMemoryFile(all_data=False, as_bytes=True) as f:
+            pdf = pdfplumber.open(f)
+
+            # Test casting
+            cast_ty = fdp.ConsumeEnum(CastType)
+
+            if cast_ty is CastType.CSV:
+                pdf.to_csv()
+            elif cast_ty is CastType.IMAGE and pdf.pages:
+                pdf.pages[0].to_image()
+            elif cast_ty is CastType.JSON:
+                pdf.to_json()
+            elif cast_ty is CastType.DICT:
+                pdf.to_dict()
+
+    except (PDFException, PSException, AssertionError):
+        return -1
+    except ValueError as e:
+        if "invalid literal for int" in str(e):
+            return -1
+        raise e
+    except TypeError as e:
+        if "argument must be a string" in str(e):
+            return -1
+        raise e
+
+
+def main():
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()