Skip to content

Commit 20dfaad

Browse files
committed
feat: add get_pdf function to convert various sources to PDF and update README with examples
1 parent d7718e0 commit 20dfaad

File tree

3 files changed

+246
-2
lines changed

3 files changed

+246
-2
lines changed

README.md

+28
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ To install: ```pip install pdfdol```
99

1010
# Examples
1111

12+
## Pdf "Stores"
13+
1214
Get a dict-like object to list and read the pdfs of a folder, as text:
1315

1416
>>> from pdfdol import PdfFilesReader
@@ -51,3 +53,29 @@ from pdfdol import concat_pdfs
5153
s = Files('~/Downloads/cosmograph_documentation_pdfs/')
5254
concat_pdfs(s, key_order=sorted)
5355
```
56+
57+
58+
## Get pdf from various sources
59+
60+
Example with a URL
61+
62+
```py
63+
pdf_data = get_pdf("https://pypi.org", src_kind="url")
64+
print("Got PDF data of length:", len(pdf_data))
65+
```
66+
67+
Example with HTML content
68+
69+
```py
70+
html_content = "<html><body><h1>Hello, PDF!</h1></body></html>"
71+
pdf_data = get_pdf(html_content, src_kind="html")
72+
print("Got PDF data of length:", len(pdf_data))
73+
```
74+
75+
Example saving to file
76+
77+
```py
78+
filepath = get_pdf("https://pypi.org", egress="output.pdf", src_kind="url")
79+
print("PDF saved to:", filepath)
80+
```
81+

pdfdol/__init__.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,12 @@
1212
1313
"""
1414

15-
from pdfdol.base import PdfFilesReader, pdf_files_reader_wrap
16-
from pdfdol.util import concat_pdfs
15+
from pdfdol.base import (
16+
PdfReader, # just pypdf's PdfReader
17+
PdfFilesReader, # A Mapping giving you a dict-like API to pdf files in a folder.
18+
pdf_files_reader_wrap, # To create a PdfFilesReader for different sources than a folder.
19+
)
20+
from pdfdol.util import concat_pdfs # concatenate pdfs
21+
from pdfdol.tools import (
22+
get_pdf, # Convert the given source to a PDF (bytes) and process it using the specified egress.
23+
)

pdfdol/tools.py

+209
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
"""Pdf Tools."""
2+
3+
from functools import partial
4+
from typing import Literal, Callable, Union
5+
import os
6+
7+
# Define the allowed source kinds
8+
SrcKind = Literal["url", "html", "file"]
9+
10+
11+
def _resolve_src_kind(src: str) -> SrcKind:
12+
"""
13+
Heuristically determine the kind of source provided.
14+
15+
Args:
16+
src (str): The source input which can be a URL, HTML string, or a file path.
17+
18+
Returns:
19+
SrcKind: "url" if src starts with http:// or https://,
20+
"html" if src appears to be HTML content,
21+
"file" if src is a path to an existing file.
22+
23+
Examples:
24+
25+
>>> _resolve_src_kind("https://example.com")
26+
'url'
27+
>>> _resolve_src_kind("<html><body>Test</body></html>")
28+
'html'
29+
>>> import tempfile, os
30+
>>> with tempfile.NamedTemporaryFile(delete=False) as tmp:
31+
... _ = tmp.write(b"dummy")
32+
... tmp_name = tmp.name
33+
>>> _resolve_src_kind(tmp_name) == 'file'
34+
True
35+
>>> os.remove(tmp_name)
36+
"""
37+
s = src.strip()
38+
if s.startswith("http://") or s.startswith("https://"):
39+
return "url"
40+
elif "<html" in s.lower():
41+
return "html"
42+
elif os.path.exists(s):
43+
return "file"
44+
else:
45+
# Fallback: if it doesn't look like a URL or a file exists, assume HTML.
46+
return "html"
47+
48+
49+
def _resolve_bytes_egress(egress: Union[None, str, Callable]) -> Callable[[bytes], any]:
50+
"""
51+
Return a callable that processes PDF bytes based on the given egress.
52+
53+
Args:
54+
egress (Union[None, str, Callable]):
55+
- If None, the callable returns the PDF bytes as-is.
56+
- If a string, the callable writes the PDF bytes to that file path and returns the path.
57+
- If a callable, it is returned directly.
58+
59+
Returns:
60+
Callable[[bytes], any]: A function that processes PDF bytes.
61+
62+
Examples:
63+
64+
>>> f = _resolve_bytes_egress(None)
65+
>>> f(b'pdf data') == b'pdf data'
66+
True
67+
>>> import tempfile, os
68+
>>> with tempfile.NamedTemporaryFile(delete=False) as tmp:
69+
... tmp_name = tmp.name
70+
>>> f = _resolve_bytes_egress(tmp_name)
71+
>>> result = f(b'pdf data')
72+
>>> result == tmp_name
73+
True
74+
>>> os.remove(tmp_name)
75+
"""
76+
if egress is None:
77+
return lambda b: b
78+
elif isinstance(egress, str):
79+
80+
def write_to_file(b: bytes) -> str:
81+
from pathlib import Path
82+
83+
Path(egress).write_bytes(b)
84+
return egress
85+
86+
return write_to_file
87+
elif callable(egress):
88+
return egress
89+
else:
90+
raise ValueError("egress must be None, a file path string, or a callable.")
91+
92+
93+
def get_pdf(
94+
src: str,
95+
egress: Union[None, str, Callable] = None,
96+
*,
97+
src_kind: SrcKind = None,
98+
# extra options for pdfkit.from_* functions
99+
options=None,
100+
toc=None,
101+
cover=None,
102+
css=None,
103+
configuration=None,
104+
cover_first=False,
105+
verbose=False,
106+
**kwargs,
107+
) -> Union[bytes, any]:
108+
"""
109+
Convert the given source to a PDF (bytes) and process it using the specified egress.
110+
111+
The source (src) can be:
112+
- a URL (e.g. "https://example.com")
113+
- an HTML string
114+
- a file path to an HTML file
115+
116+
The egress parameter determines how the PDF bytes are returned:
117+
- If None, returns the PDF as bytes.
118+
- If a string, treats it as a file path where the PDF is saved.
119+
- If a callable, applies it to the PDF bytes and returns its result.
120+
For example, you may want to specify egress=pypdf.PdfReader to get an object
121+
that provides an interface of all PDF components, or you might want to
122+
upload the PDF to a cloud storage service.
123+
124+
The src_kind parameter allows explicit specification of the source kind ("url", "html", or "file").
125+
If not provided, it is determined heuristically using _resolve_src_kind.
126+
127+
Args:
128+
src (str): The source to convert.
129+
egress (Union[None, str, Callable], optional): How to handle the PDF bytes.
130+
src_kind (SrcKind, optional): Explicit source kind; if omitted, determined automatically.
131+
options: (optional) dict with wkhtmltopdf options, with or w/o '--'
132+
toc: (optional) dict with toc-specific wkhtmltopdf options, with or w/o '--'
133+
cover: (optional) string with url/filename with a cover html page
134+
css: (optional) string with path to css file which will be added to a single input file
135+
configuration: (optional) instance of pdfkit.configuration.Configuration()
136+
cover_first: (optional) if True, cover always precedes TOC
137+
:verbose: (optional) By default '--quiet' is passed to all calls, set this to False to get wkhtmltopdf output to stdout.
138+
139+
140+
Returns:
141+
Union[bytes, any]: The PDF bytes, or the result of processing them via the egress callable.
142+
143+
144+
Examples:
145+
146+
# Example with a URL:
147+
pdf_data = get_pdf("https://pypi.org", src_kind="url")
148+
print("Got PDF data of length:", len(pdf_data))
149+
150+
# Example with HTML content:
151+
html_content = "<html><body><h1>Hello, PDF!</h1></body></html>"
152+
pdf_data = get_pdf(html_content, src_kind="html")
153+
print("Got PDF data of length:", len(pdf_data))
154+
155+
# Example saving to file:
156+
filepath = get_pdf("https://pypi.org", egress="output.pdf", src_kind="url")
157+
print("PDF saved to:", filepath)
158+
159+
160+
"""
161+
import pdfkit
162+
163+
# Determine the source kind if not explicitly provided.
164+
if src_kind is None:
165+
src_kind = _resolve_src_kind(src)
166+
167+
_add_options = lambda func: partial(
168+
func,
169+
options=options,
170+
toc=toc,
171+
cover=cover,
172+
css=css,
173+
configuration=configuration,
174+
cover_first=cover_first,
175+
verbose=verbose,
176+
**kwargs,
177+
)
178+
# Map the source kind to the corresponding pdfkit function.
179+
func_for_kind = {
180+
"url": _add_options(pdfkit.from_url),
181+
"html": _add_options(pdfkit.from_string),
182+
"file": _add_options(pdfkit.from_file),
183+
}
184+
src_to_bytes_func = func_for_kind.get(src_kind)
185+
if src_to_bytes_func is None:
186+
raise ValueError(f"Unsupported src_kind: {src_kind}")
187+
188+
# Generate the PDF bytes; passing False returns the bytes instead of writing to a file.
189+
pdf_bytes = src_to_bytes_func(src, False)
190+
191+
# Resolve the egress processing function and apply it.
192+
egress_func = _resolve_bytes_egress(egress)
193+
return egress_func(pdf_bytes)
194+
195+
196+
# Example usage:
197+
if __name__ == "__main__":
198+
# Example with a URL:
199+
pdf_data = get_pdf("https://pypi.org", src_kind="url")
200+
print("Got PDF data of length:", len(pdf_data))
201+
202+
# Example with HTML content:
203+
html_content = "<html><body><h1>Hello, PDF!</h1></body></html>"
204+
pdf_data = get_pdf(html_content, src_kind="html")
205+
print("Got PDF data of length:", len(pdf_data))
206+
207+
# Example saving to file:
208+
filepath = get_pdf("https://pypi.org", egress="output.pdf", src_kind="url")
209+
print("PDF saved to:", filepath)

0 commit comments

Comments
 (0)