Skip to content

Commit

Permalink
feat: Add support for CSV input with new backend to transform CSV fil…
Browse files Browse the repository at this point in the history
…es to DoclingDocument (#945)

* feat: Implement csv backend and format detection

Signed-off-by: Tobias Strebitzer <[email protected]>

* test: Implement csv parsing and format tests

Signed-off-by: Tobias Strebitzer <[email protected]>

* docs: Add example and CSV format documentation

Signed-off-by: Tobias Strebitzer <[email protected]>

* feat: Add support for various CSV dialects and update documentation

Signed-off-by: Tobias Strebitzer <[email protected]>

* feat: Add validation for delimiters and tests for inconsistent csv files

Signed-off-by: Tobias Strebitzer <[email protected]>

---------

Signed-off-by: Tobias Strebitzer <[email protected]>
  • Loading branch information
tobiasstrebitzer authored Feb 14, 2025
1 parent 7493d5b commit 00d9405
Show file tree
Hide file tree
Showing 42 changed files with 9,885 additions and 0 deletions.
125 changes: 125 additions & 0 deletions docling/backend/csv_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import csv
import logging
import warnings
from io import BytesIO, StringIO
from pathlib import Path
from typing import Set, Union

from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument

_log = logging.getLogger(__name__)


class CsvDocumentBackend(DeclarativeDocumentBackend):
content: StringIO

def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)

# Load content
try:
if isinstance(self.path_or_stream, BytesIO):
self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8"))
elif isinstance(self.path_or_stream, Path):
self.content = StringIO(self.path_or_stream.read_text("utf-8"))
self.valid = True
except Exception as e:
raise RuntimeError(
f"CsvDocumentBackend could not load document with hash {self.document_hash}"
) from e
return

def is_valid(self) -> bool:
return self.valid

@classmethod
def supports_pagination(cls) -> bool:
return False

def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None

@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.CSV}

def convert(self) -> DoclingDocument:
"""
Parses the CSV data into a structured document model.
"""

# Detect CSV dialect
head = self.content.readline()
dialect = csv.Sniffer().sniff(head, ",;\t|:")
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
raise RuntimeError(
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
)

# Parce CSV
self.content.seek(0)
result = csv.reader(self.content, dialect=dialect, strict=True)
self.csv_data = list(result)
_log.info(f"Detected {len(self.csv_data)} lines")

# Ensure uniform column length
expected_length = len(self.csv_data[0])
is_uniform = all(len(row) == expected_length for row in self.csv_data)
if not is_uniform:
warnings.warn(
f"Inconsistent column lengths detected in CSV data. "
f"Expected {expected_length} columns, but found rows with varying lengths. "
f"Ensure all rows have the same number of columns."
)

# Parse the CSV into a structured document model
origin = DocumentOrigin(
filename=self.file.name or "file.csv",
mimetype="text/csv",
binary_hash=self.document_hash,
)

doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)

if self.is_valid():
# Convert CSV data to table
if self.csv_data:
num_rows = len(self.csv_data)
num_cols = max(len(row) for row in self.csv_data)

table_data = TableData(
num_rows=num_rows,
num_cols=num_cols,
table_cells=[],
)

# Convert each cell to TableCell
for row_idx, row in enumerate(self.csv_data):
for col_idx, cell_value in enumerate(row):
cell = TableCell(
text=str(cell_value),
row_span=1, # CSV doesn't support merged cells
col_span=1,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + 1,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + 1,
col_header=row_idx == 0, # First row as header
row_header=False,
)
table_data.table_cells.append(cell)

doc.add_table(data=table_data)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)

return doc
3 changes: 3 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class InputFormat(str, Enum):
PDF = "pdf"
ASCIIDOC = "asciidoc"
MD = "md"
CSV = "csv"
XLSX = "xlsx"
XML_USPTO = "xml_uspto"
JSON_DOCLING = "json_docling"
Expand All @@ -61,6 +62,7 @@ class OutputFormat(str, Enum):
InputFormat.XML_PUBMED: ["xml", "nxml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.CSV: ["csv"],
InputFormat.XLSX: ["xlsx"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
Expand Down Expand Up @@ -88,6 +90,7 @@ class OutputFormat(str, Enum):
InputFormat.PDF: ["application/pdf"],
InputFormat.ASCIIDOC: ["text/asciidoc"],
InputFormat.MD: ["text/markdown", "text/x-markdown"],
InputFormat.CSV: ["text/csv"],
InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
Expand Down
33 changes: 33 additions & 0 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import csv
import logging
import re
from enum import Enum
Expand Down Expand Up @@ -296,6 +297,7 @@ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputForma
mime = _DocumentConversionInput._mime_from_extension(ext)

mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
mime = mime or _DocumentConversionInput._detect_csv(content)
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
if formats:
Expand Down Expand Up @@ -352,6 +354,8 @@ def _mime_from_extension(ext):
mime = FormatToMimeType[InputFormat.HTML][0]
elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0]
elif ext in FormatToExtensions[InputFormat.CSV]:
mime = FormatToMimeType[InputFormat.CSV][0]
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
elif ext in FormatToExtensions[InputFormat.PDF]:
Expand Down Expand Up @@ -392,3 +396,32 @@ def _detect_html_xhtml(
return "application/xml"

return None

@staticmethod
def _detect_csv(
content: bytes,
) -> Optional[Literal["text/csv"]]:
"""Guess the mime type of a CSV file from its content.
Args:
content: A short piece of a document from its beginning.
Returns:
The mime type of a CSV file, or None if the content does
not match any of the format.
"""
content_str = content.decode("ascii", errors="ignore").strip()

# Ensure there's at least one newline (CSV is usually multi-line)
if "\n" not in content_str:
return None

# Use csv.Sniffer to detect CSV characteristics
try:
dialect = csv.Sniffer().sniff(content_str)
if dialect.delimiter in {",", ";", "\t", "|"}: # Common delimiters
return "text/csv"
except csv.Error:
return None

return None
9 changes: 9 additions & 0 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend
Expand Down Expand Up @@ -61,6 +62,11 @@ def set_optional_field_default(self) -> "FormatOption":
return self


class CsvFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend


class ExcelFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
Expand Down Expand Up @@ -113,6 +119,9 @@ class PdfFormatOption(FormatOption):

def _get_default_option(format: InputFormat) -> FormatOption:
format_to_default_options = {
InputFormat.CSV: FormatOption(
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
),
InputFormat.XLSX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
),
Expand Down
80 changes: 80 additions & 0 deletions docs/examples/backend_csv.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Conversion of CSV files\n",
"\n",
"This example shows how to convert CSV files to a structured Docling Document.\n",
"\n",
"* Multiple delimiters are supported: `,` `;` `|` `[tab]`\n",
"* Additional CSV dialect settings are detected automatically (e.g. quotes, line separator, escape character)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example Code"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"# Convert CSV to Docling document\n",
"converter = DocumentConverter()\n",
"result = converter.convert(Path(\"../../tests/data/csv/csv-comma.csv\"))\n",
"output = result.document.export_to_markdown()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This code generates the following output:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |\n",
"|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|\n",
"| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | [email protected] | 2020-08-24 | http://www.stephenson.com/ |\n",
"| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano, Dr | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | [email protected] | 2021-04-23 | http://www.hobbs.com/ |\n",
"| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | [email protected] | 2020-03-25 | http://www.lawrence.com/ |\n",
"| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | [email protected] | 2020-06-02 | http://www.good-lyons.com/ |\n",
"| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | [email protected] | 2021-04-17 | https://goodwin-ingram.com/ |"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "docling-TtEIaPrw-py3.12",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
1 change: 1 addition & 0 deletions docs/examples/run_with_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def main():
InputFormat.HTML,
InputFormat.PPTX,
InputFormat.ASCIIDOC,
InputFormat.CSV,
InputFormat.MD,
], # whitelist formats, non-matching files are ignored.
format_options={
Expand Down
1 change: 1 addition & 0 deletions docs/supported_formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Below you can find a listing of all supported input and output formats.
| Markdown | |
| AsciiDoc | |
| HTML, XHTML | |
| CSV | |
| PNG, JPEG, TIFF, BMP | Image formats |

Schema-specific support:
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ nav:
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
- "Accelerator options": examples/run_with_accelerator.py
- "Simple translation": examples/translate.py
- examples/backend_csv.ipynb
- examples/backend_xml_rag.ipynb
- ✂️ Chunking:
- examples/hybrid_chunking.ipynb
Expand Down
5 changes: 5 additions & 0 deletions tests/data/csv/csv-comma-in-cell.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1,2,3,4
a,b,c,d
a,",",c,d
a,b,c,d
a,b,c,d
6 changes: 6 additions & 0 deletions tests/data/csv/csv-comma.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,[email protected],2020-08-24,http://www.stephenson.com/
2,1Ef7b82A4CAAD10,Preston,"Lozano, Dr",Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,[email protected],2021-04-23,http://www.hobbs.com/
3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,[email protected],2020-03-25,http://www.lawrence.com/
4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,[email protected],2020-06-02,http://www.good-lyons.com/
5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,[email protected],2021-04-17,https://goodwin-ingram.com/
5 changes: 5 additions & 0 deletions tests/data/csv/csv-inconsistent-header.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1,2,3
a,b,c,d
a,b,c,d
a,b,c,d
a,b,c,d
6 changes: 6 additions & 0 deletions tests/data/csv/csv-pipe.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Index|Customer Id|First Name|Last Name|Company|City|Country|Phone 1|Phone 2|Email|Subscription Date|Website
1|DD37Cf93aecA6Dc|Sheryl|Baxter|Rasmussen Group|East Leonard|Chile|229.077.5154|397.884.0519x718|[email protected]|2020-08-24|http://www.stephenson.com/
2|1Ef7b82A4CAAD10|Preston|Lozano|Vega-Gentry|East Jimmychester|Djibouti|5153435776|686-620-1820x944|[email protected]|2021-04-23|http://www.hobbs.com/
3|6F94879bDAfE5a6|Roy|Berry|Murillo-Perry|Isabelborough|Antigua and Barbuda|+1-539-402-0259|(496)978-3969x58947|[email protected]|2020-03-25|http://www.lawrence.com/
4|5Cef8BFA16c5e3c|Linda|Olsen|"Dominguez|Mcmillan and Donovan"|Bensonview|Dominican Republic|001-808-617-6467x12895|+1-813-324-8756|[email protected]|2020-06-02|http://www.good-lyons.com/
5|053d585Ab6b3159|Joanna|Bender|"Martin|Lang and Andrade"|West Priscilla|Slovakia (Slovak Republic)|001-234-203-0635x76146|001-199-446-3860x3486|[email protected]|2021-04-17|https://goodwin-ingram.com/
6 changes: 6 additions & 0 deletions tests/data/csv/csv-semicolon.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Index;Customer Id;First Name;Last Name;Company;City;Country;Phone 1;Phone 2;Email;Subscription Date;Website
1;DD37Cf93aecA6Dc;Sheryl;Baxter;Rasmussen Group;East Leonard;Chile;229.077.5154;397.884.0519x718;[email protected];2020-08-24;http://www.stephenson.com/
2;1Ef7b82A4CAAD10;Preston;Lozano;Vega-Gentry;East Jimmychester;Djibouti;5153435776;686-620-1820x944;[email protected];2021-04-23;http://www.hobbs.com/
3;6F94879bDAfE5a6;Roy;Berry;Murillo-Perry;Isabelborough;Antigua and Barbuda;+1-539-402-0259;(496)978-3969x58947;[email protected];2020-03-25;http://www.lawrence.com/
4;5Cef8BFA16c5e3c;Linda;Olsen;"Dominguez;Mcmillan and Donovan";Bensonview;Dominican Republic;001-808-617-6467x12895;+1-813-324-8756;[email protected];2020-06-02;http://www.good-lyons.com/
5;053d585Ab6b3159;Joanna;Bender;"Martin;Lang and Andrade";West Priscilla;Slovakia (Slovak Republic);001-234-203-0635x76146;001-199-446-3860x3486;[email protected];2021-04-17;https://goodwin-ingram.com/
6 changes: 6 additions & 0 deletions tests/data/csv/csv-tab.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 [email protected] 2020-08-24 http://www.stephenson.com/
2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 [email protected] 2021-04-23 http://www.hobbs.com/
3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 [email protected] 2020-03-25 http://www.lawrence.com/
4 5Cef8BFA16c5e3c Linda Olsen "Dominguez Mcmillan and Donovan" Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 [email protected] 2020-06-02 http://www.good-lyons.com/
5 053d585Ab6b3159 Joanna Bender "Martin Lang and Andrade" West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 [email protected] 2021-04-17 https://goodwin-ingram.com/
5 changes: 5 additions & 0 deletions tests/data/csv/csv-too-few-columns.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1,2,3,4
a,'b',c,d
a,b,c
a,b,c,d
a,b,c,d
5 changes: 5 additions & 0 deletions tests/data/csv/csv-too-many-columns.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1,2,3,4
a,b,c,d
a,b,c,d,e
a,b,c,d
a,b,c,d
2 changes: 2 additions & 0 deletions tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.itxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [5x4]
Loading

0 comments on commit 00d9405

Please sign in to comment.