-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add support for CSV input with new backend to transform CSV fil…
…es to DoclingDocument (#945) * feat: Implement csv backend and format detection Signed-off-by: Tobias Strebitzer <[email protected]> * test: Implement csv parsing and format tests Signed-off-by: Tobias Strebitzer <[email protected]> * docs: Add example and CSV format documentation Signed-off-by: Tobias Strebitzer <[email protected]> * feat: Add support for various CSV dialects and update documentation Signed-off-by: Tobias Strebitzer <[email protected]> * feat: Add validation for delimiters and tests for inconsistent csv files Signed-off-by: Tobias Strebitzer <[email protected]> --------- Signed-off-by: Tobias Strebitzer <[email protected]>
- Loading branch information
1 parent
7493d5b
commit 00d9405
Showing
42 changed files
with
9,885 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import csv | ||
import logging | ||
import warnings | ||
from io import BytesIO, StringIO | ||
from pathlib import Path | ||
from typing import Set, Union | ||
|
||
from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData | ||
|
||
from docling.backend.abstract_backend import DeclarativeDocumentBackend | ||
from docling.datamodel.base_models import InputFormat | ||
from docling.datamodel.document import InputDocument | ||
|
||
_log = logging.getLogger(__name__) | ||
|
||
|
||
class CsvDocumentBackend(DeclarativeDocumentBackend): | ||
content: StringIO | ||
|
||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): | ||
super().__init__(in_doc, path_or_stream) | ||
|
||
# Load content | ||
try: | ||
if isinstance(self.path_or_stream, BytesIO): | ||
self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8")) | ||
elif isinstance(self.path_or_stream, Path): | ||
self.content = StringIO(self.path_or_stream.read_text("utf-8")) | ||
self.valid = True | ||
except Exception as e: | ||
raise RuntimeError( | ||
f"CsvDocumentBackend could not load document with hash {self.document_hash}" | ||
) from e | ||
return | ||
|
||
def is_valid(self) -> bool: | ||
return self.valid | ||
|
||
@classmethod | ||
def supports_pagination(cls) -> bool: | ||
return False | ||
|
||
def unload(self): | ||
if isinstance(self.path_or_stream, BytesIO): | ||
self.path_or_stream.close() | ||
self.path_or_stream = None | ||
|
||
@classmethod | ||
def supported_formats(cls) -> Set[InputFormat]: | ||
return {InputFormat.CSV} | ||
|
||
def convert(self) -> DoclingDocument: | ||
""" | ||
Parses the CSV data into a structured document model. | ||
""" | ||
|
||
# Detect CSV dialect | ||
head = self.content.readline() | ||
dialect = csv.Sniffer().sniff(head, ",;\t|:") | ||
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"') | ||
if not dialect.delimiter in {",", ";", "\t", "|", ":"}: | ||
raise RuntimeError( | ||
f"Cannot convert csv with unknown delimiter {dialect.delimiter}." | ||
) | ||
|
||
# Parce CSV | ||
self.content.seek(0) | ||
result = csv.reader(self.content, dialect=dialect, strict=True) | ||
self.csv_data = list(result) | ||
_log.info(f"Detected {len(self.csv_data)} lines") | ||
|
||
# Ensure uniform column length | ||
expected_length = len(self.csv_data[0]) | ||
is_uniform = all(len(row) == expected_length for row in self.csv_data) | ||
if not is_uniform: | ||
warnings.warn( | ||
f"Inconsistent column lengths detected in CSV data. " | ||
f"Expected {expected_length} columns, but found rows with varying lengths. " | ||
f"Ensure all rows have the same number of columns." | ||
) | ||
|
||
# Parse the CSV into a structured document model | ||
origin = DocumentOrigin( | ||
filename=self.file.name or "file.csv", | ||
mimetype="text/csv", | ||
binary_hash=self.document_hash, | ||
) | ||
|
||
doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin) | ||
|
||
if self.is_valid(): | ||
# Convert CSV data to table | ||
if self.csv_data: | ||
num_rows = len(self.csv_data) | ||
num_cols = max(len(row) for row in self.csv_data) | ||
|
||
table_data = TableData( | ||
num_rows=num_rows, | ||
num_cols=num_cols, | ||
table_cells=[], | ||
) | ||
|
||
# Convert each cell to TableCell | ||
for row_idx, row in enumerate(self.csv_data): | ||
for col_idx, cell_value in enumerate(row): | ||
cell = TableCell( | ||
text=str(cell_value), | ||
row_span=1, # CSV doesn't support merged cells | ||
col_span=1, | ||
start_row_offset_idx=row_idx, | ||
end_row_offset_idx=row_idx + 1, | ||
start_col_offset_idx=col_idx, | ||
end_col_offset_idx=col_idx + 1, | ||
col_header=row_idx == 0, # First row as header | ||
row_header=False, | ||
) | ||
table_data.table_cells.append(cell) | ||
|
||
doc.add_table(data=table_data) | ||
else: | ||
raise RuntimeError( | ||
f"Cannot convert doc with {self.document_hash} because the backend failed to init." | ||
) | ||
|
||
return doc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Conversion of CSV files\n", | ||
"\n", | ||
"This example shows how to convert CSV files to a structured Docling Document.\n", | ||
"\n", | ||
"* Multiple delimiters are supported: `,` `;` `|` `[tab]`\n", | ||
"* Additional CSV dialect settings are detected automatically (e.g. quotes, line separator, escape character)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Example Code" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 59, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from pathlib import Path\n", | ||
"\n", | ||
"from docling.document_converter import DocumentConverter\n", | ||
"\n", | ||
"# Convert CSV to Docling document\n", | ||
"converter = DocumentConverter()\n", | ||
"result = converter.convert(Path(\"../../tests/data/csv/csv-comma.csv\"))\n", | ||
"output = result.document.export_to_markdown()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"This code generates the following output:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |\n", | ||
"|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|\n", | ||
"| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | [email protected] | 2020-08-24 | http://www.stephenson.com/ |\n", | ||
"| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano, Dr | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | [email protected] | 2021-04-23 | http://www.hobbs.com/ |\n", | ||
"| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | [email protected] | 2020-03-25 | http://www.lawrence.com/ |\n", | ||
"| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | [email protected] | 2020-06-02 | http://www.good-lyons.com/ |\n", | ||
"| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | [email protected] | 2021-04-17 | https://goodwin-ingram.com/ |" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "docling-TtEIaPrw-py3.12", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.8" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
1,2,3,4 | ||
a,b,c,d | ||
a,",",c,d | ||
a,b,c,d | ||
a,b,c,d |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website | ||
1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,[email protected],2020-08-24,http://www.stephenson.com/ | ||
2,1Ef7b82A4CAAD10,Preston,"Lozano, Dr",Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,[email protected],2021-04-23,http://www.hobbs.com/ | ||
3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,[email protected],2020-03-25,http://www.lawrence.com/ | ||
4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,[email protected],2020-06-02,http://www.good-lyons.com/ | ||
5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,[email protected],2021-04-17,https://goodwin-ingram.com/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
1,2,3 | ||
a,b,c,d | ||
a,b,c,d | ||
a,b,c,d | ||
a,b,c,d |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Index|Customer Id|First Name|Last Name|Company|City|Country|Phone 1|Phone 2|Email|Subscription Date|Website | ||
1|DD37Cf93aecA6Dc|Sheryl|Baxter|Rasmussen Group|East Leonard|Chile|229.077.5154|397.884.0519x718|[email protected]|2020-08-24|http://www.stephenson.com/ | ||
2|1Ef7b82A4CAAD10|Preston|Lozano|Vega-Gentry|East Jimmychester|Djibouti|5153435776|686-620-1820x944|[email protected]|2021-04-23|http://www.hobbs.com/ | ||
3|6F94879bDAfE5a6|Roy|Berry|Murillo-Perry|Isabelborough|Antigua and Barbuda|+1-539-402-0259|(496)978-3969x58947|[email protected]|2020-03-25|http://www.lawrence.com/ | ||
4|5Cef8BFA16c5e3c|Linda|Olsen|"Dominguez|Mcmillan and Donovan"|Bensonview|Dominican Republic|001-808-617-6467x12895|+1-813-324-8756|[email protected]|2020-06-02|http://www.good-lyons.com/ | ||
5|053d585Ab6b3159|Joanna|Bender|"Martin|Lang and Andrade"|West Priscilla|Slovakia (Slovak Republic)|001-234-203-0635x76146|001-199-446-3860x3486|[email protected]|2021-04-17|https://goodwin-ingram.com/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Index;Customer Id;First Name;Last Name;Company;City;Country;Phone 1;Phone 2;Email;Subscription Date;Website | ||
1;DD37Cf93aecA6Dc;Sheryl;Baxter;Rasmussen Group;East Leonard;Chile;229.077.5154;397.884.0519x718;[email protected];2020-08-24;http://www.stephenson.com/ | ||
2;1Ef7b82A4CAAD10;Preston;Lozano;Vega-Gentry;East Jimmychester;Djibouti;5153435776;686-620-1820x944;[email protected];2021-04-23;http://www.hobbs.com/ | ||
3;6F94879bDAfE5a6;Roy;Berry;Murillo-Perry;Isabelborough;Antigua and Barbuda;+1-539-402-0259;(496)978-3969x58947;[email protected];2020-03-25;http://www.lawrence.com/ | ||
4;5Cef8BFA16c5e3c;Linda;Olsen;"Dominguez;Mcmillan and Donovan";Bensonview;Dominican Republic;001-808-617-6467x12895;+1-813-324-8756;[email protected];2020-06-02;http://www.good-lyons.com/ | ||
5;053d585Ab6b3159;Joanna;Bender;"Martin;Lang and Andrade";West Priscilla;Slovakia (Slovak Republic);001-234-203-0635x76146;001-199-446-3860x3486;[email protected];2021-04-17;https://goodwin-ingram.com/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website | ||
1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 [email protected] 2020-08-24 http://www.stephenson.com/ | ||
2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 [email protected] 2021-04-23 http://www.hobbs.com/ | ||
3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 [email protected] 2020-03-25 http://www.lawrence.com/ | ||
4 5Cef8BFA16c5e3c Linda Olsen "Dominguez Mcmillan and Donovan" Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 [email protected] 2020-06-02 http://www.good-lyons.com/ | ||
5 053d585Ab6b3159 Joanna Bender "Martin Lang and Andrade" West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 [email protected] 2021-04-17 https://goodwin-ingram.com/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
1,2,3,4 | ||
a,'b',c,d | ||
a,b,c | ||
a,b,c,d | ||
a,b,c,d |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
1,2,3,4 | ||
a,b,c,d | ||
a,b,c,d,e | ||
a,b,c,d | ||
a,b,c,d |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
item-0 at level 0: unspecified: group _root_ | ||
item-1 at level 1: table with [5x4] |
Oops, something went wrong.