Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example Scripts: Read JSON and CSV data with Polars and databpy #3

Open
kolibril13 opened this issue Dec 20, 2024 · 5 comments
Open

Comments

@kolibril13
Copy link
Contributor

For reference, here are two scripts to load data into Blender spreadsheets using polars.
My plan is to incorporate them into https://extensions.blender.org/add-ons/csv-importer/ next month, together with databpy.

Read JSON

import polars as pl
import databpy as db
from io import StringIO
import numpy as np


# Example JSON data
json_file = StringIO(
    """
    {
    "Star": [
        [58.2136, 91.8819, 0.0],
        [58.1961, 92.215, 0.0]
    ],
    "Is_Visible": [[true], [false]],
    "Intensity": [[10], [20]]
    }
"""
)

# here's how you'd load a custom json file
# import pathlib as Pathlib 
# json_file = Pathlib.cwd() / "data.json"

df = pl.read_json(json_file)
columns_to_explode = [col for col in df.columns if df[col].dtype == pl.List(pl.List)]
df = df.explode(columns_to_explode)

vertices = np.zeros((len(df), 3), dtype=np.float32)
bob = db.create_bob(vertices, name="Hello JSON")

for col in df.columns:
    data = np.vstack(df.get_column(col).to_numpy())
    bob.store_named_attribute(data, col)

print(bob.named_attribute("Star"))
print(bob.named_attribute("Is_Visible"))
print(bob.named_attribute("Intensity"))
image

Read CSV

import polars as pl
import databpy as db
from io import StringIO
import numpy as np

csv_data = StringIO(
    """MyFloat,Is_Visible,Intensity
42.12,true,10
12.33,false,20
"""
)

# here's how you'd load a custom csv file
# import pathlib as Pathlib
# json_file = Pathlib.cwd() / "data.csv"


df = pl.read_csv(csv_data)

# Since we no longer have nested arrays as in read_json, there's no need to explode columns
vertices = np.zeros((len(df), 3), dtype=np.float32)
bob = db.create_bob(vertices, name="Hello CSV")

# Store each column as an attribute
# Note: .to_numpy() returns a 1D array, so we reshape to 2D if needed.
for col in df.columns:
    data = df[col].to_numpy().reshape(-1, 1)
    bob.store_named_attribute(data, col)

# Print the stored attributes
print(bob.named_attribute("MyFloat"))
print(bob.named_attribute("Is_Visible"))
print(bob.named_attribute("Intensity"))
image
@BradyAJohnston
Copy link
Owner

Thanks for the reference! If you've like to contribute any documentation / examples for the documentation I would welcome it. If you'd like to implement the polars stuff inside databpy I would welcome PRs!

@kolibril13
Copy link
Contributor Author

If you've like to contribute any documentation / examples for the documentation

Sure, I might be able to contribute that early next month.
I've already made some attempts to document data_attributes here:
https://kolibril13.github.io/bpy-gallery/n3data_attributes/#adding-a-float
it still needs some refactoring, but maybe I can make a databpy version of this chapter.

If you'd like to implement the polars stuff inside databpy I would welcome PRs

That's good to hear! I'll see if I find time for this next month as well.

@kolibril13
Copy link
Contributor Author

While I'm on it, here are snippets for the data formats parquet and xlsx (latter one is from Excel)

Read Parquet

import polars as pl

data_polars = pl.DataFrame({
    "MyFloat": [42.12, 12.33],
    "Is_Visible": [True, False],
    "Intensity": [10, 20]
})

data_polars.write_parquet("simple_data.parquet")
import polars as pl
import databpy as db
import numpy as np

df = pl.read_parquet("simple_data.parquet")

vertices = np.zeros((len(df), 3), dtype=np.float32)
bob = db.create_bob(vertices, name="Hello Parquet")

for col in df.columns:
    data = df[col].to_numpy().reshape(-1, 1)
    bob.store_named_attribute(data, col)

Read Excel file

import polars as pl

data_polars = pl.DataFrame({
    "MyFloat": [42.12, 12.33],
    "Is_Visible": [True, False],
    "Intensity": [10, 20]
})

# Save the DataFrame to an Excel file
# pip install xlsxwriter
data_polars.write_excel("simple_excel_polars.xlsx")
import polars as pl
import databpy as db
import numpy as np

# Read the Excel file using Polars
# pip install fastexcel pyarrow 
df = pl.read_excel("simple_excel_polars.xlsx", sheet_name="Sheet1")

vertices = np.zeros((len(df), 3), dtype=np.float32)
bob = db.create_bob(vertices, name="Hello Excel")

for col in df.columns:
    data = df[col].to_numpy().reshape(-1, 1)
    bob.store_named_attribute(data, col)

@kolibril13
Copy link
Contributor Author

next iteration that intentionally skips columns with string data

Read CSV refined

import polars as pl
import databpy as db
import numpy as np
from io import StringIO
import time

start_time = time.perf_counter()
 
csv_file = StringIO(
    """
Is_Visible,Intensity,My Strings
True,10,A
False,20,B
"""
)

df = pl.read_csv(csv_file)
# Create Bob object
vertices = np.zeros((len(df), 3), dtype=np.float32)
bob = db.create_bob(vertices, name="CSV_MyBob")

for col in df.columns:
    col_dtype = df[col].dtype
    print(col_dtype)
    
    # Skip columns with string data types
    # pl.Utf8 is for CSV objects
    # pl.List(pl.Utf8) is for JSON objects (string arrays in each cell)
    if col_dtype in [pl.Utf8, pl.List(pl.Utf8)]:
        print(f"Skipping column '{col}' as it contains string data.")
        continue

    data = df[col].to_numpy()
    if df[col].dtype == pl.List: # Handle nested lists if necessary
        data = np.vstack(data)
    bob.store_named_attribute(data, col)

elapsed_time_ms = (time.perf_counter() - start_time) * 1000

print(f" 🐻‍❄️ 📥  Added {bob.name} in {elapsed_time_ms:.2f} ms")

Read JSON refined:

import polars as pl
import databpy as db
import numpy as np
from io import StringIO
import time

# Updated JSON data
json_file = StringIO(
"""
{
"Dino": [
    [55.3846, 97.1795, 0.0],
    [51.5385, 96.0256, 0.0]
],
"Star": [
    [58.2136, 91.8819, 0.0],
    [58.1961, 92.215, 0.0]
],
"Is_Visible": [
    [true],
    [false]
],
"Intensity": [
    [10],
    [20]
],
"My Strings": [
    ["A"],
    ["B"]
]
}
"""
)

start_time = time.perf_counter()
 
df = pl.read_json(json_file)

columns_to_explode = [
    col for col in df.columns if df[col].dtype == pl.List(pl.List)
]
df = df.explode(columns_to_explode)

# Create Bob object
vertices = np.zeros((len(df), 3), dtype=np.float32)
bob = db.create_bob(vertices, name="JSON_MyBob")


for col in df.columns:
    col_dtype = df[col].dtype
    print(col_dtype)
    
    # Skip columns with string data types
    # pl.Utf8 is for CSV objects
    # pl.List(pl.Utf8) is for JSON objects (string arrays in each cell)
    if col_dtype in [pl.Utf8, pl.List(pl.Utf8)]:
        print(f"Skipping column '{col}' as it contains string data.")
        continue

    data = df[col].to_numpy()
    if df[col].dtype == pl.List: # Handle nested lists if necessary
        data = np.vstack(data)
        
    bob.store_named_attribute(data, col)



elapsed_time_ms = (time.perf_counter() - start_time) * 1000

print(f" 🐻‍❄️ 📥  Added {bob.name} in {elapsed_time_ms:.2f} ms")

@kolibril13
Copy link
Contributor Author

Read polars dataframe

import polars as pl
import numpy as np
import databpy as db
df = pl.DataFrame({
    "Star": [
        [58.2, 91.8, 0.0],
        [58.1, 92.2, 0.0]
    ],
    "Is_Visible": [True, False],
    "Intensity": [10, 20],
})

vertices = np.zeros((len(df), 3), dtype=np.float32)
bob = db.create_bob(vertices, name="DataWithVector")

for col in df.columns:
    data = np.vstack(df[col].to_numpy())
    bob.store_named_attribute(data, col)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants