mahmoodlab · pauldoucet · Nov 23, 2024 · Nov 6, 2024 · Nov 7, 2024 · Nov 16, 2024
diff --git a/docs/source/api.md b/docs/source/api.md
@@ -42,6 +42,18 @@
     HESTData
 ```
 
+## Pooling of transcripts, binning
+
+```{eval-rst}
+.. currentmodule:: hest.readers
+
+.. autosummary::
+    :toctree: generated
+
+    pool_transcripts_xenium
+    pool_bins_visiumhd
+```
+
 ## Batch effect visualization/correction
 
 ```{eval-rst}

diff --git a/src/hest/HESTData.py b/src/hest/HESTData.py
@@ -681,6 +681,10 @@ def read_hest_wsi(wsi: WSI, width, height):
 
         return SpatialData(tables=new_table, images=images, shapes=shapes)
 
+    def ensembleID_to_gene(self):
+        ensembleID_to_gene(self)
+
+
 class VisiumHESTData(HESTData): 
     def __init__(self, 
         adata: sc.AnnData, # type: ignore
@@ -1239,11 +1243,44 @@ def unify_gene_names(adata: sc.AnnData, species="human", drop=False) -> sc.AnnDa
 
     if drop:
         adata = adata[:, ~remaining]
-    
+
     # TODO return dict map of renamed, and remaining
-
     return adata
 
+def ensembleID_to_gene(st: HESTData, filter_na = False) -> HESTData:
+    """
+    Converts ensemble gene IDs of a HESTData object using Biomart annotations and filter out genes with no matching Ensembl ID
+
+    Args: 
+        st (HESTData): HESTData object
+        filter_na (bool): whenever to filter genes that are not valid ensemble IDs. Defaults to False.
+
+    Returns: 
+        HESTData: HESTData object with gene names instead of ensemble gene IDs
+    """
+
+    import scanpy as sc
+    species = st.meta['species']
+    org = "hsapiens" if species == "Homo sapiens" else "mmusculus"
+
+    annotations = sc.queries.biomart_annotations(org=org,attrs=['ensembl_gene_id', 'external_gene_name'], use_cache=True)
+    ensembl_to_gene_name = dict(zip(annotations['ensembl_gene_id'], annotations['external_gene_name']))
+
+
+    st.adata.var['gene_name'] = st.adata.var_names.map(ensembl_to_gene_name, na_action=None)
+
+    if filter_na: 
+        st.adata.var_names = st.adata.var['gene_name'].fillna('')
+    else: 
+        st.adata.var['gene_name'] = st.adata.var['gene_name'].where(st.adata.var['gene_name'].notna(), st.adata.var_names)
+
+    valid_genes = st.adata.var['gene_name'].notna()
+    st.adata = st.adata[:, valid_genes]
+
+
+    return st
+
+
 def save_spatial_plot(adata: sc.AnnData, save_path: str, name: str='', key='total_counts', pl_kwargs={}):
     """Save the spatial plot from that sc.AnnData
 
@@ -1255,7 +1292,7 @@ def save_spatial_plot(adata: sc.AnnData, save_path: str, name: str='', key='tota
     """
     import scanpy as sc
 
-    fig = sc.pl.spatial(adata, show=None, img_key="downscaled_fullres", color=[key], title=f"in_tissue spots", return_fig=True, **pl_kwargs)
+    fig = sc.pl.spatial(adata, show=False, img_key="downscaled_fullres", color=[key], title=f"in_tissue spots", return_fig=True, **pl_kwargs)
 
     filename = f"{name}spatial_plots.png"
 

diff --git a/src/hest/__init__.py b/src/hest/__init__.py
@@ -3,7 +3,7 @@
 from .utils import tiff_save, find_pixel_size_from_spot_coords, write_10X_h5, get_k_genes, SpotPacking
 from .autoalign import autoalign_visium
 from .readers import *
-from .HESTData import HESTData, read_HESTData, load_hest, iter_hest
+from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembleID_to_gene
 from .segmentation.cell_segmenters import segment_cellvit
 
 __all__ = [
@@ -20,5 +20,6 @@
     'autoalign_visium',
     'write_10X_h5',
     'HESTData',
-    'segment_cellvit'
+    'segment_cellvit', 
+    'ensembleID_to_gene'
 ]
diff --git a/src/hest/pipeline.py b/src/hest/pipeline.py
@@ -308,12 +308,12 @@ def preprocess_cells_visium_hd(
     nuclei_path = None
 ) -> Tuple[sc.AnnData, gpd.GeoDataFrame, gpd.GeoDataFrame]:
 
-    segment_kwargs['save_dir'] = full_exp_dir
+
 
     if nuclei_path is None:
         segmenter = cell_segmenter_factory(segment_method)
         logger.info('Segmenting cells...')
-        path_geojson = segmenter.segment_cells(he_wsi, '', pixel_size, **segment_kwargs)
+        path_geojson = segmenter.segment_cells(he_wsi, 'seg', pixel_size, save_dir=full_exp_dir, **segment_kwargs)
         nuc_gdf = GeojsonCellReader().read_gdf(path_geojson)  
     else:
         nuc_gdf = read_gdf(nuclei_path)
@@ -342,7 +342,8 @@ def process_meta_df(
     segment_tissue=True,
     registration_kwargs={},
     read_kwargs={},
-    segment_kwargs={}
+    segment_kwargs={},
+    preprocess_kwargs={}
 ):
     """Internal use method, process all the raw ST data in the meta_df"""
     for _, row in tqdm(meta_df.iterrows(), total=len(meta_df)):
@@ -404,21 +405,22 @@ def process_meta_df(
                     write_geojson(warped_cells, os.path.join(path, 'processed', f'he_cell_seg.geojson'), '', chunk=True)
                     write_geojson(warped_nuclei, os.path.join(path, 'processed', f'he_nucleus_seg.geojson'), '', chunk=True)
                 elif isinstance(st, VisiumHDHESTData):
-                    segment_config = {'method': 'cellvit'}
+                    segment_config = {}
                     binning_config = {}
 
                     bc_matrix_path = find_first_file_endswith(os.path.join(path, 'binned_outputs', 'square_002um'), 'filtered_feature_bc_matrix.h5')
                     bin_positions_path = find_first_file_endswith(os.path.join(path, 'binned_outputs', 'square_002um', 'spatial'), 'tissue_positions.parquet')
 
+                    del st.wsi
                     preprocess_cells_visium_hd(
                         os.path.join(path, 'processed', ALIGNED_HE_FILENAME),
                         full_exp_dir,
-                        row['id'],
                         st.pixel_size,
                         bc_matrix_path,
                         bin_positions_path,
                         segment_config,
                         binning_config,
+                        **preprocess_kwargs
                     )
 
             if isinstance(st, XeniumHESTData):