Skip to content

sopa.patches

sopa.patches.Patches2D

Compute 2D-patches with overlaps. This can be done on an image or a DataFrame.

Attributes:

Name Type Description
polygons list[Polygon]

List of shapely polygons representing the patches

bboxes ndarray

Array of shape (n_patches, 4) containing the (xmin, ymin, xmax, ymax) coordinates of the patches bounding boxes

ilocs ndarray

Array of shape (n_patches, 2) containing the (x,y) indices of the patches

Source code in sopa/patches/patches.py
class Patches2D:
    """
    Compute 2D-patches with overlaps. This can be done on an image or a DataFrame.

    Attributes:
        polygons (list[Polygon]): List of `shapely` polygons representing the patches
        bboxes (np.ndarray): Array of shape `(n_patches, 4)` containing the (xmin, ymin, xmax, ymax) coordinates of the patches bounding boxes
        ilocs (np.ndarray): Array of shape `(n_patches, 2)` containing the (x,y) indices of the patches
    """

    polygons: list[Polygon]
    bboxes: np.ndarray
    ilocs: np.ndarray

    def __init__(
        self,
        sdata: SpatialData,
        element_name: str,
        patch_width: float | int,
        patch_overlap: float | int = 50,
    ):
        """
        Args:
            sdata: A `SpatialData` object
            element_name: Name of the element on with patches will be made (image or points)
            patch_width: Width of the patches (in the unit of the coordinate system of the element)
            patch_overlap: Overlap width between the patches
        """
        self.sdata = sdata
        self.element_name = element_name
        self.element = sdata[element_name]

        if isinstance(self.element, DataTree):
            self.element = get_spatial_image(sdata, element_name)

        if isinstance(self.element, DataArray):
            xmin, ymin = 0, 0
            xmax, ymax = len(self.element.coords["x"]), len(self.element.coords["y"])
            tight, int_coords = False, True
        elif isinstance(self.element, dd.DataFrame):
            xmin, ymin = self.element.x.min().compute(), self.element.y.min().compute()
            xmax, ymax = self.element.x.max().compute(), self.element.y.max().compute()
            tight, int_coords = True, False
        else:
            raise ValueError(f"Invalid element type: {type(self.element)}")

        self.patch_x = Patches1D(xmin, xmax, patch_width, patch_overlap, tight, int_coords)
        self.patch_y = Patches1D(ymin, ymax, patch_width, patch_overlap, tight, int_coords)

        self.roi = None
        if ROI.KEY in sdata.shapes:
            geo_df = to_intrinsic(sdata, sdata[ROI.KEY], element_name)

            assert all(
                isinstance(geom, Polygon) for geom in geo_df.geometry
            ), f"All sdata['{ROI.KEY}'] geometries must be polygons"

            if len(geo_df) == 1:
                self.roi: Polygon = geo_df.geometry[0]
            else:
                self.roi = MultiPolygon(list(geo_df.geometry))

        self._init_patches()

    def _init_patches(self):
        self.ilocs, self.polygons, self.bboxes = [], [], []

        for i in range(self.patch_x._count * self.patch_y._count):
            self._try_register_patch(i)

        self.ilocs = np.array(self.ilocs)
        self.bboxes = np.array(self.bboxes)

    def _try_register_patch(self, i: int):
        """Check that the patch is valid, and, if valid, register it"""
        iy, ix = divmod(i, self.patch_x._count)
        bounds = self._bbox_iloc(ix, iy)
        patch = box(*bounds)

        if self.roi is not None and not self.roi.intersects(patch):
            return

        patch = patch if self.roi is None else patch.intersection(self.roi)

        if isinstance(patch, GeometryCollection):
            geoms = [geom for geom in patch.geoms if isinstance(geom, Polygon)]
            if not geoms:
                return
            patch = max(geoms, key=lambda polygon: polygon.area)

        if not isinstance(patch, Polygon) and not isinstance(patch, MultiPolygon):
            return

        self.polygons.append(patch)
        self.ilocs.append((ix, iy))
        self.bboxes.append(bounds)

    def __repr__(self):
        return f"{self.__class__.__name__} object with {len(self)} patches on {self.element_name}"

    @property
    def shape(self) -> tuple[int, int]:
        return (self.patch_y._count, self.patch_x._count)

    def _bbox_iloc(self, ix: int, iy: int) -> list[int]:
        """Coordinates of the rectangle bounding box of the patch at the given indices

        Args:
            ix: Patch index in the x-axis
            iy: Patch indes in the y-axis

        Returns:
            A list `[xmin, ymin, xmax, ymax]` representing the bounding box of the patch
        """
        xmin, xmax = self.patch_x[ix]
        ymin, ymax = self.patch_y[iy]
        return [xmin, ymin, xmax, ymax]

    def __len__(self):
        """Number of patches"""
        return len(self.bboxes)

    def write(self, overwrite: bool = True, shapes_key: str | None = None) -> gpd.GeoDataFrame:
        """Save patches in `sdata.shapes["sopa_patches"]` (or by the key specified)

        Args:
            overwrite: Whether to overwrite patches if existing
            shapes_key: Optional name of the shapes to be saved. By default, uses "sopa_patches".

        Returns:
            The saved GeoDataFrame
        """
        shapes_key = SopaKeys.PATCHES if shapes_key is None else shapes_key

        geo_df = gpd.GeoDataFrame(
            {
                "geometry": self.polygons,
                SopaKeys.BOUNDS: self.bboxes.tolist(),
                SopaKeys.PATCHES_ILOCS: self.ilocs.tolist(),
            }
        )
        geo_df = ShapesModel.parse(
            geo_df, transformations=get_transformation(self.element, get_all=True).copy()
        )

        self.sdata.shapes[shapes_key] = geo_df
        if self.sdata.is_backed():
            self.sdata.write_element(shapes_key, overwrite=overwrite)

        log.info(f"{len(geo_df)} patches were saved in sdata['{shapes_key}']")

        return geo_df

    def patchify_transcripts(
        self,
        temp_dir: str,
        cell_key: str = None,
        unassigned_value: int | str = None,
        use_prior: bool = False,
        config: dict = {},
        config_path: str | None = None,
        config_name: str = SopaFiles.TOML_CONFIG_FILE,
        csv_name: str = SopaFiles.TRANSCRIPTS_FILE,
        min_transcripts_per_patch: int = 4000,
        shapes_key: str = SopaKeys.CELLPOSE_BOUNDARIES,
    ) -> list[int]:
        """Creation of patches for the transcripts.

        !!! info "Prior segmentation usage"
            To save assign a prior segmentation to each transcript, you can either use `cell_key` or `use_prior`:

            - If a segmentation has already been performed (for example an existing 10X-Genomics segmentation), use `cell_key` to denote the column of the transcript dataframe containing the cell IDs (and optionaly `unassigned_value`).
            - If you have already run Cellpose with Sopa, use `use_prior` (no need to provide `cell_key` and `unassigned_value`).

        Args:
            temp_dir: Temporary directory where each patch will be stored. Note that each patch will have its own subdirectory.
            cell_key: Optional key of the transcript dataframe containing the cell IDs. This is useful if a prior segmentation has been run, assigning each transcript to a cell.
            unassigned_value: If `cell_key` has been provided, this corresponds to the value given in the 'cell ID' column for transcript that are not inside any cell.
            use_prior: Whether to use Cellpose as a prior segmentation. If `True`, make sure you have already run Cellpose with Sopa, and no need to provide `cell_key` and `unassigned_value`. Note that, if you have MERFISH data, the prior has already been run, so just use `cell_key` and `unassigned_value`.
            config: Dictionnary of segmentation parameters
            config_path: Path to the segmentation config file (you can also directly provide the argument via the `config` option)
            config_name: Name of the config file to be saved in each patch subdirectory
            csv_name: Name of the CSV file to be saved in each patch subdirectory
            min_transcripts_per_patch: Minimum number of transcripts for a patch to be considered for segmentation

        Returns:
            A list of patches indices. Each index correspond to the name of a subdirectory inside `temp_dir`
        """
        return TranscriptPatches(
            self, self.element, config_name, csv_name, min_transcripts_per_patch
        ).write(temp_dir, cell_key, unassigned_value, use_prior, config, config_path, shapes_key)

    def patchify_centroids(
        self,
        temp_dir: str,
        shapes_key: str = SopaKeys.CELLPOSE_BOUNDARIES,
        csv_name: str = SopaFiles.CENTROIDS_FILE,
        cell_key: str | None = None,
        min_cells_per_patch: int = 1,
    ) -> list[int]:
        assert isinstance(self.element, dd.DataFrame)

        centroids = to_intrinsic(self.sdata, shapes_key, self.element).geometry.centroid
        centroids = gpd.GeoDataFrame(geometry=centroids)
        centroids[cell_key or SopaKeys.DEFAULT_CELL_KEY] = range(1, len(centroids) + 1)
        centroids["x"] = centroids.geometry.x
        centroids["y"] = centroids.geometry.y
        centroids["z"] = 0

        return TranscriptPatches(self, centroids, None, csv_name, min_cells_per_patch).write(
            temp_dir, shapes_key=shapes_key
        )

__init__(sdata, element_name, patch_width, patch_overlap=50)

Parameters:

Name Type Description Default
sdata SpatialData

A SpatialData object

required
element_name str

Name of the element on with patches will be made (image or points)

required
patch_width float | int

Width of the patches (in the unit of the coordinate system of the element)

required
patch_overlap float | int

Overlap width between the patches

50
Source code in sopa/patches/patches.py
def __init__(
    self,
    sdata: SpatialData,
    element_name: str,
    patch_width: float | int,
    patch_overlap: float | int = 50,
):
    """
    Args:
        sdata: A `SpatialData` object
        element_name: Name of the element on with patches will be made (image or points)
        patch_width: Width of the patches (in the unit of the coordinate system of the element)
        patch_overlap: Overlap width between the patches
    """
    self.sdata = sdata
    self.element_name = element_name
    self.element = sdata[element_name]

    if isinstance(self.element, DataTree):
        self.element = get_spatial_image(sdata, element_name)

    if isinstance(self.element, DataArray):
        xmin, ymin = 0, 0
        xmax, ymax = len(self.element.coords["x"]), len(self.element.coords["y"])
        tight, int_coords = False, True
    elif isinstance(self.element, dd.DataFrame):
        xmin, ymin = self.element.x.min().compute(), self.element.y.min().compute()
        xmax, ymax = self.element.x.max().compute(), self.element.y.max().compute()
        tight, int_coords = True, False
    else:
        raise ValueError(f"Invalid element type: {type(self.element)}")

    self.patch_x = Patches1D(xmin, xmax, patch_width, patch_overlap, tight, int_coords)
    self.patch_y = Patches1D(ymin, ymax, patch_width, patch_overlap, tight, int_coords)

    self.roi = None
    if ROI.KEY in sdata.shapes:
        geo_df = to_intrinsic(sdata, sdata[ROI.KEY], element_name)

        assert all(
            isinstance(geom, Polygon) for geom in geo_df.geometry
        ), f"All sdata['{ROI.KEY}'] geometries must be polygons"

        if len(geo_df) == 1:
            self.roi: Polygon = geo_df.geometry[0]
        else:
            self.roi = MultiPolygon(list(geo_df.geometry))

    self._init_patches()

__len__()

Number of patches

Source code in sopa/patches/patches.py
def __len__(self):
    """Number of patches"""
    return len(self.bboxes)

patchify_transcripts(temp_dir, cell_key=None, unassigned_value=None, use_prior=False, config={}, config_path=None, config_name=SopaFiles.TOML_CONFIG_FILE, csv_name=SopaFiles.TRANSCRIPTS_FILE, min_transcripts_per_patch=4000, shapes_key=SopaKeys.CELLPOSE_BOUNDARIES)

Creation of patches for the transcripts.

Prior segmentation usage

To save assign a prior segmentation to each transcript, you can either use cell_key or use_prior:

  • If a segmentation has already been performed (for example an existing 10X-Genomics segmentation), use cell_key to denote the column of the transcript dataframe containing the cell IDs (and optionaly unassigned_value).
  • If you have already run Cellpose with Sopa, use use_prior (no need to provide cell_key and unassigned_value).

Parameters:

Name Type Description Default
temp_dir str

Temporary directory where each patch will be stored. Note that each patch will have its own subdirectory.

required
cell_key str

Optional key of the transcript dataframe containing the cell IDs. This is useful if a prior segmentation has been run, assigning each transcript to a cell.

None
unassigned_value int | str

If cell_key has been provided, this corresponds to the value given in the 'cell ID' column for transcript that are not inside any cell.

None
use_prior bool

Whether to use Cellpose as a prior segmentation. If True, make sure you have already run Cellpose with Sopa, and no need to provide cell_key and unassigned_value. Note that, if you have MERFISH data, the prior has already been run, so just use cell_key and unassigned_value.

False
config dict

Dictionnary of segmentation parameters

{}
config_path str | None

Path to the segmentation config file (you can also directly provide the argument via the config option)

None
config_name str

Name of the config file to be saved in each patch subdirectory

TOML_CONFIG_FILE
csv_name str

Name of the CSV file to be saved in each patch subdirectory

TRANSCRIPTS_FILE
min_transcripts_per_patch int

Minimum number of transcripts for a patch to be considered for segmentation

4000

Returns:

Type Description
list[int]

A list of patches indices. Each index correspond to the name of a subdirectory inside temp_dir

Source code in sopa/patches/patches.py
def patchify_transcripts(
    self,
    temp_dir: str,
    cell_key: str = None,
    unassigned_value: int | str = None,
    use_prior: bool = False,
    config: dict = {},
    config_path: str | None = None,
    config_name: str = SopaFiles.TOML_CONFIG_FILE,
    csv_name: str = SopaFiles.TRANSCRIPTS_FILE,
    min_transcripts_per_patch: int = 4000,
    shapes_key: str = SopaKeys.CELLPOSE_BOUNDARIES,
) -> list[int]:
    """Creation of patches for the transcripts.

    !!! info "Prior segmentation usage"
        To save assign a prior segmentation to each transcript, you can either use `cell_key` or `use_prior`:

        - If a segmentation has already been performed (for example an existing 10X-Genomics segmentation), use `cell_key` to denote the column of the transcript dataframe containing the cell IDs (and optionaly `unassigned_value`).
        - If you have already run Cellpose with Sopa, use `use_prior` (no need to provide `cell_key` and `unassigned_value`).

    Args:
        temp_dir: Temporary directory where each patch will be stored. Note that each patch will have its own subdirectory.
        cell_key: Optional key of the transcript dataframe containing the cell IDs. This is useful if a prior segmentation has been run, assigning each transcript to a cell.
        unassigned_value: If `cell_key` has been provided, this corresponds to the value given in the 'cell ID' column for transcript that are not inside any cell.
        use_prior: Whether to use Cellpose as a prior segmentation. If `True`, make sure you have already run Cellpose with Sopa, and no need to provide `cell_key` and `unassigned_value`. Note that, if you have MERFISH data, the prior has already been run, so just use `cell_key` and `unassigned_value`.
        config: Dictionnary of segmentation parameters
        config_path: Path to the segmentation config file (you can also directly provide the argument via the `config` option)
        config_name: Name of the config file to be saved in each patch subdirectory
        csv_name: Name of the CSV file to be saved in each patch subdirectory
        min_transcripts_per_patch: Minimum number of transcripts for a patch to be considered for segmentation

    Returns:
        A list of patches indices. Each index correspond to the name of a subdirectory inside `temp_dir`
    """
    return TranscriptPatches(
        self, self.element, config_name, csv_name, min_transcripts_per_patch
    ).write(temp_dir, cell_key, unassigned_value, use_prior, config, config_path, shapes_key)

write(overwrite=True, shapes_key=None)

Save patches in sdata.shapes["sopa_patches"] (or by the key specified)

Parameters:

Name Type Description Default
overwrite bool

Whether to overwrite patches if existing

True
shapes_key str | None

Optional name of the shapes to be saved. By default, uses "sopa_patches".

None

Returns:

Type Description
GeoDataFrame

The saved GeoDataFrame

Source code in sopa/patches/patches.py
def write(self, overwrite: bool = True, shapes_key: str | None = None) -> gpd.GeoDataFrame:
    """Save patches in `sdata.shapes["sopa_patches"]` (or by the key specified)

    Args:
        overwrite: Whether to overwrite patches if existing
        shapes_key: Optional name of the shapes to be saved. By default, uses "sopa_patches".

    Returns:
        The saved GeoDataFrame
    """
    shapes_key = SopaKeys.PATCHES if shapes_key is None else shapes_key

    geo_df = gpd.GeoDataFrame(
        {
            "geometry": self.polygons,
            SopaKeys.BOUNDS: self.bboxes.tolist(),
            SopaKeys.PATCHES_ILOCS: self.ilocs.tolist(),
        }
    )
    geo_df = ShapesModel.parse(
        geo_df, transformations=get_transformation(self.element, get_all=True).copy()
    )

    self.sdata.shapes[shapes_key] = geo_df
    if self.sdata.is_backed():
        self.sdata.write_element(shapes_key, overwrite=overwrite)

    log.info(f"{len(geo_df)} patches were saved in sdata['{shapes_key}']")

    return geo_df

sopa.patches.infer.infer_wsi_patches(sdata, model, patch_width, patch_overlap=0, level=0, magnification=None, image_key=None, batch_size=32, device=None)

Create an image made of patch based predictions of a WSI image.

Info

The image will be saved into the SpatialData object with the key sopa_{model_name} (see the argument below).

Parameters:

Name Type Description Default
sdata SpatialData

A SpatialData object

required
model Callable | str

Callable that takes as an input a tensor of size (batch_size, channels, x, y) and returns a vector for each tile (batch_size, emb_dim), or a string with the name of one of the available models (Resnet50Features, HistoSSLFeatures, or DINOv2Features).

required
patch_width int

Width (pixels) of the patches.

required
patch_overlap int

Width (pixels) of the overlap between the patches.

0
level int | None

Image level on which the processing is performed. Either level or magnification should be provided.

0
magnification int | None

The target magnification on which the processing is performed. If magnification is provided, the level argument will be automatically computed.

None
image_key str | None

Optional image key of the WSI image, unecessary if there is only one image.

None
batch_size int

Mini-batch size used during inference.

32
device str

Device used for the computer vision model.

None

Returns:

Type Description
DataArray | bool

If the processing was successful, returns the DataArray of shape (C,Y,X) containing the model predictions, else False

Source code in sopa/patches/infer.py
def infer_wsi_patches(
    sdata: SpatialData,
    model: Callable | str,
    patch_width: int,
    patch_overlap: int = 0,
    level: int | None = 0,
    magnification: int | None = None,
    image_key: str | None = None,
    batch_size: int = 32,
    device: str = None,
) -> DataArray | bool:
    """Create an image made of patch based predictions of a WSI image.

    !!! info
        The image will be saved into the `SpatialData` object with the key `sopa_{model_name}` (see the argument below).

    Args:
        sdata: A `SpatialData` object
        model: Callable that takes as an input a tensor of size (batch_size, channels, x, y) and returns a vector for each tile (batch_size, emb_dim), or a string with the name of one of the available models (`Resnet50Features`, `HistoSSLFeatures`, or `DINOv2Features`).
        patch_width: Width (pixels) of the patches.
        patch_overlap: Width (pixels) of the overlap between the patches.
        level: Image level on which the processing is performed. Either `level` or `magnification` should be provided.
        magnification: The target magnification on which the processing is performed. If `magnification` is provided, the `level` argument will be automatically computed.
        image_key: Optional image key of the WSI image, unecessary if there is only one image.
        batch_size: Mini-batch size used during inference.
        device: Device used for the computer vision model.

    Returns:
        If the processing was successful, returns the `DataArray` of shape `(C,Y,X)` containing the model predictions, else `False`
    """
    image_key = get_key(sdata, "images", image_key)
    image = sdata.images[image_key]

    infer = Inference(image, model, patch_width, level, magnification, device)
    patches = Patches2D(
        sdata, image_key, infer.patch_width_scale0, infer.downsample * patch_overlap
    )

    log.info(f"Processing {len(patches)} patches extracted from level {infer.level}")

    predictions = []
    for i in tqdm.tqdm(range(0, len(patches), batch_size)):
        prediction = infer.infer_bboxes(patches.bboxes[i : i + batch_size])
        predictions.extend(prediction)
    predictions = torch.stack(predictions)

    if len(predictions.shape) == 1:
        predictions = torch.unsqueeze(predictions, 1)

    output_image = np.zeros((predictions.shape[1], *patches.shape), dtype=np.float32)
    for (loc_x, loc_y), pred in zip(patches.ilocs, predictions):
        output_image[:, loc_y, loc_x] = pred

    patch_step = infer.patch_width_scale0 - infer.downsample * patch_overlap
    output_image = DataArray(output_image, dims=("c", "y", "x"))
    output_image = Image2DModel.parse(
        output_image,
        transformations={infer.cs: Scale([patch_step, patch_step], axes=("x", "y"))},
    )

    output_key = f"sopa_{infer.model_str}"
    sdata.images[output_key] = output_image

    log.info(f"Patch predictions saved as an image in sdata['{output_key}']")

    patches.write(shapes_key=SopaKeys.PATCHES_INFERENCE_KEY)

    return sdata[output_key]

sopa.patches.cluster.cluster_embeddings(sdata, element, method='leiden', key_added='cluster', **method_kwargs)

Cluster the patches embeddings using a clustering method

Parameters:

Name Type Description Default
sdata SpatialData

A SpatialData object

required
element DataArray | str

The DataArray containing the embeddings, or the name of the element

required
method Callable | str

Callable that takes as an input an array of size (n_patches x embedding_size) and returns an array of clusters of size n_patches, or an available method name (leiden)

'leiden'
key_added str

The key containing the clusters to be added to the patches GeoDataFrame

'cluster'
method_kwargs str

kwargs provided to the method callable

{}

Returns:

Type Description
GeoDataFrame

The patches GeoDataFrame with a new column key_added containing the patches clusters

Source code in sopa/patches/cluster.py
def cluster_embeddings(
    sdata: SpatialData,
    element: DataArray | str,
    method: Callable | str = "leiden",
    key_added: str = "cluster",
    **method_kwargs: str,
) -> gpd.GeoDataFrame:
    """Cluster the patches embeddings using a clustering method

    Args:
        sdata: A `SpatialData` object
        element: The `DataArray` containing the embeddings, or the name of the element
        method: Callable that takes as an input an array of size `(n_patches x embedding_size)` and returns an array of clusters of size `n_patches`, or an available method name (`leiden`)
        key_added: The key containing the clusters to be added to the patches `GeoDataFrame`
        method_kwargs: kwargs provided to the method callable

    Returns:
        The patches `GeoDataFrame` with a new column `key_added` containing the patches clusters
    """
    if isinstance(element, str):
        element = sdata.images[element]

    if isinstance(method, str):
        assert (
            method in METHODS_DICT
        ), f"Method {method} is not available. Use one of: {', '.join(METHODS_DICT.keys())}"
        method = METHODS_DICT[method]

    gdf_patches = sdata[SopaKeys.PATCHES_INFERENCE_KEY]

    ilocs = np.array(list(gdf_patches.ilocs))
    embeddings = element.compute().data[:, ilocs[:, 1], ilocs[:, 0]].T

    gdf_patches[key_added] = method(embeddings, **method_kwargs)
    gdf_patches[key_added] = gdf_patches[key_added].astype("category")

    return gdf_patches