Skip to content

sopa.segmentation.patching

sopa.segmentation.Patches2D

Compute 2D-patches with overlaps. This can be done on an image or a DataFrame.

Attributes:

Name Type Description
polygons list[Polygon]

List of shapely polygons representing the patches

bboxes ndarray

Array of shape (n_patches, 4) containing the (xmin, ymin, xmax, ymax) coordinates of the patches bounding boxes

ilocs ndarray

Array of shape (n_patches, 2) containing the (x,y) indices of the patches

Source code in sopa/patches/patches.py
class Patches2D:
    """
    Compute 2D-patches with overlaps. This can be done on an image or a DataFrame.

    Attributes:
        polygons (list[Polygon]): List of `shapely` polygons representing the patches
        bboxes (np.ndarray): Array of shape `(n_patches, 4)` containing the (xmin, ymin, xmax, ymax) coordinates of the patches bounding boxes
        ilocs (np.ndarray): Array of shape `(n_patches, 2)` containing the (x,y) indices of the patches
    """

    polygons: list[Polygon]
    bboxes: np.ndarray
    ilocs: np.ndarray

    def __init__(
        self,
        sdata: SpatialData,
        element_name: str,
        patch_width: float | int,
        patch_overlap: float | int = 50,
    ):
        """
        Args:
            sdata: A `SpatialData` object
            element_name: Name of the element on with patches will be made (image or points)
            patch_width: Width of the patches (in the unit of the coordinate system of the element)
            patch_overlap: Overlap width between the patches
        """
        self.sdata = sdata
        self.element_name = element_name
        self.element = sdata[element_name]

        if isinstance(self.element, DataTree):
            self.element = get_spatial_image(sdata, element_name)

        if isinstance(self.element, DataArray):
            xmin, ymin = 0, 0
            xmax, ymax = len(self.element.coords["x"]), len(self.element.coords["y"])
            tight, int_coords = False, True
        elif isinstance(self.element, dd.DataFrame):
            xmin, ymin = self.element.x.min().compute(), self.element.y.min().compute()
            xmax, ymax = self.element.x.max().compute(), self.element.y.max().compute()
            tight, int_coords = True, False
        else:
            raise ValueError(f"Invalid element type: {type(self.element)}")

        self.patch_x = Patches1D(xmin, xmax, patch_width, patch_overlap, tight, int_coords)
        self.patch_y = Patches1D(ymin, ymax, patch_width, patch_overlap, tight, int_coords)

        self.roi = None
        if ROI.KEY in sdata.shapes:
            geo_df = to_intrinsic(sdata, sdata[ROI.KEY], element_name)

            assert all(
                isinstance(geom, Polygon) for geom in geo_df.geometry
            ), f"All sdata['{ROI.KEY}'] geometries must be polygons"

            if len(geo_df) == 1:
                self.roi: Polygon = geo_df.geometry[0]
            else:
                self.roi = MultiPolygon(list(geo_df.geometry))

        self._init_patches()

    def _init_patches(self):
        self.ilocs, self.polygons, self.bboxes = [], [], []

        for i in range(self.patch_x._count * self.patch_y._count):
            self._try_register_patch(i)

        self.ilocs = np.array(self.ilocs)
        self.bboxes = np.array(self.bboxes)

    def _try_register_patch(self, i: int):
        """Check that the patch is valid, and, if valid, register it"""
        iy, ix = divmod(i, self.patch_x._count)
        bounds = self._bbox_iloc(ix, iy)
        patch = box(*bounds)

        if self.roi is not None and not self.roi.intersects(patch):
            return

        patch = patch if self.roi is None else patch.intersection(self.roi)

        if isinstance(patch, GeometryCollection):
            geoms = [geom for geom in patch.geoms if isinstance(geom, Polygon)]
            if not geoms:
                return
            patch = max(geoms, key=lambda polygon: polygon.area)

        if not isinstance(patch, Polygon) and not isinstance(patch, MultiPolygon):
            return

        self.polygons.append(patch)
        self.ilocs.append((ix, iy))
        self.bboxes.append(bounds)

    def __repr__(self):
        return f"{self.__class__.__name__} object with {len(self)} patches on {self.element_name}"

    @property
    def shape(self) -> tuple[int, int]:
        return (self.patch_y._count, self.patch_x._count)

    def _bbox_iloc(self, ix: int, iy: int) -> list[int]:
        """Coordinates of the rectangle bounding box of the patch at the given indices

        Args:
            ix: Patch index in the x-axis
            iy: Patch indes in the y-axis

        Returns:
            A list `[xmin, ymin, xmax, ymax]` representing the bounding box of the patch
        """
        xmin, xmax = self.patch_x[ix]
        ymin, ymax = self.patch_y[iy]
        return [xmin, ymin, xmax, ymax]

    def __len__(self):
        """Number of patches"""
        return len(self.bboxes)

    def write(self, overwrite: bool = True, shapes_key: str | None = None) -> gpd.GeoDataFrame:
        """Save patches in `sdata.shapes["sopa_patches"]` (or by the key specified)

        Args:
            overwrite: Whether to overwrite patches if existing
            shapes_key: Optional name of the shapes to be saved. By default, uses "sopa_patches".

        Returns:
            The saved GeoDataFrame
        """
        shapes_key = SopaKeys.PATCHES if shapes_key is None else shapes_key

        geo_df = gpd.GeoDataFrame(
            {
                "geometry": self.polygons,
                SopaKeys.BOUNDS: self.bboxes.tolist(),
                SopaKeys.PATCHES_ILOCS: self.ilocs.tolist(),
            }
        )
        geo_df = ShapesModel.parse(geo_df, transformations=get_transformation(self.element, get_all=True).copy())

        self.sdata.shapes[shapes_key] = geo_df
        if self.sdata.is_backed():
            self.sdata.write_element(shapes_key, overwrite=overwrite)

        log.info(f"{len(geo_df)} patches were saved in sdata['{shapes_key}']")

        return geo_df

    def patchify_transcripts(
        self,
        temp_dir: str,
        cell_key: str = None,
        unassigned_value: int | str = None,
        use_prior: bool = False,
        config: dict = {},
        config_path: str | None = None,
        config_name: str = SopaFiles.TOML_CONFIG_FILE,
        csv_name: str = SopaFiles.TRANSCRIPTS_FILE,
        min_transcripts_per_patch: int = 4000,
        shapes_key: str = SopaKeys.CELLPOSE_BOUNDARIES,
    ) -> list[int]:
        """Creation of patches for the transcripts.

        !!! info "Prior segmentation usage"
            To save assign a prior segmentation to each transcript, you can either use `cell_key` or `use_prior`:

            - If a segmentation has already been performed (for example an existing 10X-Genomics segmentation), use `cell_key` to denote the column of the transcript dataframe containing the cell IDs (and optionaly `unassigned_value`).
            - If you have already run Cellpose with Sopa, use `use_prior` (no need to provide `cell_key` and `unassigned_value`).

        Args:
            temp_dir: Temporary directory where each patch will be stored. Note that each patch will have its own subdirectory.
            cell_key: Optional key of the transcript dataframe containing the cell IDs. This is useful if a prior segmentation has been run, assigning each transcript to a cell.
            unassigned_value: If `cell_key` has been provided, this corresponds to the value given in the 'cell ID' column for transcript that are not inside any cell.
            use_prior: Whether to use Cellpose as a prior segmentation. If `True`, make sure you have already run Cellpose with Sopa, and no need to provide `cell_key` and `unassigned_value`. Note that, if you have MERFISH data, the prior has already been run, so just use `cell_key` and `unassigned_value`.
            config: Dictionnary of segmentation parameters
            config_path: Path to the segmentation config file (you can also directly provide the argument via the `config` option)
            config_name: Name of the config file to be saved in each patch subdirectory
            csv_name: Name of the CSV file to be saved in each patch subdirectory
            min_transcripts_per_patch: Minimum number of transcripts for a patch to be considered for segmentation

        Returns:
            A list of patches indices. Each index correspond to the name of a subdirectory inside `temp_dir`
        """
        return TranscriptPatches(self, self.element, config_name, csv_name, min_transcripts_per_patch).write(
            temp_dir, cell_key, unassigned_value, use_prior, config, config_path, shapes_key
        )

    def patchify_centroids(
        self,
        temp_dir: str,
        shapes_key: str = SopaKeys.CELLPOSE_BOUNDARIES,
        csv_name: str = SopaFiles.CENTROIDS_FILE,
        cell_key: str | None = None,
        min_cells_per_patch: int = 1,
    ) -> list[int]:
        assert isinstance(self.element, dd.DataFrame)

        centroids = to_intrinsic(self.sdata, shapes_key, self.element).geometry.centroid
        centroids = gpd.GeoDataFrame(geometry=centroids)
        centroids[cell_key or SopaKeys.DEFAULT_CELL_KEY] = range(1, len(centroids) + 1)
        centroids["x"] = centroids.geometry.x
        centroids["y"] = centroids.geometry.y
        centroids["z"] = 0

        return TranscriptPatches(self, centroids, None, csv_name, min_cells_per_patch).write(
            temp_dir, shapes_key=shapes_key
        )

__init__(sdata, element_name, patch_width, patch_overlap=50)

Parameters:

Name Type Description Default
sdata SpatialData

A SpatialData object

required
element_name str

Name of the element on with patches will be made (image or points)

required
patch_width float | int

Width of the patches (in the unit of the coordinate system of the element)

required
patch_overlap float | int

Overlap width between the patches

50
Source code in sopa/patches/patches.py
def __init__(
    self,
    sdata: SpatialData,
    element_name: str,
    patch_width: float | int,
    patch_overlap: float | int = 50,
):
    """
    Args:
        sdata: A `SpatialData` object
        element_name: Name of the element on with patches will be made (image or points)
        patch_width: Width of the patches (in the unit of the coordinate system of the element)
        patch_overlap: Overlap width between the patches
    """
    self.sdata = sdata
    self.element_name = element_name
    self.element = sdata[element_name]

    if isinstance(self.element, DataTree):
        self.element = get_spatial_image(sdata, element_name)

    if isinstance(self.element, DataArray):
        xmin, ymin = 0, 0
        xmax, ymax = len(self.element.coords["x"]), len(self.element.coords["y"])
        tight, int_coords = False, True
    elif isinstance(self.element, dd.DataFrame):
        xmin, ymin = self.element.x.min().compute(), self.element.y.min().compute()
        xmax, ymax = self.element.x.max().compute(), self.element.y.max().compute()
        tight, int_coords = True, False
    else:
        raise ValueError(f"Invalid element type: {type(self.element)}")

    self.patch_x = Patches1D(xmin, xmax, patch_width, patch_overlap, tight, int_coords)
    self.patch_y = Patches1D(ymin, ymax, patch_width, patch_overlap, tight, int_coords)

    self.roi = None
    if ROI.KEY in sdata.shapes:
        geo_df = to_intrinsic(sdata, sdata[ROI.KEY], element_name)

        assert all(
            isinstance(geom, Polygon) for geom in geo_df.geometry
        ), f"All sdata['{ROI.KEY}'] geometries must be polygons"

        if len(geo_df) == 1:
            self.roi: Polygon = geo_df.geometry[0]
        else:
            self.roi = MultiPolygon(list(geo_df.geometry))

    self._init_patches()

__len__()

Number of patches

Source code in sopa/patches/patches.py
def __len__(self):
    """Number of patches"""
    return len(self.bboxes)

patchify_transcripts(temp_dir, cell_key=None, unassigned_value=None, use_prior=False, config={}, config_path=None, config_name=SopaFiles.TOML_CONFIG_FILE, csv_name=SopaFiles.TRANSCRIPTS_FILE, min_transcripts_per_patch=4000, shapes_key=SopaKeys.CELLPOSE_BOUNDARIES)

Creation of patches for the transcripts.

Prior segmentation usage

To save assign a prior segmentation to each transcript, you can either use cell_key or use_prior:

  • If a segmentation has already been performed (for example an existing 10X-Genomics segmentation), use cell_key to denote the column of the transcript dataframe containing the cell IDs (and optionaly unassigned_value).
  • If you have already run Cellpose with Sopa, use use_prior (no need to provide cell_key and unassigned_value).

Parameters:

Name Type Description Default
temp_dir str

Temporary directory where each patch will be stored. Note that each patch will have its own subdirectory.

required
cell_key str

Optional key of the transcript dataframe containing the cell IDs. This is useful if a prior segmentation has been run, assigning each transcript to a cell.

None
unassigned_value int | str

If cell_key has been provided, this corresponds to the value given in the 'cell ID' column for transcript that are not inside any cell.

None
use_prior bool

Whether to use Cellpose as a prior segmentation. If True, make sure you have already run Cellpose with Sopa, and no need to provide cell_key and unassigned_value. Note that, if you have MERFISH data, the prior has already been run, so just use cell_key and unassigned_value.

False
config dict

Dictionnary of segmentation parameters

{}
config_path str | None

Path to the segmentation config file (you can also directly provide the argument via the config option)

None
config_name str

Name of the config file to be saved in each patch subdirectory

TOML_CONFIG_FILE
csv_name str

Name of the CSV file to be saved in each patch subdirectory

TRANSCRIPTS_FILE
min_transcripts_per_patch int

Minimum number of transcripts for a patch to be considered for segmentation

4000

Returns:

Type Description
list[int]

A list of patches indices. Each index correspond to the name of a subdirectory inside temp_dir

Source code in sopa/patches/patches.py
def patchify_transcripts(
    self,
    temp_dir: str,
    cell_key: str = None,
    unassigned_value: int | str = None,
    use_prior: bool = False,
    config: dict = {},
    config_path: str | None = None,
    config_name: str = SopaFiles.TOML_CONFIG_FILE,
    csv_name: str = SopaFiles.TRANSCRIPTS_FILE,
    min_transcripts_per_patch: int = 4000,
    shapes_key: str = SopaKeys.CELLPOSE_BOUNDARIES,
) -> list[int]:
    """Creation of patches for the transcripts.

    !!! info "Prior segmentation usage"
        To save assign a prior segmentation to each transcript, you can either use `cell_key` or `use_prior`:

        - If a segmentation has already been performed (for example an existing 10X-Genomics segmentation), use `cell_key` to denote the column of the transcript dataframe containing the cell IDs (and optionaly `unassigned_value`).
        - If you have already run Cellpose with Sopa, use `use_prior` (no need to provide `cell_key` and `unassigned_value`).

    Args:
        temp_dir: Temporary directory where each patch will be stored. Note that each patch will have its own subdirectory.
        cell_key: Optional key of the transcript dataframe containing the cell IDs. This is useful if a prior segmentation has been run, assigning each transcript to a cell.
        unassigned_value: If `cell_key` has been provided, this corresponds to the value given in the 'cell ID' column for transcript that are not inside any cell.
        use_prior: Whether to use Cellpose as a prior segmentation. If `True`, make sure you have already run Cellpose with Sopa, and no need to provide `cell_key` and `unassigned_value`. Note that, if you have MERFISH data, the prior has already been run, so just use `cell_key` and `unassigned_value`.
        config: Dictionnary of segmentation parameters
        config_path: Path to the segmentation config file (you can also directly provide the argument via the `config` option)
        config_name: Name of the config file to be saved in each patch subdirectory
        csv_name: Name of the CSV file to be saved in each patch subdirectory
        min_transcripts_per_patch: Minimum number of transcripts for a patch to be considered for segmentation

    Returns:
        A list of patches indices. Each index correspond to the name of a subdirectory inside `temp_dir`
    """
    return TranscriptPatches(self, self.element, config_name, csv_name, min_transcripts_per_patch).write(
        temp_dir, cell_key, unassigned_value, use_prior, config, config_path, shapes_key
    )

write(overwrite=True, shapes_key=None)

Save patches in sdata.shapes["sopa_patches"] (or by the key specified)

Parameters:

Name Type Description Default
overwrite bool

Whether to overwrite patches if existing

True
shapes_key str | None

Optional name of the shapes to be saved. By default, uses "sopa_patches".

None

Returns:

Type Description
GeoDataFrame

The saved GeoDataFrame

Source code in sopa/patches/patches.py
def write(self, overwrite: bool = True, shapes_key: str | None = None) -> gpd.GeoDataFrame:
    """Save patches in `sdata.shapes["sopa_patches"]` (or by the key specified)

    Args:
        overwrite: Whether to overwrite patches if existing
        shapes_key: Optional name of the shapes to be saved. By default, uses "sopa_patches".

    Returns:
        The saved GeoDataFrame
    """
    shapes_key = SopaKeys.PATCHES if shapes_key is None else shapes_key

    geo_df = gpd.GeoDataFrame(
        {
            "geometry": self.polygons,
            SopaKeys.BOUNDS: self.bboxes.tolist(),
            SopaKeys.PATCHES_ILOCS: self.ilocs.tolist(),
        }
    )
    geo_df = ShapesModel.parse(geo_df, transformations=get_transformation(self.element, get_all=True).copy())

    self.sdata.shapes[shapes_key] = geo_df
    if self.sdata.is_backed():
        self.sdata.write_element(shapes_key, overwrite=overwrite)

    log.info(f"{len(geo_df)} patches were saved in sdata['{shapes_key}']")

    return geo_df