Skip to content

Readers

The readers below are used to read your raw spatial data into a SpatialData object. Choose the right function below, according to your technology of interest.

Warning

Due to many updates in the data format provided by the different companies, you might have issues loading your data. In this case, consider opening an issue detailing the version of the machine you used and the error log, as well as an example of file names that you are trying to read.

sopa.io.xenium(path, image_models_kwargs=None, imread_kwargs=None, cells_boundaries=False, cells_table=False, **kwargs)

Read Xenium data as a SpatialData object. For more information, refer to spatialdata-io.

This function reads the following files
  • transcripts.parquet: transcripts locations and names
  • experiment.xenium: metadata file
  • morphology_focus.ome.tif: morphology image (or a directory, for recent versions of the Xenium)

Parameters:

Name Type Description Default
path str | Path

Path to the Xenium directory containing all the experiment files

required
image_models_kwargs dict | None

Keyword arguments passed to spatialdata.models.Image2DModel.

None
imread_kwargs dict | None

Keyword arguments passed to dask_image.imread.imread.

None

Returns:

Type Description
SpatialData

A SpatialData object representing the Xenium experiment

Source code in sopa/io/reader/xenium.py
def xenium(
    path: str | Path,
    image_models_kwargs: dict | None = None,
    imread_kwargs: dict | None = None,
    cells_boundaries: int = False,
    cells_table: int = False,
    **kwargs: int,
) -> SpatialData:
    """Read Xenium data as a `SpatialData` object. For more information, refer to [spatialdata-io](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.xenium.html).

    This function reads the following files:
        - `transcripts.parquet`: transcripts locations and names
        - `experiment.xenium`: metadata file
        - `morphology_focus.ome.tif`: morphology image (or a directory, for recent versions of the Xenium)


    Args:
        path: Path to the Xenium directory containing all the experiment files
        image_models_kwargs: Keyword arguments passed to `spatialdata.models.Image2DModel`.
        imread_kwargs: Keyword arguments passed to `dask_image.imread.imread`.

    Returns:
        A `SpatialData` object representing the Xenium experiment
    """
    from spatialdata_io.readers.xenium import xenium as xenium_spatialdata_io

    image_models_kwargs, imread_kwargs = _default_image_kwargs(image_models_kwargs, imread_kwargs)

    sdata: SpatialData = xenium_spatialdata_io(
        path,
        cells_table=cells_table,
        nucleus_labels=False,
        cells_labels=False,
        cells_as_circles=False,
        nucleus_boundaries=False,
        cells_boundaries=cells_boundaries,
        image_models_kwargs=image_models_kwargs,
        imread_kwargs=imread_kwargs,
        **kwargs,
    )

    if "table" in sdata.tables:
        sdata["table"].uns[ATTRS_KEY]["region"] = "cell_boundaries"
        sdata["table"].obs["region"] = "cell_boundaries"
        sdata["table"].obs["region"] = sdata["table"].obs["region"].astype("category")

    ensure_string_channel_names(sdata)

    ### Add Sopa attributes to detect the spatial elements
    if "morphology_focus" in sdata.images:
        sdata.attrs[SopaAttrs.CELL_SEGMENTATION] = "morphology_focus"

    if "he_image" in sdata.images:
        sdata.attrs[SopaAttrs.TISSUE_SEGMENTATION] = "he_image"

    if "transcripts" in sdata.points:
        sdata.attrs[SopaAttrs.TRANSCRIPTS] = "transcripts"

    return sdata

sopa.io.merscope(path, backend=None, z_layers=3, region_name=None, slide_name=None, image_models_kwargs=None, imread_kwargs=None, **kwargs)

Read MERSCOPE data as a SpatialData object. For more information, refer to spatialdata-io.

This function reads the following files
  • detected_transcripts.csv: transcripts locations and names
  • all the images under the images directory
  • images/micron_to_mosaic_pixel_transform.csv: affine transformation

Parameters:

Name Type Description Default
path str | Path

Path to the MERSCOPE directory containing all the experiment files

required
backend Literal['dask_image', 'rioxarray'] | None

Either "dask_image" or "rioxarray" (the latter uses less RAM, but requires rioxarray to be installed). By default, uses "rioxarray" if and only if the rioxarray library is installed.

None
z_layers int | list[int] | None

Indices of the z-layers to consider. Either one int index, or a list of int indices. If None, then no image is loaded. By default, only the middle layer is considered (that is, layer 3).

3
region_name str | None

Name of the region of interest, e.g., 'region_0'. If None then the name of the path directory is used.

None
slide_name str | None

Name of the slide/run. If None then the name of the parent directory of path is used (whose name starts with a date).

None
image_models_kwargs dict | None

Keyword arguments passed to spatialdata.models.Image2DModel.

None
imread_kwargs dict | None

Keyword arguments passed to dask_image.imread.imread.

None

Returns:

Type Description
SpatialData

A SpatialData object representing the MERSCOPE experiment

Source code in sopa/io/reader/merscope.py
def merscope(
    path: str | Path,
    backend: Literal["dask_image", "rioxarray"] | None = None,
    z_layers: int | list[int] | None = 3,
    region_name: str | None = None,
    slide_name: str | None = None,
    image_models_kwargs: dict | None = None,
    imread_kwargs: dict | None = None,
    **kwargs: int,
) -> SpatialData:
    """Read MERSCOPE data as a `SpatialData` object. For more information, refer to [spatialdata-io](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html).

    This function reads the following files:
        - `detected_transcripts.csv`: transcripts locations and names
        - all the images under the `images` directory
        - `images/micron_to_mosaic_pixel_transform.csv`: affine transformation

    Args:
        path: Path to the MERSCOPE directory containing all the experiment files
        backend: Either `"dask_image"` or `"rioxarray"` (the latter uses less RAM, but requires `rioxarray` to be installed). By default, uses `"rioxarray"` if and only if the `rioxarray` library is installed.
        z_layers: Indices of the z-layers to consider. Either one `int` index, or a list of `int` indices. If `None`, then no image is loaded. By default, only the middle layer is considered (that is, layer 3).
        region_name: Name of the region of interest, e.g., `'region_0'`. If `None` then the name of the `path` directory is used.
        slide_name: Name of the slide/run. If `None` then the name of the parent directory of `path` is used (whose name starts with a date).
        image_models_kwargs: Keyword arguments passed to `spatialdata.models.Image2DModel`.
        imread_kwargs: Keyword arguments passed to `dask_image.imread.imread`.

    Returns:
        A `SpatialData` object representing the MERSCOPE experiment
    """
    from spatialdata_io.readers.merscope import merscope as merscope_spatialdata_io

    image_models_kwargs, imread_kwargs = _default_image_kwargs(image_models_kwargs, imread_kwargs)

    sdata = merscope_spatialdata_io(
        path,
        backend=backend,
        z_layers=z_layers,
        region_name=region_name,
        slide_name=slide_name,
        image_models_kwargs=image_models_kwargs,
        imread_kwargs=imread_kwargs,
        cells_boundaries=False,
        cells_table=False,
        **kwargs,
    )

    ### Add Sopa attributes to detect the spatial elements
    if z_layers is not None:
        if not isinstance(z_layers, int) and len(z_layers) == 1:
            z_layers = z_layers[0]
        if isinstance(z_layers, int):
            for key in sdata.images.keys():
                if key.endswith(f"_z{z_layers}"):
                    sdata.attrs[SopaAttrs.CELL_SEGMENTATION] = key
        else:
            log.warning(
                f"Multiple z-layers provided: {z_layers}. Not deciding which image should be used for cell segmentation."
            )

    for key in sdata.points.keys():
        if key.endswith("_transcripts"):
            sdata.attrs[SopaAttrs.TRANSCRIPTS] = key

    sdata.attrs[SopaAttrs.GENE_EXCLUDE_PATTERN] = "blank"

    return sdata

sopa.io.cosmx(path, dataset_id=None, fov=None, read_proteins=False, image_models_kwargs=None, imread_kwargs=None)

Read Cosmx Nanostring data. The fields of view are stitched together, except if fov is provided.

This function reads the following files
  • *_fov_positions_file.csv or *_fov_positions_file.csv.gz: FOV locations
  • Morphology2D directory: all the FOVs morphology images
  • Morphology_ChannelID_Dictionary.txt: Morphology channels names
  • *_tx_file.csv.gz or *_tx_file.csv: Transcripts location and names
  • If read_proteins is True, all the images under the nested ProteinImages directories will be read

These files must be exported as flat files in AtomX. That is: within a study, click on "Export" and then select files from the "Flat CSV Files" section (transcripts flat and FOV position flat).

Parameters:

Name Type Description Default
path str | Path

Path to the root directory containing Nanostring files.

required
dataset_id Optional[str]

Optional name of the dataset (needs to be provided if not infered).

None
fov int | str | None

Name or number of one single field of view to be read. If a string is provided, an example of correct syntax is "F008". By default, reads all FOVs.

None
read_proteins bool

Whether to read the proteins or the transcripts.

False
image_models_kwargs dict | None

Keyword arguments passed to spatialdata.models.Image2DModel.

None
imread_kwargs dict | None

Keyword arguments passed to dask_image.imread.imread.

None

Returns:

Type Description
SpatialData

A SpatialData object representing the CosMX experiment

Source code in sopa/io/reader/cosmx.py
def cosmx(
    path: str | Path,
    dataset_id: Optional[str] = None,
    fov: int | str | None = None,
    read_proteins: bool = False,
    image_models_kwargs: dict | None = None,
    imread_kwargs: dict | None = None,
) -> SpatialData:
    """
    Read *Cosmx Nanostring* data. The fields of view are stitched together, except if `fov` is provided.

    This function reads the following files:
        - `*_fov_positions_file.csv` or `*_fov_positions_file.csv.gz`: FOV locations
        - `Morphology2D` directory: all the FOVs morphology images
        - `Morphology_ChannelID_Dictionary.txt`: Morphology channels names
        - `*_tx_file.csv.gz` or `*_tx_file.csv`: Transcripts location and names
        - If `read_proteins` is `True`, all the images under the nested `ProteinImages` directories will be read

        These files must be exported as flat files in AtomX. That is: within a study, click on "Export" and then select files from the "Flat CSV Files" section (transcripts flat and FOV position flat).

    Args:
        path: Path to the root directory containing *Nanostring* files.
        dataset_id: Optional name of the dataset (needs to be provided if not infered).
        fov: Name or number of one single field of view to be read. If a string is provided, an example of correct syntax is "F008". By default, reads all FOVs.
        read_proteins: Whether to read the proteins or the transcripts.
        image_models_kwargs: Keyword arguments passed to `spatialdata.models.Image2DModel`.
        imread_kwargs: Keyword arguments passed to `dask_image.imread.imread`.

    Returns:
        A `SpatialData` object representing the CosMX experiment
    """
    path = Path(path)
    image_models_kwargs, imread_kwargs = _default_image_kwargs(image_models_kwargs, imread_kwargs)

    dataset_id = _infer_dataset_id(path, dataset_id)
    fov_locs = _read_fov_locs(path, dataset_id)
    fov_id, fov = _check_fov_id(fov)

    protein_dir_dict = {}
    if read_proteins:
        protein_dir_dict = {
            int(protein_dir.parent.name[3:]): protein_dir for protein_dir in list(path.rglob("**/FOV*/ProteinImages"))
        }
        assert len(protein_dir_dict), f"No directory called 'ProteinImages' was found under {path}"

    ### Read image(s)
    images_dir = _find_dir(path, "Morphology2D")
    morphology_coords = _cosmx_morphology_coords(path)

    if fov is None:
        image, c_coords = _read_stitched_image(
            images_dir,
            fov_locs,
            protein_dir_dict,
            morphology_coords,
            **imread_kwargs,
        )
        image_name = "stitched_image"
    else:
        pattern = f"*{fov_id}.TIF"
        fov_files = list(images_dir.rglob(pattern))

        assert len(fov_files), f"No file matches the pattern {pattern} inside {images_dir}"
        assert len(fov_files) == 1, f"Multiple files match the pattern {pattern}: {', '.join(fov_files)}"

        image, c_coords = _read_fov_image(fov_files[0], protein_dir_dict.get(fov), morphology_coords, **imread_kwargs)
        image_name = f"{fov}_image"

    parsed_image = Image2DModel.parse(image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs)

    if read_proteins:
        return SpatialData(images={image_name: parsed_image}, attrs={SopaAttrs.CELL_SEGMENTATION: image_name})

    ### Read transcripts
    transcripts_data = _read_transcripts_csv(path, dataset_id)

    if fov is None:
        transcripts_data["x"] = transcripts_data["x_global_px"] - fov_locs["xmin"].min()
        transcripts_data["y"] = transcripts_data["y_global_px"] - fov_locs["ymin"].min()
        coordinates = None
        points_name = "points"
    else:
        transcripts_data = transcripts_data[transcripts_data["fov"] == fov]
        coordinates = {"x": "x_local_px", "y": "y_local_px"}
        points_name = f"{fov}_points"

    from spatialdata_io._constants._constants import CosmxKeys

    transcripts = PointsModel.parse(
        transcripts_data,
        coordinates=coordinates,
        feature_key=CosmxKeys.TARGET_OF_TRANSCRIPT,
    )

    return SpatialData(
        images={image_name: parsed_image},
        points={points_name: transcripts},
        attrs={SopaAttrs.CELL_SEGMENTATION: image_name, SopaAttrs.TRANSCRIPTS: points_name},
    )

sopa.io.macsima(path, **kwargs)

Read MACSIMA data as a SpatialData object

Notes

For all dulicated name, their index will be added in brackets after, for instance you may find DAPI (1).

Parameters:

Name Type Description Default
path Path

Path to the directory containing the MACSIMA .tif images

required
kwargs int

Kwargs for the _general_tif_directory_reader

{}

Returns:

Type Description
SpatialData

A SpatialData object with a 2D-image of shape (C, Y, X)

Source code in sopa/io/reader/macsima.py
def macsima(path: Path, **kwargs: int) -> SpatialData:
    """Read MACSIMA data as a `SpatialData` object

    Notes:
        For all dulicated name, their index will be added in brackets after, for instance you may find `DAPI (1)`.

    Args:
        path: Path to the directory containing the MACSIMA `.tif` images
        kwargs: Kwargs for the `_general_tif_directory_reader`

    Returns:
        A `SpatialData` object with a 2D-image of shape `(C, Y, X)`
    """
    files = list(Path(path).glob("*.tif"))

    if any("A-" in file.name for file in files):  # non-ome.tif format
        return _general_tif_directory_reader(path, files_to_channels=_get_channel_names_macsima, **kwargs)

    return _general_tif_directory_reader(path, **kwargs)

sopa.io.phenocycler(path, channels_renaming=None, image_models_kwargs=None)

Read Phenocycler data as a SpatialData object

Parameters:

Name Type Description Default
path str | Path

Path to a .qptiff file, or a .tif file (if exported from QuPath)

required
channels_renaming dict | None

A dictionnary whose keys correspond to channels and values to their corresponding new name. Not all channels need to be renamed.

None
image_models_kwargs dict | None

Keyword arguments passed to spatialdata.models.Image2DModel.

None

Returns:

Type Description
SpatialData

A SpatialData object with a 2D-image of shape (C, Y, X)

Source code in sopa/io/reader/phenocycler.py
def phenocycler(
    path: str | Path, channels_renaming: dict | None = None, image_models_kwargs: dict | None = None
) -> SpatialData:
    """Read Phenocycler data as a `SpatialData` object

    Args:
        path: Path to a `.qptiff` file, or a `.tif` file (if exported from QuPath)
        channels_renaming: A dictionnary whose keys correspond to channels and values to their corresponding new name. Not all channels need to be renamed.
        image_models_kwargs: Keyword arguments passed to `spatialdata.models.Image2DModel`.

    Returns:
        A `SpatialData` object with a 2D-image of shape `(C, Y, X)`
    """
    image_models_kwargs, _ = _default_image_kwargs(image_models_kwargs)

    path = Path(path)
    image_name = path.absolute().stem

    if path.suffix == ".qptiff":
        with tf.TiffFile(path) as tif:
            series = tif.series[0]
            names = _deduplicate_names([_get_channel_name_qptiff(page.description) for page in series])

            delayed_image = delayed(lambda series: series.asarray())(tif)
            image = da.from_delayed(delayed_image, dtype=series.dtype, shape=series.shape)
    elif path.suffix == ".tif":
        image = imread(path)
        names = _get_IJ_channel_names(path)
    else:
        raise ValueError(f"Unsupported file extension {path.suffix}. Must be '.qptiff' or '.tif'.")

    names = _rename_channels(names, channels_renaming)
    image = image.rechunk(chunks=image_models_kwargs["chunks"])

    image = Image2DModel.parse(
        image,
        dims=("c", "y", "x"),
        transformations={"pixels": Identity()},
        c_coords=names,
        **image_models_kwargs,
    )

    return SpatialData(images={image_name: image}, attrs={SopaAttrs.CELL_SEGMENTATION: image_name})

sopa.io.hyperion(path, image_models_kwargs=None, imread_kwargs=None)

Read Hyperion data as a SpatialData object

Parameters:

Name Type Description Default
path Path

Path to the directory containing the Hyperion .tiff images

required
image_models_kwargs dict | None

Keyword arguments passed to spatialdata.models.Image2DModel.

None
imread_kwargs dict | None

Keyword arguments passed to dask_image.imread.imread.

None

Returns:

Type Description
SpatialData

A SpatialData object with a 2D-image of shape (C, Y, X)

Source code in sopa/io/reader/hyperion.py
def hyperion(path: Path, image_models_kwargs: dict | None = None, imread_kwargs: dict | None = None) -> SpatialData:
    """Read Hyperion data as a `SpatialData` object

    Args:
        path: Path to the directory containing the Hyperion `.tiff` images
        image_models_kwargs: Keyword arguments passed to `spatialdata.models.Image2DModel`.
        imread_kwargs: Keyword arguments passed to `dask_image.imread.imread`.

    Returns:
        A `SpatialData` object with a 2D-image of shape `(C, Y, X)`
    """
    image_models_kwargs, imread_kwargs = _default_image_kwargs(image_models_kwargs, imread_kwargs)

    files = [file for file in Path(path).iterdir() if file.suffix == ".tiff"]

    names = _get_channel_names_hyperion(files)
    image = da.concatenate(
        [imread(file, **imread_kwargs) for file in files],
        axis=0,
    )

    image = image.rechunk(chunks=image_models_kwargs["chunks"])

    log.info(f"Found channel names {names}")

    image_name = Path(path).absolute().stem

    image = DataArray(image, dims=["c", "y", "x"], name=image_name, coords={"c": names})
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        image = _clip_intensity_values(image)

    image = Image2DModel.parse(
        image,
        transformations={"pixels": Identity()},
        c_coords=image.coords["c"].values,
        **image_models_kwargs,
    )

    return SpatialData(images={image_name: image}, attrs={SopaAttrs.CELL_SEGMENTATION: image_name})

sopa.io.aicsimageio(path, z_stack=0, image_models_kwargs=None, aics_kwargs=None)

Read an image using AICSImageIO. It supports special formats such as ND2, CZI, LIF, or DV.

Extra dependencies

To use this reader, you'll need the aicsimageio dependency (pip install aicsimageio). To read .czi images, you'll also need to install aicspylibczi (for instance pip install aicspylibczi).

Parameters:

Name Type Description Default
path Path

Path to the image file

required
z_stack int

(Only for 3D images) Index of the stack in the z-axis to use.

0
image_models_kwargs dict | None

Keyword arguments passed to spatialdata.models.Image2DModel.

None
aics_kwargs dict | None

Keyword arguments passed to aicsimageio.AICSImage.

None

Returns:

Type Description
SpatialData

A SpatialData object with a 2D-image of shape (C, Y, X)

Source code in sopa/io/reader/aics.py
def aicsimageio(
    path: Path,
    z_stack: int = 0,
    image_models_kwargs: dict | None = None,
    aics_kwargs: dict | None = None,
) -> SpatialData:
    """Read an image using [AICSImageIO](https://github.com/AllenCellModeling/aicsimageio). It supports special formats such as `ND2`, `CZI`, `LIF`, or `DV`.

    !!! note "Extra dependencies"
        To use this reader, you'll need the `aicsimageio` dependency (`pip install aicsimageio`). To read `.czi` images, you'll also need to install `aicspylibczi` (for instance `pip install aicspylibczi`).

    Args:
        path: Path to the image file
        z_stack: (Only for 3D images) Index of the stack in the z-axis to use.
        image_models_kwargs: Keyword arguments passed to `spatialdata.models.Image2DModel`.
        aics_kwargs: Keyword arguments passed to `aicsimageio.AICSImage`.

    Returns:
        A `SpatialData` object with a 2D-image of shape `(C, Y, X)`
    """
    image_models_kwargs, _ = _default_image_kwargs(image_models_kwargs, None)
    aics_kwargs = {} if aics_kwargs is None else aics_kwargs

    try:
        from aicsimageio import AICSImage
    except ImportError:
        raise ImportError("You need to install aicsimageio, e.g. by running `pip install aicsimageio`")

    xarr: xr.DataArray = AICSImage(path, **aics_kwargs).xarray_dask_data

    assert len(xarr.coords["T"]) == 1, f"Only one time dimension is supported, found {len(xarr.coords['T'])}."

    if len(xarr.coords["Z"]) > 1:
        log.info(f"3D image found, only reading {z_stack:=}")

    xarr = xarr.isel(T=0, Z=z_stack).rename({"C": "c", "Y": "y", "X": "x"})
    xarr = _image_int_dtype(xarr)

    image = Image2DModel.parse(xarr, c_coords=xarr.coords["c"].values, **image_models_kwargs)

    return SpatialData(images={"image": image}, attrs={SopaAttrs.CELL_SEGMENTATION: "image"})

sopa.io.ome_tif(path, as_image=False)

Read an .ome.tif image. This image should be a 2D image (with possibly multiple channels). Typically, this function can be used to open Xenium IF images.

Parameters:

Name Type Description Default
path Path

Path to the .ome.tif image

required
as_image bool

If True, will return a DataArray object

False

Returns:

Type Description
DataArray | SpatialData

A DataArray or a SpatialData object

Source code in sopa/io/reader/utils.py
def ome_tif(path: Path, as_image: bool = False) -> DataArray | SpatialData:
    """Read an `.ome.tif` image. This image should be a 2D image (with possibly multiple channels).
    Typically, this function can be used to open Xenium IF images.

    Args:
        path: Path to the `.ome.tif` image
        as_image: If `True`, will return a `DataArray` object

    Returns:
        A `DataArray` or a `SpatialData` object
    """
    image_models_kwargs, _ = _default_image_kwargs()
    image_name = Path(path).absolute().name.split(".")[0]
    image: da.Array = imread(path)

    if image.ndim == 4:
        assert image.shape[0] == 1, "4D images not supported"
        image = da.moveaxis(image[0], 2, 0)
        log.info(f"Transformed 4D image into a 3D image of shape (c, y, x) = {image.shape}")
    elif image.ndim != 3:
        raise ValueError(f"Number of dimensions not supported: {image.ndim}")

    image = image.rechunk(chunks=image_models_kwargs["chunks"])

    try:
        channel_names = _ome_channels_names(path)
    except:
        channel_names = []
    if len(channel_names) != len(image):
        channel_names = [str(i) for i in range(len(image))]
        log.warning(f"Channel names couldn't be read. Using {channel_names} instead.")

    image = DataArray(image, dims=["c", "y", "x"], name=image_name, coords={"c": channel_names})
    image = _image_int_dtype(image)

    if as_image:
        return image

    image = Image2DModel.parse(
        image,
        c_coords=channel_names,
        transformations={"pixels": Identity()},
        **image_models_kwargs,
    )

    return SpatialData(images={image_name: image}, attrs={SopaAttrs.CELL_SEGMENTATION: image_name})

sopa.io.wsi(path, chunks=(3, 256, 256), as_image=False, backend='tiffslide')

Read a WSI into a SpatialData object

Parameters:

Name Type Description Default
path str | Path

Path to the WSI

required
chunks tuple[int, int, int]

Tuple representing the chunksize for the dimensions (C, Y, X).

(3, 256, 256)
as_image bool

If True, returns a, image instead of a SpatialData object

False
backend str

The library to use as a backend in order to load the WSI. One of: "openslide", "tiffslide".

'tiffslide'

Returns:

Type Description
SpatialData | DataTree

A SpatialData object with a multiscale 2D-image of shape (C, Y, X), or just the DataTree if as_image=True

Source code in sopa/io/reader/wsi.py
def wsi(
    path: str | Path,
    chunks: tuple[int, int, int] = (3, 256, 256),
    as_image: bool = False,
    backend: str = "tiffslide",
) -> SpatialData | DataTree:
    """Read a WSI into a `SpatialData` object

    Args:
        path: Path to the WSI
        chunks: Tuple representing the chunksize for the dimensions `(C, Y, X)`.
        as_image: If `True`, returns a, image instead of a `SpatialData` object
        backend: The library to use as a backend in order to load the WSI. One of: `"openslide"`, `"tiffslide"`.

    Returns:
        A `SpatialData` object with a multiscale 2D-image of shape `(C, Y, X)`, or just the DataTree if `as_image=True`
    """
    image_name, img, slide, slide_metadata = _open_wsi(path, backend=backend)

    images = {}
    for level, key in enumerate(list(img.keys())):
        suffix = key if key != "0" else ""

        scale_image = DataArray(
            img[key].transpose("S", f"Y{suffix}", f"X{suffix}"),
            dims=("c", "y", "x"),
        ).chunk(chunks)

        scale_factor = slide.level_downsamples[level]

        scale_image = Image2DModel.parse(
            scale_image[:3, :, :],
            transformations={"pixels": _get_scale_transformation(scale_factor)},
            c_coords=("r", "g", "b"),
        )
        scale_image.coords["y"] = scale_factor * scale_image.coords["y"]
        scale_image.coords["x"] = scale_factor * scale_image.coords["x"]

        images[f"scale{key}"] = scale_image

    multiscale_image = DataTree.from_dict(images)
    sdata = SpatialData(images={image_name: multiscale_image}, attrs={SopaAttrs.TISSUE_SEGMENTATION: image_name})
    sdata[image_name].attrs["metadata"] = slide_metadata
    sdata[image_name].attrs["backend"] = backend
    sdata[image_name].name = image_name

    if as_image:
        return multiscale_image

    return sdata

sopa.io.toy_dataset(*_, length=2048, cell_density=0.0001, n_points_per_cell=100, c_coords=['DAPI', 'CK', 'CD3', 'CD20'], genes=['EPCAM', 'CD3E', 'CD20', 'CXCL4', 'CXCL10'], sigma_factor=0.05, pixel_size=0.1, seed=0, include_vertices=False, include_image=True, include_he_image=True, apply_blur=True, as_output=False, transcript_cell_id_as_merscope=False, add_nan_gene_name=False)

Generate a dummy dataset composed of cells generated uniformly in a square. It also has transcripts.

Parameters:

Name Type Description Default
length int

Size of the square, in pixels

2048
cell_density float

Density of cells per pixel^2

0.0001
n_points_per_cell int

Mean number of transcripts per cell

100
c_coords list[str]

Channel names

['DAPI', 'CK', 'CD3', 'CD20']
genes int | list[str]

Number of different genes, or list of gene names

['EPCAM', 'CD3E', 'CD20', 'CXCL4', 'CXCL10']
sigma_factor float

Factor used to determine sigma for the gaussian blur.

0.05
pixel_size float

Number of microns in one pixel.

0.1
seed int

Numpy random seed

0
include_vertices bool

Whether to include the vertices of the cells (as points) in the spatialdata object

False
include_image bool

Whether to include the image in the spatialdata object

True
apply_blur bool

Whether to apply gaussian blur on the image (without blur, cells are just one pixel)

True
as_output bool

If True, the data will have the same format than an output of Sopa

False

Returns:

Type Description
SpatialData

A SpatialData object with a 2D image (sdata["image"]), the cells polygon boundaries (sdata["cells"]), the transcripts (sdata["transcripts"]), and optional cell vertices (sdata["vertices"]) if include_vertices is True.

Source code in sopa/utils/data.py
def toy_dataset(
    *_,
    length: int = 2_048,
    cell_density: float = 1e-4,
    n_points_per_cell: int = 100,
    c_coords: list[str] = ["DAPI", "CK", "CD3", "CD20"],
    genes: int | list[str] = ["EPCAM", "CD3E", "CD20", "CXCL4", "CXCL10"],
    sigma_factor: float = 0.05,
    pixel_size: float = 0.1,
    seed: int = 0,
    include_vertices: bool = False,
    include_image: bool = True,
    include_he_image: bool = True,
    apply_blur: bool = True,
    as_output: bool = False,
    transcript_cell_id_as_merscope: bool = False,
    add_nan_gene_name: bool = False,
) -> SpatialData:
    """Generate a dummy dataset composed of cells generated uniformly in a square. It also has transcripts.

    Args:
        length: Size of the square, in pixels
        cell_density: Density of cells per pixel^2
        n_points_per_cell: Mean number of transcripts per cell
        c_coords: Channel names
        genes: Number of different genes, or list of gene names
        sigma_factor: Factor used to determine `sigma` for the gaussian blur.
        pixel_size: Number of microns in one pixel.
        seed: Numpy random seed
        include_vertices: Whether to include the vertices of the cells (as points) in the spatialdata object
        include_image: Whether to include the image in the spatialdata object
        apply_blur: Whether to apply gaussian blur on the image (without blur, cells are just one pixel)
        as_output: If `True`, the data will have the same format than an output of Sopa

    Returns:
        A SpatialData object with a 2D image (`sdata["image"]`), the cells polygon boundaries (`sdata["cells"]`), the transcripts (`sdata["transcripts"]`), and optional cell vertices (`sdata["vertices"]`) if `include_vertices` is `True`.
    """
    np.random.seed(seed)

    grid_width = max(1, int(length * np.sqrt(cell_density)))
    dx = length / grid_width
    sigma = dx * sigma_factor
    n_cells = grid_width**2
    radius = int(dx) // 4
    cell_types_index = np.random.randint(0, max(1, len(c_coords) - 1), n_cells)

    log.info(
        f"Image of size ({len(c_coords), length, length}) with {n_cells} cells and {n_points_per_cell} transcripts per cell"
    )

    ### Compute cell vertices (xy array)
    vertices_x = dx / 2 + np.arange(grid_width) * dx
    x, y = np.meshgrid(vertices_x, vertices_x)
    xy = np.stack([x.ravel(), y.ravel()], axis=1)
    xy += np.random.uniform(-dx / 2, dx / 2, size=xy.shape)
    xy = xy.clip(0, length - 1).astype(int)

    vertices = pd.DataFrame(xy, columns=["x", "y"])

    ### Create images
    images = {}

    if include_image:
        x_circle, y_circle = _circle_coords(radius)

        image = np.zeros((len(c_coords), length, length))
        for i, (x, y) in enumerate(xy):
            y_coords = (y + y_circle).clip(0, image.shape[1] - 1)
            x_coords = (x + x_circle).clip(0, image.shape[2] - 1)
            image[0, y_coords, x_coords] = 1
            if len(c_coords) > 1:
                image[cell_types_index[i] + 1, y_coords, x_coords] = 1
        if apply_blur:
            image = gaussian_filter(image, sigma=sigma, axes=(1, 2))
        image = (image / image.max() * 255).astype(np.uint8)
        image = da.from_array(image, chunks=(1, 1024, 1024))
        images["image"] = Image2DModel.parse(image, c_coords=c_coords, dims=["c", "y", "x"])

    if include_he_image:
        he_image = _he_image(length // 2)
        scale = length / (length // 2)
        images["he_image"] = Image2DModel.parse(
            he_image,
            dims=["c", "y", "x"],
            transformations={"global": Scale([scale, scale], axes=["x", "y"])},
            scale_factors=[2, 2],
        )

    ### Create cell boundaries
    cells = [Point(vertex).buffer(radius).simplify(tolerance=1) for vertex in xy]
    bbox = box(0, 0, length - 1, length - 1)
    cells = [cell.intersection(bbox) for cell in cells]
    gdf = gpd.GeoDataFrame(geometry=cells)
    shapes_key = "cellpose_boundaries" if as_output else "cells"
    shapes = {shapes_key: ShapesModel.parse(gdf)}

    ### Create transcripts
    n_genes = n_cells * n_points_per_cell
    point_cell_index = np.arange(n_cells).repeat(n_points_per_cell)
    points_coords = radius / 2 * np.random.randn(n_genes, 2) + xy[point_cell_index]
    points_coords = points_coords.clip(0, length - 1)

    if isinstance(genes, int):
        gene_names = np.random.choice([chr(97 + i) for i in range(n_genes)], size=n_genes)
    elif len(genes) and len(genes) >= len(c_coords) - 1:
        gene_names = np.full(n_genes, "", dtype="<U5")
        for i in range(len(genes)):
            where_cell_type = np.where(cell_types_index[point_cell_index] == i)[0]
            probabilities = np.full(len(genes), 0.2 / (len(genes) - 1))
            probabilities[i] = 0.8
            gene_names[where_cell_type] = np.random.choice(genes, len(where_cell_type), p=probabilities)
    else:
        gene_names = np.random.choice(genes, size=n_genes)

    gene_names = gene_names.astype(object)
    if add_nan_gene_name:
        gene_names[3] = np.nan  # Add a nan value for tests

    df = pd.DataFrame(
        {
            "x": points_coords[:, 0],
            "y": points_coords[:, 1],
            "z_": 1,  # TODO: add back as 'z'?
            "genes": gene_names,
        }
    )

    # apply an arbritrary transformation for a more complete test case
    affine = np.array([[pixel_size, 0, 100], [0, pixel_size, 600], [0, 0, 1]])
    df[["x", "y", "z_"]] = df[["x", "y", "z_"]] @ affine.T
    affine = Affine(affine, input_axes=["x", "y"], output_axes=["x", "y"]).inverse()

    df = dd.from_pandas(df, chunksize=2_000_000)
    misc_df = pd.DataFrame({"x": [0, 1], "y": [0, 1]})  # dummy dataframe for testing purposes

    points = {
        "transcripts": PointsModel.parse(
            df, transformations={"global": affine, "microns": Identity()}, feature_key="genes"
        ),
        "misc": PointsModel.parse(misc_df, transformations={"global": Identity()}),
    }
    if include_vertices:
        points["vertices"] = PointsModel.parse(vertices)

    sdata = SpatialData(
        images=images,
        points=points,
        shapes=shapes,
        attrs={SopaAttrs.TRANSCRIPTS: "transcripts"},
    )

    if include_image:
        sdata.attrs[SopaAttrs.CELL_SEGMENTATION] = "image"

    if include_he_image:
        sdata.attrs[SopaAttrs.TISSUE_SEGMENTATION] = "he_image"

    from ..spatial import assign_transcript_to_cell

    assign_transcript_to_cell(sdata, "transcripts", shapes_key, "cell_id", unassigned_value=0)

    sdata["transcripts"]["cell_id"] = sdata["transcripts"]["cell_id"].astype(int) - int(transcript_cell_id_as_merscope)

    if as_output:
        _add_table(sdata)

    return sdata