Source code for scphylo.datasets._simulate

import math
import random

import anndata as ad
import numpy as np
import pandas as pd

import scphylo as scp


[docs]def simulate(n_cells=10, n_muts=10, n_clones=3, alpha=0.00001, beta=0.1, missing=0):
    """Simulate single-cell noisy genotype matrix.

    This function is using :cite:`OncoNEM`.

    Parameters
    ----------
    n_cells : :obj:`int`, optional
        Number of cells, by default 10
    n_muts : :obj:`int`, optional
        Number of mutations, by default 10
    n_clones : :obj:`int`, optional
        Number of clones, by default 3
    alpha : :obj:`float`, optional
        False positive rate, by default 0.00001
    beta : :obj:`float`, optional
        False negative rate, by default 0.1
    missing : :obj:`int`, optional
        Missing entry rate, by default 0

    Returns
    -------
    :class:`pandas.DataFrame`
        A genotype matrix where 0 is absent, 1 is present and 3 is missing.
    """
    # TODO: replace

    onconem, onconem_is_not_imported = scp.ul.import_rpy2(
        "oncoNEM",
        "BiocManager::install('graph')\n"
        "devtools::install_bitbucket('edith_ross/oncoNEM')\n",
    )
    if onconem_is_not_imported:
        scp.logg.error("Unable to import a package!")

    import rpy2.robjects as ro
    from rpy2.robjects import pandas2ri

    dat = onconem.simulateData(
        N_cells=n_cells,
        N_normalContam=0,
        N_clones=n_clones,
        N_unobs=0,
        N_sites=n_muts,
        FPR=alpha,
        FNR=beta,
        p_missing=missing,
        randomizeOrder=False,
    )

    with ro.conversion.localconverter(ro.default_converter + pandas2ri.converter):
        dat = ro.conversion.rpy2py(dat.rx2("D"))
    dat[dat == 2] = 3
    df = pd.DataFrame(dat.T, dtype=int)
    df.columns = [f"mut{x}" for x in df.columns]
    df.index = [f"cell{x}" for x in df.index]

    return df


[docs]def add_noise(df_in, alpha, beta, missing):
    """Add noise to the input genotype matrix.

    These noise includes:
    1) False positive errors (alpha)
    2) False negative errors (beta)
    3) Missing entry errors (missing)

    Parameters
    ----------
    df_in : :class:`pandas.DataFrame`
        Input genotype matrix.
    alpha : :obj:`float`
        False positive error rate.
    beta : :obj:`float`
        False negative error rate.
    missing : :obj:`float`
        Missing entry error rate.

    Returns
    -------
    :class:`pandas.DataFrame`
        A noisy genotype matrix where 0 is absent, 1 is present and 3 is missing.
    """

    def toss(p):
        return True if random.random() < p else False

    data = df_in.values
    n, m = df_in.shape
    data2 = -1 * np.ones(shape=(n, m)).astype(int)
    countFP = 0
    countFN = 0
    countNA = 0
    countOneZero = 0
    indexNA = []
    changedBefore = []
    for i in range(n):
        for j in range(m):
            indexNA.append([i, j])
            countOneZero = countOneZero + 1
    random.shuffle(indexNA)
    nas = math.ceil(countOneZero * missing)
    for i in range(int(nas)):
        [a, b] = indexNA[i]
        changedBefore.append([a, b])
        data2[a][b] = 3
        countNA = countNA + 1
    for i in range(n):
        for j in range(m):
            if data2[i][j] != 3:
                if data[i][j] == 1:
                    if toss(beta):
                        data2[i][j] = 0
                        countFN = countFN + 1
                    else:
                        data2[i][j] = data[i][j]
                elif data[i][j] == 0:
                    if toss(alpha):
                        data2[i][j] = 1
                        countFP = countFP + 1
                    else:
                        data2[i][j] = data[i][j]
                else:
                    scp.logg.error("Wrong Input")

    # scp.logg.info(f"FNs={countFN}, FPs={countFP}, NAs={countNA}")

    df_out = pd.DataFrame(data2)
    df_out.columns = df_in.columns
    df_out.index = df_in.index
    df_out.index.name = "cellIDxmutID"

    return df_out


def add_doublets(df_ground, df_noisy, alpha, beta, missing, doublet):
    df_doublet = df_noisy.copy()
    doublet_cells = []
    for _ in range(int(doublet * df_ground.shape[0])):
        r1 = np.random.choice(df_ground.index, replace=False, size=1)
        while r1 in doublet_cells:
            r1 = np.random.choice(df_ground.index, replace=False, size=1)
        doublet_cells.append(r1)
        r2 = np.random.choice(df_ground.index, replace=False, size=1)
        df_doublet.loc[r1] = 1 * np.logical_or(df_ground.loc[r1], df_ground.loc[r2])
        df_doublet.loc[r1] = scp.datasets.add_noise(
            df_doublet.loc[r1], alpha=alpha, beta=beta, missing=missing
        )
    return df_doublet


[docs]def add_readcount(df_in, mean_coverage=50, seed=None):
    """Add readcount to the input genotype matrix.

    Parameters
    ----------
    df_in : :class:`pandas.DataFrame`
        Input genotype matrix.
    mean_coverage : :obj:`int`, optional
        The average coverage, by default 50
    seed : :obj:`int`, optional
        The seed for numpy random function, by default None

    Returns
    -------
    adata : :class:`anndata.AnnData`
        The genotype data with readcount info contains layers of mutant and total.
    """
    if seed is not None:
        np.random.seed(seed)
    n, m = df_in.shape
    I_mtx = df_in.values
    T_mtx = np.random.binomial(n=mean_coverage, p=0.5, size=(n, m))

    R_mtx = np.zeros(I_mtx.shape, dtype=int)
    for i in range(I_mtx.shape[0]):
        for j in range(I_mtx.shape[1]):
            if I_mtx[i, j] == 0:
                # rr allele
                R_mtx[i, j] = np.ceil(T_mtx[i, j] * np.random.beta(20, 1))
            elif I_mtx[i, j] == 1:
                if np.random.uniform(0, 1) < 0.5:
                    # aa allele
                    R_mtx[i, j] = np.ceil(T_mtx[i, j] * np.random.beta(1, 20))
                else:
                    # ra allele
                    R_mtx[i, j] = np.ceil(T_mtx[i, j] * np.random.beta(20, 20))
            elif I_mtx[i, j] == 3:
                R_mtx[i, j] = 0
                T_mtx[i, j] = 0
            else:
                scp.logg.error("Wrong Input")
    adata = ad.AnnData(df_in, dtype=int)
    adata.layers["total"] = T_mtx
    adata.layers["mutant"] = T_mtx - R_mtx
    return adata