Source code for scphylo.tl.solver._siclonefit

import glob
import os
import time

import numpy as np
import pandas as pd

import scphylo as scp


[docs]def siclonefit(
    df_input,
    alpha,
    beta,
    n_restarts=3,
    n_iters=500,
    n_burnin=100,
    return_tree=False,
    experiment=False,
):
    """Solving using SiCloneFit.

    Bayesian inference of population structure, genotype, and phylogeny of tumor clones
    from single-cell genome sequencing data :cite:`SiCloneFit`.

    Parameters
    ----------
    df_input : :class:`pandas.DataFrame`
        Input genotype matrix in which rows are cells and columns are mutations.
        Values inside this matrix show the presence (1), absence (0) and missing
        entires (3).
    alpha : :obj:`float`
        False positive error rate.
    beta : :obj:`float`
        False negative error rate.
    n_restarts : :obj:`int`, optional
        Number of restarts, by default 3
    n_iters : :obj:`int`, optional
        Number of iterations for each Markov Chain after burnin, by default 500
    n_burnin : :obj:`int`, optional
        Number of iterations for burnin of each Markov Chain, by default 100
    return_tree : :obj:`bool`, optional
        Return the inferred cell-lineage tree, by default False
    experiment : :obj:`bool`, optional
        Is in the experiment mode (the log won't be shown), by default False

    Returns
    -------
    :class:`pandas.DataFrame`
        A conflict-free matrix in which rows are cells and columns are mutations.
        Values inside this matrix show the presence (1) and absence (0).
    """
    executable = scp.ul.executable("SiCloneFiTComplete.jar", "SiCloneFit")

    if not experiment:
        scp.logg.info(
            f"running SiCloneFit with alpha={alpha}, beta={beta}, n_iters={n_iters}"
        )

    tmpdir = scp.ul.tmpdirsys(suffix=".siclonefit")

    df_input.T.reset_index(drop=True).to_csv(
        f"{tmpdir.name}/siclonefit.input", sep=" ", header=None
    )
    with open(f"{tmpdir.name}/siclonefit.cellnames", "w") as fout:
        fout.write(" ".join(df_input.index))
    with open(f"{tmpdir.name}/siclonefit.genenames", "w") as fout:
        fout.write(" ".join(df_input.columns))
    I_mtr = df_input.values

    cmd = (
        f"java -jar {executable} "
        f"-m {df_input.shape[0]} "
        f"-n {df_input.shape[1]} "
        f"-ipMat {tmpdir.name}/siclonefit.input "
        f"-fp {alpha} "
        f"-fn {beta} "
        "-df 0 "
        f"-missing {np.sum(I_mtr == 3)/(I_mtr.size)} "
        f"-iter {n_iters} "
        f"-cellNames {tmpdir.name}/siclonefit.cellnames "
        f"-geneNames {tmpdir.name}/siclonefit.genenames "
        f"-r {n_restarts} "
        f"-burnin {n_burnin} "
        # "-recurProb 0 "
        # "-delProb 0 "
        # "-LOHProb 0 "
        # "-doublet "
        # "-printIter "
        # "-treeIter "
        f"-outDir {tmpdir.name} > {tmpdir.name}/siclonefit.log"
    )
    s_time = time.time()
    os.system(cmd)
    e_time = time.time()
    running_time = e_time - s_time

    out_dir = glob.glob(f"{tmpdir.name}/*samples/best")[0]

    df_output = pd.read_csv(
        f"{out_dir}/best_MAP_predicted_genotype.txt",
        sep=" ",
        header=None,
        index_col=0,
    ).T
    df_output.columns = df_input.columns
    df_output.index = df_input.index
    df_output.index.name = "cellIDxmutID"

    with open(f"{out_dir}/best_MAP_tree.txt") as fin:
        tree = fin.readline().strip()

    tmpdir.cleanup()

    if not experiment:
        scp.ul.stat(df_input, df_output, alpha, beta, running_time)
        if return_tree:
            return df_output, tree
        else:
            return df_output
    else:
        is_cf = scp.ul.is_conflict_free_gusfield(df_output)
        nll = scp.ul.calc_nll_matrix(df_input, df_output, alpha, beta)
        return df_output, running_time, is_cf, nll