Source code for scphylo.tl.solver._siclonefit

import glob
import os
import time

import numpy as np
import pandas as pd

import scphylo as scp


[docs]def siclonefit( df_input, alpha, beta, n_restarts=3, n_iters=500, n_burnin=100, return_tree=False, experiment=False, ): """Solving using SiCloneFit. Bayesian inference of population structure, genotype, and phylogeny of tumor clones from single-cell genome sequencing data :cite:`SiCloneFit`. Parameters ---------- df_input : :class:`pandas.DataFrame` Input genotype matrix in which rows are cells and columns are mutations. Values inside this matrix show the presence (1), absence (0) and missing entires (3). alpha : :obj:`float` False positive error rate. beta : :obj:`float` False negative error rate. n_restarts : :obj:`int`, optional Number of restarts, by default 3 n_iters : :obj:`int`, optional Number of iterations for each Markov Chain after burnin, by default 500 n_burnin : :obj:`int`, optional Number of iterations for burnin of each Markov Chain, by default 100 return_tree : :obj:`bool`, optional Return the inferred cell-lineage tree, by default False experiment : :obj:`bool`, optional Is in the experiment mode (the log won't be shown), by default False Returns ------- :class:`pandas.DataFrame` A conflict-free matrix in which rows are cells and columns are mutations. Values inside this matrix show the presence (1) and absence (0). """ executable = scp.ul.executable("SiCloneFiTComplete.jar", "SiCloneFit") if not experiment: scp.logg.info( f"running SiCloneFit with alpha={alpha}, beta={beta}, n_iters={n_iters}" ) tmpdir = scp.ul.tmpdirsys(suffix=".siclonefit") df_input.T.reset_index(drop=True).to_csv( f"{tmpdir.name}/siclonefit.input", sep=" ", header=None ) with open(f"{tmpdir.name}/siclonefit.cellnames", "w") as fout: fout.write(" ".join(df_input.index)) with open(f"{tmpdir.name}/siclonefit.genenames", "w") as fout: fout.write(" ".join(df_input.columns)) I_mtr = df_input.values cmd = ( f"java -jar {executable} " f"-m {df_input.shape[0]} " f"-n {df_input.shape[1]} " f"-ipMat {tmpdir.name}/siclonefit.input " f"-fp {alpha} " f"-fn {beta} " "-df 0 " f"-missing {np.sum(I_mtr == 3)/(I_mtr.size)} " f"-iter {n_iters} " f"-cellNames {tmpdir.name}/siclonefit.cellnames " f"-geneNames {tmpdir.name}/siclonefit.genenames " f"-r {n_restarts} " f"-burnin {n_burnin} " # "-recurProb 0 " # "-delProb 0 " # "-LOHProb 0 " # "-doublet " # "-printIter " # "-treeIter " f"-outDir {tmpdir.name} > {tmpdir.name}/siclonefit.log" ) s_time = time.time() os.system(cmd) e_time = time.time() running_time = e_time - s_time out_dir = glob.glob(f"{tmpdir.name}/*samples/best")[0] df_output = pd.read_csv( f"{out_dir}/best_MAP_predicted_genotype.txt", sep=" ", header=None, index_col=0, ).T df_output.columns = df_input.columns df_output.index = df_input.index df_output.index.name = "cellIDxmutID" with open(f"{out_dir}/best_MAP_tree.txt") as fin: tree = fin.readline().strip() tmpdir.cleanup() if not experiment: scp.ul.stat(df_input, df_output, alpha, beta, running_time) if return_tree: return df_output, tree else: return df_output else: is_cf = scp.ul.is_conflict_free_gusfield(df_output) nll = scp.ul.calc_nll_matrix(df_input, df_output, alpha, beta) return df_output, running_time, is_cf, nll