Source code for scphylo.tl.solver._scistree

import copy
import time

import numpy as np
import pandas as pd
from Bio.Phylo import BaseTree
from Bio.Phylo.TreeConstruction import DistanceMatrix

import scphylo as scp
from scphylo.external._scistree import run_scistree
from scphylo.external._scprob import run_scprob

# from Bio.Phylo.TreeConstruction import DistanceTreeConstructor


[docs]def scistree(df_input, alpha, beta, n_threads=1, experiment=False):
    """Solving using ScisTree.

    Accurate and efficient cell lineage tree inference from noisy
    single cell data: the maximum likelihood perfect phylogeny approach
    :cite:`ScisTree`.

    Parameters
    ----------
    df_input : :class:`pandas.DataFrame`
        Input genotype matrix in which rows are cells and columns are mutations.
        Values inside this matrix show the presence (1), absence (0) and missing
        entires (3).
    alpha : :obj:`float`
        False positive error rate.
    beta : :obj:`float`
        False negative error rate.
    n_threads : :obj:`int`
        Number of threads.
    experiment : :obj:`bool`, optional
        Is in the experiment mode (the log won't be shown), by default False

    Returns
    -------
    :class:`pandas.DataFrame`
        A conflict-free matrix in which rows are cells and columns are mutations.
        Values inside this matrix show the presence (1) and absence (0).
    """
    if not experiment:
        scp.logg.info(
            f"running ScisTree with alpha={alpha}, beta={beta}, n_threads={n_threads}"
        )
    tmpdir = scp.ul.tmpdirsys(suffix=".scistree")
    cells = df_input.index
    snvs = df_input.columns
    df = df_input.transpose()

    df = df.replace(3, 0.5)
    df = df.replace(0, 1 - beta)
    df = df.replace(1, alpha)

    file1 = f"{tmpdir.name}/scistree.input"
    df.index.name = f"HAPLOID {df.shape[0]} {df.shape[1]}"
    df.to_csv(file1, sep=" ")
    with open(file1) as ifile:
        data = ifile.read()
    with open(file1, "w") as ofile:
        data = data.replace('"', "")
        ofile.write(data)

    cmd = [
        "scistree",
        "-v",
        "-d",
        "0",
        "-e",
        "-k",
        f"{n_threads}",
        "-o",
        f"{tmpdir.name}/scistree.gml",
        f"{tmpdir.name}/scistree.input",
    ]
    s_time = time.time()
    run_scistree(cmd)
    e_time = time.time()
    running_time = e_time - s_time

    data = []
    detail = {"cost": "\n"}
    with open(f"{tmpdir.name}/scistree.output") as infile:
        now_store = False
        for line in infile:
            line = line.strip()
            if "Imputed genotypes:" in line:
                now_store = True
            if line[:4] == "Site" and now_store:
                line = "".join(line.split(":")[1])
                line = line.replace("\t", "")
                data.append([int(x) for x in line.split(" ")])
            if "current cost: " in line:
                cost = float(line.split("current cost: ")[1].split(", opt tree: ")[0])
                detail["cost"] += f"    current best cost = {cost}\n"

    data = np.array(data)
    matrix_output = data.T

    df_output = pd.DataFrame(matrix_output)
    df_output.columns = snvs
    df_output.index = cells
    df_output.index.name = "cellIDxmutID"

    tmpdir.cleanup()

    if not experiment:
        scp.ul.stat(df_input, df_output, alpha, beta, running_time)
        return df_output
    else:
        return df_output, running_time


def rscistree(adata, alpha=0, beta=0, n_threads=1, mode="haploid"):
    """Solving using read-count ScisTree.

    Accurate and efficient cell lineage tree inference from noisy
    single cell data: the maximum likelihood perfect phylogeny approach
    :cite:`ScisTree`.

    Parameters
    ----------
    adata : :class:`anndata.AnnData`
        Input data contains layers of mutant and total.
    alpha : :obj:`float`
        False positive error rate.
    beta : :obj:`float`
        False negative error rate.
    n_threads : :obj:`int`
        Number of threads.
    mode : :obj:`str`
        Mode of calculating the probability from read-count.
        In {'haploid', 'ternary'}, by default haploid
    experiment : :obj:`bool`, optional
        Is in the experiment mode (the log won't be shown), by default False

    Returns
    -------
    :class:`pandas.DataFrame`
        A conflict-free matrix in which rows are cells and columns are mutations.
        Values inside this matrix show the presence (1) and absence (0).
    """
    scp.logg.info(f"running rScisTree with n_threads={n_threads}, mode={mode}")
    tmpdir = scp.ul.tmpdirsys(suffix=".rscistree", dirname=".")

    cells = adata.obs_names
    snvs = adata.var_names
    df_input = adata.to_df()

    V = adata.layers["mutant"]
    R = adata.layers["total"] - V
    with open(f"{tmpdir.name}/rscistree.counts", "w") as fout:
        fout.write(f"{mode.upper()} {len(snvs)} {len(cells)}\n")
        for j in range(len(snvs)):
            for i in range(len(cells)):
                fout.write(f"{R[i,j]} {V[i,j]}     ")
            fout.write("\n")

    cmd = [
        "scprob",
        f"{tmpdir.name}/rscistree.counts",
    ]
    if mode.lower() == "haploid":
        cmd += ["0"]
    elif mode.lower() == "ternary":
        cmd += ["1"]
    else:
        scp.logg.error("Wrong mode!")
    run_scprob(cmd)

    cmd = [
        "scistree",
        "-v",
        "-d",
        "0",
        "-e",
        "-k",
        f"{n_threads}",
        "-o",
        f"{tmpdir.name}/rscistree.gml",
        f"{tmpdir.name}/rscistree.input",
    ]
    s_time = time.time()
    run_scistree(cmd)
    e_time = time.time()
    running_time = e_time - s_time

    data = []
    detail = {"cost": "\n"}
    with open(f"{tmpdir.name}/rscistree.output") as infile:
        now_store = False
        for line in infile:
            line = line.strip()
            if "Imputed genotypes:" in line:
                now_store = True
            if line[:4] == "Site" and now_store:
                line = "".join(line.split(":")[1])
                line = line.replace("\t", "")
                data.append([int(x) for x in line.split(" ")])
            if "current cost: " in line:
                cost = float(line.split("current cost: ")[1].split(", opt tree: ")[0])
                detail["cost"] += f"    current best cost = {cost}\n"

    data = np.array(data)
    matrix_output = data.T

    df_output = pd.DataFrame(matrix_output)
    df_output.columns = snvs
    df_output.index = cells
    df_output.index.name = "cellIDxmutID"

    tmpdir.cleanup()

    scp.ul.stat(df_input, df_output, alpha, beta, running_time)
    return df_output


def iscistree(df_input, alpha, beta, n_iters=np.inf):
    """Solving using my own implementation of ScisTree.

    Accurate and efficient cell lineage tree inference from noisy
    single cell data: the maximum likelihood perfect phylogeny approach
    :cite:`ScisTree`.

    Parameters
    ----------
    df_input : :class:`pandas.DataFrame`
        Input genotype matrix in which rows are cells and columns are mutations.
        Values inside this matrix show the presence (1), absence (0) and missing
        entires (3).
    alpha : :obj:`float`
        False positive error rate.
    beta : :obj:`float`
        False negative error rate.
    n_iters : :obj:`int`
        Number of iterations to search for the neighboring trees, by default inf.
    experiment : :obj:`bool`, optional
        Is in the experiment mode (the log won't be shown), by default False

    Returns
    -------
    :class:`pandas.DataFrame`
        A conflict-free matrix in which rows are cells and columns are mutations.
        Values inside this matrix show the presence (1) and absence (0).
    """
    scp.logg.info(
        f"running iScisTree with alpha={alpha}, beta={beta}, n_iters={n_iters}"
    )

    def get_initial_tree(D):
        Q = []
        for i in range(D.shape[0]):
            Q.append(list(D[i, : i + 1]))
        dm = DistanceMatrix(names=[f"{i}" for i in range(D.shape[0])], matrix=Q)
        tree = nj(dm)

        node = None
        for clade in tree.find_clades():
            if clade.name == f"{D.shape[0]-1}":
                node = clade
        tree.root_with_outgroup(node)
        tree.prune(f"{D.shape[0]-1}")
        return tree

    def denoise_linear(I_mtr, alpha, beta, opt_tree):
        tree = {}
        for clade in list(opt_tree.find_clades(order="level"))[::-1]:
            children = list(clade.find_clades(order="level"))
            if len(children) > 2:
                child_l = children[2]
                child_r = children[1]
                tree[clade.name] = [child_l.name, child_r.name]
            else:
                tree[clade.name] = []

        def get_cells_in_best(cells_in_best, best):
            for node in tree[best]:
                if "Inner" in node:
                    for child in tree[node]:
                        get_cells_in_best(cells_in_best, child)
                else:
                    cells_in_best.append(node)
            if "Inner" not in best:
                cells_in_best.append(best)

        output = np.zeros(I_mtr.shape, dtype=int)
        total_cost = 0
        for c in range(I_mtr.shape[1]):
            qs = {}
            best = None
            best_v = 0
            for k, v in tree.items():
                if len(v) == 0:
                    obs = I_mtr[int(k), c] == 1
                    p0 = (1 - obs) * (1 - beta) + obs * alpha
                    qs[k] = np.log((1 - p0) / p0)
                else:
                    qs[k] = qs[v[0]] + qs[v[1]]
                if qs[k] > best_v:
                    best = k
                    best_v = qs[k]

            cells_in_best = []
            get_cells_in_best(cells_in_best, best)
            output[list(map(int, cells_in_best)), c] = 1

            total_cost += -best_v
        return output, total_cost

    def get_neighbors(tree):
        """
        Return neighbors.

        For a tree with n taxa, there are n - 3 internal branches.
        Thus there are 2(n - 3) NNI rearrangements for any tree
        """
        # make child to parent dict
        parents = {}
        for clade in tree.find_clades():
            if clade != tree.root:
                node_path = tree.get_path(clade)
                # cannot get the parent if the parent is root. Bug?
                if len(node_path) == 1:
                    parents[clade] = tree.root
                else:
                    parents[clade] = node_path[-2]
        neighbors = []
        root_childs = []
        for clade in tree.get_nonterminals(order="level"):
            if clade == tree.root:
                left = clade.clades[0]
                right = clade.clades[1]
                root_childs.append(left)
                root_childs.append(right)
                if not left.is_terminal() and not right.is_terminal():
                    # make changes around the left_left clade
                    # left_left = left.clades[0]
                    left_right = left.clades[1]
                    right_left = right.clades[0]
                    right_right = right.clades[1]
                    # neightbor 1 (left_left + right_right)
                    del left.clades[1]
                    del right.clades[1]
                    left.clades.append(right_right)
                    right.clades.append(left_right)
                    temp_tree = copy.deepcopy(tree)
                    neighbors.append(temp_tree)
                    # neighbor 2 (left_left + right_left)
                    del left.clades[1]
                    del right.clades[0]
                    left.clades.append(right_left)
                    right.clades.append(right_right)
                    temp_tree = copy.deepcopy(tree)
                    neighbors.append(temp_tree)
                    # change back (left_left + left_right)
                    del left.clades[1]
                    del right.clades[0]
                    left.clades.append(left_right)
                    right.clades.insert(0, right_left)
            elif clade in root_childs:
                # skip root child
                continue
            else:
                # method for other clades
                # make changes around the parent clade
                left = clade.clades[0]
                right = clade.clades[1]
                parent = parents[clade]
                if clade == parent.clades[0]:
                    sister = parent.clades[1]
                    # neighbor 1 (parent + right)
                    del parent.clades[1]
                    del clade.clades[1]
                    parent.clades.append(right)
                    clade.clades.append(sister)
                    temp_tree = copy.deepcopy(tree)
                    neighbors.append(temp_tree)
                    # neighbor 2 (parent + left)
                    del parent.clades[1]
                    del clade.clades[0]
                    parent.clades.append(left)
                    clade.clades.append(right)
                    temp_tree = copy.deepcopy(tree)
                    neighbors.append(temp_tree)
                    # change back (parent + sister)
                    del parent.clades[1]
                    del clade.clades[0]
                    parent.clades.append(sister)
                    clade.clades.insert(0, left)
                else:
                    sister = parent.clades[0]
                    # neighbor 1 (parent + right)
                    del parent.clades[0]
                    del clade.clades[1]
                    parent.clades.insert(0, right)
                    clade.clades.append(sister)
                    temp_tree = copy.deepcopy(tree)
                    neighbors.append(temp_tree)
                    # neighbor 2 (parent + left)
                    del parent.clades[0]
                    del clade.clades[0]
                    parent.clades.insert(0, left)
                    clade.clades.append(right)
                    temp_tree = copy.deepcopy(tree)
                    neighbors.append(temp_tree)
                    # change back (parent + sister)
                    del parent.clades[0]
                    del clade.clades[0]
                    parent.clades.insert(0, sister)
                    clade.clades.insert(0, left)
        return neighbors

    def nj(distance_matrix):
        # make a copy of the distance matrix to be used
        dm = copy.deepcopy(distance_matrix)
        # init terminal clades
        clades = [BaseTree.Clade(None, name) for name in dm.names]
        # init node distance
        node_dist = [0] * len(dm)
        # init minimum index
        min_i = 0
        min_j = 0
        inner_count = 0
        # special cases for Minimum Alignment Matrices
        if len(dm) == 1:
            root = clades[0]
            return BaseTree.Tree(root, rooted=False)
        elif len(dm) == 2:
            # minimum distance will always be [1,0]
            min_i = 1
            min_j = 0
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            clade1.branch_length = dm[min_i, min_j] / 2.0
            clade2.branch_length = dm[min_i, min_j] - clade1.branch_length
            inner_clade = BaseTree.Clade(None, "Inner")
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            clades[0] = inner_clade
            root = clades[0]
            return BaseTree.Tree(root, rooted=False)
        while len(dm) > 2:
            # calculate nodeDist
            for i in range(0, len(dm)):
                node_dist[i] = 0
                for j in range(0, len(dm)):
                    node_dist[i] += dm[i, j]
                node_dist[i] = node_dist[i] / (len(dm) - 2)

            # find minimum distance pair
            min_dist = dm[1, 0] - node_dist[1] - node_dist[0]
            min_i = 0
            min_j = 1
            for i in range(1, len(dm)):
                for j in range(0, i):
                    temp = dm[i, j] - node_dist[i] - node_dist[j]
                    if min_dist > temp:
                        min_dist = temp
                        min_i = i
                        min_j = j
            # create clade
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            inner_count += 1
            inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            # assign branch length
            clade1.branch_length = (
                dm[min_i, min_j] + node_dist[min_i] - node_dist[min_j]
            ) / 2.0
            clade2.branch_length = dm[min_i, min_j] - clade1.branch_length
            if clade1.branch_length < 0:
                clade1.branch_length = 0
            if clade2.branch_length < 0:
                clade2.branch_length = 0

            # update node list
            clades[min_j] = inner_clade
            del clades[min_i]

            # rebuild distance matrix,
            # set the distances of new node at the index of min_j
            for k in range(0, len(dm)):
                if k != min_i and k != min_j:
                    dm[min_j, k] = (
                        dm[min_i, k] + dm[min_j, k] - dm[min_i, min_j]
                    ) / 2.0

            dm.names[min_j] = "Inner" + str(inner_count)
            del dm[min_i]

        # set the last clade as one of the child of the inner_clade
        root = None
        if clades[0] == inner_clade:
            clades[0].branch_length = 0
            clades[1].branch_length = dm[1, 0]
            clades[0].clades.append(clades[1])
            root = clades[0]
        else:
            clades[0].branch_length = dm[1, 0]
            clades[1].branch_length = 0
            clades[1].clades.append(clades[0])
            root = clades[1]

        return BaseTree.Tree(root, rooted=False)

    cells = list(df_input.index)
    snvs = list(df_input.columns)
    I_mtr = df_input.values

    s_time = time.time()
    Ip = np.vstack([I_mtr, np.zeros(I_mtr.shape[1])])  # add root with profile zero
    scp.logg.debug("now calculating distance!", time=True)
    dist = scp.ul.dist_l1_ignore_na(Ip)
    scp.logg.debug("distance is done!", time=True)
    opt_tree = get_initial_tree(dist)
    # opt_subtrees = get_subtrees(opt_tree)
    # opt_O, opt_cost = denoise_quadratic(I, alpha, beta, opt_subtrees)
    scp.logg.debug("init tree!", time=True)
    opt_O, opt_cost = denoise_linear(I_mtr, alpha, beta, opt_tree)
    scp.logg.info("current best cost =", opt_cost, time=True)

    n_iter = 1
    is_done = False
    already_seen = set()
    already_seen.add(str(opt_tree))
    while not is_done and n_iter < n_iters:
        is_done = True
        neighbors = get_neighbors(opt_tree)
        for nbr_tree in neighbors:
            if str(nbr_tree) in already_seen:
                continue
            else:
                already_seen.add(str(nbr_tree))
            # nbr_subtrees = get_subtrees(nbr_tree)
            # nbr_O, nbr_cost = denoise_quadratic(I, alpha, beta, nbr_subtrees)
            nbr_O, nbr_cost = denoise_linear(I_mtr, alpha, beta, nbr_tree)
            if nbr_cost < opt_cost:
                opt_tree = nbr_tree
                # opt_subtrees = nbr_subtrees
                opt_O = nbr_O
                opt_cost = nbr_cost
                is_done = False
                scp.logg.info("current best cost =", nbr_cost, time=True)
        n_iter += 1
    e_time = time.time()
    running_time = e_time - s_time

    df_output = pd.DataFrame(opt_O)
    df_output.columns = snvs
    df_output.index = cells
    df_output.index.name = "cellIDxmutID"

    scp.ul.stat(df_input, df_output, alpha, beta, running_time)

    return df_output