Source code for scphylo.tl.score._ours

import time

import apted
import numpy as np

import scphylo as scp
from scphylo.external._mltd import run_mltd
from scphylo.ul._trees import _split_labels, _to_apted


[docs]def gs(df_grnd, df_sol):
    """Genotype-similarity accuracy.

    This measure was introduced in :cite:`SiCloneFit`.

    Parameters
    ----------
    df_grnd : :class:`pandas.DataFrame`
        The first genotype matrix (e.g. ground truth)
        This matrix must be conflict-free.
    df_sol : :class:`pandas.DataFrame`
        The second genotype matrix (e.g. solution/inferred)
        This matrix must be conflict-free.

    Returns
    -------
    :obj:`float`
        Similarity out of one.
    """
    muts = np.intersect1d(df_grnd.columns, df_sol.columns)
    cells = np.intersect1d(df_grnd.index, df_sol.index)
    if len(muts) == 0:
        scp.logg.error("No common mutations found between two trees!")
    if len(cells) == 0:
        scp.logg.error("No common cells found between two trees!")
    M_grnd = df_grnd.loc[cells, muts].values
    M_sol = df_sol.loc[cells, muts].values
    return 1 - np.abs(M_grnd - M_sol).sum() / M_grnd.size


[docs]def ad(df_grnd, df_sol):
    """Ancestor-descendent accuracy.

    For each pair of mutations in ground truth tree that are in
    ancestor-descendant relation (same nodes excluded) we check
    whether this relationship is preserved in the inferred tree.

    This measure was introduced in :cite:`B-SCITE`.

    Parameters
    ----------
    df_grnd : :class:`pandas.DataFrame`
        The first genotype matrix (e.g. ground truth)
        This matrix must be conflict-free.
    df_sol : :class:`pandas.DataFrame`
        The second genotype matrix (e.g. solution/inferred)
        This matrix must be conflict-free.

    Returns
    -------
    :obj:`float`
        Similarity out of one.
    """
    inter = np.intersect1d(df_grnd.columns, df_sol.columns)
    if len(inter) == 0:
        scp.logg.error("No common mutations found between two trees!")
    M_grnd = df_grnd[inter].values
    M_sol = df_sol[inter].values
    error_pairs = []
    n_adpairs = 0
    for i in range(M_grnd.shape[1]):
        for j in range(i, M_grnd.shape[1]):
            cap1 = M_grnd[:, i] * M_grnd[:, j]
            cap2 = M_sol[:, i] * M_sol[:, j]
            if np.sum(cap1) > 0 and np.sum(M_grnd[:, i]) != np.sum(M_grnd[:, j]):
                n_adpairs = n_adpairs + 1
                if np.sum(cap2) == 0:
                    error_pairs.append([i, j])
                else:
                    if np.sum(M_grnd[:, j]) > np.sum(M_grnd[:, i]) and np.sum(
                        M_sol[:, j]
                    ) <= np.sum(M_sol[:, i]):
                        error_pairs.append([i, j])
                    else:
                        if np.sum(M_grnd[:, i]) > np.sum(M_grnd[:, j]) and np.sum(
                            M_sol[:, i]
                        ) <= np.sum(M_sol[:, j]):
                            error_pairs.append([i, j])
    if n_adpairs == 0:
        scp.logg.error("No pair of mutations exists in the given inputs!")
    return 1 - len(error_pairs) / n_adpairs


[docs]def dl(df_grnd, df_sol):
    """Different-lineage accuracy.

    For each pair of mutations in ground truth tree that are
    in different-lineages relation we check whether the same relationship
    is preserved in the inferred tree.

    This measure was introduced in :cite:`B-SCITE`.

    Parameters
    ----------
    df_grnd : :class:`pandas.DataFrame`
        The first genotype matrix (e.g. ground truth)
        This matrix must be conflict-free.
    df_sol : :class:`pandas.DataFrame`
        The second genotype matrix (e.g. solution/inferred)
        This matrix must be conflict-free.

    Returns
    -------
    :obj:`float`
        Similarity out of one.
    """
    inter = np.intersect1d(df_grnd.columns, df_sol.columns)
    if len(inter) == 0:
        scp.logg.error("No common mutations found between two trees!")
    M_grnd = df_grnd[inter].values
    M_sol = df_sol[inter].values
    n_dlpairs1 = 0
    n_dlpairs2 = 0
    for i in range(M_grnd.shape[1]):
        for j in range(i, M_grnd.shape[1]):
            cap1 = M_grnd[:, i] * M_grnd[:, j]
            cap2 = M_sol[:, i] * M_sol[:, j]
            if (
                np.sum(cap1) == 0
                and np.sum(M_grnd[:, i]) != 0
                and np.sum(M_grnd[:, j]) != 0
            ):
                n_dlpairs1 = n_dlpairs1 + 1
                if (
                    np.sum(cap2) == 0
                    and np.sum(M_sol[:, i]) != 0
                    and np.sum(M_sol[:, j]) != 0
                ):
                    n_dlpairs2 = n_dlpairs2 + 1
    return n_dlpairs2 / n_dlpairs1


def cc(df_grnd, df_sol):
    """Co-clustering accuracy.

    For each pair of mutations in ground truth tree that are on the same node we look
    relationship is preserved in the inferred tree.

    This measure was introduced in :cite:`B-SCITE`.

    Parameters
    ----------
    df_grnd : :class:`pandas.DataFrame`
        The first genotype matrix (e.g. ground truth)
        This matrix must be conflict-free.
    df_sol : :class:`pandas.DataFrame`
        The second genotype matrix (e.g. solution/inferred)
        This matrix must be conflict-free.

    Returns
    -------
    :obj:`float`
        Similarity out of one.
    """
    inter = np.intersect1d(df_grnd.columns, df_sol.columns)
    if len(inter) == 0:
        scp.logg.error("No common mutations found between two trees!")
    M_grnd = df_grnd[inter].values
    M_sol = df_sol[inter].values
    type(M_grnd)
    type(M_sol)

    # TODO: implement
    return None


[docs]def mltd(df_grnd, df_sol):
    """Multi-labeled tree dissimilarity measure (MLTD).

    This measure was introduced in :cite:`MLTD`.

    Parameters
    ----------
    df_grnd : :class:`pandas.DataFrame`
        The first genotype matrix (e.g. ground truth)
        This matrix must be conflict-free.
    df_sol : :class:`pandas.DataFrame`
        The second genotype matrix (e.g. solution/inferred)
        This matrix must be conflict-free.

    Returns
    -------
    :obj:`dict`
        {'distance', 'similarity', 'normalized_similarity'}
    """

    def _convert_tree_to_mtld_input(tree, file):
        with open(file, "w") as fout:
            for u, v, l in tree.edges.data("label"):
                if tree.in_degree(u) == 0:
                    fout.write(f"{u}=\n")
                muts = l.split(tree.graph["splitter_mut"])
                fout.write(f"{v}={','.join(muts)}\n")
            for u in tree.nodes:
                children = [str(n) for n in tree.neighbors(u)]
                if len(children) == 0:
                    continue
                fout.write(f"{u}:{','.join(children)}\n")

    tmpdir = scp.ul.tmpdirsys(suffix=".mltd")

    df_grnd.columns = df_grnd.columns.str.replace(":", "_").str.replace("=", "_")
    df_sol.columns = df_sol.columns.str.replace(":", "_").str.replace("=", "_")
    inter = np.intersect1d(df_grnd.columns, df_sol.columns)
    if len(inter) == 0:
        scp.logg.error("No common mutations found between two trees!")
    df_grnd1 = df_grnd[inter]
    df_sol1 = df_sol[inter]

    tree_grnd = scp.ul.to_tree(df_grnd1)
    tree_sol = scp.ul.to_tree(df_sol1)

    _convert_tree_to_mtld_input(tree_grnd, f"{tmpdir.name}/grnd.in")
    _convert_tree_to_mtld_input(tree_sol, f"{tmpdir.name}/sol.in")

    s_time = time.time()
    result = run_mltd(f"{tmpdir.name}/grnd.in", f"{tmpdir.name}/sol.in")
    e_time = time.time()
    running_time = e_time - s_time
    type(running_time)

    tmpdir.cleanup()
    if result is None:
        scp.logg.error("MLTD core failed!")

    return result


[docs]def tpted(df_grnd, df_sol):
    """Tumor phylogeny tree edit distance measure (TPTED).

    This measure was introduced in :cite:`PhISCS`. This implementation uses
    `APTED <https://github.com/JoaoFelipe/apted>`_.

    Parameters
    ----------
    df_grnd : :class:`pandas.DataFrame`
        The first genotype matrix (e.g. ground truth)
        This matrix must be conflict-free.
    df_sol : :class:`pandas.DataFrame`
        The second genotype matrix (e.g. solution/inferred)
        This matrix must be conflict-free.

    Returns
    -------
    :obj:`float`
        Similarity out of one.
    """
    inter = np.intersect1d(df_grnd.columns, df_sol.columns)
    if len(inter) == 0:
        scp.logg.error("No common mutations found between two trees!")
    df_grnd1 = df_grnd[inter]
    df_sol1 = df_sol[inter]

    tree_grnd = scp.ul.to_tree(df_grnd1)
    tree_sol = scp.ul.to_tree(df_sol1)
    inter = np.setdiff1d(
        inter,
        np.union1d(
            tree_grnd.graph["become_germline"], tree_sol.graph["become_germline"]
        ),
    )

    df_grnd1 = df_grnd[inter]
    df_sol1 = df_sol[inter]

    tree_grnd = scp.ul.to_tree(df_grnd1)
    tree_sol = scp.ul.to_tree(df_sol1)

    mt_grnd = scp.ul.to_mtree(tree_grnd)
    mt_sol = scp.ul.to_mtree(tree_sol)

    sl_grnd, sl_sol = _split_labels(mt_grnd, mt_sol)

    apted_grnd = _to_apted(sl_grnd)
    apted_sol = _to_apted(sl_sol)

    tree1 = apted.helpers.Tree.from_text(apted_grnd)
    tree2 = apted.helpers.Tree.from_text(apted_sol)

    ap = apted.APTED(tree1, tree2)
    ed = ap.compute_edit_distance()

    # FIXME: `python -m apted -t {a{b}{c}} {a{c}{b}}` returns 2!
    # looks like the above isomorphic trees have TED of 2!
    return 1 - ed / (2 * (len(inter) + 1))