Source code for scphylo.tl.score._others

import itertools

import ete3
import networkx as nx
import numpy as np

import scphylo as scp
from scphylo.external._mp3 import build_tree, similarity
from scphylo.ul._trees import _to_newick


def bourque(df_grnd, df_sol):
    """Bourque distances for mutation trees.

    This measure was introduced in :cite:`Bourque`.

    Parameters
    ----------
    df_grnd : :class:`pandas.DataFrame`
        The first genotype matrix (e.g. ground truth)
        This matrix must be conflict-free.
    df_sol : :class:`pandas.DataFrame`
        The second genotype matrix (e.g. solution/inferred)
        This matrix must be conflict-free.

    Returns
    -------
    :obj:`float`
        Similarity out of one.
    """
    # TODO: implement
    return None


def pcss(df_grnd, df_sol):
    """Pairwise cell shortest-path similarity score.

    For every pair of cells :math:`i` and :math:`j`, we computed the shortest-path
    :math:`d_{ij}` between the two cells in each tree. If the two cells belong to the
    same clone, their shortest-path distance is 0, otherwise the shortest-path distance
    equals the number of edges (regardless of direction) that separate the clones of the
    two cells. Finally, we summed up the absolute differences between the shortest-path
    distances of all unordered pairs of cells in the two trees.
    This measure is metric. The proof is given in the paper.

    This measure was introduced in :cite:`OncoNEM`.

    Parameters
    ----------
    df_grnd : :class:`pandas.DataFrame`
        The first genotype matrix (e.g. ground truth)
        This matrix must be conflict-free.
    df_sol : :class:`pandas.DataFrame`
        The second genotype matrix (e.g. solution/inferred)
        This matrix must be conflict-free.

    Returns
    -------
    :obj:`float`
        Similarity out of one.
    """
    # TODO: implement
    return None


[docs]def mp3(df_grnd, df_sol): """Triplet-based similarity score. For fully multilabeled trees with poly-occurring labels. This measure was introduced in :cite:`MP3`. Parameters ---------- df_grnd : :class:`pandas.DataFrame` The first genotype matrix (e.g. ground truth) This matrix must be conflict-free. df_sol : :class:`pandas.DataFrame` The second genotype matrix (e.g. solution/inferred) This matrix must be conflict-free. Returns ------- :obj:`float` Similarity out of one. """ inter = np.intersect1d(df_grnd.columns, df_sol.columns) if len(inter) == 0: scp.logg.error("No common mutations found between two trees!") df_grnd1 = df_grnd[inter] df_sol1 = df_sol[inter] tree_grnd = scp.ul.to_tree(df_grnd1) tree_sol = scp.ul.to_tree(df_sol1) inter = np.setdiff1d( inter, np.union1d( tree_grnd.graph["become_germline"], tree_sol.graph["become_germline"] ), ) df_grnd1 = df_grnd[inter] df_sol1 = df_sol[inter] tree_grnd = scp.ul.to_tree(df_grnd1) tree_sol = scp.ul.to_tree(df_sol1) tree_grnd = scp.ul.to_mtree(tree_grnd) tree_sol = scp.ul.to_mtree(tree_sol) for n in tree_grnd.nodes: if tree_grnd.in_degree(n) > 0: tree_grnd.nodes[n]["label"] = ",".join(tree_grnd.nodes[n]["label"]) for n in tree_sol.nodes: if tree_sol.in_degree(n) > 0: tree_sol.nodes[n]["label"] = ",".join(tree_sol.nodes[n]["label"]) T1 = build_tree(tree_grnd) T2 = build_tree(tree_sol) return similarity(T1, T2)
[docs]def caset(df_grnd, df_sol): """Commonly Ancestor Sets score. This measure was introduced in :cite:`CASet_DISC`. Parameters ---------- df_grnd : :class:`pandas.DataFrame` The first genotype matrix (e.g. ground truth) This matrix must be conflict-free. df_sol : :class:`pandas.DataFrame` The second genotype matrix (e.g. solution/inferred) This matrix must be conflict-free. Returns ------- :obj:`float` Similarity out of one. """ def _get_ancesteral_set(tree): root = scp.ul.root_id(tree) ancesteral_set = {} for n in tree.nodes: if tree.in_degree(n) > 0 and "––" not in tree.nodes[n]["label"]: for mut in tree.nodes[n]["label"]: ancester_set = ["root"] ancester_set += tree.nodes[n]["label"] # self ancestor is ok for m in nx.shortest_path(tree, root, n): if tree.in_degree(m) > 0 and "––" not in tree.nodes[m]["label"]: ancester_set += tree.nodes[m]["label"] ancesteral_set[mut] = ancester_set return ancesteral_set def _get_common_ancesteral_set(muts, ancesteral_set): common_ancesteral_set = {} for x, y in itertools.combinations(muts, 2): common_ancesteral_set[(x, y)] = np.intersect1d( ancesteral_set[x], ancesteral_set[y] ) return common_ancesteral_set inter = np.intersect1d(df_grnd.columns, df_sol.columns) if len(inter) == 0: scp.logg.error("No common mutations found between two trees!") df_grnd1 = df_grnd[inter] df_sol1 = df_sol[inter] tree_grnd = scp.ul.to_tree(df_grnd1) tree_sol = scp.ul.to_tree(df_sol1) inter = np.setdiff1d( inter, np.union1d( tree_grnd.graph["become_germline"], tree_sol.graph["become_germline"] ), ) df_grnd1 = df_grnd[inter] df_sol1 = df_sol[inter] tree_grnd = scp.ul.to_tree(df_grnd1) tree_sol = scp.ul.to_tree(df_sol1) tree_grnd = scp.ul.to_mtree(tree_grnd) tree_sol = scp.ul.to_mtree(tree_sol) ancesteral_set_grnd = _get_ancesteral_set(tree_grnd) ancesteral_set_sol = _get_ancesteral_set(tree_sol) common_ancesteral_set_grnd = _get_common_ancesteral_set(inter, ancesteral_set_grnd) common_ancesteral_set_sol = _get_common_ancesteral_set(inter, ancesteral_set_sol) final = [] for x, y in itertools.combinations(inter, 2): a = len( np.intersect1d( common_ancesteral_set_grnd[(x, y)], common_ancesteral_set_sol[(x, y)] ) ) b = len( np.union1d( common_ancesteral_set_grnd[(x, y)], common_ancesteral_set_sol[(x, y)] ) ) final.append(a / b) return np.mean(final)
[docs]def disc(df_grnd, df_sol): """Distinctly Inherited Sets score. This measure was introduced in :cite:`CASet_DISC`. Parameters ---------- df_grnd : :class:`pandas.DataFrame` The first genotype matrix (e.g. ground truth) This matrix must be conflict-free. df_sol : :class:`pandas.DataFrame` The second genotype matrix (e.g. solution/inferred) This matrix must be conflict-free. Returns ------- :obj:`float` Similarity out of one. """ def _get_ancesteral_set(tree): root = scp.ul.root_id(tree) ancesteral_set = {} for n in tree.nodes: if tree.in_degree(n) > 0 and "––" not in tree.nodes[n]["label"]: for mut in tree.nodes[n]["label"]: ancester_set = ["root"] ancester_set += tree.nodes[n]["label"] # self ancestor is ok for m in nx.shortest_path(tree, root, n): if tree.in_degree(m) > 0 and "––" not in tree.nodes[m]["label"]: ancester_set += tree.nodes[m]["label"] ancesteral_set[mut] = ancester_set return ancesteral_set def _get_distinctly_inherited_set(muts, ancesteral_set): common_ancesteral_set = {} for x, y in itertools.permutations(muts, 2): common_ancesteral_set[(x, y)] = np.setdiff1d( ancesteral_set[x], ancesteral_set[y] ) return common_ancesteral_set inter = np.intersect1d(df_grnd.columns, df_sol.columns) if len(inter) == 0: scp.logg.error("No common mutations found between two trees!") df_grnd1 = df_grnd[inter] df_sol1 = df_sol[inter] tree_grnd = scp.ul.to_tree(df_grnd1) tree_sol = scp.ul.to_tree(df_sol1) inter = np.setdiff1d( inter, np.union1d( tree_grnd.graph["become_germline"], tree_sol.graph["become_germline"] ), ) df_grnd1 = df_grnd[inter] df_sol1 = df_sol[inter] tree_grnd = scp.ul.to_tree(df_grnd1) tree_sol = scp.ul.to_tree(df_sol1) tree_grnd = scp.ul.to_mtree(tree_grnd) tree_sol = scp.ul.to_mtree(tree_sol) ancesteral_set_grnd = _get_ancesteral_set(tree_grnd) ancesteral_set_sol = _get_ancesteral_set(tree_sol) distinctly_inherited_set_grnd = _get_distinctly_inherited_set( inter, ancesteral_set_grnd ) distinctly_inherited_set_sol = _get_distinctly_inherited_set( inter, ancesteral_set_sol ) final = [] for x, y in itertools.combinations(inter, 2): a = len( np.intersect1d( distinctly_inherited_set_grnd[(x, y)], distinctly_inherited_set_sol[(x, y)], ) ) b = len( np.union1d( distinctly_inherited_set_grnd[(x, y)], distinctly_inherited_set_sol[(x, y)], ) ) if ( b > 0 ): # FIXME: if a and b are in the same node distinctly_inherited_set is empty final.append(a / b) return np.mean(final)
[docs]def rf(df_grnd, df_sol): """Robinson-Foulds score. The Robinson–Foulds or symmetric difference metric is defined as (A + B) where A is the number of partitions of data implied by the first tree but not the second tree and B is the number of partitions of data implied by the second tree but not the first tree (although some software implementations divide the RF metric by 2 and others scale the RF distance to have a maximum value of 1). Parameters ---------- df_grnd : :class:`pandas.DataFrame` The first genotype matrix (e.g. ground truth) This matrix must be conflict-free. df_sol : :class:`pandas.DataFrame` The second genotype matrix (e.g. solution/inferred) This matrix must be conflict-free. Returns ------- :obj:`float` Similarity out of one. """ inter = np.intersect1d(df_grnd.index, df_sol.index) if len(inter) == 0: scp.logg.error("No common cells found between two trees!") df_grnd1 = df_grnd.loc[inter] df_sol1 = df_sol.loc[inter] tree_grnd = scp.ul.to_tree(df_grnd1) tree_sol = scp.ul.to_tree(df_sol1) nwk_grnd = _to_newick(tree_grnd) nwk_sol = _to_newick(tree_sol) # from skbio import TreeNode # tree_grnd = TreeNode.read([nwk_grnd]) # tree_sol = TreeNode.read([nwk_sol]) # return tree_grnd.compare_rfd(tree_sol) tree_grnd = ete3.Tree(nwk_grnd, format=1) tree_sol = ete3.Tree(nwk_sol, format=1) rf = tree_grnd.robinson_foulds(tree_sol, unrooted_trees=True) return 1 - rf[0] / rf[1]