Source code for scphylo.pp._bifiltering

import math

import numpy as np

import scphylo as scp


[docs]def bifiltering(df, cellr, mutr, time_limit=3600): """Bi-filtering to find maximally inforemed submatrix. This function runs an ILP to find maximally inforemed submatrix where the number of mutant genotypes is maximized. Parameters ---------- df : :class:`pandas.DataFrame` The input noisy genotype matrix where entries are 0,1 and 3. cellr : :obj:`float` ratio for picking how many cells. mutr : :obj:`float` ratio for picking how many mutations. time_limit : :obj:`int`, optional Time limit for the ILP solver, by default 3600. Returns ------- :class:`pandas.DataFrame` The output gentoype submatrix. """ gp, gp_is_not_imported = scp.ul.import_gurobi() if gp_is_not_imported: scp.logg.error("Unable to import a package!") M = df.values.copy() # mutant and reference M[M != 3] = 1 M[M == 3] = -1 n, m = M.shape n_cells, n_sites = math.floor(cellr * n), math.floor(mutr * m) model = gp.Model("bi-filtering") model.Params.OutputFlag = 0 model.Params.LogFile = "" model.Params.Threads = 1 model.Params.TimeLimit = time_limit scp.logg.info("1. declaring variables", time=True) A, C, S = {}, {}, {} for i in range(n): A[i] = {} for j in range(m): A[i][j] = model.addVar(vtype=gp.GRB.BINARY) for i in range(n): C[i] = model.addVar(vtype=gp.GRB.BINARY) for j in range(m): S[j] = model.addVar(vtype=gp.GRB.BINARY) scp.logg.info("2. adding constraints", time=True) for i in range(n): for j in range(m): model.addConstr(A[i][j] <= C[i]) model.addConstr(A[i][j] <= S[j]) model.addConstr(C[i] + S[j] - 1 <= A[i][j]) model.addConstr(gp.quicksum(C[i] for i in range(n)) == n_cells) model.addConstr(gp.quicksum(S[j] for j in range(m)) == n_sites) model.update() scp.logg.info("3. adding objective", time=True) model.setObjective( gp.quicksum(A[i][j] * M[i, j] for i in range(n) for j in range(m)), gp.GRB.MAXIMIZE, ) scp.logg.info("4. start solving", time=True) model.optimize() a = np.array([i for i in range(n) if C[i].X > 0]) b = np.array([j for j in range(m) if S[j].X > 0]) cells = df.index[a] sites = df.columns[b] return df.loc[cells, sites].copy()