Source code for micom.stats

"""Data warngling and statistics for MICOM."""

import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu, kruskal, spearmanr, pearsonr
from .workflows import workflow



[docs]
def fdr_adjust(p):
    """Get FDR cutoffs for p-values with Benjamini-Hochberg.

    Arguments
    ---------
    p : list[float]
        The original p-values. Can not contain naNs.

    Returns
    -------
    A numpy array of FDR cutoffs (q-values). This is commonly known as "adjusted"
    p-values.
    """
    p = np.array(p)
    if np.isnan(p).any():
        raise ValueError("`p` can not contain NAs. Please filter those before.")
    order = np.argsort(-p)
    reverse_order = np.argsort(order)
    n = len(p)
    i = np.arange(n, 0, -1)
    q = np.minimum.accumulate(n / i * p[order])[reverse_order]
    q[q > 1.0] = 1.0
    return q




[docs]
def _run_test(args):
    df, col, groups = args
    met = df.metabolite.iloc[0]
    if len(groups) == 2:
        lfc = np.log2(df.flux[df[col] == groups[1]].mean() + 1e-6) - np.log2(
            df.flux[df[col] == groups[0]].mean() + 1e-6
        )
        try:
            p = mannwhitneyu(
                df.flux[df[col] == groups[0]], df.flux[df[col] == groups[1]]
            )[1]
        except Exception:
            p = np.nan

        return pd.DataFrame(
            {
                "metabolite": met,
                "log_fold_change": lfc,
                "p": p,
                "n": df.shape[0],
            },
            index=[0],
        )
    elif len(groups) > 2:
        lstd = df.groupby(col).flux.apply(lambda x: np.log2(x.mean() + 1e-6)).std()
        try:
            gs = [df.flux[df[col] == c] for c in groups]
            p = kruskal(*gs)[1]
        except Exception:
            p = np.nan

        return pd.DataFrame(
            {
                "metabolite": met,
                "log_mean_std": lstd,
                "p": p,
                "n": df.shape[0],
            },
            index=[0],
        )




[docs]
def _run_corr(args):
    df, col = args
    met = df.metabolite.iloc[0]
    try:
        rho, p = spearmanr(df["flux"], df[col])
    except Exception:
        rho, p = np.nan, np.nan
    return pd.DataFrame(
        {
            "metabolite": met,
            "spearman_rho": rho,
            "p": p,
            "n": df.shape[0],
        },
        index=[0],
    )




[docs]
def compare_groups(fluxes, metadata_column, groups=None, threads=1, progress=True):
    """Compare fluxes form different sample groups.

    Note
    ----
    This uses a non-parametric test by default. By default it will use a
    Mann-Whitney test for two groups and a Kruskal-Wallis test for >2 groups.
    `q` are the FDR-corrected p-values (Benjamini-Hochberg correction).

    Arguments
    ---------
    fluxes : pandas.DataFrame
        A frame with net fluxes as returned by `production_rates` or
        `consumption_rates`.
    metatdata_column : str
        The column of the DataFrame denoting the groups.
    groups : list[str] or None
        Specify a subset of groups you want to compare or define the order (1st will
        be the reference group). If None will use the groups as they appear in
        the DataFrame.
    threads : int
        How many threads to use to run tests in parallel.
    progress : bool
        Whether to show a progress bar.

    Returns
    -------
    Returns the metabolite with their respective test statistics.
    """
    if groups is not None:
        fluxes = fluxes[fluxes[metadata_column].isin(groups)]
    else:
        groups = np.unique(fluxes[metadata_column])

    met_data = [
        (fluxes[fluxes.metabolite == m], metadata_column, groups)
        for m in fluxes.metabolite.unique()
    ]
    res = workflow(_run_test, met_data, threads=threads, progress=progress)
    res = pd.concat(res)
    if len(groups) == 2:
        res["comparison"] = f"{groups[1]} vs. {groups[0]}"
    else:
        res["covariate"] = metadata_column
    res["q"] = np.nan
    res.loc[res.p.notnull(), "q"] = fdr_adjust(res.loc[res.p.notnull(), "p"])
    return res




[docs]
def correlate_fluxes(fluxes, metadata_column, threads=1, progress=True):
    """Correlate fluxes with a continuous covariate.

    Note
    ----
    This uses a non-parametric test by default (Spearman rank correlation). `q`
    are the FDR-corrected p-values (Benjamini-Hochberg correction).

    Arguments
    ---------
    fluxes : pandas.DataFrame
        A frame with net fluxes as returned by `production_rates` or
        `consumption_rates`.
    metatdata_column : str
        The column of the DataFrame denoting the covariate.
    threads : int
        How many threads to use to run tests in parallel.
    progress : bool
        Whether to show a progress bar.

    Returns
    -------
    Returns the metabolite with their respective test statistics.
    """
    met_data = [
        (fluxes[fluxes.metabolite == m], metadata_column)
        for m in fluxes.metabolite.unique()
    ]
    res = workflow(_run_corr, met_data, threads=threads, progress=progress)
    res = pd.concat(res)
    res["covariate"] = metadata_column
    res["q"] = np.nan
    res.loc[res.p.notnull(), "q"] = fdr_adjust(res.loc[res.p.notnull(), "p"])
    return res