"""Data warngling and statistics for MICOM."""
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu, kruskal, spearmanr, pearsonr
from .workflows import workflow
[docs]def fdr_adjust(p):
"""Get FDR cutoffs for p-values with Benjamini-Hochberg.
Arguments
---------
p : list[float]
The original p-values. Can not contain naNs.
Returns
-------
A numpy array of FDR cutoffs (q-values). This is commonly known as "adjusted"
p-values.
"""
p = np.array(p)
if np.isnan(p).any():
raise ValueError("`p` can not contain NAs. Please filter those before.")
order = np.argsort(-p)
reverse_order = np.argsort(order)
n = len(p)
i = np.arange(n, 0, -1)
q = np.minimum.accumulate(n / i * p[order])[reverse_order]
q[q > 1.0] = 1.0
return q
[docs]def _run_test(args):
df, col, groups = args
met = df.metabolite.iloc[0]
if len(groups) == 2:
lfc = np.log2(df.flux[df[col] == groups[1]].mean() + 1e-6) - np.log2(
df.flux[df[col] == groups[0]].mean() + 1e-6
)
try:
p = mannwhitneyu(
df.flux[df[col] == groups[0]], df.flux[df[col] == groups[1]]
)[1]
except Exception:
p = np.nan
return pd.DataFrame(
{
"metabolite": met,
"log_fold_change": lfc,
"p": p,
"n": df.shape[0],
},
index=[0],
)
elif len(groups) > 2:
lstd = df.groupby(col).flux.apply(lambda x: np.log2(x.mean() + 1e-6)).std()
try:
gs = [df.flux[df[col] == c] for c in groups]
p = kruskal(*gs)[1]
except Exception:
p = np.nan
return pd.DataFrame(
{
"metabolite": met,
"log_mean_std": lstd,
"p": p,
"n": df.shape[0],
},
index=[0],
)
[docs]def _run_corr(args):
df, col = args
met = df.metabolite.iloc[0]
try:
rho, p = spearmanr(df["flux"], df[col])
except Exception:
rho, p = np.nan, np.nan
return pd.DataFrame(
{
"metabolite": met,
"spearman_rho": rho,
"p": p,
"n": df.shape[0],
},
index=[0],
)
[docs]def compare_groups(fluxes, metadata_column, groups=None, threads=1, progress=True):
"""Compare fluxes form different sample groups.
Note
----
This uses a non-parametric test by default. By default it will use a
Mann-Whitney test for two groups and a Kruskal-Wallis test for >2 groups.
`q` are the FDR-corrected p-values (Benjamini-Hochberg correction).
Arguments
---------
fluxes : pandas.DataFrame
A frame with net fluxes as returned by `production_rates` or
`consumption_rates`.
metatdata_column : str
The column of the DataFrame denoting the groups.
groups : list[str] or None
Specify a subset of groups you want to compare or define the order (1st will
be the reference group). If None will use the groups as they appear in
the DataFrame.
threads : int
How many threads to use to run tests in parallel.
progress : bool
Whether to show a progress bar.
Returns
-------
Returns the metabolite with their respective test statistics.
"""
if groups is not None:
fluxes = fluxes[fluxes[metadata_column].isin(groups)]
else:
groups = np.unique(fluxes[metadata_column])
met_data = [
(fluxes[fluxes.metabolite == m], metadata_column, groups)
for m in fluxes.metabolite.unique()
]
res = workflow(_run_test, met_data, threads=threads, progress=progress)
res = pd.concat(res)
if len(groups) == 2:
res["comparison"] = f"{groups[1]} vs. {groups[0]}"
else:
res["covariate"] = metadata_column
res["q"] = np.nan
res.loc[res.p.notnull(), "q"] = fdr_adjust(res.loc[res.p.notnull(), "p"])
return res
[docs]def correlate_fluxes(fluxes, metadata_column, threads=1, progress=True):
"""Correlate fluxes with a continuous covariate.
Note
----
This uses a non-parametric test by default (Spearman rank correlation). `q`
are the FDR-corrected p-values (Benjamini-Hochberg correction).
Arguments
---------
fluxes : pandas.DataFrame
A frame with net fluxes as returned by `production_rates` or
`consumption_rates`.
metatdata_column : str
The column of the DataFrame denoting the covariate.
threads : int
How many threads to use to run tests in parallel.
progress : bool
Whether to show a progress bar.
Returns
-------
Returns the metabolite with their respective test statistics.
"""
met_data = [
(fluxes[fluxes.metabolite == m], metadata_column)
for m in fluxes.metabolite.unique()
]
res = workflow(_run_corr, met_data, threads=threads, progress=progress)
res = pd.concat(res)
res["covariate"] = metadata_column
res["q"] = np.nan
res.loc[res.p.notnull(), "q"] = fdr_adjust(res.loc[res.p.notnull(), "p"])
return res