Source code for pymodulon.enrichment

"""
Contains functions for gene set enrichment analysis
"""

import itertools
import logging

import numpy as np
import pandas as pd
from scipy import special, stats
from statsmodels.stats.multitest import fdrcorrection


[docs]def contingency(set1, set2, all_genes):
    """
    Creates contingency table for gene enrichment

    Parameters
    ----------
    set1 : set
        Set of genes (e.g. iModulon)
    set2 : set
        Set of genes (e.g. regulon)
    all_genes : set
        Set of all genes

    Returns
    -------
    np.ndarray
        Contingency table

    """

    set1 = set(set1)
    set2 = set(set2)
    all_genes = set(all_genes)
    if len(set1 - all_genes) > 0 or len(set2 - all_genes) > 0:
        raise ValueError("Gene sets contain genes not in all_genes")

    tp = len(set1 & set2)
    fp = len(set1 - set2)
    tn = len(all_genes - set1 - set2)
    fn = len(set2 - set1)
    return np.array([[tp, fp], [fn, tn]])


[docs]def compute_enrichment(gene_set, target_genes, all_genes, label=None):
    """
    Computes enrichment statistic for gene_set in target_genes.

    Parameters
    ----------
    gene_set : list
        Gene set for enrichment (e.g. genes in iModulon)
    target_genes : list
        Genes to be enriched against (e.g. genes in regulon or
            GO term)
    all_genes : list
        Set of all genes
    label : list
        Label for target_genes (e.g. regulator name or GO term)

    Returns
    -------
    pd.Series
        Table containing statistically significant enrichments
    """

    # Create contingency table
    ((tp, fp), (fn, tn)) = contingency(gene_set, target_genes, all_genes)

    # Handle edge cases
    if tp == 0:
        res = [1, 0, 0, 0, 0, len(target_genes), len(gene_set)]
    elif fp == 0 and fn == 0:
        res = [0, 1, 1, 1, len(gene_set), len(target_genes), len(gene_set)]
    else:
        odds, pval = stats.fisher_exact([[tp, fp], [fn, tn]], alternative="greater")
        recall = np.true_divide(tp, tp + fn)
        precision = np.true_divide(tp, tp + fp)
        f1score = (2 * precision * recall) / (precision + recall)
        res = [pval, precision, recall, f1score, tp, len(target_genes), len(gene_set)]

    return pd.Series(
        res,
        index=[
            "pvalue",
            "precision",
            "recall",
            "f1score",
            "TP",
            "target_set_size",
            "gene_set_size",
        ],
        name=label,
    )


[docs]def FDR(p_values, fdr, total=None):
    """
    Runs false detection correction for a table of statistics

    Parameters
    ----------
    p_values : ~pandas.DataFrame
        DataFrame with a 'pvalue' column
    fdr : float
        False detection rate
    total : int
        Total number of tests (for multi-enrichment)

    Returns
    -------
    ~pandas.DataFrame
        Table containing entries that passed multiple hypothesis correction
    """

    if total is not None:
        pvals = p_values.pvalue.values.tolist() + [1] * (total - len(p_values))
    else:
        pvals = p_values.pvalue.values

    keep, qvals = fdrcorrection(pvals, alpha=fdr)

    result = p_values.copy()
    result["qvalue"] = qvals[: len(p_values)]
    result = result[keep[: len(p_values)]]
    return result.sort_values("qvalue")


[docs]def parse_regulon_str(regulon_str, trn):
    """
    Converts a complex regulon (regulon_str) into a list of genes

    Parameters
    ----------
    regulon_str : str
        Complex regulon, where "/" uses genes in any regulon and "+" uses
        genes in all regulons
    trn : ~pandas.DataFrame
        Table containing transcriptional regulatory network

    Returns
    -------
    reg_genes : set
        Set of genes regulated by regulon_str
    """

    if regulon_str == "":
        return set()
    if "+" in regulon_str and "/" in regulon_str:
        raise NotImplementedError(
            'Complex regulons cannot contain both "+" ' '(AND) and "/" (OR) operators'
        )
    elif "+" in regulon_str:
        join = set.intersection
        regs = regulon_str.split("+")
    elif "/" in regulon_str:
        join = set.union
        regs = regulon_str.split("/")
    else:
        join = set.union
        regs = [regulon_str]

    # Combine regulon
    reg_genes = join(*[set(trn[trn.regulator == reg].gene_id) for reg in regs])
    return reg_genes


[docs]def compute_regulon_enrichment(gene_set, regulon_str, all_genes, trn):
    """
    Computes enrichment statistics for a gene_set in a regulon

    Parameters
    ----------
    gene_set : set
        Gene set for enrichment (e.g. genes in iModulon)
    regulon_str : str
        Complex regulon, where "/" uses genes in any regulon and "+" uses
        genes in all regulons
    all_genes : set
        Set of all genes
    trn : ~pandas.DataFrame
        Table containing transcriptional regulatory network

    Returns
    -------
    result : ~pandas.DataFrame
        Table containing statistically significant enrichments
    """

    regulon = parse_regulon_str(regulon_str, trn)
    # Remove genes in regulon that are not in all_genes
    if len(regulon - set(all_genes)) > 0:
        logging.warning(
            "Some genes are in the regulon but not in all_genes. "
            "These genes are removed before enrichment analysis.",
            category=UserWarning,
        )
        regulon = regulon & set(all_genes)
    result = compute_enrichment(gene_set, regulon, all_genes, regulon_str)
    result.rename({"target_set_size": "regulon_size"}, inplace=True)
    n_regs = 1 + regulon_str.count("+") + regulon_str.count("/")
    result["n_regs"] = n_regs
    return result


[docs]def compute_trn_enrichment(
    gene_set, all_genes, trn, max_regs=1, fdr=0.01, method="both", force=False
):
    """
    Compare a gene set against an entire TRN

    Parameters
    ----------
    gene_set : set
        Gene set for enrichment (e.g. genes in iModulon)
    all_genes : set
        Set of all genes
    trn : ~pandas.DataFrame
        Table containing transcriptional regulatory network
    max_regs : int
        Maximum number of regulators to include in complex regulon (default: 1)
    fdr : float
        False detection rate (default = .01)
    method : str
        How to combine complex regulons. (default: 'both')
        "or" computes enrichment against union of regulons
        "and" computes enrichment against intersection of regulons
        "both" performs both tests
    force : bool
        Allows computation of >2 regulators (default = False)

    Returns
    -------
    ~pandas.DataFrame
        Table containing statistically significant enrichments
    """

    # Warning if max_regs is too high
    if max_regs > 2 and not force:
        raise ValueError(
            "Using >2 maximum regulators may take time to compute. "
            "To perform analysis, use force=True"
        )

    # Only search for regulators known to regulate a gene in gene_set
    # This reduces the total runtime by skipping unnecessary tests
    # However, this needs to be taken into account for FDR
    imod_regs = trn[trn.gene_id.isin(gene_set)].regulator.unique()

    # # Account for imodulons with no regulators
    # if len(imod_regs) == 0:
    #     return compute_regulon_enrichment(gene_set, '', all_genes, trn)

    # Prepare complex regulon names
    enrich_list = []
    total = 0

    # Perform enrichments for 1 regulator
    for reg in imod_regs:
        enrich_list.append(compute_regulon_enrichment(gene_set, reg, all_genes, trn))
    total += len(imod_regs)

    # Perform enrichments for >1 regulator
    for n_regs in range(2, max_regs + 1):
        group1, group2 = itertools.tee(itertools.combinations(imod_regs, n_regs))
        num_tests = int(special.comb(len(trn.regulator.unique()), n_regs))

        if method == "and":
            reg_list = ["+".join(regs) for regs in group1]
            total += num_tests
        elif method == "or":
            reg_list = ["/".join(regs) for regs in group1]
            total += num_tests
        elif method == "both":
            reg_list = ["+".join(regs) for regs in group1] + [
                "/".join(regs) for regs in group2
            ]
            total += 2 * num_tests
        else:
            raise ValueError("'method' must be either 'and', 'or', or 'both'")

        # Perform enrichments
        for reg in reg_list:
            enrich_list.append(
                compute_regulon_enrichment(gene_set, reg, all_genes, trn)
            )

    if len(enrich_list) == 0:
        return pd.DataFrame()
    df_enrich = pd.concat(enrich_list, axis=1, sort=False).T
    return FDR(df_enrich, fdr=fdr, total=total)


[docs]def compute_annotation_enrichment(gene_set, all_genes, annotation, column, fdr=0.01):
    """
    Compare a gene set against a dataframe of gene annotations

    Parameters
    ----------
    gene_set : set
        Gene set for enrichment (e.g. genes in iModulon)
    all_genes : set
        Set of all genes
    annotation : ~pandas.DataFrame
        Table containing gene annotations
    column : str
        Name of column in the annotation DataFrame (default: 'annotation')
    fdr : float
        False detection rate (default: 0.01)

    Returns
    -------
    pandas.DataFrame
       Table containing statistically significant enrichments
    """

    # TODO: Create test functions
    enrich_list = []
    for name, group in annotation.groupby(column):
        target_genes = group["gene_id"]
        enrich_list.append(
            compute_enrichment(gene_set, target_genes, all_genes, label=name)
        )
    df_enrich = pd.concat(enrich_list, axis=1).T
    return FDR(df_enrich, fdr=fdr)