Source code for pymodulon.enrichment

"""
Contains functions for gene set enrichment analysis
"""

import itertools
import logging

import numpy as np
import pandas as pd
from scipy import special, stats
from statsmodels.stats.multitest import fdrcorrection


[docs]def contingency(set1, set2, all_genes): """ Creates contingency table for gene enrichment Parameters ---------- set1 : set Set of genes (e.g. iModulon) set2 : set Set of genes (e.g. regulon) all_genes : set Set of all genes Returns ------- np.ndarray Contingency table """ set1 = set(set1) set2 = set(set2) all_genes = set(all_genes) if len(set1 - all_genes) > 0 or len(set2 - all_genes) > 0: raise ValueError("Gene sets contain genes not in all_genes") tp = len(set1 & set2) fp = len(set1 - set2) tn = len(all_genes - set1 - set2) fn = len(set2 - set1) return np.array([[tp, fp], [fn, tn]])
[docs]def compute_enrichment(gene_set, target_genes, all_genes, label=None): """ Computes enrichment statistic for gene_set in target_genes. Parameters ---------- gene_set : list Gene set for enrichment (e.g. genes in iModulon) target_genes : list Genes to be enriched against (e.g. genes in regulon or GO term) all_genes : list Set of all genes label : list Label for target_genes (e.g. regulator name or GO term) Returns ------- pd.Series Table containing statistically significant enrichments """ # Create contingency table ((tp, fp), (fn, tn)) = contingency(gene_set, target_genes, all_genes) # Handle edge cases if tp == 0: res = [1, 0, 0, 0, 0, len(target_genes), len(gene_set)] elif fp == 0 and fn == 0: res = [0, 1, 1, 1, len(gene_set), len(target_genes), len(gene_set)] else: odds, pval = stats.fisher_exact([[tp, fp], [fn, tn]], alternative="greater") recall = np.true_divide(tp, tp + fn) precision = np.true_divide(tp, tp + fp) f1score = (2 * precision * recall) / (precision + recall) res = [pval, precision, recall, f1score, tp, len(target_genes), len(gene_set)] return pd.Series( res, index=[ "pvalue", "precision", "recall", "f1score", "TP", "target_set_size", "gene_set_size", ], name=label,
)
[docs]def FDR(p_values, fdr, total=None): """ Runs false detection correction for a table of statistics Parameters ---------- p_values : ~pandas.DataFrame DataFrame with a 'pvalue' column fdr : float False detection rate total : int Total number of tests (for multi-enrichment) Returns ------- ~pandas.DataFrame Table containing entries that passed multiple hypothesis correction """ if total is not None: pvals = p_values.pvalue.values.tolist() + [1] * (total - len(p_values)) else: pvals = p_values.pvalue.values keep, qvals = fdrcorrection(pvals, alpha=fdr) result = p_values.copy() result["qvalue"] = qvals[: len(p_values)] result = result[keep[: len(p_values)]] return result.sort_values("qvalue")
[docs]def parse_regulon_str(regulon_str, trn): """ Converts a complex regulon (regulon_str) into a list of genes Parameters ---------- regulon_str : str Complex regulon, where "/" uses genes in any regulon and "+" uses genes in all regulons trn : ~pandas.DataFrame Table containing transcriptional regulatory network Returns ------- reg_genes : set Set of genes regulated by regulon_str """ if regulon_str == "": return set() if "+" in regulon_str and "/" in regulon_str: raise NotImplementedError( 'Complex regulons cannot contain both "+" ' '(AND) and "/" (OR) operators' ) elif "+" in regulon_str: join = set.intersection regs = regulon_str.split("+") elif "/" in regulon_str: join = set.union regs = regulon_str.split("/") else: join = set.union regs = [regulon_str] # Combine regulon reg_genes = join(*[set(trn[trn.regulator == reg].gene_id) for reg in regs]) return reg_genes
[docs]def compute_regulon_enrichment(gene_set, regulon_str, all_genes, trn): """ Computes enrichment statistics for a gene_set in a regulon Parameters ---------- gene_set : set Gene set for enrichment (e.g. genes in iModulon) regulon_str : str Complex regulon, where "/" uses genes in any regulon and "+" uses genes in all regulons all_genes : set Set of all genes trn : ~pandas.DataFrame Table containing transcriptional regulatory network Returns ------- result : ~pandas.DataFrame Table containing statistically significant enrichments """ regulon = parse_regulon_str(regulon_str, trn) # Remove genes in regulon that are not in all_genes if len(regulon - set(all_genes)) > 0: logging.warning( "Some genes are in the regulon but not in all_genes. " "These genes are removed before enrichment analysis.", category=UserWarning, ) regulon = regulon & set(all_genes) result = compute_enrichment(gene_set, regulon, all_genes, regulon_str) result.rename({"target_set_size": "regulon_size"}, inplace=True) n_regs = 1 + regulon_str.count("+") + regulon_str.count("/") result["n_regs"] = n_regs return result
[docs]def compute_trn_enrichment( gene_set, all_genes, trn, max_regs=1, fdr=0.01, method="both", force=False ): """ Compare a gene set against an entire TRN Parameters ---------- gene_set : set Gene set for enrichment (e.g. genes in iModulon) all_genes : set Set of all genes trn : ~pandas.DataFrame Table containing transcriptional regulatory network max_regs : int Maximum number of regulators to include in complex regulon (default: 1) fdr : float False detection rate (default = .01) method : str How to combine complex regulons. (default: 'both') "or" computes enrichment against union of regulons "and" computes enrichment against intersection of regulons "both" performs both tests force : bool Allows computation of >2 regulators (default = False) Returns ------- ~pandas.DataFrame Table containing statistically significant enrichments """ # Warning if max_regs is too high if max_regs > 2 and not force: raise ValueError( "Using >2 maximum regulators may take time to compute. " "To perform analysis, use force=True" ) # Only search for regulators known to regulate a gene in gene_set # This reduces the total runtime by skipping unnecessary tests # However, this needs to be taken into account for FDR imod_regs = trn[trn.gene_id.isin(gene_set)].regulator.unique() # # Account for imodulons with no regulators # if len(imod_regs) == 0: # return compute_regulon_enrichment(gene_set, '', all_genes, trn) # Prepare complex regulon names enrich_list = [] total = 0 # Perform enrichments for 1 regulator for reg in imod_regs: enrich_list.append(compute_regulon_enrichment(gene_set, reg, all_genes, trn)) total += len(imod_regs) # Perform enrichments for >1 regulator for n_regs in range(2, max_regs + 1): group1, group2 = itertools.tee(itertools.combinations(imod_regs, n_regs)) num_tests = int(special.comb(len(trn.regulator.unique()), n_regs)) if method == "and": reg_list = ["+".join(regs) for regs in group1] total += num_tests elif method == "or": reg_list = ["/".join(regs) for regs in group1] total += num_tests elif method == "both": reg_list = ["+".join(regs) for regs in group1] + [ "/".join(regs) for regs in group2 ] total += 2 * num_tests else: raise ValueError("'method' must be either 'and', 'or', or 'both'") # Perform enrichments for reg in reg_list: enrich_list.append( compute_regulon_enrichment(gene_set, reg, all_genes, trn) ) if len(enrich_list) == 0: return pd.DataFrame() df_enrich = pd.concat(enrich_list, axis=1, sort=False).T return FDR(df_enrich, fdr=fdr, total=total)
[docs]def compute_annotation_enrichment(gene_set, all_genes, annotation, column, fdr=0.01): """ Compare a gene set against a dataframe of gene annotations Parameters ---------- gene_set : set Gene set for enrichment (e.g. genes in iModulon) all_genes : set Set of all genes annotation : ~pandas.DataFrame Table containing gene annotations column : str Name of column in the annotation DataFrame (default: 'annotation') fdr : float False detection rate (default: 0.01) Returns ------- pandas.DataFrame Table containing statistically significant enrichments """ # TODO: Create test functions enrich_list = [] for name, group in annotation.groupby(column): target_genes = group["gene_id"] enrich_list.append( compute_enrichment(gene_set, target_genes, all_genes, label=name) ) df_enrich = pd.concat(enrich_list, axis=1).T return FDR(df_enrich, fdr=fdr)