Source code for pymodulon.plotting

"""
Plotting functions for iModulons
"""
import logging
import warnings
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from adjustText import adjust_text
from matplotlib.patches import Rectangle
from scipy import sparse, stats
from scipy.optimize import OptimizeWarning, curve_fit
from sklearn.base import clone
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, silhouette_samples, silhouette_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor

from pymodulon.compare import convert_gene_index
from pymodulon.enrichment import parse_regulon_str
from pymodulon.util import _parse_sample, dima, explained_variance, mutual_info_distance


#############
# Bar Plots #
#############


[docs]def barplot( values, sample_table, ylabel="", projects=None, highlight=None, ax=None, legend_kwargs=None, ): """ Creates an overlaid scatter and barplot for a set of values (either gene expression levels or iModulon activities) Parameters ---------- values: ~pandas.Series List of `values` to plot sample_table: ~pandas.DataFrame Sample table from :class:`~pymodulon.core.IcaData` object ylabel: str, optional Y-axis label projects: list or str, optional Name(s) of `projects` to show (default: show all) highlight: list or str, optional Project(s) to `highlight` (default: None) ax: ~matplotlib.axes.Axes, optional Axes object to plot on, otherwise use current Axes legend_kwargs: dict, optional Additional keyword arguments passed to :func:`matplotlib.pyplot.legend` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the barplot """ # Remove extra projects if isinstance(projects, str): projects = [projects] if projects is not None and len(projects) == 1: highlight = projects if projects is not None and "project" in sample_table: sample_table = sample_table[sample_table.project.isin(projects)] values = values[sample_table.index] if ax is None: figsize = (len(values) / 15 + 0.5, 2) fig, ax = plt.subplots(figsize=figsize) # Get ymin and max ymin = values.min() ymax = values.max() yrange = ymax - ymin ymax = max(1, max(ymax * 1.1, ymax + yrange * 0.1)) ymin = min(-1, min(ymin * 1.1, ymin - yrange * 0.1)) yrange = ymax - ymin # Add project-specific information if "project" in sample_table.columns and "condition" in sample_table.columns: # Sort data by project/condition to ensure replicates are together metadata = sample_table.loc[:, ["project", "condition"]] metadata = metadata.sort_values(["project", "condition"]) metadata["name"] = metadata.project + " - " + metadata.condition.astype(str) # Coerce highlight to iterable if highlight is None: highlight = [] elif isinstance(highlight, str): highlight = [highlight] # Get X and Y values for scatter points metadata["y"] = values metadata["x"] = np.cumsum(~metadata[["name"]].duplicated()) # Get heights for barplot bar_vals = metadata.groupby("x").mean() # Add colors and names bar_vals["name"] = metadata.drop_duplicates("name").name.values bar_vals["project"] = metadata.drop_duplicates("name").project.values # Plot bars for highlighted samples color_vals = bar_vals[bar_vals.project.isin(highlight)] color_cycle = [ "tab:red", "tab:orange", "tab:green", "tab:purple", "tab:brown", "tab:pink", "tab:gray", "tab:olive", "tab:cyan", ] i = 0 for name, group in color_vals.groupby("name"): ax.bar( group.index, group.y, color=color_cycle[i], width=1, linewidth=0, align="edge", zorder=1, label=name, ) i = (i + 1) % len(color_cycle) # Plot bars for non-highlighted samples other_vals = bar_vals[~bar_vals.project.isin(highlight)] ax.bar( other_vals.index, other_vals.y, color="tab:blue", width=1, linewidth=0, align="edge", zorder=1, label=None, ) ax.scatter(metadata.x + 0.5, metadata.y, color="k", zorder=2, s=10) # Get project names and sizes projects = metadata.project.drop_duplicates() md_cond = metadata.drop_duplicates(["name"]) project_sizes = [len(md_cond[md_cond.project == proj]) for proj in projects] nbars = len(md_cond) # Draw lines to discriminate between projects proj_lines = np.cumsum([1] + project_sizes) ax.vlines(proj_lines, ymin, ymax, colors="lightgray", linewidth=1) # Add project names texts = [] start = 2 for proj, size in zip(projects, project_sizes): x = start + size / 2 texts.append( ax.text( x, ymin - yrange * 0.02, proj, ha="right", va="top", rotation=45 ) ) start += size # Add legend if not color_vals.empty: kwargs = { "bbox_to_anchor": (1, 1), "ncol": len(color_vals.name.unique()) // 6 + 1, } if legend_kwargs is not None: kwargs.update(legend_kwargs) ax.legend(**kwargs) else: logging.warning("Missing `project` and `condition` columns in `sample_table.`") ax.bar(range(len(values)), values, width=1, align="edge") nbars = len(values) # Set axis limits xmin = -0.5 xmax = nbars + 2.5 ax.set_xlim(xmin, xmax) ax.set_ylim(ymin, ymax) # Axis labels ax.set_ylabel(ylabel, fontsize=12) ax.set_xticks([]) # X-axis ax.hlines(0, xmin, xmax, color="k") return ax
[docs]def plot_expression( ica_data, gene, projects=None, highlight=None, ax=None, legend_kwargs=None ): """ Creates a barplot showing an gene's expression across the compendium Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object gene: str `Gene` locus tag or name projects: str or list, optional Name(s) of `projects` to show (default: show all) highlight: str or list, optional Name(s) of projects to `highlight` (default: None) ax: ~matplotlib.axes.Axes, optional Axes object to plot on, otherwise use current Axes legend_kwargs: dict, optional Additional keyword arguments passed to :func:`matplotlib.pyplot.legend` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the barplot """ # Check that gene exists if gene in ica_data.X.index: values = ica_data.X.loc[gene] label = "{} Expression".format(gene) else: locus = ica_data.name2num(gene) values = ica_data.X.loc[locus] label = "${}$ Expression".format(gene) return barplot( values=values, sample_table=ica_data.sample_table, ylabel=label, projects=projects, highlight=highlight, ax=ax, legend_kwargs=legend_kwargs,
)
[docs]def plot_activities( ica_data, imodulon, projects=None, highlight=None, ax=None, legend_kwargs=None ): """ Creates a barplot showing an iModulon's activity across the compendium Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object imodulon: int or str `iModulon` name projects: list or str, optional Name(s) of `projects` to show (default: show all) highlight: str or list, optional Name(s) of projects to `highlight` (default: None) ax: ~matplotlib.axes.Axes, optional Axes object to plot on, otherwise use current Axes legend_kwargs: dict, optional Additional keyword arguments passed to :func:`matplotlib.pyplot.legend` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the barplot """ # Check that iModulon exists if imodulon in ica_data.A.index: values = ica_data.A.loc[imodulon] else: raise ValueError(f"iModulon does not exist: {imodulon}") label = "{} iModulon\nActivity".format(imodulon) return barplot( values=values, sample_table=ica_data.sample_table, ylabel=label, projects=projects, highlight=highlight, ax=ax, legend_kwargs=legend_kwargs,
)
[docs]def plot_metadata( ica_data, column, projects=None, highlight=None, ax=None, legend_kwargs=None ): """ Creates a barplot for values in the sample table Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object column : str `Column` name to plot projects: list or str, optional Name(s) of `projects` to show (default: show all) highlight: str or list, optional Name(s) of projects to `highlight` (default: None) ax: ~matplotlib.axes.Axes, optional Axes object to plot on, otherwise use current Axes legend_kwargs: dict, optional Additional keyword arguments passed to :func:`matplotlib.pyplot.legend` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the barplot """ # Check that column exists if column in ica_data.sample_table.columns: # Make sure the column is filled with numbers if not pd.api.types.is_numeric_dtype(ica_data.sample_table[column]): raise ValueError("Metadata column {} is not numeric".format(column)) # Remove null values table = ica_data.sample_table[ica_data.sample_table[column].notnull()] values = ica_data.sample_table[column] else: raise ValueError("Column not in sample table: {}".format(column)) return barplot( values=values, sample_table=table, ylabel=column, projects=projects, highlight=highlight, ax=ax, legend_kwargs=legend_kwargs,
)
[docs]def plot_regulon_histogram( ica_data, imodulon, regulator=None, bins=None, kind="overlap", ax=None, hist_label=("Not regulated", "Regulon Genes"), color=("#aaaaaa", "salmon"), alpha=0.7, ax_font_kwargs=None, legend_kwargs=None, ): """ Plots a histogram of regulon vs non-regulon genes by iModulon weighting. Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object imodulon : int or str `iModulon` name regulator: str, optional Name of `regulator` to compare enrichment against. If no regulator is given, choose top enrichment from :attr:`~pymodulon.core.IcaData.imodulon_table` bins: int, str or list, optional The `bins` to use when generating the histogram. Passed on to :func:`matplotlib.pyplot.hist` kind: 'overlap' or 'side', optional Whether to plot an overlapping or side-by-side comparison histogram ( default: 'overlap') ax: ~matplotlib.axes.Axes, optional Axes object to plot on, otherwise use current Axes hist_label: tuple, optional The label to use when plotting the regulon and non-regulon genes. Takes into a tuple of 2 values (first for non-regulon genes, second for regulon genes). Passed on to :func:`matplotlib.pyplot.hist` color: list or str, optional The colors to use for regulon and non-regulon genes. Takes a Sequence of 2 values (first for non-regulon genes, second for regulon genes). Passed on to :func:`matplotlib.pyplot.hist` alpha: float, optional Sets the opacity of the histogram (0 = transparent, 1 = opaque). Passed on to :func:`matplotlib.pyplot.hist` ax_font_kwargs: dict, optional Additional keyword arguments for axes labels legend_kwargs: dict, optional Additional keyword arguments passed to :func:`matplotlib.pyplot.legend` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the histogram """ # Check that iModulon exists if imodulon not in ica_data.M.columns: raise ValueError(f"iModulon does not exist: {imodulon}") # If ax is None, create ax on which to generate histogram if ax is None: fig, ax = plt.subplots() # If bins is None, generate optimal number of bins if bins is None: bin_arr = _mod_freedman_diaconis(ica_data, imodulon) else: bin_arr = bins # If no TRN in IcaData, regulon genes cannot be determined and plotted if ica_data.trn.empty: reg = None # If regulator is given, use it to find regulon genes elif regulator is not None: reg = regulator # If regulator is not given, use imodulon_table to find regulator elif not ica_data.imodulon_table.empty: reg = ica_data.imodulon_table.loc[imodulon, "regulator"] if pd.isna(reg): reg = None # If no imodulon_table in IcaData, compute trn enrichment to find the # regulator. Note that trn enrichment is computed using `max_regs` = 1 else: df_enriched = ica_data.compute_trn_enrichment(imodulons=imodulon) df_top_enrich = df_enriched.sort_values( ["imodulon", "qvalue", "n_regs"] ).drop_duplicates("imodulon") try: reg = df_top_enrich.set_index("imodulon").loc[imodulon, "regulator"] except KeyError: reg = None # Use regulator value to find regulon genes if reg is not None: reg_genes = parse_regulon_str(reg, ica_data.trn) else: reg_genes = set() # Handle custom kwargs if ax_font_kwargs is None: ax_font_kwargs = {} if legend_kwargs is None: legend_kwargs = dict({"loc": "upper right"}) # Histogram non_reg_genes = set(ica_data.gene_names) - reg_genes reg_arr = ica_data.M[imodulon].loc[reg_genes] non_reg_arr = ica_data.M[imodulon].loc[non_reg_genes] if kind == "overlap": ax.hist( non_reg_arr, bins=bin_arr, alpha=alpha, color=color[0], label=hist_label[0] ) ax.hist(reg_arr, bins=bin_arr, alpha=alpha, color=color[1], label=hist_label[1]) elif kind == "side": arr = np.array([non_reg_arr, reg_arr], dtype="object") # noinspection PyTypeChecker ax.hist(arr, bins=bin_arr, alpha=alpha, color=color, label=hist_label) else: raise ValueError( f"{kind} is not a valid option. `kind` must be " 'either "overlap" or "side"' ) # Set y-axis to log-scale ax.set_yscale("log") # Add thresholds to scatterplot (dashed lines) ymin, ymax = ax.get_ylim() thresh = abs(ica_data.thresholds[imodulon]) if thresh != 0: ax.vlines( [-thresh, thresh], ymin=ymin, ymax=ymax, colors="k", linestyles="dashed", linewidth=1, ) ax.set_ylim(ymin, ymax) # Set x and y labels ax.set_xlabel(f"{imodulon} Gene Weight", **ax_font_kwargs) ax.set_ylabel("Number of Genes", **ax_font_kwargs) # Add legend ax.legend(**legend_kwargs) return ax
################ # Scatterplots # ################
[docs]def scatterplot( x, y, groups=None, colors=None, show_labels="auto", adjust_labels=True, line45=False, line45_margin=0, fit_line=False, fit_metric="pearson", xlabel="", ylabel="", ax=None, legend=True, ax_font_kwargs=None, scatter_kwargs=None, label_font_kwargs=None, legend_kwargs=None, ): """ Generates a scatter-plot of the data given, with options for coloring by group, adding labels, adding lines, and generating correlation or determination coefficients. Parameters ---------- x: ~pandas.Series `X`-axis data y: ~pandas.Series `Y`-axis data groups: dict, optional A dictionary mapping samples to group names colors: str, list or dict, optional Color of points, list of colors for different groups, or dictionary mapping groups to colors show_labels: bool or 'auto' Show labels for data points adjust_labels: bool Auto-adjust labels for data points line45: bool Show 45-degree line of equal values line45_margin: float Show 45-degreen lines offset by a margin fit_line: bool Draw a line of best fit on the scatterplot fit_metric: 'pearson', 'spearman', 'r2', or None Metric to report in legend for line of best fit xlabel: str X-axis label ylabel: str Y-axis label ax: ~matplotlib.axes.Axes, optional Axes object to plot on, otherwise use current Axes legend: bool Show legend ax_font_kwargs: dict, optional Additional keyword arguments for axis labels scatter_kwargs: dict, optional Additional keyword arguments passed to :func:`matplotlib.pyplot.scatter` label_font_kwargs: dict, optional Additional keyword arguments for labels passed to :func:`matplotlib.pyplot.text` legend_kwargs: dict, optional Additional keyword arguments passed to :func:`matplotlib.pyplot.legend` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the scatterplot """ if ax is None: fig, ax = plt.subplots() if show_labels == "auto": show_labels = len(x) <= 20 if not ( isinstance(x, pd.Series) and isinstance(y, pd.Series) and ( x.index.astype(str).sort_values() == y.index.astype(str).sort_values() ).all() ): raise TypeError("X and Y must be pandas series with the same index") # Set up data object data = pd.DataFrame({"x": x, "y": y}) # Add group information data["group"] = "" if groups is not None: for k, val in groups.items(): data.loc[k, "group"] = val # Handle custom kwargs if ax_font_kwargs is None: ax_font_kwargs = {} if label_font_kwargs is None: label_font_kwargs = {} if legend_kwargs is None: legend_kwargs = {} if scatter_kwargs is None: scatter_kwargs = {} # Get x and y limits margin = 0.1 xrange = x.max() - x.min() yrange = y.max() - y.min() xmin = x.min() - xrange * margin xmax = x.max() + xrange * margin ymin = y.min() - yrange * margin ymax = y.max() + yrange * margin allmin = max(xmin, ymin) allmax = min(xmax, ymax) # Add 45 degree line if line45: # Plot diagonal lines ax.plot( [allmin, allmax], [allmin, allmax], color="k", linestyle="dashed", linewidth=0.5, zorder=0, ) if line45_margin > 0: diff = pd.DataFrame(abs(data.x - data.y), index=data.index) hidden = diff.loc[diff[0] < line45_margin] data.loc[hidden.index, "group"] = "hidden" ax.plot( [max(xmin, ymin + line45_margin), min(xmax, ymax + line45_margin)], [max(ymin, xmin - line45_margin), min(ymax, xmax - line45_margin)], color="gray", linestyle="dashed", linewidth=0.5, zorder=0, ) ax.plot( [max(xmin, ymin - line45_margin), min(xmax, ymax - line45_margin)], [max(ymin, xmin + line45_margin), min(ymax, xmax + line45_margin)], color="gray", linestyle="dashed", linewidth=0.5, zorder=0, ) # Add colors to the data # If colors is already a dict, just update the hidden color try: if "hidden" not in colors.keys(): colors.update({"hidden": "gray"}) if "" not in colors.keys(): colors.update({"": "tab:blue"}) except AttributeError: groups = [item for item in data["group"].unique() if item != "hidden"] if colors is None: colorlist = plt.rcParams["axes.prop_cycle"].by_key()["color"] elif isinstance(colors, str): colorlist = [colors] * len(groups) else: colorlist = list(colors) # Deal with short colorlists if len(colorlist) < len(groups): colorlist = colorlist * (len(groups) // len(colorlist) + 1) colors = dict(zip(groups, colorlist)) colors.update({"hidden": "gray"}) for name, group in data.groupby("group"): kwargs = scatter_kwargs.copy() kwargs["color"] = colors[name] # Override defaults for hidden points if name == "hidden": kwargs.update({"alpha": 0.7, "linewidth": 0, "label": None}) elif name == "": kwargs.update({"label": None}) else: kwargs.update({"label": name}) ax.scatter(group.x, group.y, **kwargs, zorder=1) # Add regression if fit_line: _fit_line(x, y, ax, fit_metric) # Add lines at 0 if xmin < 0 < xmax: ax.hlines(0, xmin, xmax, linewidth=0.5, color="gray", zorder=2) if ymin < 0 < ymax: ax.vlines(0, ymin, ymax, linewidth=0.5, color="gray", zorder=2) # Add labels if show_labels: texts = [] for idx in x.index: texts.append(ax.text(x[idx], y[idx], idx, **label_font_kwargs)) if adjust_labels: adjust_text( texts, ax=ax, arrowprops=dict(arrowstyle="-", color="k", lw=0.5), only_move={"objects": "y"}, expand_objects=(1.2, 1.4), expand_points=(1.3, 1.3), ) if np.allclose(xmin, xmax): ax.set_xlim(xmin - 1, xmax + 1) else: ax.set_xlim(xmin, xmax) if np.allclose(ymin, ymax): ax.set_ylim(ymin - 1, ymax + 1) else: ax.set_ylim(ymin, ymax) ax.set_xlabel(xlabel, **ax_font_kwargs) ax.set_ylabel(ylabel, **ax_font_kwargs) if legend or fit_line: ax.legend(**legend_kwargs) return ax
[docs]def plot_gene_weights(ica_data, imodulon, by="start", xaxis=None, xname="", **kwargs): """ Plot gene weights on a scatter plot. Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object imodulon : int or str `iModulon` name by: 'log-tpm-norm', 'length', or 'start' Property to plot on x-axis. Superceded by `xaxis` xaxis: list, dict or ~pandas.Series, optional Values on custom x-axis xname: str, optional Name of x-axis if using custom x-axis **kwargs: Additional keyword arguments passed to :func:`pymodulon.plotting.scatterplot` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the scatterplot """ # Check that iModulon exists if imodulon in ica_data.M.columns: y = ica_data.M[imodulon] ylabel = f"{imodulon} Gene Weight" else: raise ValueError(f"iModulon does not exist: {imodulon}") # Get genes in the iModulon bin_M = ica_data.M_binarized component_genes = set(bin_M[imodulon].loc[bin_M[imodulon] == 1].index) other_genes = set(bin_M[imodulon].loc[bin_M[imodulon] == 0].index) # If experimental `xaxis` parameter is used, use custom values for x-axis if xaxis is not None: x = _set_xaxis(xaxis=xaxis, y=y) xlabel = xname else: # Ensure 'by' has a valid input and assign x, xlabel accordingly if by == "log-tpm": x = ica_data.log_tpm.mean(axis=1) xlabel = "Mean Expression" elif by == "log-tpm-norm": x = ica_data.X.mean(axis=1) xlabel = "Mean Centered Expression" elif by == "length": x = np.log10(ica_data.gene_table.length) xlabel = "Gene Length (log10-scale)" elif by == "start": x = ica_data.gene_table.start xlabel = "Gene Start" else: raise ValueError( '"by" must be "log-tpm", "log-tpm-norm", "length", ' 'or "start"' ) # Override specific kwargs (their implementation is different # in this function) show_labels_pgw = kwargs.pop("show_labels", "auto") adjust_labels_pgw = kwargs.pop("adjust_labels", True) legend_kwargs_pgw = kwargs.pop("legend_kwargs", {}) label_font_kwargs_pgw = kwargs.pop("label_font_kwargs", {}) kwargs["show_labels"] = kwargs["adjust_labels"] = False # Remove xlabel and ylabel kwargs if provided kwargs.pop("xlabel", None) kwargs.pop("ylabel", None) # Default legend should be on the side of the plot if ( "bbox_to_anchor" not in legend_kwargs_pgw.keys() and "loc" not in legend_kwargs_pgw.keys() ): legend_kwargs_pgw.update({"bbox_to_anchor": (1, 1), "loc": 2}) kwargs["legend_kwargs"] = legend_kwargs_pgw # Update colors for COG groups if "COG" in ica_data.gene_table.columns and "groups" not in kwargs: mod_cogs = ica_data.gene_table.loc[component_genes].COG hidden_cogs = pd.Series("hidden", index=other_genes) all_cogs = pd.concat([mod_cogs, hidden_cogs]) # colors = {cog:ica_data.cog_colors[cog] for cog in sorted(mod_cogs.unique())} kwargs.update({"groups": all_cogs, "colors": ica_data.cog_colors}) # Scatter Plot ax = scatterplot(x, y, xlabel=xlabel, ylabel=ylabel, **kwargs) # Add thresholds to scatter-plot (dashed lines) xmin, xmax = ax.get_xlim() ymin, ymax = ax.get_ylim() thresh = ica_data.thresholds[imodulon] if thresh != 0: ax.hlines( [thresh, -thresh], xmin=xmin, xmax=xmax, colors="k", linestyles="dashed", linewidth=1, ) ax.set_xlim(xmin, xmax) ax.set_ylim(ymin, ymax) texts = [] expand_kwargs = {"expand_objects": (1.2, 1.4), "expand_points": (1.3, 1.3)} # Add labels: Put gene name if components contain under 20 genes if show_labels_pgw is True or ( show_labels_pgw is not False and len(component_genes) <= 20 ): for gene in component_genes: # Add labels text_kwargs = label_font_kwargs_pgw.copy() if "fontstyle" not in text_kwargs: text_kwargs.update({"fontstyle": "normal"}) # Italicize gene if there is a defined name (not locus tag) try: gene_name = ica_data.gene_table.loc[gene, "gene_name"] if gene_name != gene: text_kwargs.update({"fontstyle": "italic"}) except KeyError: gene_name = gene # Set default fontsize if "fontsize" not in text_kwargs: text_kwargs.update({"fontsize": 12}) texts.append( ax.text( x[gene], ica_data.M.loc[gene, imodulon], gene_name, **text_kwargs, ) ) expand_kwargs["expand_text"] = (1.4, 1.4) # Add labels: Repel texts from other text and points rect = ax.add_patch( Rectangle( xy=(xmin, -abs(thresh)), width=xmax - xmin, height=2 * abs(thresh), fill=False, linewidth=0, ) ) if adjust_labels_pgw: adjust_text( texts=texts, add_objects=[rect], ax=ax, arrowprops=dict(arrowstyle="-", color="k", lw=0.5), only_move={"objects": "y"}, **expand_kwargs, ) return ax
[docs]def compare_gene_weights( ica_data, imodulon1, imodulon2, ica_data2=None, ortho_file=None, use_org1_names=True, **kwargs, ): """ Create a scatterplot comparing the gene weights between two iModulons Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object imodulon1 : int or str Name of `iModulon` on X-axis imodulon2 : int or str Name of `iModulon` on X-axis ica_data2: ~pymodulon.core.IcaData, optional :class:`~pymodulon.core.IcaData` object for second iModulon (if comparing iModulons across objects) ortho_file: str, optional Path to orthology file between organisms use_org1_names: bool If true, use gene names from first organism. If false, use gene names from second organism (default: True) **kwargs: Additional keyword arguments passed to :func:`pymodulon.plotting.scatterplot` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the scatterplot """ if ica_data2 is None: ica_data2 = ica_data.copy() M1, M2 = convert_gene_index(ica_data.M, ica_data2.M, ortho_file) bin_M1, bin_M2 = convert_gene_index( ica_data.M_binarized, ica_data2.M_binarized, ortho_file ) # Convert gene table gene_table1, gene_table2 = convert_gene_index( ica_data.gene_table, ica_data2.gene_table, ortho_file, keep_locus=True ) if use_org1_names: gene_table = gene_table1 gene_table["locus_tag"] = gene_table.index else: gene_table = gene_table2 x = M1[imodulon1] y = M2[imodulon2] xlabel = f"{imodulon1} Gene Weight" ylabel = f"{imodulon2} Gene Weight" # Override specific kwargs (their implementation is different # in this function) show_labels_cgw = kwargs.pop("show_labels", "auto") adjust_labels_cgw = kwargs.pop("adjust_labels", True) legend_cgw = kwargs.pop("legend", False) legend_kwargs_cgw = kwargs.pop("legend_kwargs", {}) label_font_kwargs_cgw = kwargs.pop("label_font_kwargs", {}) kwargs["show_labels"] = kwargs["adjust_labels"] = kwargs["legend"] = False kwargs["legend_kwargs"] = None # Remove xlabel and ylabel kwargs if provided kwargs.pop("xlabel", None) kwargs.pop("ylabel", None) # Scatter Plot ax = scatterplot(x, y, xlabel=xlabel, ylabel=ylabel, **kwargs) # Add thresholds to scatterplot (dashed lines) xmin, xmax = ax.get_xlim() ymin, ymax = ax.get_ylim() thresh1 = ica_data.thresholds[imodulon1] thresh2 = ica_data2.thresholds[imodulon2] if thresh1 != 0: ax.vlines( [thresh1, -thresh1], ymin=ymin, ymax=ymax, colors="k", linestyles="dashed", linewidth=1, ) if thresh2 != 0: ax.hlines( [thresh2, -thresh2], xmin=xmin, xmax=xmax, colors="k", linestyles="dashed", linewidth=1, ) ax.set_xlim(xmin, xmax) ax.set_ylim(ymin, ymax) # Add labels on data-points component_genes_x = bin_M1[bin_M1[imodulon1] == 1].index component_genes_y = bin_M2[bin_M2[imodulon2] == 1].index component_genes = component_genes_x.intersection(component_genes_y) texts = [] expand_kwargs = {"expand_objects": (1.2, 1.4), "expand_points": (1.3, 1.3)} # Add labels: Put gene name if components contain under 20 genes auto = None if show_labels_cgw == "auto": auto = ( bin_M1[imodulon1].astype(bool) & bin_M2[imodulon2].astype(bool) ).sum() <= 20 if show_labels_cgw or auto: for gene in component_genes: ax.scatter(M1.loc[gene, imodulon1], M2.loc[gene, imodulon2], color="r") # Add labels text_kwargs = label_font_kwargs_cgw.copy() if "fontstyle" not in text_kwargs: text_kwargs.update({"fontstyle": "normal"}) # Italicize gene if there is a defined name (not locus tag) try: gene_name = gene_table.loc[gene, "gene_name"] if gene_name != gene_table.loc[gene, "locus_tag"]: text_kwargs.update({"fontstyle": "italic"}) except KeyError: gene_name = gene # Set default fontsize if "fontsize" not in text_kwargs: text_kwargs.update({"fontsize": 12}) texts.append( ax.text( M1.loc[gene, imodulon1], M2.loc[gene, imodulon2], gene_name, **text_kwargs, ) ) expand_kwargs["expand_text"] = (1.4, 1.4) # Add labels: Repel texts from other text and points rectx = ax.add_patch( Rectangle( xy=(xmin, -abs(thresh2)), width=xmax - xmin, height=2 * abs(thresh2), fill=False, linewidth=0, ) ) recty = ax.add_patch( Rectangle( xy=(-abs(thresh1), ymin), width=2 * abs(thresh1), height=ymax - ymin, fill=False, linewidth=0, ) ) if adjust_labels_cgw: adjust_text( texts=texts, add_objects=[rectx, recty], ax=ax, arrowprops=dict(arrowstyle="-", color="k", lw=0.5), only_move={"objects": "y"}, **expand_kwargs, ) # Add legend if legend_cgw: ax.legend(**legend_kwargs_cgw) return ax
[docs]def compare_expression(ica_data, gene1, gene2, **kwargs): """ Create a scatterplot comparing the compendium-wide expression profiles of two genes Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object gene1: str Gene to plot on the x-axis gene2: str Gene to plot on the y-axis **kwargs: Additional keyword arguments passed to :func:`pymodulon.plotting.scatterplot` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the scatterplot """ # Check that gene1 exists if gene1 in ica_data.X.index: x = ica_data.X.loc[gene1] xlabel = f"{gene1} Expression" else: locus = ica_data.name2num(gene1) x = ica_data.X.loc[locus] xlabel = f"${gene1}$ Expression" # Check that gene2 exists if gene2 in ica_data.X.index: y = ica_data.X.loc[gene2] ylabel = f"{gene2} Expression" else: locus = ica_data.name2num(gene2) y = ica_data.X.loc[locus] ylabel = f"${gene2}$ Expression" # Remove xlabel, ylabel, and fit_line kwargs if provided kwargs.pop("xlabel", None) kwargs.pop("ylabel", None) kwargs.pop("fit_line", None) # Scatter Plot ax = scatterplot(x, y, xlabel=xlabel, ylabel=ylabel, fit_line=True, **kwargs) return ax
[docs]def compare_activities(ica_data, imodulon1, imodulon2, **kwargs): """ Create a scatterplot comparing the compendium-wide activities of two iModulons Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object imodulon1: int or str Name of the iModulon to plot on the x-axis imodulon2: int or str Name of the iModulon to plot on the y-axis **kwargs: Additional keyword arguments passed to :func:`pymodulon.plotting.scatterplot` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the scatterplot """ x = ica_data.A.loc[imodulon1] y = ica_data.A.loc[imodulon2] xlabel = f"{imodulon1} iModulon Activity" ylabel = f"{imodulon2} iModulon Activity" # Remove xlabel, ylabel, and fit_line kwargs if provided kwargs.pop("xlabel", None) kwargs.pop("ylabel", None) kwargs.pop("fit_line", None) # Scatter Plot ax = scatterplot(x, y, xlabel=xlabel, ylabel=ylabel, fit_line=True, **kwargs) return ax
[docs]def plot_dima( ica_data, sample1, sample2, threshold=5, fdr=0.1, label=True, adjust=True, table=False, **kwargs, ): """ Plots a DiMA plot between two projects or two sets of samples Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object sample1: list or str List of sample IDs or name of "project:condition" for x-axis sample2: list or str List of sample IDs or name of "project:condition" for y-axis threshold: float Minimum activity difference to determine DiMAs (default: 5) fdr: float False detection rate (default: 0.1) label: bool Label differentially activated iModulons (default: True) adjust: bool Automatically adjust labels (default: True) table: bool Return differential iModulon activity table (default: False) **kwargs: Additional keyword arguments passed to :func:`pymodulon.plotting.scatterplot` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the scatterplot df_diff: ~pandas.DataFrame, optional Table reporting differentially activated iModulons """ # use secret option to enable passing of clustered activity matrix try: A_to_use = kwargs.pop("alternate_A") except KeyError: A_to_use = ica_data.A # Override specific kwargs (their implementation is different # in this function) legend_cgw = kwargs.pop("legend", False) legend_kwargs_cgw = kwargs.pop("legend_kwargs", {}) kwargs["legend"] = False kwargs["legend_kwargs"] = None # Get x and y coordinates sample1_list = _parse_sample(ica_data, sample1) sample2_list = _parse_sample(ica_data, sample2) if isinstance(sample1, str): xlabel = sample1 else: xlabel = "\n".join(sample1) if isinstance(sample2, str): ylabel = sample2 else: ylabel = "\n".join(sample2) a1 = A_to_use[sample1_list].mean(axis=1) a2 = A_to_use[sample2_list].mean(axis=1) df_diff = dima( ica_data, sample1_list, sample2_list, threshold=threshold, fdr=fdr, alternate_A=A_to_use, ) groups = {} for i in A_to_use.index: if i not in df_diff.index: groups.update({i: "hidden"}) else: groups.update({i: ""}) ax = scatterplot( a1, a2, groups=groups, line45=True, xlabel=xlabel, ylabel=ylabel, **kwargs, ) if label: df_diff = pd.concat([df_diff, a1, a2], join="inner", axis=1) texts = [] for k in df_diff.index: texts.append(ax.text(df_diff.loc[k, 0], df_diff.loc[k, 1], k, fontsize=10)) if adjust: expand_args = { "expand_objects": (1.2, 1.4), "expand_points": (1.3, 1.3), "expand_text": (1.4, 1.4), } adjust_text( texts, ax=ax, arrowprops=dict(arrowstyle="-", color="k", lw=0.5), only_move={"objects": "y"}, **expand_args, ) # Add legend if requested if legend_cgw: ax.legend(**legend_kwargs_cgw) if table: return ax, df_diff else: return ax
############### # Other plots # ############### # TODO: Add kind=bar to plot top explained variance
[docs]def plot_explained_variance(ica_data, pc=True, ax=None): """ Plots the cumulative explained variance for independent components and, optionally, principal components Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object pc: bool If True, plot cumulative explained variance of independent components ax: ~matplotlib.axes.Axes, optional Axes object to plot on, otherwise use current Axes Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the line plot """ if not ax: fig, ax = plt.subplots() # Get IC explained variance ic_var = [] for imodulon in ica_data.imodulon_names: ic_var.append(explained_variance(ica_data, imodulons=imodulon)) ic_var = np.insert(np.cumsum(sorted(ic_var, reverse=True)), 0, 0) ax.plot(range(len(ic_var)), ic_var, label="Independent Components") if pc: # Get PC explained variance pca = PCA().fit(ica_data.X.T) pc_var = np.insert(np.cumsum(pca.explained_variance_ratio_), 0, 0) # Only keep number of PCs as ICs pc_var = pc_var[: len(ic_var)] # Plot PCs ax.plot(range(len(ic_var)), pc_var, label="Principal Components") ax.legend() ax.set_xlabel("Components") ax.set_ylabel("Cumulative Explained Varaince") ax.set_ylim([0, 1]) ax.set_xlim([0, len(ic_var)]) return ax
[docs]def compare_imodulons_vs_regulons( ica_data, imodulons=None, cat_column=None, size_column=None, scale=1, reg_only=True, xlabel=None, ylabel=None, vline=0.6, hline=0.6, ax=None, scatter_kwargs=None, ax_font_kwargs=None, legend_kwargs=None, ): """ Compare the overlaps between iModulons and their linked regulons Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object imodulons: list, optional List of iModulons to plot cat_column: str, optional Column in the `imodulon_table` that stores the category of each iModulon size_column: str, optional Column in the `imodulon_table` that stores the size of each iModulon scale: float, optional (default: 1) Value used to scale the size of each point reg_only: bool (default: True) Only plot iModulons with an entry in the `regulator` column of the `imodulon_table` xlabel: str, optional Custom x-axis label (default: "# shared genes/Regulon size") ylabel: str, optional Custom y-axis label (default: "# shared genes/iModulon size") vline: float, optional (default: 0.6) Draw a dashed vertical line hline: float, optional (default: 0.6) Draw a dashed horizontal line ax: ~matplotlib.axes.Axes, optional Axes object to plot on, otherwise use current Axes scatter_kwargs: dict, optional Additional keyword arguments passed to :func:`matplotlib.pyplot.scatter` ax_font_kwargs: dict, optional Additional keyword arguments for axes labels legend_kwargs: dict, optional Additional keyword arguments passed to :func:`matplotlib.pyplot.legend` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the line plot """ # Set up axis if not ax: fig, ax = plt.subplots(figsize=(5, 5)) # Handle kwargs if scatter_kwargs is None: scatter_kwargs = {} if ax_font_kwargs is None: ax_font_kwargs = {} if legend_kwargs is None: legend_kwargs = {} # Set axes labels if xlabel is None: xlabel = "$\\frac{\\mathrm{\\#\\;Shared\\;Genes}}{\\mathrm{Regulon\\;Size}}$" xlabelscale = 2 else: xlabelscale = 1 if ylabel is None: ylabel = "$\\frac{\\mathrm{\\#\\;Shared\\;Genes}}{\\mathrm{iModulon\\;Size}}$" ylabelscale = 2 else: ylabelscale = 1 # Select iModulons if imodulons is None: reg_table = ica_data.imodulon_table.copy() if reg_only: reg_table = reg_table[reg_table.regulator.notnull()] else: reg_table = ica_data.imodulon_table.loc[imodulons] # Select category column if cat_column is None: reg_table["_category"] = "iModulons" cat_column = "_category" # Make sure all necessary columns exist if reg_table.recall.isnull().any(): logging.warning( "Some iModulons have missing `recall` entries in the " "`imodulon_table` " "and will not be shown" ) if reg_table.precision.isnull().any(): logging.warning( "Some iModulons have missing `precision` entries in the " "`imodulon_table` and will not be shown" ) if size_column is not None and reg_table[size_column].isnull().any(): logging.warning( "Some iModulons have missing {} entries in the " "`imodulon_table`".format(size_column) ) for cat, group in reg_table.groupby(cat_column): # Set point sizes if size_column is None: size_ivr = scatter_kwargs.pop("s", plt.rcParams["lines.markersize"]) else: size_ivr = scatter_kwargs.pop("s", group[size_column]) edgecolor_ivr = scatter_kwargs.pop("edgecolor", "k") ax.scatter( group.recall, group.precision, s=size_ivr * scale, edgecolor=edgecolor_ivr, label=cat, **scatter_kwargs, ) # Set plot boundaries xmin, xmax = ax.get_xlim() xmin = min(xmin, 0) xmax = max(xmax, 1) ax.set_xlim((xmin, xmax)) ymin, ymax = ax.get_ylim() ymin = min(ymin, 0) ymax = max(ymax, 1) ax.set_ylim((ymin, ymax)) # Add dashed lines if hline: ax.hlines(hline, xmin, xmax, linestyles="dashed", colors="gray", zorder=0) if vline: ax.vlines(vline, xmin, xmax, linestyles="dashed", colors="gray", zorder=0) # Set axis labels label_size = ax_font_kwargs.pop("fontsize", plt.rcParams["font.size"]) xlabel_size = label_size * xlabelscale ax.set_xlabel(xlabel, fontsize=xlabel_size, **ax_font_kwargs) ylabel_size = label_size * ylabelscale ax.set_ylabel(ylabel, fontsize=ylabel_size, **ax_font_kwargs) # Set legend bbox_to_anchor_ivr = legend_kwargs.pop("bbox_to_anchor", (1, 1)) ax.legend(bbox_to_anchor=bbox_to_anchor_ivr, **legend_kwargs) return ax
###################### # Cluster Activities # ######################
[docs]def cluster_activities( ica_data, correlation_method="spearman", distance_threshold=None, show_thresholding=False, show_clustermap=True, show_best_clusters=False, n_best_clusters="auto", cluster_names=None, return_clustermap=False, # dimca options dimca_sample1=None, dimca_sample2=None, dimca_threshold=5, dimca_fdr=0.1, dimca_label=True, dimca_adjust=True, dimca_table=False, **dima_kwargs, ): """ Cluster all iModulon activity profiles using hierarchical clustering and display the results using :func:`seaborn.clustermap` Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object correlation_method: 'pearson', 'spearman', 'kendall', 'mutual_info' or callable Method for computing correlations between iModulon activities. See :meth:`pandas.DataFrame.corr` Default is 'spearman'. distance_threshold: float, optional A distance from 0 to 1 to define flat clusters from the hierarchical clustering. Larger values yield fewer clusters. If None, automatic selection of optimal threshold will occur by maximizing the silhouette score across iModulons. (default: None) show_thresholding: bool Show the plot of distance thresholds vs. silhouette scores (default: False) show_clustermap: bool Show the clustermap (default: True) show_best_clusters: bool Show individual clusters below complete clustermap n_best_clusters: int or str Number of best clusters to show. If 'auto', only clusters with above-average silhouette scores are shown cluster_names: dict, optional A dictionary mapping cluster indices to names, usually used after clustering has been performed previously. return_clustermap: bool Return the axis containing the clustermap dimca_sample1: list or str List of sample IDs or name of "project:condition" of reference samples for Differential iModulon Cluster Analysis (DiMCA) dimca_sample2: list or str List of sample IDs or name of "project:condition" of target samples for DiMCA dimca_threshold: float Minimum activity difference to determine DiMCAs dimca_fdr: float False detection rate for DiMCA dimca_label: bool Label differentially activated imodulon clusters (default: True) dimca_adjust: bool Auto-adjust DiMCA cluster labels (default: True) dimca_table: bool Return DiMCA table (default: False) **dima_kwargs: dict, optional Additional keyword arguments passed to :func:`pymodulon.plotting.dima` Returns ------- clusters: ~sklearn.cluster.AgglomerativeClustering :class:`sklearn.cluster.AgglomerativeClustering` of activity matrix cg: ~seaborn.matrix.ClusterGrid, optional :class:`~seaborn.matrix.ClusterGrid` containing the clusterplot dimca_ax: ~matplotlib.axes.Axes, optional :class:`~matplotlib.axes.Axes` containing the DiMCA scatterplot dimca_table: ~pandas.DataFrame, optional Table of differentially activated iModulon clusters """ # compute distance matrix; distance metric defined as 1 - correlation to # ensure that correlated iModulons are close in distance, can be clustered if correlation_method == "mutual_info": correlation_df = 1 - ica_data.A.T.corr(method=mutual_info_distance) distance_matrix = 1 - correlation_df.abs() - np.eye(len(correlation_df)) correlation_df = (correlation_df - correlation_df.min().min()) / ( correlation_df.max().max() ) + np.eye(len(correlation_df)) else: correlation_df = ica_data.A.T.corr(method=correlation_method) distance_matrix = 1 - correlation_df.abs() best_clusters = [] cluster_score_dict = {} # define a base instance of the clustering object we will use throughout; # the distance threshold here is a dummy; we will replace this with either # the user-specified value or an automatically-selected value agg_cluster_base = AgglomerativeClustering( n_clusters=None, affinity="precomputed", compute_full_tree=True, linkage="complete", distance_threshold=0.5, ) # perform automatic thresholding for default case if distance_threshold is None: auto_threshold_df = pd.DataFrame(columns=["threshold", "score", "n_clusters"]) auto_threshold_df["threshold"] = np.arange(0, 1, 0.025) for row in auto_threshold_df.itertuples(index=True): # create a copy of the base clusterer with the threshold to try; fit agg_cluster_auto_threshold = clone(agg_cluster_base).set_params( distance_threshold=row.threshold ) agg_cluster_auto_threshold.fit(distance_matrix) n_clusters = agg_cluster_auto_threshold.n_clusters_ auto_threshold_df.loc[row.Index, "n_clusters"] = n_clusters # score the clustering; handle the edge case where all clusters have # size 1; this is invalid input for silhouette_score if n_clusters == distance_matrix.shape[0] or n_clusters == 1: auto_threshold_df.loc[row.Index, "score"] = 0 else: auto_threshold_df.loc[row.Index, "score"] = silhouette_score( distance_matrix, agg_cluster_auto_threshold.labels_, metric="precomputed", ) best_threshold = auto_threshold_df.sort_values( by="score", ascending=False ).iloc[0]["threshold"] if show_thresholding: _, ax = plt.subplots() ax.yaxis.grid(False) sns.scatterplot( x="threshold", y="score", data=auto_threshold_df, ax=ax, color="blue" ) ax.axvline(best_threshold, linestyle="--", color="k") ax.tick_params(axis="both", labelsize=13) ax.tick_params(axis="y", color="blue", labelcolor="blue") ax.set_xlabel("Threshold", fontsize=14) ax.set_ylabel("Silhouette", color="blue", fontsize=14) ax2 = ax.twinx() ax2.yaxis.grid(False) sns.scatterplot( x="threshold", y="n_clusters", data=auto_threshold_df, ax=ax2, color="red", ) ax2.tick_params(axis="both", labelsize=13) ax2.tick_params(axis="y", color="red", labelcolor="red") ax2.set_ylabel("# Clusters", fontsize=14, color="red") plt.show() else: best_threshold = distance_threshold # re-perform the clustering with the best threshold from above agg_cluster_final = clone(agg_cluster_base).set_params( distance_threshold=best_threshold ) agg_cluster_final.fit(distance_matrix) labels = agg_cluster_final.labels_ returns = [agg_cluster_final] # compute silhouette scores for each cluster and determine what our best # clusters are; we only need to do this if we are displaying best clusters # OR if we're doing a DIMCA later if show_best_clusters or dimca_sample1 is not None: sample_scores = silhouette_samples( distance_matrix, labels, metric="precomputed" ) cluster_score_dict = {} for label in set(labels): cluster_score_dict[label] = np.mean( np.array(sample_scores)[np.where(labels == label)[0]] ) # calculate the best clusters based on the requested method if n_best_clusters == "auto": mean_cluster_score = np.mean(list(cluster_score_dict.values())) best_clusters = [ cluster for cluster, score in cluster_score_dict.items() if score > mean_cluster_score ] else: best_clusters = list( list( zip( *sorted( cluster_score_dict.items(), key=lambda label_score: label_score[1], reverse=True, ) ) )[0] )[: min(n_best_clusters, len(cluster_score_dict))] if show_clustermap: # prepare a linkage matrix so that we can specify the dendrogram to sns # https://stackoverflow.com/questions/29127013/ children = agg_cluster_final.children_ distance = np.arange(children.shape[0]) no_of_observations = np.arange(2, children.shape[0] + 2) linkage_matrix = np.column_stack( [children, distance, no_of_observations] ).astype(float) clustermap = sns.clustermap( correlation_df, row_linkage=linkage_matrix, col_linkage=linkage_matrix, xticklabels=False, yticklabels=False, figsize=(10, 10), center=0, ) if correlation_method == "mutual_info": clustermap.ax_cbar.set_ylabel("Normalized MI", fontsize=14) else: clustermap.ax_cbar.set_ylabel(f"{correlation_method} R", fontsize=14) # the following code draws squares on the clustermap to highlight # the cluster locations; will also number the best clusters if they # are to be called out after the fact cluster_size_dict = Counter(labels) size_counter = 1 top_left = 0 best_cluster_labels = [] best_cluster_matrices = [] best_cluster_scores = [] for i, imod_idx in enumerate(clustermap.dendrogram_row.reordered_ind): # draw a box around the cluster if we're at the end of the cluster cluster_for_imod = agg_cluster_final.labels_[imod_idx] cluster_size = cluster_size_dict[cluster_for_imod] if cluster_size == size_counter: # usually Rectangle wants bottom left, but the heatmap that sns # makes has the positive direction reversed (axes positions # increase towards the bottom right) clustermap.ax_heatmap.add_patch( Rectangle( (top_left, top_left), cluster_size, cluster_size, fill=False, color="white", lw=1, ) ) # also add a number (or name) IF we're calling out later if show_best_clusters: if cluster_for_imod in best_clusters: if cluster_names is not None: cluster_name = cluster_names.get( cluster_for_imod, cluster_for_imod ) else: cluster_name = cluster_for_imod clustermap.ax_heatmap.text( top_left + cluster_size + 0.05, top_left - 0.05, str(cluster_name), color="white", fontsize=16, ) # stash the clustermap data for the best cluster best_cluster_submatrix = clustermap.data2d.iloc[ top_left : (top_left + cluster_size), top_left : (top_left + cluster_size), ] best_cluster_labels.append(cluster_for_imod) best_cluster_matrices.append(best_cluster_submatrix) best_cluster_scores.append(cluster_score_dict[cluster_for_imod]) # reset the counters being used to establish where to draw boxes size_counter = 1 top_left = i + 1 else: size_counter += 1 if return_clustermap: returns.append(clustermap) plt.show() # now actually display the best clusters if asked to if show_best_clusters: # calculate the figure size and arrangement needed to cleanly # display the number of best clusters we have on hand subplot_rows, subplot_cols = 1, 1 while subplot_rows * subplot_cols < len(best_cluster_matrices): if subplot_rows / subplot_cols >= 1: subplot_cols += 1 else: subplot_rows += 1 if len(best_cluster_matrices) <= 4: figsize = (10, 6) elif len(best_cluster_matrices) <= 30: figsize = (14, 9) else: figsize = (20, 14) _, axs = plt.subplots(subplot_rows, subplot_cols, figsize=figsize) axs = axs.flatten() # sort the best clusters by score to display the best one first sorted_lab_mtrx_score_tups = sorted( zip(best_cluster_labels, best_cluster_matrices, best_cluster_scores), key=lambda label_matrix_score_tup: label_matrix_score_tup[2], reverse=True, ) for lab_mtrx_score_tup, ax in zip(sorted_lab_mtrx_score_tups, axs): cluster_lab, cluster_mtrx, cluster_score = lab_mtrx_score_tup sns.heatmap( cluster_mtrx, xticklabels=False, center=0, square=True, cbar=False, ax=ax, ) if cluster_names is not None: cluster_name = cluster_names.get( cluster_lab, f"Cluster {cluster_lab}" ) cluster_title = f"{cluster_name} ({cluster_score:.2f})" else: cluster_title = f"Cluster {cluster_lab} " f"({cluster_score:.2f})" ax.set_title(cluster_title, fontsize=16) ax.set_yticks(np.arange(0.5, cluster_mtrx.shape[0] + 0.5, 1)) ax.set_yticklabels(cluster_mtrx.index) ax.tick_params(axis="both", labelsize=11) ax.tick_params(axis="y", rotation=0) for i in range(1, len(axs) - len(best_cluster_matrices) + 1): axs[-i].set_visible(False) plt.tight_layout() # do the requested DIMCA comparison if asked; re-use machinery from base # DIMA as much as possible if dimca_sample1 is not None: # compute the new activity matrix with the best cluster activities # consolidated; define positive direction as direction with more + corrs cluster_A_df = pd.DataFrame(columns=ica_data.A.columns) all_clustered_ims = [] for best_cluster_label in best_clusters: cluster_im_inds = np.where(labels == best_cluster_label)[0] cluster_ims = np.array(ica_data.A.index)[cluster_im_inds] all_clustered_ims += list(cluster_ims) cluster_im_A_df = ica_data.A.loc[cluster_ims] cluster_corr_df = correlation_df.loc[cluster_ims, cluster_ims] # get the average non-self correlation for each iM in the cluster # use this to invert the row in the cluster A df if negative cluster_size = cluster_corr_df.shape[0] for i in range(cluster_size): non_self_inds = list(set(list(range(cluster_size))) - {i}) non_self_corrs = cluster_corr_df.iloc[i, non_self_inds] im_corr_mean = np.mean(non_self_corrs) if im_corr_mean < 0: cluster_im_A_df.iloc[i, :] = cluster_im_A_df.iloc[i, :] * -1 # compute the final averaged cluster activity, add to new dataframe # use the provided name for this cluster if we have one if cluster_names is not None and best_cluster_label in cluster_names: cluster_name = f"{cluster_names[best_cluster_label]} [Clst]" else: cluster_name = f"Cluster {best_cluster_label}" cluster_A_df.loc[cluster_name] = cluster_im_A_df.mean(axis=0) # now we can add in the singleton iModulons to our new A matrix singleton_ims = set(list(ica_data.A.index)) - set(all_clustered_ims) cluster_A_df = cluster_A_df.append(ica_data.A.loc[list(singleton_ims)]) # add to kwargs dima_kwargs["alternate_A"] = cluster_A_df # now we can pretty much proceed as normal with DIMCA; just have a # different activity matrix, but the procedure is the same from here dimca_return = plot_dima( ica_data, sample1=dimca_sample1, sample2=dimca_sample2, threshold=dimca_threshold, fdr=dimca_fdr, label=dimca_label, adjust=dimca_adjust, table=dimca_table, **dima_kwargs, ) if dimca_table: returns += dimca_return else: returns.append(dimca_return) return returns
#################### # Metadata Boxplot # ####################
[docs]def metadata_boxplot( ica_data, imodulon, show_points=True, n_boxes=3, samples=None, strip_conc=True, ignore_cols=None, use_cols=None, return_results=False, ax=None, box_kwargs=None, strip_kwargs=None, swarm_kwargs=None, ): """ Uses a decision tree regressor to automatically cluster iModulon activities using metadata. Displays results as a box plot. Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object imodulon: int or str `iModulon` name show_points: bool or str (default: True) Overlay individual points on top of the boxplot. By default, this uses :func:`seaborn.stripplot`. `show_points='swarm'` will use :func`seaborn.swarmplot`. n_boxes: int Number of boxes to create samples: list, optional Subset of samples to analyze strip_conc: bool Remove concentrations from metadata (e.g. "glucose(2g/L)" would be interpreted as just "glucose") ignore_cols: list, optional List of columns to ignore. If None, only "project" and "condition" are ignored use_cols: list, optional List of columns to use. This supercedes ignore_cols. return_results: bool Return a dataframe describing the classifications ax: ~matplotlib.axes.Axes, optional Axes object to plot on, otherwise use current Axes box_kwargs: dict, optional Additional keyword arguments passed to :func:`seaborn.boxplot` strip_kwargs: dict, optional Additional keyword arguments passed to :func:`seaborn.stripplot` swarm_kwargs: dict, optional Additional keyword arguments passed to :func:`seaborn.swarmplot` Returns ------- ax: ~matplotlib.axes.Axes :class:`~matplotlib.axes.Axes` containing the boxplot df_classes: ~pd.DataFrame Metadata classifications of the samples """ if not ax: fig, ax = plt.subplots() if show_points is True: show_points = "strip" if box_kwargs is None: box_kwargs = {} if strip_kwargs is None: strip_kwargs = {} if swarm_kwargs is None: swarm_kwargs = {} if samples is None: samples = ica_data.sample_names activities = ica_data.A.loc[imodulon, samples] encoding, features = _encode_metadata( ica_data, samples=samples, ignore_cols=ignore_cols, use_cols=use_cols, strip_conc=strip_conc, ) clf = _train_classifier(activities, features, max_leaf_nodes=n_boxes) labels = _get_labels_from_tree(clf, encoding) df_classes = _get_sample_leaves(clf, features, labels, activities) df_classes = df_classes.sort_values(imodulon, ascending=False) # If using either a stripplot or swarmplot, hide fliers if show_points in ["strip", "swarm"]: box_kwargs.update({"fliersize": 0}) sns.boxplot(data=df_classes, x=imodulon, y="category", **box_kwargs) if show_points == "strip": color_mdbp = strip_kwargs.pop("color", "k") jitter_mdbp = strip_kwargs.pop("color", 0.3) sns.stripplot( data=df_classes, x=imodulon, y="category", color=color_mdbp, jitter=jitter_mdbp, **strip_kwargs, ) elif show_points == "swarm": color_mdbp = swarm_kwargs.pop("color", "k") sns.swarmplot( data=df_classes, x=imodulon, y="category", color=color_mdbp, **swarm_kwargs ) ax.set_ylabel("") if return_results: return ax, df_classes else: return ax
[docs]def _encode_metadata( ica_data, samples, use_cols=None, ignore_cols=None, strip_conc=True ): # Convert values to strings metadata = ica_data.sample_table.loc[samples].copy() metadata = metadata.astype(str).fillna("") # Select columns if ignore_cols is None: ignore_cols = [] if use_cols is not None: metadata = metadata[use_cols] else: metadata.drop(set(ignore_cols) & set(metadata.columns), axis=1, inplace=True) # Remove concentrations if strip_conc: metadata = metadata.replace(r"(.*?)\(.*?\)", r"\1", regex=True) # One-hot encode features enc = OneHotEncoder() enc.fit(metadata) features = enc.fit_transform(metadata) # Get feature information list2struct = [] for cat, vals in zip(metadata.columns, enc.categories_): list2struct += [[cat, val] for val in vals] encoding = pd.DataFrame(list2struct, columns=["category", "value"]) return encoding, features
[docs]def _train_classifier(component, features, max_leaf_nodes=3): # Run Decision Tree Regressor clf = DecisionTreeRegressor( min_samples_leaf=2, criterion="mae", max_leaf_nodes=max_leaf_nodes ) clf.fit(features, component) return clf
[docs]def _get_labels_from_tree(clf, encoding): # Load tree information children_left = clf.tree_.children_left children_right = clf.tree_.children_right feature_ids = clf.tree_.feature # Initialize traversal stack = [(0, "")] # node_id, label final_labels = [""] * len(feature_ids) # Traverse Tree while len(stack) > 0: node_id, label = stack.pop() # Add feature split to dataframe feat = feature_ids[node_id] # Check for leaf if feat != -2: cat, val = encoding.loc[feat].values label_left = f"{label}\n{cat}: NOT {val}" label_right = f"{label}\n{cat}: {val}" # Add children to stack stack.append((children_left[node_id], label_left)) stack.append((children_right[node_id], label_right)) else: final_labels[node_id] = label.strip() return final_labels
[docs]def _get_sample_leaves(clf, features, labels, component): # Initialize final dataframe DF_result = pd.DataFrame(component) DF_result["category"] = "" # Get decision path node_mat = clf.decision_path(features) for i, lab in enumerate(labels): if lab != "": DF_result.iloc[sparse.find(node_mat[:, i])[0], 1] = lab return DF_result
#################### # Helper Functions # ####################
[docs]def _fit_line(x, y, ax, metric): # Get line parameters and metric of correlation/regression if metric is None: return if metric == "r2": params, r2 = _get_fit(x, y) label = "$R^2_{{adj}}$ = {:.2f}".format(r2) elif metric == "pearson": params = curve_fit(_solid_line, x, y)[0] r, pval = stats.pearsonr(x, y) if pval < 1e-10: label = f"Pearson R = {r:.2f}\np-value < 1e-10" else: label = f"Pearson R = {r:.2f}\np-value = {pval:.2e}" elif metric == "spearman": params = curve_fit(_solid_line, x, y)[0] r, pval = stats.spearmanr(x, y) if pval < 1e-10: label = f"Spearman R = {r:.2f}\np-value < 1e-10" else: label = f"Spearman R = {r:.2f}\np-value = {pval:.2e}" else: raise ValueError('Metric must be "pearson", "spearman", or "r2"') # Plot line if len(params) == 2: xvals = np.array([min(x), max(x)]) ax.plot( xvals, _solid_line(xvals, *params), label=label, color="k", linestyle="dashed", linewidth=1, zorder=5, ) else: mid = params[2] xvals = np.array([x.min(), mid, x.max()]) ax.plot( xvals, _broken_line(xvals, *params), label=label, color="k", linestyle="dashed", linewidth=1, zorder=5,
)
[docs]def _get_fit(x, y): all_params = [curve_fit(_solid_line, x, y)[0]] with warnings.catch_warnings(): warnings.simplefilter("ignore", category=OptimizeWarning) for c in [min(x), np.mean(x), max(x)]: try: all_params.append( curve_fit(_broken_line, x, y, p0=[1, 1, c], maxfev=5000)[0] ) except (OptimizeWarning, RuntimeError): pass best_r2 = -np.inf best_params = all_params[0] for params in all_params: if len(params) == 2: r2 = _adj_r2(_solid_line, x, y, params) else: r2 = _adj_r2(_broken_line, x, y, params) if r2 > best_r2: best_r2 = r2 best_params = params if best_r2 < 0: return [0, np.mean(y)], 0 return best_params, best_r2
[docs]def _broken_line(x, A, B, C): y = np.zeros(len(x), dtype=float) y += (A * x + B) * (x >= C) y += (A * C + B) * (x < C) return y
[docs]def _solid_line(x, A, B): # this is your 'straight line' y=f(x) y = A * x + B return y
[docs]def _adj_r2(f, x, y, params): n = len(x) k = len(params) - 1 r2 = r2_score(y, f(x, *params)) return 1 - np.true_divide((1 - r2) * (n - 1), (n - k - 1))
[docs]def _mod_freedman_diaconis(ica_data, imodulon): """ Generates bins using optimal bin width estimate using the Freedman-Diaconis rule Parameters ---------- ica_data: ~pymodulon.core.IcaData :class:`~pymodulon.core.IcaData` object imodulon : int or str `iModulon` name Returns ------- bins: ~numpy.ndarray Numpy array of bins See Also -------- Wikipedia: {https://en.wikipedia.org/wiki/Freedman-Diaconis_rule} """ x = ica_data.M[imodulon] thresh = abs(ica_data.thresholds[imodulon]) # Modified Freedman-Diaconis opt_width = (x.max() - x.min()) / (len(x) ** (1 / 3)) # Width calculated using optimal width and iModulon threshold if thresh > opt_width: width = thresh / int(thresh / opt_width) else: width = thresh / 2 # Use width and thresh to calculate xmin, xmax if x.min() < -thresh: multiple = np.ceil(abs(x.min() / width)) xmin = -(multiple + 1) * width else: xmin = -(thresh + width) if x.max() > thresh: multiple = np.ceil(x.max() / width) xmax = (multiple + 1) * width else: xmax = thresh + width return np.arange(xmin, xmax + width, width)
########################## # Experimental Functions # ##########################
[docs]def _set_xaxis(xaxis, y): """ Implements experimental `xaxis` param from `plot_gene_weights`. This allows for users to generate a scatterplot comparing gene weight on the y-axis with any collection of numbers on the x-axis (as long as the lengths match). Parameters ---------- xaxis: list, set, tuple, dict, np.array, pd.Series Any collection or mapping of numbers (plots on x-axis) y: pd.Series pandas Series of Gene Weights to be plotted on the y-axis of `plot_gene_weights` Returns ------- x: pd.Series Returns a pd.Series to be used as the x-axis data-points for generating the plot_gene_weights scatter-plot. """ # Determine type of `xaxis` and set `x` accordingly x = xaxis if isinstance(xaxis, pd.Series) else pd.Series(xaxis) if not x.sort_index().index.equals(y.sort_index().index): raise ValueError( "Given x-values do not align with gene and their " "respective gene-weights on the y-axis" ) return x