Source code for pymodulon.compare

import logging
import os
import subprocess
from glob import glob

import numpy as np
import pandas as pd
from Bio import SeqIO
from graphviz import Digraph


[docs]def _get_orthologous_imodulons(M1, M2, method, cutoff): """ Given two M matrices, returns the dot graph and name links of the various connected ICA components Parameters ---------- M1 : ~pandas.DataFrame M matrix from the first organism M2 : ~pandas.DataFrame M matrix from the second organism method : int or str Correlation metric to use from {‘pearson’, ‘kendall’, ‘spearman’} or callable (see :meth:`~pandas.DataFrame.corr`) cutoff : float Cut off value for correlation metric Returns ------- links: list Links and distances of connected iModulons """ # Only keep genes found in both M matrices common = set(M1.index) & set(M2.index) if len(common) == 0: raise KeyError("No common genes") m1 = M1.reindex(common) m2 = M2.reindex(common) # Compute correlation matrix corr = ( pd.concat([m1, m2], axis=1, keys=["df1", "df2"]) .corr(method=method) .loc["df1", "df2"] .abs() ) DF_corr = corr.loc[m1.columns, m2.columns] # Get positions where correlation is above cutoff pos = zip(*np.where(DF_corr > cutoff)) links = [(m1.columns[i], m2.columns[j], DF_corr.iloc[i, j]) for i, j in pos] return links
[docs]def _make_dot_graph(links, show_all, names1, names2): """ Given a set of links between M matrices, generates a dot graph of the various connected iModulons Parameters ---------- links : list Names and distances of connected iModulons show_all : bool Show all iModulons regardless of their linkage (default: False) names1 : list List of names in dataset 1 names2 : list List of names in dataset 2 Returns ------- dot: Digraph Dot graph of connected iModulons """ link_names1 = [link[0] for link in links] link_names2 = [link[1] for link in links] if show_all: names1 = [str(i) for i in names1] names2 = [str(i) for i in names2] else: # Get names of nodes names1 = [str(i) for i in link_names1] names2 = [str(i) for i in link_names2] # Split names in half if necessary for dataset1 name_dict1 = {} for name in names1: val = str(name) if len(val) > 10: name_dict1[name] = val[: len(val) // 2] + "-\n" + val[len(val) // 2 :] else: name_dict1[name] = val # Split names in half if necessary for dataset2 name_dict2 = {} for name in names2: val = str(name) if len(val) > 10: name_dict2[name] = val[: len(val) // 2] + "-\n" + val[len(val) // 2 :] else: name_dict2[name] = val # Initialize Graph dot = Digraph( engine="dot", graph_attr={ "ranksep": "0.3", "nodesep": "0", "packmode": "array_u", "size": "10,10", }, node_attr={"fontsize": "14", "shape": "none"}, edge_attr={"arrowsize": "0.5"}, format="png", ) if len(links) == 0: logging.warning("No components shared across runs") return dot # Initialize Nodes for k in sorted(names2): if k in link_names2: color = "black" font = "helvetica" else: color = "red" font = "helvetica-bold" dot.node( "data2_" + str(k), label=name_dict2[k], _attributes={"fontcolor": color, "fontname": font}, ) for k in sorted(names1): if k in link_names1: color = "black" font = "helvetica" else: color = "red" font = "helvetica-bold" dot.node( "data1_" + str(k), label=name_dict1[k], _attributes={"fontcolor": color, "fontname": font}, ) # Add links between related components for k1, k2, dist in links: width = dist * 5 dot.edge( "data1_" + str(k1), "data2_" + str(k2), _attributes={"penwidth": "{:.2f}".format(width)}, ) return dot
[docs]def convert_gene_index(df1, df2, ortho_file=None, keep_locus=False): """ Reorganizes and renames genes in a dataframe to be consistent with another object/organism Parameters ---------- df1 : ~pandas.DataFrame Dataframe from the first object/organism df2 : ~pandas.DataFrame Dataframe from the second object/organism ortho_file : str or ~pandas.DataFrame, optional Path to orthology file between organisms (default: None) keep_locus : bool If True, keep old locus tags as a column (default: False) Returns ------- df1_new: ~pandas.DataFrame M matrix for organism 1 with indexes translated into orthologs df2_new: ~pandas.DataFrame M matrix for organism 2 with indexes translated into orthologs """ if ortho_file is None: common_genes = df1.index.intersection(df2.index) df1_new = df1.loc[common_genes] df2_new = df2.loc[common_genes] else: try: DF_orth = pd.read_csv(ortho_file) except (ValueError, TypeError): DF_orth = ortho_file DF_orth = DF_orth[ DF_orth.gene.isin(df1.index) & DF_orth.subject.isin(df2.index) ] subject2gene = DF_orth.set_index("subject").gene.to_dict() df1_new = df1.loc[DF_orth.gene] df2_new = df2.loc[DF_orth.subject] # Reset index of df2 to conform with df1 if keep_locus: df2_new.index.name = "locus_tag" df2_new.reset_index(inplace=True) df2_new.index = [subject2gene[idx] for idx in df2_new.locus_tag] else: df2_new.index = [subject2gene[idx] for idx in df2_new.index] if len(df1_new) == 0 or len(df2_new) == 0: raise ValueError( "No shared genes. Check that matrix 1 conforms to " "the 'gene' column of the BBH file and matrix 2 " "conforms to the 'subject' column" ) return df1_new, df2_new
[docs]def compare_ica( M1, M2, ortho_file=None, cutoff=0.25, method="pearson", plot=True, show_all=False ): """ Compares two M matrices between a single organism or across organisms and returns the connected iModulons Parameters ---------- M1 : ~pandas.DataFrame M matrix from the first organism M2 : ~pandas.DataFrame M matrix from the second organism ortho_file : str, optional Path to orthology file between organisms (default: None) cutoff : float Cut off value for correlation metric (default: .25) method : str or ~typing.Callable Correlation metric to use from {‘pearson’, ‘kendall’, ‘spearman’} or callable (see :meth:`~pandas.DataFrame.corr`) plot : bool Create dot plot of matches (default: True) show_all : bool Show all iModulons regardless of their linkage (default: False) Returns ------- matches: list Links and distances of connected iModulons dot: Digraph Dot graph of connected iModulons """ new_M1, new_M2 = convert_gene_index(M1, M2, ortho_file) matches = _get_orthologous_imodulons(new_M1, new_M2, method=method, cutoff=cutoff) if plot: dot = _make_dot_graph( matches, show_all=show_all, names1=M1.columns, names2=M2.columns ) return matches, dot else: return matches
#################### # BBH CSV Creation # ####################
[docs]def make_prots(gbk, out_path, lt_key="locus_tag"): """ Makes protein files for all the genes in the genbank file Parameters ---------- gbk : str Path to input genbank file out_path : str Path to the output FASTA file lt_key: str Key to search for locus_tag. Should be either 'locus_tag' or 'old_locus_tag' Returns ------- None : None """ if lt_key not in ["locus_tag", "old_locus_tag"]: raise ValueError("lt_key must either be 'locus_tag' or 'old_locus_tag'") with open(out_path, "w") as fa: for refseq in SeqIO.parse(gbk, "genbank"): recorded = set() for feats in [f for f in refseq.features if f.type == "CDS"]: try: lt = feats.qualifiers[lt_key][0] seq = feats.qualifiers["translation"][0] except KeyError: # newly annotated genes do not have old_locus_tags continue if lt in recorded: # clear the duplicates continue fa.write(">{}\n{}\n".format(lt, seq)) recorded.add(lt)
[docs]def make_prot_db(fasta_file, outname=None, combined="combined.fa"): """ Creates GenBank Databases from Protein FASTA of an organism Parameters ---------- fasta_file : str or list Path to protein FASTA file or list of paths to protein fasta files outname : str Name of BLAST database to be created. If None, it uses fasta_file name combined : str Path to combined fasta file; only used if multiple fasta files are passed Returns ------- None : None """ # if a list of fasta is passed, combine them if type(fasta_file) == list: cat_cmd = ["cat"] cat_cmd.extend(fasta_file) with open(combined, "w") as out: subprocess.call(cat_cmd, stdout=out) fasta_file = combined if ( os.path.isfile(fasta_file + ".phr") and os.path.isfile(fasta_file + ".pin") and os.path.isfile(fasta_file + ".psq") ): print("BLAST DB files already exist") return None cmd_line = ["makeblastdb", "-in", fasta_file, "-parse_seqids", "-dbtype", "prot"] if outname: cmd_line.extend(["-out", outname]) print("running makeblastdb with following command line...") print(" ".join(cmd_line)) try: subprocess.check_call(cmd_line) print("Protein DB files created successfully") except subprocess.CalledProcessError: print( "\nmakeblastdb run failed. Make sure makeblastdb is" " installed and working properly, and that the protein FASTA " "file contains no duplicate genes. View the output below to " "see what error occured:\n" ) status = subprocess.run(cmd_line, capture_output=True) print(status) return None
# noinspection PyTypeChecker
[docs]def get_bbh( db1, db2, outdir="bbh", outname=None, mincov=0.8, evalue=0.001, threads=1, force=False, savefiles=True, ): """ Runs Bidirectional Best Hit BLAST to find orthologs utilizing two protein FASTA files. Outputs a CSV file of all orthologous genes. Parameters ---------- db1 : str Path to protein FASTA file for organism 1 db2 : str Path to protein FASTA file for organism 2 outdir : str Path to output directory (default: "bbh") outname : str Name of output CSV file (default: <db1>_vs_<db2>_parsed.csv) mincov : float Minimum coverage to call hits in BLAST, must be between 0 and 1 (default: 0.8) evalue : float E-value threshold for BlAST hist (default: .001) threads : int Number of threads to use for BLAST (default: 1) force : bool If True, overwrite existing files (default: False) savefiles : bool If True, save files to 'outdir' (default: True) Returns ------- out: ~pandas.DataFrame Table of bi-directional BLAST hits between the two organisms """ # check if files exist, and vars are appropriate if not _all_clear(db1, db2, outdir, mincov): return None # get get the db names, will be used for outfile names on1 = ".".join(os.path.split(db1)[-1].split(".")[:-1]) on2 = ".".join(os.path.split(db2)[-1].split(".")[:-1]) # run and save BLAST results bres1 = os.path.join(outdir, "{}_vs_{}.txt".format(on2, on1)) bres2 = os.path.join(outdir, "{}_vs_{}.txt".format(on1, on2)) _run_blastp(db1, db2, bres1, evalue, threads, force) _run_blastp(db2, db1, bres2, evalue, threads, force) db1_lengths = _get_gene_lens(db1) db2_lengths = _get_gene_lens(db2) if not outname: outname = "{}_vs_{}_parsed.csv".format(on1, on2) out_file = os.path.join(outdir, outname) files = glob(os.path.join(outdir, "*_parsed.csv")) if not force and out_file in files: print("bbh already parsed for", on1, on2) out = pd.read_csv(out_file, index_col=0) return out print("parsing BBHs for", on1, on2) cols = [ "gene", "subject", "PID", "alnLength", "mismatchCount", "gapOpenCount", "queryStart", "queryEnd", "subjectStart", "subjectEnd", "eVal", "bitScore", ] bbh_file1 = os.path.join(outdir, "{}_vs_{}.txt".format(on1, on2)) bbh_file2 = os.path.join(outdir, "{}_vs_{}.txt".format(on2, on1)) bbh = pd.read_csv(bbh_file1, sep="\t", names=cols) bbh = pd.merge(bbh, db1_lengths) bbh["COV"] = bbh["alnLength"] / bbh["gene_length"] bbh2 = pd.read_csv(bbh_file2, sep="\t", names=cols) bbh2 = pd.merge(bbh2, db2_lengths) bbh2["COV"] = bbh2["alnLength"] / bbh2["gene_length"] # Strip "lcl|" from protein files taken from NCBI bbh.gene = bbh.gene.str.strip("lcl|") bbh.subject = bbh.subject.str.strip("lcl|") bbh2.gene = bbh2.gene.str.strip("lcl|") bbh2.subject = bbh2.subject.str.strip("lcl|") # FILTER GENES THAT HAVE COVERAGE < mincov bbh = bbh[bbh.COV >= mincov] bbh2 = bbh2[bbh2.COV >= mincov] list2struct = [] # find if genes are directionally best hits of each other for g in bbh.gene.unique(): res = bbh[bbh.gene == g] if len(res) == 0: continue # find BLAST hit with highest percent identity (PID) best_hit = res.loc[res.PID.idxmax()].copy() res2 = bbh2[bbh2.gene == best_hit.subject] if len(res2) == 0: # no match continue # find BLAST hit with higest PID in the reciprocal BLAST best_gene2 = res2.loc[res2.PID.idxmax(), "subject"] # if doing forward then reciprocal BLAST nets the same gene -> BBH if g == best_gene2: best_hit["BBH"] = "<=>" else: # only best hit in one direction best_hit["BBH"] = "->" list2struct.append(best_hit) out = pd.DataFrame(list2struct) out = out[out["BBH"] == "<=>"] if savefiles: print("Saving results to: " + out_file) out.to_csv(out_file) else: os.remove(bbh_file1) os.remove(bbh_file2) return out
[docs]def _get_gene_lens(file_in): """ Computes gene lengths Parameters ---------- file_in : str Input file path Returns ------- out : ~pandas.DataFrame Table of gene lengths """ handle = open(file_in) records = SeqIO.parse(handle, "fasta") out = [] for record in records: out.append({"gene": record.name, "gene_length": len(record.seq)}) out = pd.DataFrame(out) return out
[docs]def _run_blastp(db1, db2, out, evalue, threads, force): """ Runs BLASTP between two organisms Parameters ---------- db1 : str Path to protein FASTA file for organism 1 db2 : str Path to protein FASTA file for organism 2 out : str Path for BLASTP output evalue : float E-value threshold for BlAST hits threads : int Number of threads to use for BLAST force : bool If True, overwrite existing files Returns ------- out : str Path of BLASTP output """ if not force and os.path.isfile(out): print(db1, " already blasted") return out print("blasting {} vs {}".format(db1, db2)) cmd_line = [ "blastp", "-db", db1, "-query", db2, "-out", out, "-evalue", str(evalue), "-outfmt", "6", "-num_threads", str(threads), ] print("running blastp with following command line...") print(" ".join(cmd_line)) try: subprocess.check_call(cmd_line) except subprocess.CalledProcessError as err: print("BLAST run failed. Make sure BLAST is" " installed and working properly.") raise err return out
[docs]def _all_clear(db1, db2, outdir, mincov): """Checks inputs are acceptable""" if not 0 < mincov <= 1: print("Coverage must be greater than 0 and less than or equal to 1") return None if not (os.path.isfile(db1) and os.path.isfile(db2)): print("One of the fasta file is missing") return None for i in [".phr", ".pin", ".psq"]: if not os.path.isfile(db1 + i) or not os.path.isfile(db2 + i): print("Some of the BLAST db files are missing") return None if not os.path.isdir(outdir): print("Making the output directory: " + outdir) os.mkdir(outdir) return True
[docs]def _same_output(df1, df2): """Checks that inputs are the same""" df1 = df1.reset_index(drop=True) df2 = df2.reset_index(drop=True) if all(df1.eq(df2)): print("The two outputs are the same.") return True elif all(df1.eq(df2.rename(columns={"subject": "gene", "gene": "subject"}))): print( "The two outputs are the same, " "but the genes and subject are switched." ) return True else: print("The two outputs are not the same.") return False