Source code for pymodulon.gene_util

"""
Utility functions for gene annotation
"""

import logging
import re
import urllib
from io import StringIO

import pandas as pd


[docs]def cog2str(cog):
    """
    Get the full description for a COG category letter

    Parameters
    ----------
    cog : str
        COG category letter

    Returns
    -------
    str
        Description of COG category
    """

    cog_dict = {
        "A": "RNA processing and modification",
        "B": "Chromatin structure and dynamics",
        "C": "Energy production and conversion",
        "D": "Cell cycle control, cell division, chromosome partitioning",
        "E": "Amino acid transport and metabolism",
        "F": "Nucleotide transport and metabolism",
        "G": "Carbohydrate transport and metabolism",
        "H": "Coenzyme transport and metabolism",
        "I": "Lipid transport and metabolism",
        "J": "Translation, ribosomal structure and biogenesis",
        "K": "Transcription",
        "L": "Replication, recombination and repair",
        "M": "Cell wall/membrane/envelope biogenesis",
        "N": "Cell motility",
        "O": "Post-translational modification, protein turnover, and chaperones",
        "P": "Inorganic ion transport and metabolism",
        "Q": "Secondary metabolites biosynthesis, transport, and catabolism",
        "R": "General function prediction only",
        "S": "Function unknown",
        "T": "Signal transduction mechanisms",
        "U": "Intracellular trafficking, secretion, and vesicular transport",
        "V": "Defense mechanisms",
        "W": "Extracellular structures",
        "X": "No COG annotation",
        "Y": "Nuclear structure",
        "Z": "Cytoskeleton",
    }

    return cog_dict[cog]


[docs]def _get_attr(attributes, attr_id, ignore=False):
    """
    Helper function for parsing GFF annotations

    Parameters
    ----------
    attributes : str
        Attribute string
    attr_id : str
        Attribute ID
    ignore : bool
        If true, ignore errors if ID is not in attributes (default: False)

    Returns
    -------
    str, optional
        Value of attribute
    """

    try:
        return re.search(attr_id + "=(.*?)(;|$)", attributes).group(1)
    except AttributeError:
        if ignore:
            return None
        else:
            raise ValueError("{} not in attributes: {}".format(attr_id, attributes))


[docs]def gff2pandas(gff_file, feature="CDS", index=None):
    """
    Converts GFF file(s) to a Pandas DataFrame
    Parameters
    ----------
    gff_file : str or list
        Path(s) to GFF file
    feature: str or list
        Name(s) of features to keep (default = "CDS")
    index : str, optional
        Column or attribute to use as index

    Returns
    -------
    df_gff: ~pandas.DataFrame
        GFF formatted as a DataFrame
    """

    # Argument checking
    if isinstance(gff_file, str):
        gff_file = [gff_file]

    if isinstance(feature, str):
        feature = [feature]

    result = []

    for gff in gff_file:
        with open(gff, "r") as f:
            lines = f.readlines()

        # Get lines to skip
        skiprow = sum([line.startswith("#") for line in lines])

        # Read GFF
        names = [
            "accession",
            "source",
            "feature",
            "start",
            "end",
            "score",
            "strand",
            "phase",
            "attributes",
        ]
        DF_gff = pd.read_csv(gff, sep="\t", skiprows=skiprow, names=names, header=None)

        # Filter for CDSs
        DF_cds = DF_gff[DF_gff.feature.isin(feature)]

        # Also filter for genes to get old_locus_tag
        DF_gene = DF_gff[DF_gff.feature == "gene"].reset_index()
        DF_gene["locus_tag"] = DF_gene.attributes.apply(
            _get_attr, attr_id="locus_tag", ignore=True
        )
        DF_gene["old_locus_tag"] = DF_gene.attributes.apply(
            _get_attr, attr_id="old_locus_tag", ignore=True
        )
        DF_gene = DF_gene[["locus_tag", "old_locus_tag"]]
        DF_gene = DF_gene[DF_gene.locus_tag.notnull()]

        # Sort by start position
        DF_cds = DF_cds.sort_values("start")

        # Extract attribute information
        DF_cds["locus_tag"] = DF_cds.attributes.apply(_get_attr, attr_id="locus_tag")

        DF_cds["gene_name"] = DF_cds.attributes.apply(
            _get_attr, attr_id="gene", ignore=True
        )

        DF_cds["gene_product"] = DF_cds.attributes.apply(
            _get_attr, attr_id="product", ignore=True
        )

        DF_cds["ncbi_protein"] = DF_cds.attributes.apply(
            _get_attr, attr_id="protein_id", ignore=True
        )

        # Merge in old_locus_tag
        DF_cds = pd.merge(DF_cds, DF_gene, how="left", on="locus_tag", sort=False)

        result.append(DF_cds)

    DF_gff = pd.concat(result)

    if index:
        if DF_gff[index].duplicated().any():
            logging.warning("Duplicate {} detected. Dropping duplicates.".format(index))
            DF_gff = DF_gff.drop_duplicates(index)
        DF_gff.set_index("locus_tag", drop=True, inplace=True)

    return DF_gff


[docs]def reformat_biocyc_tu(tu):
    """

    Parameters
    ----------
    tu: str
        Biocyc-formatted transcription unit (i.e. 'thrL // thrA // thrB //
        thrC')

    Returns
    -------
    formatted_tu : str
        Semicolon-separated sorted gene list
    """
    try:
        return ";".join(sorted(tu.split(" // ")))
    except AttributeError:
        return None


##############
# ID Mapping #
##############


[docs]def uniprot_id_mapping(
    prot_list,
    input_id="ACC+ID",
    output_id="P_REFSEQ_AC",
    input_name="input_id",
    output_name="output_id",
):
    """
    Python wrapper for the uniprot ID mapping tool (See
    https://www.uniprot.org/uploadlists/)

    Parameters
    ----------
    prot_list : list
        List of proteins to be mapped
    input_id : str
        ID type for the mapping input (default: "ACC+ID")
    output_id : str
        ID type for the mapping output (default: "P_REFSEQ_AC")
    input_name : str
        Column name for input IDs
    output_name : str
        Column name for output IDs

    Returns
    -------
    mapping : ~pandas.DataFrame
        Table containing two columns, one listing the inputs, and one listing
        the mapped outputs. Column names are defined by input_name and
        output_name.

    """

    url = "https://www.uniprot.org/uploadlists/"

    params = {
        "from": input_id,
        "to": output_id,
        "format": "tab",
        "query": " ".join(prot_list),
    }

    # Send mapping request to uniprot
    data = urllib.parse.urlencode(params)
    data = data.encode("utf-8")
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
        response = f.read()

    # Load result to pandas dataframe
    text = StringIO(response.decode("utf-8"))
    mapping = pd.read_csv(text, sep="\t", header=0, names=[input_name, output_name])

    # Only keep one uniprot ID per gene
    mapping = mapping.sort_values(output_name).drop_duplicates(input_name)
    return mapping