Source code for graphein.grn.parse_trrust

"""Utilities for parsing the TRRUST database."""
# %%
# Graphein
# Author: Ramon Vinas, Arian Jamasb <arian@jamasb.io>
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
import logging
import os
from functools import lru_cache
from pathlib import Path
from typing import Callable, List, Optional

import pandas as pd
import wget

from graphein.utils.utils import filter_dataframe

log = logging.getLogger(__name__)


def _download_TRRUST(root_dir: Optional[Path] = None) -> str:
    """
    Downloads TRRUST from https://www.grnpedia.org/trrust/data/trrust_rawdata.human.tsv

    :param root_dir: Path to desired output directory to download TRRUST to. Defaults to None (downloads to graphein/datasets/trrust/)
    :type root_dir: pathlib.Path, optional
    :returns: Path to downloaded TRRUST Dataset.
    :rtype: str
    """
    url = "https://www.grnpedia.org/trrust/data/trrust_rawdata.human.tsv"

    if root_dir is None:
        root_dir = Path(__file__).parent.parent.parent
    trrust_dir = f"{root_dir}/datasets/trrust"
    Path(trrust_dir).mkdir(parents=False, exist_ok=True)
    file = f"{trrust_dir}/human.tsv"

    # Download data
    if not os.path.exists(file):
        log.info("Downloading TRRUST ...")
        wget.download(url, file)

    return file


[docs]@lru_cache()
def load_TRRUST(root_dir: Optional[Path] = None) -> pd.DataFrame:
    """
    Loads the TRRUST datafile. If file not found, it is downloaded.

    :param root_dir: Root directory path to either find or download TRRUST
    :type root_dir: pathlib.Path, optional
    :returns: TRRUST database as a dataframe
    :rtype: pd.DataFrame
    """
    file = _download_TRRUST(root_dir)

    return pd.read_csv(
        file,
        delimiter="\t",
        header=None,
        names=["g1", "g2", "regtype", "references"],
    )


[docs]def parse_TRRUST(
    gene_list: List[str], root_dir: Optional[Path] = None
) -> pd.DataFrame:
    """
    Parser for TRRUST regulatory interactions. If the TRRUST dataset is not found in the specified root_dir, it is downloaded

    :param gene_list: List of gene identifiers to restrict dataframe to.
    :type gene_list: List[str]
    :param root_dir: Root directory path to either find or download TRRUST. Defaults to None (downloads dataset to graphein/datasets/trrust)
    :type root_dir: pathlib.Path, optional
    :returns: Pandas dataframe with the regulatory interactions between genes in the gene list
    :rtype: pd.DataFrame
    """
    df = load_TRRUST(root_dir=root_dir)
    # Select input genes
    df = df[df["g1"].isin(gene_list) & df["g2"].isin(gene_list)]

    return df


[docs]def filter_TRRUST(
    df: pd.DataFrame, funcs: Optional[List[Callable]]
) -> pd.DataFrame:
    """
    Filters results of TRRUST call according to user kwargs.

    :param df: Source specific Pandas dataframe (TRRUST) with results of the API call
    :type df: pd.DataFrame
    :param funcs: User functions to filter the results.
    :type funcs: List[Callable]
    :return: Source specific Pandas dataframe with filtered results
    :rtype: pd.DataFrame
    """
    if funcs is not None:
        df = filter_dataframe(df, funcs)

    return df


[docs]def standardise_TRRUST(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filters results of TRRUST call by providing a list of
    user-defined functions that accept a dataframe and return a dataframe.

    :param df: pd.Dataframe to filter. Must contain columns: ["g1", "g2", "regtype"]
    :type df: pd.DataFrame
    :param funcs: list of functions that carry out dataframe processing
    :type funcs: List[Callable]
    :return: processed dataframe
    :rtype: pd.DataFrame
    """
    # Rename & delete columns
    df = df[["g1", "g2", "regtype"]]

    # Rename type of regulatory interaction
    df["regtype"].replace(
        {"Activation": "+", "Repression": "-", "Unknown": "?"}, inplace=True
    )

    # Add source column
    df["source"] = "TRRUST"

    return df


[docs]def TRRUST_df(
    gene_list: List[str], filtering_funcs: Optional[List[Callable]] = None
) -> pd.DataFrame:
    """
    Generates standardised dataframe with TRRUST protein-protein interactions, filtered according to user's input.

    :param gene_list:
    :type gene_list: List[str]
    :param filtering_funcs: Functions with which to filter the dataframe.
    :type filtering_funcs: List[Callable]
    :return: Standardised dataframe with TRRUST interactions
    :rtype: pd.DataFrame
    """
    df = parse_TRRUST(gene_list=gene_list)
    df = filter_TRRUST(df, filtering_funcs)
    df = standardise_TRRUST(df)

    return df


if __name__ == "__main__":
    df = TRRUST_df(["AATF", "MYC", "USF1", "SP1", "TP53", "DUSP1"])
    print(df.head())