Source code for graphein.grn.parse_trrust

"""Utilities for parsing the TRRUST database."""
# %%
# Graphein
# Author: Ramon Vinas, Arian Jamasb <>
# License: MIT
# Project Website:
# Code Repository:
import logging
import os
from functools import lru_cache
from pathlib import Path
from typing import Callable, List, Optional

import pandas as pd
import wget

from graphein.utils.utils import filter_dataframe

log = logging.getLogger(__name__)

def _download_TRRUST(root_dir: Optional[Path] = None) -> str:
    Downloads TRRUST from

    :param root_dir: Path to desired output directory to download TRRUST to. Defaults to None (downloads to graphein/datasets/trrust/)
    :type root_dir: pathlib.Path, optional
    :returns: Path to downloaded TRRUST Dataset.
    :rtype: str
    url = ""

    if root_dir is None:
        root_dir = Path(__file__).parent.parent.parent
    trrust_dir = f"{root_dir}/datasets/trrust"
    Path(trrust_dir).mkdir(parents=False, exist_ok=True)
    file = f"{trrust_dir}/human.tsv"

    # Download data
    if not os.path.exists(file):"Downloading TRRUST ..."), file)

    return file

[docs]@lru_cache() def load_TRRUST(root_dir: Optional[Path] = None) -> pd.DataFrame: """ Loads the TRRUST datafile. If file not found, it is downloaded. :param root_dir: Root directory path to either find or download TRRUST :type root_dir: pathlib.Path, optional :returns: TRRUST database as a dataframe :rtype: pd.DataFrame """ file = _download_TRRUST(root_dir) return pd.read_csv( file, delimiter="\t", header=None, names=["g1", "g2", "regtype", "references"], )
[docs]def parse_TRRUST( gene_list: List[str], root_dir: Optional[Path] = None ) -> pd.DataFrame: """ Parser for TRRUST regulatory interactions. If the TRRUST dataset is not found in the specified root_dir, it is downloaded :param gene_list: List of gene identifiers to restrict dataframe to. :type gene_list: List[str] :param root_dir: Root directory path to either find or download TRRUST. Defaults to None (downloads dataset to graphein/datasets/trrust) :type root_dir: pathlib.Path, optional :returns: Pandas dataframe with the regulatory interactions between genes in the gene list :rtype: pd.DataFrame """ df = load_TRRUST(root_dir=root_dir) # Select input genes df = df[df["g1"].isin(gene_list) & df["g2"].isin(gene_list)] return df
[docs]def filter_TRRUST( df: pd.DataFrame, funcs: Optional[List[Callable]] ) -> pd.DataFrame: """ Filters results of TRRUST call according to user kwargs. :param df: Source specific Pandas dataframe (TRRUST) with results of the API call :type df: pd.DataFrame :param funcs: User functions to filter the results. :type funcs: List[Callable] :return: Source specific Pandas dataframe with filtered results :rtype: pd.DataFrame """ if funcs is not None: df = filter_dataframe(df, funcs) return df
[docs]def standardise_TRRUST(df: pd.DataFrame) -> pd.DataFrame: """ Filters results of TRRUST call by providing a list of user-defined functions that accept a dataframe and return a dataframe. :param df: pd.Dataframe to filter. Must contain columns: ["g1", "g2", "regtype"] :type df: pd.DataFrame :param funcs: list of functions that carry out dataframe processing :type funcs: List[Callable] :return: processed dataframe :rtype: pd.DataFrame """ # Rename & delete columns df = df[["g1", "g2", "regtype"]] # Rename type of regulatory interaction df["regtype"].replace( {"Activation": "+", "Repression": "-", "Unknown": "?"}, inplace=True ) # Add source column df["source"] = "TRRUST" return df
[docs]def TRRUST_df( gene_list: List[str], filtering_funcs: Optional[List[Callable]] = None ) -> pd.DataFrame: """ Generates standardised dataframe with TRRUST protein-protein interactions, filtered according to user's input. :param gene_list: :type gene_list: List[str] :param filtering_funcs: Functions with which to filter the dataframe. :type filtering_funcs: List[Callable] :return: Standardised dataframe with TRRUST interactions :rtype: pd.DataFrame """ df = parse_TRRUST(gene_list=gene_list) df = filter_TRRUST(df, filtering_funcs) df = standardise_TRRUST(df) return df
if __name__ == "__main__": df = TRRUST_df(["AATF", "MYC", "USF1", "SP1", "TP53", "DUSP1"]) print(df.head())