"""Utilities for parsing the TRRUST database."""# %%# Graphein# Author: Ramon Vinas, Arian Jamasb <arian@jamasb.io># License: MIT# Project Website: https://github.com/a-r-j/graphein# Code Repository: https://github.com/a-r-j/grapheinimportloggingimportosfromfunctoolsimportlru_cachefrompathlibimportPathfromtypingimportCallable,List,Optionalimportpandasaspdimportwgetfromgraphein.utils.utilsimportfilter_dataframelog=logging.getLogger(__name__)def_download_TRRUST(root_dir:Optional[Path]=None)->str:""" Downloads TRRUST from https://www.grnpedia.org/trrust/data/trrust_rawdata.human.tsv :param root_dir: Path to desired output directory to download TRRUST to. Defaults to None (downloads to graphein/datasets/trrust/) :type root_dir: pathlib.Path, optional :returns: Path to downloaded TRRUST Dataset. :rtype: str """url="https://www.grnpedia.org/trrust/data/trrust_rawdata.human.tsv"ifroot_dirisNone:root_dir=Path(__file__).parent.parent.parenttrrust_dir=f"{root_dir}/datasets/trrust"Path(trrust_dir).mkdir(parents=False,exist_ok=True)file=f"{trrust_dir}/human.tsv"# Download dataifnotos.path.exists(file):log.info("Downloading TRRUST ...")wget.download(url,file)returnfile
[docs]@lru_cache()defload_TRRUST(root_dir:Optional[Path]=None)->pd.DataFrame:""" Loads the TRRUST datafile. If file not found, it is downloaded. :param root_dir: Root directory path to either find or download TRRUST :type root_dir: pathlib.Path, optional :returns: TRRUST database as a dataframe :rtype: pd.DataFrame """file=_download_TRRUST(root_dir)returnpd.read_csv(file,delimiter="\t",header=None,names=["g1","g2","regtype","references"],)
[docs]defparse_TRRUST(gene_list:List[str],root_dir:Optional[Path]=None)->pd.DataFrame:""" Parser for TRRUST regulatory interactions. If the TRRUST dataset is not found in the specified root_dir, it is downloaded :param gene_list: List of gene identifiers to restrict dataframe to. :type gene_list: List[str] :param root_dir: Root directory path to either find or download TRRUST. Defaults to None (downloads dataset to graphein/datasets/trrust) :type root_dir: pathlib.Path, optional :returns: Pandas dataframe with the regulatory interactions between genes in the gene list :rtype: pd.DataFrame """df=load_TRRUST(root_dir=root_dir)# Select input genesdf=df[df["g1"].isin(gene_list)&df["g2"].isin(gene_list)]returndf
[docs]deffilter_TRRUST(df:pd.DataFrame,funcs:Optional[List[Callable]])->pd.DataFrame:""" Filters results of TRRUST call according to user kwargs. :param df: Source specific Pandas dataframe (TRRUST) with results of the API call :type df: pd.DataFrame :param funcs: User functions to filter the results. :type funcs: List[Callable] :return: Source specific Pandas dataframe with filtered results :rtype: pd.DataFrame """iffuncsisnotNone:df=filter_dataframe(df,funcs)returndf
[docs]defstandardise_TRRUST(df:pd.DataFrame)->pd.DataFrame:""" Filters results of TRRUST call by providing a list of user-defined functions that accept a dataframe and return a dataframe. :param df: pd.Dataframe to filter. Must contain columns: ["g1", "g2", "regtype"] :type df: pd.DataFrame :param funcs: list of functions that carry out dataframe processing :type funcs: List[Callable] :return: processed dataframe :rtype: pd.DataFrame """# Rename & delete columnsdf=df[["g1","g2","regtype"]]# Rename type of regulatory interactiondf["regtype"].replace({"Activation":"+","Repression":"-","Unknown":"?"},inplace=True)# Add source columndf["source"]="TRRUST"returndf
[docs]defTRRUST_df(gene_list:List[str],filtering_funcs:Optional[List[Callable]]=None)->pd.DataFrame:""" Generates standardised dataframe with TRRUST protein-protein interactions, filtered according to user's input. :param gene_list: :type gene_list: List[str] :param filtering_funcs: Functions with which to filter the dataframe. :type filtering_funcs: List[Callable] :return: Standardised dataframe with TRRUST interactions :rtype: pd.DataFrame """df=parse_TRRUST(gene_list=gene_list)df=filter_TRRUST(df,filtering_funcs)df=standardise_TRRUST(df)returndf