Source code for graphein.ppi.parse_biogrid

"""Functions for making and parsing API calls to BIOGRID."""
# %%
# Graphein
# Author: Ramon Vinas, Arian Jamasb <arian@jamasb.io>
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
import logging
from typing import Dict, List, Union

import pandas as pd
import requests

log = logging.getLogger(__name__)


[docs]def params_BIOGRID( params: Dict[str, Union[str, int, List[str], List[int]]], **kwargs ) -> Dict[str, Union[str, int]]: """ Updates default parameters with user parameters for the method "interactions" of the BIOGRID API REST. See also https://wiki.thebiogrid.org/doku.php/biogridrest :param params: Dictionary of default parameters :type params: Dict[str, Union[str, int, List[str], List[int]]] :param kwargs: User parameters for the method "network" of the BIOGRID API REST. The key must start with "BIOGRID" :type kwargs: Dict[str, Union[str, int, List[str], List[int]]] :return: Dictionary of parameters :rtype: Dict[str, Union[str, int]] """ fields = [ "searchNames", # If ‘true’, the interactor OFFICIAL_SYMBOL will be examined for a match # with the geneList. "max", # Number of results to fetch "interSpeciesExcluded", # If ‘true’, interactions with interactors from different species will # be excluded. "selfInteractionsExcluded", # If ‘true’, interactions with one interactor will be excluded. "evidenceList", # Any interaction evidence with its Experimental System in the list will be excluded # from the results unless includeEvidence is set to true. "includeEvidence", # If set to true, any interaction evidence with its Experimental System in the # evidenceList will be included in the result "searchIds", # If ‘true’, the interactor ENTREZ_GENE, ORDERED LOCUS and SYSTEMATIC_NAME (orf) will # be examined for a match with the geneList. "searchNames", # If ‘true’, the interactor OFFICIAL_SYMBOL will be examined for a match with # the geneList. "searchSynonyms", # If ‘true’, the interactor SYNONYMS will be examined for a match with # the geneList. "searchBiogridIds", # If ‘true’, the entries in 'GENELIST' will be compared to BIOGRID internal IDS # which are provided in all Tab2 formatted files. "additionalIdentifierTypes", # Identifier types on this list are examined for a match with # the geneList. "excludeGenes", # If ‘true’, interactions containing genes in the geneList will be excluded from the # results. "includeInteractors", # If ‘true’, in addition to interactions between genes on the geneList, # interactions will also be fetched which have only one interactor on # the geneList "includeInteractorInteractions", # If ‘true’ interactions between the geneList’s first order # interactors will be included. "pubmedList", # Interactions will be fetched whose Pubmed Id is/ is not in this list, depending on # the value of excludePubmeds. "excludePubmeds", # If ‘false’, interactions with Pubmed ID in pubmedList will be included in the # results; if ‘true’ they will be excluded. "htpThreshold", # Interactions whose Pubmed ID has more than this number of interactions will be # excluded from the results. Ignored if excludePubmeds is ‘false’. "throughputTag" # If set to 'low or 'high', only interactions with 'Low throughput' or # 'High throughput' in the 'throughput' field will be returned. ] for p in fields: kwarg_name = "BIOGRID_" + p if kwarg_name in kwargs: value = kwargs[kwarg_name] if type(value) is list: value = "|".join(value) params[p] = value return params
[docs]def parse_BIOGRID( protein_list: List[str], ncbi_taxon_id: Union[int, str, List[int], List[str]], paginate: bool = True, **kwargs, ) -> pd.DataFrame: """ Makes BIOGRID API call and returns a source specific Pandas dataframe. See also [1] BIOGRID: https://wiki.thebiogrid.org/doku.php/biogridrest :param protein_list: Proteins to include in the graph :type protein_list: List[str] :param ncbi_taxon_id: NCBI taxonomy identifiers for the organism. Default is 9606 (Homo Sapiens) :type ncbi_taxon_id: Union[int, str, List[int], List[str]] :param paginate: boolean indicating whether to paginate the calls (for BIOGRID, the maximum number of rows per call is 10000). Defaults to True :type paginate: bool :param kwargs: Parameters of the "interactions" method of the BIOGRID API REST, used to select the results. The parameter names are of the form BIOGRID_<param>, where <param> is the name of the parameter. Information about these parameters can be found at [1]. :type kwargs: Dict[str, Union[str, int, List[str], List[int]]] :return: Source specific Pandas dataframe. :rtype: pd.DataFrame """ # Prepare call to BIOGRID API biogrid_api_url = "https://webservice.thebiogrid.org" method = "interactions" request_url = "/".join([biogrid_api_url, method]) if type(ncbi_taxon_id) is list: ncbi_taxon_id = "|".join(str(t) for t in ncbi_taxon_id) params = { # Default parameters "geneList": "|".join(protein_list), "accesskey": "c4ab86373e0bb921a878bb6d15ee4fb4", "taxId": ncbi_taxon_id, # 9606 is human "format": "json", "max": 10000, # Number of results to fetch "searchNames": "true", "includeInteractors": "false", # Set to true to get any interaction involving EITHER gene, # set to false to get interactions between genes "selfInteractionsExcluded": "true", # If ‘true’, interactions with one interactor will be excluded } params = params_BIOGRID(params, **kwargs) # Call BIOGRID def make_call( request_url: str, params: Dict[str, Union[str, int]], start: int = 0, max: int = 10000, paginate: bool = paginate, ) -> pd.DataFrame: """ Makes call to BIOGRID API. :param request_url: BIOGRID URL to make request :type request_url: str :param params: BIOGRID API parameters to use :type params: Dict[str, Union[str, int]] :param start: index in gene list to start from :type start: int :param max: number of genes to use in API call. Results are limited to 10k per call :type max: int :param paginate: bool indicating whether or not to paginate calls. Above 10k calls this is required :type paginate: bool :return: pd.DataFrame containing BIOGRID_df API call output :rtype: pd.DataFrame """ params["start"] = start response = requests.post(request_url, data=params) df = pd.read_json(response.text.strip()).transpose() # Maximum number of results is limited to 10k. Paginate to retrieve everything if paginate and df.shape[0] == max: next_df = make_call(request_url, params, start + max, max) df = pd.concat([df, next_df]) return df return make_call( request_url=request_url, params=params, start=0, max=params["max"] )
[docs]def filter_BIOGRID(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Filters results of the BIOGRID API call according to user kwargs. :param df: Source specific Pandas dataframe (BIOGRID) with results of the API call :type df: pd.DataFrame :param kwargs: User thresholds used to filter the results. The parameter names are of the form BIOGRID_<param>, where <param> is the name of the parameter. All the parameters are numerical values. :type kwargs: Dict[str, Union[str, int, List[str], List[int]]] :return: Source specific Pandas dataframe with filtered results :rtype: pd.DataFrame """ # Note: To filter BIOGRID interactions, use parameters from https://wiki.thebiogrid.org/doku.php/biogridrest # TODO: Make sure that user can filter results of API call via the parameters. # Otherwise implement filtering here. # TODO: Perhaps can filter by EXPERIMENTAL_SYSTEM (e.g. Co-fractionation) # and EXPERIMENTAL_SYSTEM_TYPE (e.g. physical) return df
[docs]def standardise_BIOGRID(df: pd.DataFrame) -> pd.DataFrame: """ Standardises BIOGRID dataframe, e.g. puts everything into a common format. :param df: Source specific Pandas dataframe :type df: pd.DataFrame :return: Standardised dataframe :rtpe: pd.DataFrame """ if df.empty: return pd.DataFrame({"p1": [], "p2": [], "source": []}) # Rename & delete columns df = df.rename( columns={"OFFICIAL_SYMBOL_A": "p1", "OFFICIAL_SYMBOL_B": "p2"} ) df = df[["p1", "p2"]] # Add source column df["source"] = "BIOGRID" return df
[docs]def BIOGRID_df( protein_list: List[str], ncbi_taxon_id: Union[int, str, List[int], List[str]], **kwargs, ) -> pd.DataFrame: """ Generates standardised dataframe with BIOGRID protein-protein interactions, filtered according to user's input. :protein_list: List of proteins (official symbol) that will be included in the PPI graph :type protein_list: List[str] :ncbi_taxon_id: NCBI taxonomy identifiers for the organism. 9606 corresponds to Homo Sapiens :type ncbi_taxon_id: int :param kwargs: Additional parameters to pass to BIOGRID API calls :type kwargs: Union[int, str, List[int], List[str]] :return: Standardised dataframe with BIOGRID interactions :rtype: pd.DataFrame """ df = parse_BIOGRID( protein_list=protein_list, ncbi_taxon_id=ncbi_taxon_id, **kwargs ) df = filter_BIOGRID(df, **kwargs) df = standardise_BIOGRID(df) return df
if __name__ == "__main__": protein_list = [ "CDC42", "CDK1", "KIF23", "PLK1", "RAC2", "RACGAP1", "RHOA", "RHOB", ] sources = ["STRING", "BIOGRID"] kwargs = { "STRING_escore": 0.2, # Keeps STRING interactions with an experimental score >= 0.2 "BIOGRID_throughputTag": "high", # Keeps high throughput BIOGRID interactions } df = BIOGRID_df( protein_list=protein_list, ncbi_taxon_id=9606, kwargs=kwargs ) print(df.head())