Source code for graphein.ppi.parse_biogrid

"""Functions for making and parsing API calls to BIOGRID."""
# %%
# Graphein
# Author: Ramon Vinas, Arian Jamasb <arian@jamasb.io>
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
import logging
from typing import Dict, List, Union

import pandas as pd
import requests

log = logging.getLogger(__name__)


[docs]def params_BIOGRID(
    params: Dict[str, Union[str, int, List[str], List[int]]], **kwargs
) -> Dict[str, Union[str, int]]:
    """
    Updates default parameters with user parameters for the method "interactions" of the BIOGRID API REST.

    See also https://wiki.thebiogrid.org/doku.php/biogridrest
    :param params: Dictionary of default parameters
    :type params: Dict[str, Union[str, int, List[str], List[int]]]
    :param kwargs: User parameters for the method "network" of the BIOGRID API REST. The key must start with "BIOGRID"
    :type kwargs: Dict[str, Union[str, int, List[str], List[int]]]
    :return: Dictionary of parameters
    :rtype: Dict[str, Union[str, int]]
    """
    fields = [
        "searchNames",  # If ‘true’, the interactor OFFICIAL_SYMBOL will be examined for a match
        # with the geneList.
        "max",  # Number of results to fetch
        "interSpeciesExcluded",  # If ‘true’, interactions with interactors from different species will
        # be excluded.
        "selfInteractionsExcluded",  # If ‘true’, interactions with one interactor will be excluded.
        "evidenceList",  # Any interaction evidence with its Experimental System in the list will be excluded
        # from the results unless includeEvidence is set to true.
        "includeEvidence",  # If set to true, any interaction evidence with its Experimental System in the
        # evidenceList will be included in the result
        "searchIds",  # If ‘true’, the interactor ENTREZ_GENE, ORDERED LOCUS and SYSTEMATIC_NAME (orf) will
        # be examined for a match with the geneList.
        "searchNames",  # If ‘true’, the interactor OFFICIAL_SYMBOL will be examined for a match with
        # the geneList.
        "searchSynonyms",  # If ‘true’, the interactor SYNONYMS will be examined for a match with
        # the geneList.
        "searchBiogridIds",  # If ‘true’, the entries in 'GENELIST' will be compared to BIOGRID internal IDS
        # which are provided in all Tab2 formatted files.
        "additionalIdentifierTypes",  # Identifier types on this list are examined for a match with
        # the geneList.
        "excludeGenes",  # If ‘true’, interactions containing genes in the geneList will be excluded from the
        # results.
        "includeInteractors",  # If ‘true’, in addition to interactions between genes on the geneList,
        # interactions will also be fetched which have only one interactor on
        # the geneList
        "includeInteractorInteractions",  # If ‘true’ interactions between the geneList’s first order
        # interactors will be included.
        "pubmedList",  # Interactions will be fetched whose Pubmed Id is/ is not in this list, depending on
        # the value of excludePubmeds.
        "excludePubmeds",  # If ‘false’, interactions with Pubmed ID in pubmedList will be included in the
        # results; if ‘true’ they will be excluded.
        "htpThreshold",  # Interactions whose Pubmed ID has more than this number of interactions will be
        # excluded from the results. Ignored if excludePubmeds is ‘false’.
        "throughputTag"  # If set to 'low or 'high', only interactions with 'Low throughput' or
        # 'High throughput' in the 'throughput' field will be returned.
    ]
    for p in fields:
        kwarg_name = "BIOGRID_" + p
        if kwarg_name in kwargs:
            value = kwargs[kwarg_name]
            if type(value) is list:
                value = "|".join(value)
            params[p] = value
    return params


[docs]def parse_BIOGRID(
    protein_list: List[str],
    ncbi_taxon_id: Union[int, str, List[int], List[str]],
    paginate: bool = True,
    **kwargs,
) -> pd.DataFrame:
    """
    Makes BIOGRID API call and returns a source specific Pandas dataframe.

    See also [1] BIOGRID: https://wiki.thebiogrid.org/doku.php/biogridrest
    :param protein_list: Proteins to include in the graph
    :type protein_list: List[str]
    :param ncbi_taxon_id: NCBI taxonomy identifiers for the organism. Default is 9606 (Homo Sapiens)
    :type ncbi_taxon_id: Union[int, str, List[int], List[str]]
    :param paginate: boolean indicating whether to paginate the calls (for BIOGRID, the maximum number of rows per
        call is 10000). Defaults to True
    :type paginate: bool
    :param kwargs: Parameters of the "interactions" method of the BIOGRID API REST, used to select the results.
        The parameter names are of the form BIOGRID_<param>, where <param> is the name of the parameter.
        Information about these parameters can be found at [1].
    :type kwargs: Dict[str, Union[str, int, List[str], List[int]]]
    :return: Source specific Pandas dataframe.
    :rtype: pd.DataFrame
    """
    # Prepare call to BIOGRID API
    biogrid_api_url = "https://webservice.thebiogrid.org"
    method = "interactions"
    request_url = "/".join([biogrid_api_url, method])
    if type(ncbi_taxon_id) is list:
        ncbi_taxon_id = "|".join(str(t) for t in ncbi_taxon_id)
    params = {  # Default parameters
        "geneList": "|".join(protein_list),
        "accesskey": "c4ab86373e0bb921a878bb6d15ee4fb4",
        "taxId": ncbi_taxon_id,  # 9606 is human
        "format": "json",
        "max": 10000,  # Number of results to fetch
        "searchNames": "true",
        "includeInteractors": "false",  # Set to true to get any interaction involving EITHER gene,
        # set to false to get interactions between genes
        "selfInteractionsExcluded": "true",  # If ‘true’, interactions with one interactor will be excluded
    }
    params = params_BIOGRID(params, **kwargs)

    # Call BIOGRID
    def make_call(
        request_url: str,
        params: Dict[str, Union[str, int]],
        start: int = 0,
        max: int = 10000,
        paginate: bool = paginate,
    ) -> pd.DataFrame:
        """
        Makes call to BIOGRID API.

        :param request_url: BIOGRID URL to make request
        :type request_url: str
        :param params: BIOGRID API parameters to use
        :type params: Dict[str, Union[str, int]]
        :param start: index in gene list to start from
        :type start: int
        :param max: number of genes to use in API call. Results are limited to 10k per call
        :type max: int
        :param paginate: bool indicating whether or not to paginate calls. Above 10k calls this is required
        :type paginate: bool
        :return: pd.DataFrame containing BIOGRID_df API call output
        :rtype: pd.DataFrame
        """
        params["start"] = start
        response = requests.post(request_url, data=params)
        df = pd.read_json(response.text.strip()).transpose()

        # Maximum number of results is limited to 10k. Paginate to retrieve everything
        if paginate and df.shape[0] == max:
            next_df = make_call(request_url, params, start + max, max)
            df = pd.concat([df, next_df])

        return df

    return make_call(
        request_url=request_url, params=params, start=0, max=params["max"]
    )


[docs]def filter_BIOGRID(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """
    Filters results of the BIOGRID API call according to user kwargs.

    :param df: Source specific Pandas dataframe (BIOGRID) with results of the API call
    :type df: pd.DataFrame
    :param kwargs: User thresholds used to filter the results. The parameter names are of the form BIOGRID_<param>,
        where <param> is the name of the parameter. All the parameters are numerical values.
    :type kwargs: Dict[str, Union[str, int, List[str], List[int]]]
    :return: Source specific Pandas dataframe with filtered results
    :rtype: pd.DataFrame
    """
    # Note: To filter BIOGRID interactions, use parameters from https://wiki.thebiogrid.org/doku.php/biogridrest
    # TODO: Make sure that user can filter results of API call via the parameters.
    #       Otherwise implement filtering here.
    # TODO: Perhaps can filter by EXPERIMENTAL_SYSTEM (e.g. Co-fractionation)
    #       and EXPERIMENTAL_SYSTEM_TYPE (e.g. physical)
    return df


[docs]def standardise_BIOGRID(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardises BIOGRID dataframe, e.g. puts everything into a common format.

    :param df: Source specific Pandas dataframe
    :type df: pd.DataFrame
    :return: Standardised dataframe
    :rtpe: pd.DataFrame
    """
    if df.empty:
        return pd.DataFrame({"p1": [], "p2": [], "source": []})

    # Rename & delete columns
    df = df.rename(
        columns={"OFFICIAL_SYMBOL_A": "p1", "OFFICIAL_SYMBOL_B": "p2"}
    )
    df = df[["p1", "p2"]]

    # Add source column
    df["source"] = "BIOGRID"

    return df


[docs]def BIOGRID_df(
    protein_list: List[str],
    ncbi_taxon_id: Union[int, str, List[int], List[str]],
    **kwargs,
) -> pd.DataFrame:
    """
    Generates standardised dataframe with BIOGRID protein-protein interactions, filtered according to user's input.

    :protein_list: List of proteins (official symbol) that will be included in the PPI graph
    :type protein_list: List[str]
    :ncbi_taxon_id: NCBI taxonomy identifiers for the organism. 9606 corresponds to Homo Sapiens
    :type ncbi_taxon_id: int
    :param kwargs:  Additional parameters to pass to BIOGRID API calls
    :type kwargs: Union[int, str, List[int], List[str]]
    :return: Standardised dataframe with BIOGRID interactions
    :rtype: pd.DataFrame
    """
    df = parse_BIOGRID(
        protein_list=protein_list, ncbi_taxon_id=ncbi_taxon_id, **kwargs
    )
    df = filter_BIOGRID(df, **kwargs)
    df = standardise_BIOGRID(df)
    return df


if __name__ == "__main__":
    protein_list = [
        "CDC42",
        "CDK1",
        "KIF23",
        "PLK1",
        "RAC2",
        "RACGAP1",
        "RHOA",
        "RHOB",
    ]
    sources = ["STRING", "BIOGRID"]
    kwargs = {
        "STRING_escore": 0.2,  # Keeps STRING interactions with an experimental score >= 0.2
        "BIOGRID_throughputTag": "high",  # Keeps high throughput BIOGRID interactions
    }

    df = BIOGRID_df(
        protein_list=protein_list, ncbi_taxon_id=9606, kwargs=kwargs
    )

    print(df.head())