Source code for graphein.ppi.parse_stringdb

"""Functions for making and parsing API calls to STRINGdb."""
# %%
# Graphein
# Author: Ramon Vinas, Arian Jamasb <arian@jamasb.io>
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
import logging
from typing import Dict, List, Union

import pandas as pd
import requests

log = logging.getLogger(__name__)


[docs]def params_STRING( params: Dict[str, Union[str, int, List[str], List[int]]], **kwargs ) -> Dict[str, Union[str, int]]: """ Updates default parameters with user parameters for the method "network" of the STRING API REST. See also https://string-db.org/help/api/ :param params: Dictionary of default parameters :type params: Dict[str, Union[str, int, List[str], List[int]]] :param kwargs: User parameters for the method "network" of the STRING API REST. The key must start with "STRING" :type kwargs: Dict[str, Union[str, int, List[str], List[int]]] :return: Dictionary of parameters :rtype: Dict[str, Union[str, int]] """ # TODO: Might be possible to generalise this function for all sources fields = [ "species", # NCBI taxon identifiers "required_score", # threshold of significance to include a interaction, a number between 0 and 1000 # (default depends on the network) "network_type", # network type: functional (default), physical "add_nodes", # adds a number of proteins to the network based on their confidence score, # e.g., extends the interaction neighborhood of selected proteins to desired value "show_query_node_labels" # when available use submitted names in the preferredName column when # (0 or 1) (default:0) ] for p in fields: kwarg_name = f"STRING_{p}" if kwarg_name in kwargs: value = kwargs[kwarg_name] if type(value) is list: value = "%0d".join(value) params[p] = value return params
[docs]def parse_STRING( protein_list: List[str], ncbi_taxon_id: Union[int, str, List[int], List[str]], **kwargs, ) -> pd.DataFrame: """ Makes STRING API call and returns a source specific Pandas dataframe. See also [1] STRING: https://string-db.org/help/api/ :param protein_list: Proteins to include in the graph :type protein_list: List[str] :param ncbi_taxon_id: NCBI taxonomy identifiers for the organism. Default is 9606 (Homo Sapiens) :type ncbi_taxon_id: int :param kwargs: Parameters of the "network" method of the STRING API REST, used to select the results. The parameter names are of the form STRING_<param>, where <param> is the name of the parameter. Information about these parameters can be found at [1]. :type kwargs: Dict[str, Union[str, int, List[str], List[int]]] :return: Source specific Pandas dataframe. :rtype: pd.DataFrame """ # Prepare call to STRING API string_api_url = "https://string-db.org/api" output_format = "json" # "tsv-no-header" method = "network" request_url = "/".join([string_api_url, output_format, method]) if type(ncbi_taxon_id) is list: ncbi_taxon_id = "%0d".join(ncbi_taxon_id) params = { "identifiers": "%0d".join(protein_list), "species": ncbi_taxon_id, # 9606 is human "caller_identity": "graphein", } params = params_STRING(params, **kwargs) # Call STRING response = requests.post(request_url, data=params) return pd.read_json(response.text.strip())
[docs]def filter_STRING(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Filters results of the STRING API call according to user kwargs, keeping rows where the input parameters are greater or equal than the input thresholds. :param df: Source specific Pandas dataframe (STRING) with results of the API call :type df: pd.DataFrame :param kwargs: User thresholds used to filter the results. The parameter names are of the form STRING_<param>, where <param> is the name of the parameter. All the parameters are numerical values. :type kwargs: Dict[str, Union[str, int, List[str], List[int]]] :return: Source specific Pandas dataframe with filtered results :rtype: pd.DataFrame """ scores = [ "score", # combined score "nscore", # gene neighborhood score "fscore", # gene fusion score "pscore", # phylogenetic profile score "ascore", # coexpression score "escore", # experimental score "dscore", # database score "tscore", ] # textmining score] for s in scores: kwarg_name = f"STRING_{s}" if kwarg_name in kwargs: threshold = kwargs[kwarg_name] df = df[df[s] >= threshold] return df
[docs]def standardise_STRING(df: pd.DataFrame) -> pd.DataFrame: """ Standardises STRING dataframe, e.g. puts everything into a common format. :param df: Source specific Pandas dataframe :type df: pd.DataFrame :return: Standardised dataframe :rtype: pd.DataFrame """ if df.empty: return pd.DataFrame({"p1": [], "p2": [], "source": []}) # Rename & delete columns df = df.rename(columns={"preferredName_A": "p1", "preferredName_B": "p2"}) df = df[["p1", "p2"]] # Add source column df["source"] = "STRING" return df
[docs]def STRING_df( protein_list: List[str], ncbi_taxon_id: Union[int, str, List[int], List[str]], **kwargs, ) -> pd.DataFrame: """ Generates standardised dataframe with STRING protein-protein interactions, filtered according to user's input. :param protein_list: List of proteins (official symbol) that will be included in the PPI graph :type protein_list: List[str] :param ncbi_taxon_id: NCBI taxonomy identifiers for the organism. 9606 corresponds to Homo Sapiens :type ncbi_taxon_id: int :param kwargs: Additional parameters to pass to STRING API calls :return: Standardised dataframe with STRING interactions :rtype: pd.DataFrame """ df = parse_STRING( protein_list=protein_list, ncbi_taxon_id=ncbi_taxon_id, **kwargs ) df = filter_STRING(df, **kwargs) df = standardise_STRING(df) return df
if __name__ == "__main__": protein_list = [ "CDC42", "CDK1", "KIF23", "PLK1", "RAC2", "RACGAP1", "RHOA", "RHOB", ] sources = ["STRING", "BIOGRID"] kwargs = { "STRING_escore": 0.2, # Keeps STRING interactions with an experimental score >= 0.2 "BIOGRID_throughputTag": "high", # Keeps high throughput BIOGRID interactions } df = STRING_df( protein_list=protein_list, ncbi_taxon_id=9606, kwargs=kwargs ) print(df)