Source code for graphein.protein.subgraphs

"""Provides functions for extracting subgraphs from protein graphs."""
import logging

# Graphein
# Author: Arian Jamasb <arian@jamasb.io>
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
from typing import Dict, List, Optional, Tuple, Union

import networkx as nx
import numpy as np

from graphein.protein.edges.distance import compute_distmat
from graphein.protein.utils import ProteinGraphConfigurationError

log = logging.getLogger(__name__)


[docs]def extract_subgraph_from_node_list( g, node_list: Optional[List[str]], filter_dataframe: bool = True, update_coords: bool = True, recompute_distmat: bool = False, inverse: bool = False, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts a subgraph from a graph based on a list of nodes. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param node_list: The list of nodes to extract. :type node_list: List[str] :param filter_dataframe: Whether to filter the pdb_df dataframe of the graph. Defaults to True. :type filter_dataframe: bool :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param inverse: Whether to inverse the selection. Defaults to False. :type inverse: bool :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ if node_list: # Get all nodes not in nodelist if inversing the selection if inverse: node_list = [n for n in g.nodes() if n not in node_list] # If we are just returning the node list, return it here before subgraphing. if return_node_list: return node_list log.debug(f"Creating subgraph from nodes: {node_list}.") # Create a subgraph from the node list. g = g.subgraph(node_list) # Filter the PDB DF accordingly if filter_dataframe: g.graph["pdb_df"] = g.graph["pdb_df"].loc[ g.graph["pdb_df"]["node_id"].isin(node_list) ] if update_coords: g.graph["coords"] = np.array( [d["coords"] for _, d in g.nodes(data=True)] ) if recompute_distmat: if not filter_dataframe: log.warning("Recomputing distmat without filtering dataframe.") g.graph["distmat"] = compute_distmat(g.graph["pdb_df"]) if return_node_list: return node_list return g
[docs]def extract_subgraph_from_point( g: nx.Graph, centre_point: Union[np.ndarray, Tuple[float, float, float]], radius: float, filter_dataframe: bool = True, update_coords: bool = True, recompute_distmat: bool = False, inverse: bool = False, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts a subgraph from a graph based on a centre point and radius. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param centre_point: The centre point of the subgraph. :type centre_point: Tuple[float, float, float] :param radius: The radius of the subgraph. :type radius: float :param filter_dataframe: Whether to filter the pdb_df dataframe of the graph. Defaults to True. :type filter_dataframe: bool :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param inverse: Whether to inverse the selection. Defaults to False. :type inverse: bool :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ node_list: List = [] for n, d in g.nodes(data=True): coords = d["coords"] dist = np.linalg.norm(coords - centre_point) if dist < radius: node_list.append(n) node_list = list(set(node_list)) log.debug( f"Found {len(node_list)} nodes in the spatial point-radius subgraph." ) return extract_subgraph_from_node_list( g, node_list, filter_dataframe=filter_dataframe, inverse=inverse, return_node_list=return_node_list, recompute_distmat=recompute_distmat, update_coords=update_coords, )
[docs]def extract_subgraph_from_atom_types( g: nx.Graph, atom_types: List[str], filter_dataframe: bool = True, update_coords: bool = True, recompute_distmat: bool = False, inverse: bool = False, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts a subgraph from a graph based on a list of atom types. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param atom_types: The list of atom types to extract. :type atom_types: List[str] :param filter_dataframe: Whether to filter the pdb_df dataframe of the graph. Defaults to True. :type filter_dataframe: bool :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param inverse: Whether to inverse the selection. Defaults to False. :type inverse: bool :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ node_list: List = [ n for n, d in g.nodes(data=True) if d["atom_type"] in atom_types ] node_list = list(set(node_list)) log.debug(f"Found {len(node_list)} nodes in the atom type subgraph.") return extract_subgraph_from_node_list( g, node_list, filter_dataframe=filter_dataframe, inverse=inverse, return_node_list=return_node_list, update_coords=update_coords, recompute_distmat=recompute_distmat, )
[docs]def extract_subgraph_from_residue_types( g: nx.Graph, residue_types: List[str], filter_dataframe: bool = True, update_coords: bool = True, recompute_distmat: bool = False, inverse: bool = False, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts a subgraph from a graph based on a list of allowable residue types. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param residue_types: List of allowable residue types (3 letter residue names). :type residue_types: List[str] :param filter_dataframe: Whether to filer the pdb_df of the graph, defaults to True :type filter_dataframe: bool, optional :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param inverse: Whether to inverse the selection. Defaults to False. :type inverse: bool :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ node_list: List = [ n for n, d in g.nodes(data=True) if d["residue_name"] in residue_types ] node_list = list(set(node_list)) log.debug(f"Found {len(node_list)} nodes in the residue type subgraph.") return extract_subgraph_from_node_list( g, node_list, filter_dataframe=filter_dataframe, inverse=inverse, return_node_list=return_node_list, update_coords=update_coords, recompute_distmat=recompute_distmat, )
[docs]def extract_subgraph_from_chains( g: nx.Graph, chains: List[str], filter_dataframe: bool = True, update_coords: bool = True, recompute_distmat: bool = False, inverse: bool = False, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts a subgraph from a graph based on a chain. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param chain: The chain(s) to extract. :type chain: List[str] :param filter_dataframe: Whether to filter the pdb_df dataframe of the graph. Defaults to True. :type filter_dataframe: bool :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param inverse: Whether to inverse the selection. Defaults to False. :type inverse: bool :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ node_list: List = [ n for n, d in g.nodes(data=True) if d["chain_id"] in chains ] node_list = list(set(node_list)) log.debug(f"Found {len(node_list)} nodes in the chain subgraph.") return extract_subgraph_from_node_list( g, node_list, filter_dataframe=filter_dataframe, update_coords=update_coords, recompute_distmat=recompute_distmat, inverse=inverse, return_node_list=return_node_list, )
[docs]def extract_subgraph_by_sequence_position( g: nx.Graph, sequence_positions: List[int], filter_dataframe: bool = True, update_coords: bool = True, recompute_distmat: bool = False, inverse: bool = False, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts a subgraph from a graph based on a chain. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param chain: The sequence positions to extract. :type chain: List[int] :param filter_dataframe: Whether to filter the pdb_df dataframe of the graph. Defaults to True. :type filter_dataframe: bool :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param inverse: Whether to inverse the selection. Defaults to False. :type inverse: bool :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ node_list: List = [ n for n, d in g.nodes(data=True) if d["residue_number"] in sequence_positions ] node_list = list(set(node_list)) log.debug( f"Found {len(node_list)} nodes in the sequence position subgraph." ) return extract_subgraph_from_node_list( g, node_list, filter_dataframe=filter_dataframe, inverse=inverse, return_node_list=return_node_list, recompute_distmat=recompute_distmat, update_coords=update_coords, )
[docs]def extract_subgraph_by_bond_type( g: nx.Graph, bond_types: List[str], filter_dataframe: bool = True, update_coords: bool = True, recompute_distmat: bool = False, inverse: bool = False, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts a subgraph from a graph based on a list of allowable bond types. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param bond_types: List of allowable bond types. :type bond_types: List[str] :param filter_dataframe: Whether to filter the pdb_df of the graph, defaults to True :type filter_dataframe: bool, optional :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param inverse: Whether to inverse the selection, defaults to False :type inverse: bool, optional :param return_node_list: Whether to return the node list, defaults to False :type return_node_list: bool, optional :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ node_list: List = [] for u, v, d in g.edges(data=True): for bond_type in list(d["kind"]): if bond_type in bond_types: node_list.append(u) node_list.append(v) node_list = list(set(node_list)) log.debug(f"Found {len(node_list)} nodes in the bond type subgraph.") # Remove bond annotations for u, v, d in g.edges(data=True): for bond in list(d["kind"]): if not inverse: if bond not in bond_types: d["kind"].discard(bond) elif inverse: if bond in bond_types: d["kind"].discard(bond) return extract_subgraph_from_node_list( g, node_list, filter_dataframe=filter_dataframe, inverse=inverse, return_node_list=return_node_list, recompute_distmat=recompute_distmat, update_coords=update_coords, )
[docs]def extract_subgraph_from_secondary_structure( g: nx.Graph, ss_elements: List[str], inverse: bool = False, filter_dataframe: bool = True, recompute_distmat: bool = False, update_coords: bool = True, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts subgraphs for nodes that have a secondary structure element in the list. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param ss_elements: List of secondary structure elements to extract. :type ss_elements: List[str] :param inverse: Whether to inverse the selection. Defaults to False. :type inverse: bool :param filter_dataframe: Whether to filter the pdb_df of the graph, defaults to True :type filter_dataframe: bool, optional :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param return_node_list: Whether to return the node list. Defaults to False. :raises ProteinGraphConfigurationError: If the graph does not contain ss features on the nodes (`d['ss'] not in d.keys() for _, d in g.nodes(data=True)`). :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ node_list: List[str] = [] for n, d in g.nodes(data=True): if "ss" not in d.keys(): raise ProteinGraphConfigurationError( f"Secondary structure not defined for all nodes ({n}). Please ensure you have used graphein.protein.nodes.features.dssp.secondary_structure as a graph annotation function." ) if d["ss"] in ss_elements: node_list.append(n) node_list = list(set(node_list)) log.debug( f"Found {len(node_list)} nodes in the secondary structure subgraph." ) return extract_subgraph_from_node_list( g, node_list, inverse=inverse, return_node_list=return_node_list, filter_dataframe=filter_dataframe, recompute_distmat=recompute_distmat, update_coords=update_coords, )
[docs]def extract_surface_subgraph( g: nx.Graph, rsa_threshold: float = 0.2, inverse: bool = False, filter_dataframe: bool = True, recompute_distmat: bool = False, update_coords: bool = True, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts a subgraph based on thresholding the Relative Solvent Accessibility (RSA). This can be used for extracting a surface graph. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param rsa_threshold: The threshold to use for the RSA. Defaults to 0.2 (20%) :type rsa_threshold: float :param filter_dataframe: Whether to filter the pdb_df of the graph, defaults to True :type filter_dataframe: bool, optional :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param inverse: Whether to inverse the selection, defaults to False :type inverse: bool, optional :param return_node_list: Whether to return the node list. Defaults to False. :type return_node_list: bool :raises ProteinGraphConfigurationError: If the graph does not contain RSA features on the nodes (`d['rsa'] not in d.keys() for _, d in g.nodes(data=True)`). :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ node_list: List[str] = [] for n, d in g.nodes(data=True): if "rsa" not in d.keys(): raise ProteinGraphConfigurationError( f"RSA not defined for all nodes ({n}). Please ensure you have used graphein.protein.nodes.features.dssp.rsa as a graph annotation function." ) if d["rsa"] >= rsa_threshold: node_list.append(n) node_list = list(set(node_list)) log.debug(f"Found {len(node_list)} nodes in the surface subgraph.") return extract_subgraph_from_node_list( g, node_list, inverse=inverse, return_node_list=return_node_list, filter_dataframe=filter_dataframe, recompute_distmat=recompute_distmat, update_coords=update_coords, )
[docs]def extract_k_hop_subgraph( g: nx.Graph, central_node: str, k: int, k_only: bool = False, filter_dataframe: bool = True, update_coords: bool = True, recompute_distmat: bool = False, inverse: bool = False, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts a k-hop subgraph. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param central_node: The central node to extract the subgraph from. :type central_node: str :param k: The number of hops to extract. :type k: int :param k_only: Whether to only extract the exact k-hop subgraph (e.g. include 2-hop neighbours in 5-hop graph). Defaults to False. :type k_only: bool :param filter_dataframe: Whether to filter the pdb_df of the graph, defaults to True :type filter_dataframe: bool, optional :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param inverse: Whether to inverse the selection, defaults to False :type inverse: bool, optional :param return_node_list: Whether to return the node list. Defaults to False. :type return_node_list: bool :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ neighbours: Dict[int, Union[List[str], set]] = {0: [central_node]} for i in range(1, k + 1): neighbours[i] = set() for node in neighbours[i - 1]: neighbours[i].update(g.neighbors(node)) neighbours[i] = list(set(neighbours[i])) if k_only: node_list = neighbours[k] else: node_list = list( {value for values in neighbours.values() for value in values} ) log.debug(f"Found {len(node_list)} nodes in the k-hop subgraph.") return extract_subgraph_from_node_list( g, node_list, filter_dataframe=filter_dataframe, inverse=inverse, return_node_list=return_node_list, recompute_distmat=recompute_distmat, update_coords=update_coords, )
[docs]def extract_subgraph( g: nx.Graph, node_list: Optional[List[str]] = None, sequence_positions: Optional[List[int]] = None, chains: Optional[List[str]] = None, residue_types: Optional[List[str]] = None, atom_types: Optional[List[str]] = None, bond_types: Optional[List[str]] = None, centre_point: Optional[ Union[np.ndarray, Tuple[float, float, float]] ] = None, radius: Optional[float] = None, ss_elements: Optional[List[str]] = None, rsa_threshold: Optional[float] = None, k_hop_central_node: Optional[str] = None, k_hops: Optional[int] = None, k_only: Optional[bool] = None, filter_dataframe: bool = True, update_coords: bool = True, recompute_distmat: bool = False, inverse: bool = False, return_node_list: bool = False, ) -> Union[nx.Graph, List[str]]: """Extracts a subgraph from a graph based on a list of nodes, sequence positions, chains, residue types, atom types, centre point and radius. :param g: The graph to extract the subgraph from. :type g: nx.Graph :param node_list: List of nodes to extract specified by their node_id. Defaults to None. :type node_list: List[str], optional :param sequence_positions: The sequence positions to extract. Defaults to None. :type sequence_positions: List[int], optional :param chains: The chain(s) to extract. Defaults to None. :type chains: List[str], optional :param residue_types: List of allowable residue types (3 letter residue names). Defaults to None. :type residue_types: List[str], optional :param atom_types: List of allowable atom types. Defaults to None. :type atom_types: List[str], optional :param centre_point: The centre point to extract the subgraph from. Defaults to None. :type centre_point: Union[np.ndarray, Tuple[float, float, float]], optional :param radius: The radius to extract the subgraph from. Defaults to None. :type radius: float, optional :param ss_elements: List of secondary structure elements to extract. `["H", "B", "E", "G", "I", "T", "S", "-"]` corresponding to Alpha helix Beta bridge, Strand, Helix-3, Helix-5, Turn, Bend, None. Defaults to None. :type ss_elements: List[str], optional :param rsa_threshold: The threshold to use for the RSA. Defaults to None. :type rsa_threshold: float, optional :param central_node: The central node to extract the subgraph from. Defaults to None. :type central_node: str, optional :param k: The number of hops to extract. :type k: int :param k_only: Whether to only extract the exact k-hop subgraph (e.g. include 2-hop neighbours in 5-hop graph). Defaults to False. :type k_only: bool :param filter_dataframe: Whether to filter the pdb_df dataframe of the graph. Defaults to True. Defaults to None. :type filter_dataframe: bool, optional :param update_coords: Whether to update the coordinates of the graph. Defaults to True. :type update_coords: bool :param recompute_distmat: Whether to recompute the distance matrix of the graph. Defaults to False. :type recompute_distmat: bool :param inverse: Whether to inverse the selection. Defaults to False. :type inverse: bool, optional :return: The subgraph or node list if return_node_list is True. :rtype: Union[nx.Graph, List[str]] """ if node_list is None: node_list = [] if sequence_positions is not None: node_list += extract_subgraph_by_sequence_position( g, sequence_positions, return_node_list=True ) if chains is not None: node_list += extract_subgraph_from_chains( g, chains, return_node_list=True ) if residue_types is not None: node_list += extract_subgraph_from_residue_types( g, residue_types, return_node_list=True ) if atom_types is not None: node_list += extract_subgraph_from_atom_types( g, atom_types, return_node_list=True ) if bond_types is not None: node_list += extract_subgraph_by_bond_type( g, bond_types, return_node_list=True ) if centre_point is not None and radius is not None: node_list += extract_subgraph_from_point( g, centre_point, radius, return_node_list=True ) if ss_elements is not None: node_list += extract_subgraph_from_secondary_structure( g, ss_elements, return_node_list=True ) if rsa_threshold is not None: node_list += extract_surface_subgraph( g, rsa_threshold, return_node_list=True ) if k_hop_central_node is not None and k_hops and k_only is not None: node_list += extract_k_hop_subgraph( g, k_hop_central_node, k_hops, k_only, return_node_list=True ) node_list = list(set(node_list)) return extract_subgraph_from_node_list( g, node_list, filter_dataframe=filter_dataframe, inverse=inverse, return_node_list=return_node_list, recompute_distmat=recompute_distmat, update_coords=update_coords, )