Source code for graphein.protein.edges.distance

"""Functions for computing biochemical edges of graphs."""
# Graphein
# Author: Eric Ma, Arian Jamasb <>
# License: MIT
# Project Website:
# Code Repository:
from __future__ import annotations

import logging
from itertools import combinations
from typing import Dict, List, Optional, Tuple, Union

import networkx as nx
import numpy as np
import pandas as pd
from scipy.spatial import Delaunay
from scipy.spatial.distance import euclidean, pdist, rogerstanimoto, squareform
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import kneighbors_graph

from graphein.protein.resi_atoms import (
from graphein.protein.utils import filter_dataframe

log = logging.getLogger(__name__)

[docs]def compute_distmat(pdb_df: pd.DataFrame) -> pd.DataFrame: """ Compute pairwise euclidean distances between every atom. Design choice: passed in a DataFrame to enable easier testing on dummy data. :param pdb_df: pd.Dataframe containing protein structure. Must contain columns ["x_coord", "y_coord", "z_coord"] :type pdb_df: pd.DataFrame :return: pd.Dataframe of euclidean distance matrix :rtype: pd.DataFrame """ eucl_dists = pdist( pdb_df[["x_coord", "y_coord", "z_coord"]], metric="euclidean" ) eucl_dists = pd.DataFrame(squareform(eucl_dists)) eucl_dists.index = pdb_df.index eucl_dists.columns = pdb_df.index return eucl_dists
[docs]def add_sequence_distance_edges( G: nx.Graph, d: int, name: str = "sequence_edge" ) -> nx.Graph: """ Adds edges based on sequence distance to residues in each chain. Eg. if ``d=6`` then we join: nodes ``(1,7), (2,8), (3,9)..`` based on their sequence number. :param G: networkx protein graph. :type G: nx.Graph :param d: Sequence separation to add edges on. :param name: Name of the edge type. Defaults to ``"sequence_edge"``. :type name: str :return G: networkx protein graph with added peptide bonds. :rtype: nx.Graph """ print(len(G)) # Iterate over every chain for chain_id in G.graph["chain_ids"]: # Find chain residues chain_residues = [ (n, v) for n, v in G.nodes(data=True) if v["chain_id"] == chain_id ] # Iterate over every residue in chain for i, residue in enumerate(chain_residues): try: # Checks not at chain terminus - is this versatile enough? if i == len(chain_residues) - d: continue # Asserts residues are on the same chain cond_1 = ( residue[1]["chain_id"] == chain_residues[i + d][1]["chain_id"] ) # Asserts residue numbers are adjacent cond_2 = ( abs( residue[1]["residue_number"] - chain_residues[i + d][1]["residue_number"] ) == d ) # If this checks out, we add a peptide bond if (cond_1) and (cond_2): # Adds "peptide bond" between current residue and the next if G.has_edge(i, i + d): G.edges[i, i + d]["kind"].add(name) else: G.add_edge( residue[0], chain_residues[i + d][0], kind={name}, ) except IndexError: continue return G
[docs]def add_peptide_bonds(G: nx.Graph) -> nx.Graph: """ Adds peptide backbone as edges to residues in each chain. :param G: networkx protein graph. :type G: nx.Graph :return G: networkx protein graph with added peptide bonds. :rtype: nx.Graph """ return add_sequence_distance_edges(G, d=1, name="peptide_bond")
[docs]def add_hydrophobic_interactions( G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None ): """ Find all hydrophobic interactions. Performs searches between the following residues: ``[ALA, VAL, LEU, ILE, MET, PHE, TRP, PRO, TYR]`` (:const:`~graphein.protein.resi_atoms.HYDROPHOBIC_RESIS`). Criteria: R-group residues are within 5A distance. :param G: nx.Graph to add hydrophobic interactions to. :type G: nx.Graph :param rgroup_df: Optional dataframe of R-group atoms. :type rgroup_df: pd.DataFrame, optional """ if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] hydrophobics_df = filter_dataframe( rgroup_df, "residue_name", HYDROPHOBIC_RESIS, True ) hydrophobics_df = filter_dataframe( hydrophobics_df, "node_id", list(G.nodes()), True ) distmat = compute_distmat(hydrophobics_df) interacting_atoms = get_interacting_atoms(5, distmat) add_interacting_resis( G, interacting_atoms, hydrophobics_df, ["hydrophobic"] )
[docs]def add_disulfide_interactions( G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None ): """ Find all disulfide interactions between CYS residues (:const:`~graphein.protein.resi_atoms.DISULFIDE_RESIS`, :const:`~graphein.protein.resi_atoms.DISULFIDE_ATOMS`). Criteria: sulfur atom pairs are within 2.2A of each other. :param G: networkx protein graph :type G: nx.Graph :param rgroup_df: pd.DataFrame containing rgroup data, defaults to None, which retrieves the df from the provided nx graph. :type rgroup_df: pd.DataFrame, optional """ # Check for existence of at least two Cysteine residues residues = [d["residue_name"] for _, d in G.nodes(data=True)] if residues.count("CYS") < 2: log.debug( f"{residues.count('CYS')} CYS residues found. Cannot add disulfide interactions with fewer than two CYS residues." ) return if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] disulfide_df = filter_dataframe( rgroup_df, "residue_name", DISULFIDE_RESIS, True ) disulfide_df = filter_dataframe( disulfide_df, "atom_name", DISULFIDE_ATOMS, True ) distmat = compute_distmat(disulfide_df) interacting_atoms = get_interacting_atoms(2.2, distmat) add_interacting_resis(G, interacting_atoms, disulfide_df, ["disulfide"])
[docs]def add_hydrogen_bond_interactions( G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None ): """Add all hydrogen-bond interactions.""" # For these atoms, find those that are within 3.5A of one another. if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] rgroup_df = filter_dataframe(rgroup_df, "node_id", list(G.nodes()), True) HBOND_ATOMS = [ "ND", # histidine and asparagine "NE", # glutamate, tryptophan, arginine, histidine "NH", # arginine "NZ", # lysine "OD1", "OD2", "OE", "OG", "OH", "SD", # cysteine "SG", # methionine "N", "O", ] hbond_df = filter_dataframe(rgroup_df, "atom_name", HBOND_ATOMS, True) if len(hbond_df.index) > 0: distmat = compute_distmat(hbond_df) interacting_atoms = get_interacting_atoms(3.5, distmat) add_interacting_resis(G, interacting_atoms, hbond_df, ["hbond"]) # For these atoms, find those that are within 4.0A of one another. HBOND_ATOMS_SULPHUR = ["SD", "SG"] hbond_df = filter_dataframe( rgroup_df, "atom_name", HBOND_ATOMS_SULPHUR, True ) if len(hbond_df.index) > 0: distmat = compute_distmat(hbond_df) interacting_atoms = get_interacting_atoms(4.0, distmat) add_interacting_resis(G, interacting_atoms, hbond_df, ["hbond"])
[docs]def add_ionic_interactions( G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None ): """ Find all ionic interactions. Criteria: ``[ARG, LYS, HIS, ASP, and GLU]`` (:const:`~graphein.protein.resi_atoms.IONIC_RESIS`) residues are within 6A. We also check for opposing charges (:const:`~graphein.protein.resi_atoms.POS_AA`, :const:`~graphein.protein.resi_atoms.NEG_AA`) """ if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] ionic_df = filter_dataframe(rgroup_df, "residue_name", IONIC_RESIS, True) ionic_df = filter_dataframe(rgroup_df, "node_id", list(G.nodes()), True) distmat = compute_distmat(ionic_df) interacting_atoms = get_interacting_atoms(6, distmat) add_interacting_resis(G, interacting_atoms, ionic_df, ["ionic"]) # Check that the interacting residues are of opposite charges for r1, r2 in get_edges_by_bond_type(G, "ionic"): condition1 = ( G.nodes[r1]["residue_name"] in POS_AA and G.nodes[r2]["residue_name"] in NEG_AA ) condition2 = ( G.nodes[r2]["residue_name"] in POS_AA and G.nodes[r1]["residue_name"] in NEG_AA ) is_ionic = condition1 or condition2 if not is_ionic: G.edges[r1, r2]["kind"].remove("ionic") if len(G.edges[r1, r2]["kind"]) == 0: G.remove_edge(r1, r2)
[docs]def add_aromatic_interactions( G: nx.Graph, pdb_df: Optional[pd.DataFrame] = None ): """ Find all aromatic-aromatic interaction. Criteria: phenyl ring centroids separated between 4.5A to 7A. Phenyl rings are present on ``PHE, TRP, HIS, TYR`` (:const:`~graphein.protein.resi_atoms.AROMATIC_RESIS`). Phenyl ring atoms on these amino acids are defined by the following atoms: - PHE: CG, CD, CE, CZ - TRP: CD, CE, CH, CZ - HIS: CG, CD, ND, NE, CE - TYR: CG, CD, CE, CZ Centroids of these atoms are taken by taking: (mean x), (mean y), (mean z) for each of the ring atoms. Notes for future self/developers: - Because of the requirement to pre-compute ring centroids, we do not use the functions written above (filter_dataframe, compute_distmat, get_interacting_atoms), as they do not return centroid atom euclidean coordinates. """ if pdb_df is None: pdb_df = G.graph["raw_pdb_df"] dfs = [] for resi in AROMATIC_RESIS: resi_rings_df = get_ring_atoms(pdb_df, resi) resi_rings_df = filter_dataframe( resi_rings_df, "node_id", list(G.nodes()), True ) resi_centroid_df = get_ring_centroids(resi_rings_df) dfs.append(resi_centroid_df) aromatic_df = ( pd.concat(dfs).sort_values(by="node_id").reset_index(drop=True) ) distmat = compute_distmat(aromatic_df) distmat.set_index(aromatic_df["node_id"], inplace=True) distmat.columns = aromatic_df["node_id"] distmat = distmat[(distmat >= 4.5) & (distmat <= 7)].fillna(0) indices = np.where(distmat > 0) interacting_resis = [ (distmat.index[r], distmat.index[c]) for r, c in zip(indices[0], indices[1]) ]"Found: {len(interacting_resis)} aromatic-aromatic interactions") for n1, n2 in interacting_resis: assert G.nodes[n1]["residue_name"] in AROMATIC_RESIS assert G.nodes[n2]["residue_name"] in AROMATIC_RESIS if G.has_edge(n1, n2): G.edges[n1, n2]["kind"].add("aromatic") else: G.add_edge(n1, n2, kind={"aromatic"})
[docs]def add_aromatic_sulphur_interactions( G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None ): """Find all aromatic-sulphur interactions.""" if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] RESIDUES = ["MET", "CYS", "PHE", "TYR", "TRP"] SULPHUR_RESIS = ["MET", "CYS"] AROMATIC_RESIS = ["PHE", "TYR", "TRP"] aromatic_sulphur_df = filter_dataframe( rgroup_df, "residue_name", RESIDUES, True ) aromatic_sulphur_df = filter_dataframe( aromatic_sulphur_df, "node_id", list(G.nodes()), True ) distmat = compute_distmat(aromatic_sulphur_df) interacting_atoms = get_interacting_atoms(5.3, distmat) interacting_atoms = list(zip(interacting_atoms[0], interacting_atoms[1])) for (a1, a2) in interacting_atoms: resi1 = aromatic_sulphur_df.loc[a1, "node_id"] resi2 = aromatic_sulphur_df.loc[a2, "node_id"] condition1 = resi1 in SULPHUR_RESIS and resi2 in AROMATIC_RESIS condition2 = resi1 in AROMATIC_RESIS and resi2 in SULPHUR_RESIS if (condition1 or condition2) and resi1 != resi2: if G.has_edge(resi1, resi2): G.edges[resi1, resi2]["kind"].add("aromatic_sulphur") else: G.add_edge(resi1, resi2, kind={"aromatic_sulphur"})
[docs]def add_cation_pi_interactions( G: nx.Graph, rgroup_df: Optional[pd.DataFrame] = None ): """Add cation-pi interactions.""" if rgroup_df is None: rgroup_df = G.graph["rgroup_df"] cation_pi_df = filter_dataframe( rgroup_df, "residue_name", CATION_PI_RESIS, True ) cation_pi_df = filter_dataframe( cation_pi_df, "node_id", list(G.nodes()), True ) distmat = compute_distmat(cation_pi_df) interacting_atoms = get_interacting_atoms(6, distmat) interacting_atoms = list(zip(interacting_atoms[0], interacting_atoms[1])) for (a1, a2) in interacting_atoms: resi1 = cation_pi_df.loc[a1, "node_id"] resi2 = cation_pi_df.loc[a2, "node_id"] condition1 = resi1 in CATION_RESIS and resi2 in PI_RESIS condition2 = resi1 in PI_RESIS and resi2 in CATION_RESIS if (condition1 or condition2) and resi1 != resi2: if G.has_edge(resi1, resi2): G.edges[resi1, resi2]["kind"].add("cation_pi") else: G.add_edge(resi1, resi2, kind={"cation_pi"})
[docs]def get_interacting_atoms(angstroms: float, distmat: pd.DataFrame): """Find the atoms that are within a particular radius of one another.""" return np.where(distmat <= angstroms)
[docs]def add_delaunay_triangulation( G: nx.Graph, allowable_nodes: Optional[List[str]] = None ): """ Compute the Delaunay triangulation of the protein structure. This has been used in prior work. References: Harrison, R. W., Yu, X. & Weber, I. T. Using triangulation to include target structure improves drug resistance prediction accuracy. in 1–1 (IEEE, 2013). doi:10.1109/ICCABS.2013.6629236 Yu, X., Weber, I. T. & Harrison, R. W. Prediction of HIV drug resistance from genotype with encoded three-dimensional protein structure. BMC Genomics 15 Suppl 5, S1 (2014). Notes: 1. We do not use the add_interacting_resis function, because this interaction is computed on the ``CA`` atoms. Therefore, there is code duplication. For now, I have chosen to leave this code duplication in. :param G: The networkx graph to add the triangulation to. :type G: nx.Graph :param allowable_nodes: The nodes to include in the triangulation. If ``None`` (default), no filtering is done. This parameter is used to filter out nodes that are not desired in the triangulation. Eg if you wanted to construct a delaunay triangulation of the CA atoms of an atomic graph. :type allowable_nodes: List[str], optional """ if allowable_nodes is None: coords = np.array([d["coords"] for _, d in G.nodes(data=True)]) node_map: Dict[int, str] = dict(enumerate(G.nodes())) else: coords = np.array( [ d["coords"] for _, d in G.nodes(data=True) if d["atom_type"] in allowable_nodes ] ) node_map: Dict[int, str] = { i: n for i, (n, d) in enumerate(G.nodes(data=True)) if d["atom_type"] in allowable_nodes } node_map: Dict[int, str] = dict(enumerate(node_map.values())) tri = Delaunay(coords) # this is the triangulation log.debug( f"Detected {len(tri.simplices)} simplices in the Delaunay Triangulation." ) for simplex in tri.simplices: nodes = [node_map[s] for s in simplex] for n1, n2 in combinations(nodes, 2): if n1 not in G.nodes or n2 not in G.nodes: continue if G.has_edge(n1, n2): G.edges[n1, n2]["kind"].add("delaunay") else: G.add_edge(n1, n2, kind={"delaunay"})
[docs]def add_distance_threshold( G: nx.Graph, long_interaction_threshold: int, threshold: float = 5.0 ): """ Adds edges to any nodes within a given distance of each other. Long interaction threshold is used to specify minimum separation in sequence to add an edge between networkx nodes within the distance threshold :param G: Protein Structure graph to add distance edges to :type G: nx.Graph :param long_interaction_threshold: minimum distance in sequence for two nodes to be connected :type long_interaction_threshold: int :param threshold: Distance in angstroms, below which two nodes are connected :type threshold: float :return: Graph with distance-based edges added """ pdb_df = filter_dataframe( G.graph["pdb_df"], "node_id", list(G.nodes()), True ) dist_mat = compute_distmat(pdb_df) interacting_nodes = get_interacting_atoms(threshold, distmat=dist_mat) interacting_nodes = list(zip(interacting_nodes[0], interacting_nodes[1]))"Found: {len(interacting_nodes)} distance edges") count = 0 for a1, a2 in interacting_nodes: n1 = G.graph["pdb_df"].loc[a1, "node_id"] n2 = G.graph["pdb_df"].loc[a2, "node_id"] n1_chain = G.graph["pdb_df"].loc[a1, "chain_id"] n2_chain = G.graph["pdb_df"].loc[a2, "chain_id"] n1_position = G.graph["pdb_df"].loc[a1, "residue_number"] n2_position = G.graph["pdb_df"].loc[a2, "residue_number"] condition_1 = n1_chain == n2_chain condition_2 = ( abs(n1_position - n2_position) < long_interaction_threshold ) if not (condition_1 and condition_2): count += 1 if G.has_edge(n1, n2): G.edges[n1, n2]["kind"].add("distance_threshold") else: G.add_edge(n1, n2, kind={"distance_threshold"}) f"Added {count} distance edges. ({len(list(interacting_nodes)) - count} removed by LIN)" )
[docs]def add_k_nn_edges( G: nx.Graph, long_interaction_threshold: int, k: int = 5, mode: str = "connectivity", metric: str = "minkowski", p: int = 2, include_self: Union[bool, str] = False, ): """ Adds edges to nodes based on K nearest neighbours. Long interaction threshold is used to specify minimum separation in sequence to add an edge between networkx nodes within the distance threshold :param G: Protein Structure graph to add distance edges to :type G: nx.Graph :param long_interaction_threshold: minimum distance in sequence for two nodes to be connected :type long_interaction_threshold: int :param k: Number of neighbors for each sample. :type k: int :param mode: Type of returned matrix: ``"connectivity"`` will return the connectivity matrix with ones and zeros, and ``"distance"`` will return the distances between neighbors according to the given metric. :type mode: str :param metric: The distance metric used to calculate the k-Neighbors for each sample point. The DistanceMetric class gives a list of available metrics. The default distance is ``"euclidean"`` (``"minkowski"`` metric with the ``p`` param equal to ``2``). :type metric: str :param p: Power parameter for the Minkowski metric. When ``p = 1``, this is equivalent to using ``manhattan_distance`` (l1), and ``euclidean_distance`` (l2) for ``p = 2``. For arbitrary ``p``, ``minkowski_distance`` (l_p) is used. Default is ``2`` (euclidean). :type p: int :param include_self: Whether or not to mark each sample as the first nearest neighbor to itself. If ``"auto"``, then ``True`` is used for ``mode="connectivity"`` and ``False`` for ``mode="distance"``. Default is ``False``. :type include_self: Union[bool, str] :return: Graph with knn-based edges added :rtype: nx.Graph """ pdb_df = filter_dataframe( G.graph["pdb_df"], "node_id", list(G.nodes()), True ) dist_mat = compute_distmat(pdb_df) nn = kneighbors_graph( X=dist_mat, n_neighbors=k, mode=mode, metric=metric, p=p, include_self=include_self, ) # Create iterable of node indices outgoing = np.repeat(np.array(range(len(G.graph["pdb_df"]))), k) incoming = nn.indices interacting_nodes = list(zip(outgoing, incoming))"Found: {len(interacting_nodes)} KNN edges") for a1, a2 in interacting_nodes: # Get nodes IDs from indices n1 = G.graph["pdb_df"].loc[a1, "node_id"] n2 = G.graph["pdb_df"].loc[a2, "node_id"] # Get chains n1_chain = G.graph["pdb_df"].loc[a1, "chain_id"] n2_chain = G.graph["pdb_df"].loc[a2, "chain_id"] # Get sequence position n1_position = G.graph["pdb_df"].loc[a1, "residue_number"] n2_position = G.graph["pdb_df"].loc[a2, "residue_number"] # Check residues are not on same chain condition_1 = n1_chain != n2_chain # Check residues are separated by long_interaction_threshold condition_2 = ( abs(n1_position - n2_position) > long_interaction_threshold ) # If not on same chain add edge or # If on same chain and separation is sufficient add edge if condition_1 or condition_2: if G.has_edge(n1, n2): G.edges[n1, n2]["kind"].add("k_nn") else: G.add_edge(n1, n2, kind={"k_nn"})
[docs]def get_ring_atoms(dataframe: pd.DataFrame, aa: str) -> pd.DataFrame: """ Return ring atoms from a dataframe. A helper function for add_aromatic_interactions. Gets the ring atoms from the particular aromatic amino acid. Parameters: =========== - dataframe: the dataframe containing the atom records. - aa: the amino acid of interest, passed in as 3-letter string. Returns: ======== - dataframe: a filtered dataframe containing just those atoms from the particular amino acid selected. e.g. equivalent to selecting just the ring atoms from a particular amino acid. """ ring_atom_df = filter_dataframe(dataframe, "residue_name", [aa], True) ring_atom_df = filter_dataframe( ring_atom_df, "atom_name", AA_RING_ATOMS[aa], True ) return ring_atom_df
[docs]def get_ring_centroids(ring_atom_df: pd.DataFrame) -> pd.DataFrame: """ Return aromatic ring centrods. A helper function for add_aromatic_interactions. Computes the ring centroids for each a particular amino acid's ring atoms. Ring centroids are computed by taking the mean of the x, y, and z coordinates. Parameters: =========== - ring_atom_df: a dataframe computed using get_ring_atoms. - aa: the amino acid under study Returns: ======== - centroid_df: a dataframe containing just the centroid coordinates of the ring atoms of each residue. """ return ( ring_atom_df.groupby("node_id") .mean()[["x_coord", "y_coord", "z_coord"]] .reset_index() )
[docs]def get_edges_by_bond_type( G: nx.Graph, bond_type: str ) -> List[Tuple[str, str]]: """ Return edges of a particular bond type. Parameters: =========== - bond_type: (str) one of the elements in the variable BOND_TYPES Returns: ======== - resis: (list) a list of tuples, where each tuple is an edge. """ return [ (n1, n2) for n1, n2, d in G.edges(data=True) if bond_type in d["kind"] ]
[docs]def node_coords(G: nx.Graph, n: str) -> Tuple[float, float, float]: """ Return the ``x, y, z`` coordinates of a node. This is a helper function. Simplifies the code. :param G: nx.Graph protein structure graph to extract coordinates from :type G: nx.Graph :param n: str node ID in graph to extract coordinates from :type n: str :return: Tuple of coordinates ``(x, y, z)`` :rtype: Tuple[float, float, float] """ x = G.nodes[n]["x_coord"] y = G.nodes[n]["y_coord"] z = G.nodes[n]["z_coord"] return x, y, z
[docs]def add_interacting_resis( G: nx.Graph, interacting_atoms: np.ndarray, dataframe: pd.DataFrame, kind: List[str], ): """ Add interacting residues to graph. Returns a list of 2-tuples indicating the interacting residues based on the interacting atoms. This is most typically called after the get_interacting_atoms function above. Also filters out the list such that the residues have to be at least two apart. ### Parameters - interacting_atoms: (numpy array) result from get_interacting_atoms function. - dataframe: (pandas dataframe) a pandas dataframe that houses the euclidean locations of each atom. - kind: (list) the kind of interaction. Contains one of : - hydrophobic - disulfide - hbond - ionic - aromatic - aromatic_sulphur - cation_pi - delaunay Returns: ======== - filtered_interacting_resis: (set of tuples) the residues that are in an interaction, with the interaction kind specified """ # This assertion/check is present for defensive programming! for k in kind: assert k in BOND_TYPES resi1 = dataframe.loc[interacting_atoms[0]]["node_id"].values resi2 = dataframe.loc[interacting_atoms[1]]["node_id"].values interacting_resis = set(list(zip(resi1, resi2)))"Found {len(interacting_resis)} {k} interactions.") for i1, i2 in interacting_resis: if i1 != i2: if G.has_edge(i1, i2): for k in kind: G.edges[i1, i2]["kind"].add(k) else: G.add_edge(i1, i2, kind=set(kind))