Source code for graphein.protein.features.nodes.geometry

"""Provides geometry-based featurisation functions."""
# Graphein
# Author: Arian Jamasb <arian@jamasb.io>
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
import logging

import networkx as nx
import numpy as np
import pandas as pd

from graphein.protein.utils import filter_dataframe


[docs]def add_sidechain_vector( g: nx.Graph, scale: bool = True, reverse: bool = False ): """Adds vector from node to average position of sidechain atoms. We compute the mean of the sidechain atoms for each node. For this we use the ``rgroup_df`` dataframe. If the graph does not contain the ``rgroup_df`` dataframe, we compute it from the ``raw_pdb_df``. If scale, we scale the vector to the unit vector. If reverse is True, we reverse the vector (``sidechain - node``). If reverse is false (default) we compute (``node - sidechain``). :param g: Graph to add vector to. :type g: nx.Graph :param scale: Scale vector to unit vector. Defaults to ``True``. :type scale: bool :param reverse: Reverse vector. Defaults to ``False``. :type reverse: bool """ # Get or compute R-Group DF if "rgroup_df" not in g.graph.keys(): g.graph["rgroup_df"] = compute_rgroup_dataframe(g.graph["raw_pdb_df"]) sc_centroid = g.graph["rgroup_df"].groupby("node_id").mean() # Iterate over nodes and compute vector for n, d in g.nodes(data=True): if d["residue_name"] == "GLY": # If GLY, set vector to 0 vec = np.array([0, 0, 0]) else: if reverse: vec = d["coords"] - np.array( sc_centroid.loc[n][["x_coord", "y_coord", "z_coord"]] ) else: vec = ( np.array( sc_centroid.loc[n][["x_coord", "y_coord", "z_coord"]] ) - d["coords"] ) if scale: vec = vec / np.linalg.norm(vec) d["sidechain_vector"] = vec
[docs]def add_beta_carbon_vector( g: nx.Graph, scale: bool = True, reverse: bool = False ): """Adds vector from node (typically alpha carbon) to position of beta carbon. Glycine does not have a beta carbon, so we set it to ``np.array([0, 0, 0])``. We extract the position of the beta carbon from the unprocessed atomic PDB dataframe. For this we use the ``raw_pdb_df`` dataframe. If scale, we scale the vector to the unit vector. If reverse is True, we reverse the vector (``C beta - node``). If reverse is false (default) we compute (``node - C beta``). :param g: Graph to add vector to. :type g: nx.Graph :param scale: Scale vector to unit vector. Defaults to ``True``. :type scale: bool :param reverse: Reverse vector. Defaults to ``False``. :type reverse: bool """ # Get or compute R-Group DF if "rgroup_df" not in g.graph.keys(): g.graph["rgroup_df"] = compute_rgroup_dataframe(g.graph["raw_pdb_df"]) c_beta_coords = filter_dataframe( g.graph["rgroup_df"], "atom_name", ["CB"], boolean=True ) c_beta_coords.index = c_beta_coords["node_id"] # Iterate over nodes and compute vector for n, d in g.nodes(data=True): if d["residue_name"] == "GLY": vec = np.array([0, 0, 0]) else: if reverse: vec = d["coords"] - np.array( c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]] ) else: vec = ( np.array( c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]] ) - d["coords"] ) if scale: vec = vec / np.linalg.norm(vec) d["c_beta_vector"] = vec
[docs]def add_sequence_neighbour_vector( g: nx.Graph, scale: bool = True, reverse: bool = False, n_to_c: bool = True ): """Computes vector from node to adjacent node in sequence. Typically used with ``CA`` (alpha carbon) graphs. If ``n_to_c`` is ``True`` (default), we compute the vectors from the N terminus to the C terminus (canonical direction). If ``reverse`` is ``False`` (default), we compute ``Node_i - Node_{i+1}``. If ``reverse is ``True``, we compute ``Node_{i+1} - Node_i``. :param g: Graph to add vector to. :type g: nx.Graph :param scale: Scale vector to unit vector. Defaults to ``True``. :type scale: bool :param reverse: Reverse vector. Defaults to ``False``. :type reverse: bool :param n_to_c: Compute vector from N to C or C to N. Defaults to ``True``. :type n_to_c: bool """ suffix = "n_to_c" if n_to_c else "c_to_n" # Iterate over every chain for chain_id in g.graph["chain_ids"]: # Find chain residues chain_residues = [ (n, v) for n, v in g.nodes(data=True) if v["chain_id"] == chain_id ] if not n_to_c: chain_residues.reverse() # Iterate over every residue in chain for i, residue in enumerate(chain_residues): # Checks not at chain terminus - is this versatile enough? if i == len(chain_residues) - 1: residue[1][f"sequence_neighbour_vector_{suffix}"] = np.array( [0, 0, 0] ) continue # Asserts residues are on the same chain cond_1 = ( residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"] ) # Asserts residue numbers are adjacent cond_2 = ( abs( residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"] ) == 1 ) # If this checks out, we compute the vector if (cond_1) and (cond_2): vec = chain_residues[i + 1][1]["coords"] - residue[1]["coords"] if reverse: vec = -vec if scale: vec = vec / np.linalg.norm(vec) residue[1][f"sequence_neighbour_vector_{suffix}"] = vec