Source code for graphein.molecule.edges.distance

"""Functions for computing biochemical edges of graphs."""

# Graphein
# Author: Eric Ma, Arian Jamasb <>
# License: MIT
# Project Website:
# Code Repository:
from __future__ import annotations

import itertools
import logging
from typing import Union

import networkx as nx
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import kneighbors_graph

log = logging.getLogger(__name__)

[docs]def compute_distmat(coords: np.ndarray) -> np.ndarray: """ Compute pairwise euclidean distances between every atom. Design choice: passed in a DataFrame to enable easier testing on dummy data. :param coords: pd.Dataframe containing molecule structure. Must contain columns ``["x_coord", "y_coord", "z_coord"]``. :type coords: pd.DataFrame :return: np.ndarray of euclidean distance matrix. :rtype: np.ndarray """ return pairwise_distances(coords, metric="euclidean")
[docs]def get_interacting_atoms(angstroms: float, distmat: np.ndarray) -> np.ndarray: """Find the atoms that are within a particular radius of one another. :param angstroms: Radius in angstroms. :type angstroms: float :param distmat: Distance matrix. :type distmat: np.ndarray :returns: Array of interacting atoms :rtype: np.ndarray """ return np.where(distmat <= angstroms)
[docs]def add_distance_threshold(G: nx.Graph, threshold: float = 5.0): """ Adds edges to any nodes within a given distance of each other. :param G: molecule structure graph to add distance edges to :type G: nx.Graph :param threshold: Distance in angstroms, below which two nodes are connected. :type threshold: float :return: Graph with distance-based edges added """ dist_mat = compute_distmat(G.graph["coords"]) interacting_nodes = get_interacting_atoms(threshold, distmat=dist_mat) outgoing = [list(G.nodes())[i] for i in interacting_nodes[0]] incoming = [list(G.nodes())[i] for i in interacting_nodes[1]] interacting_nodes = list(zip(outgoing, incoming)) f"Found: {len(interacting_nodes)} distance edges for radius {threshold}" ) for n1, n2 in interacting_nodes: if G.has_edge(n1, n2): G.edges[n1, n2]["kind"].add("distance_threshold") else: G.add_edge(n1, n2, kind={"distance_threshold"})
[docs]def add_fully_connected_edges( G: nx.Graph, ): """ Adds fully connected edges to nodes. :param G: Molecule structure graph to add distance edges to. :type G: nx.Graph """ length = len(G.graph["coords"]) for n1, n2 in itertools.product(G.nodes(), G.nodes()): if G.has_edge(n1, n2): G.edges[n1, n2]["kind"].add("fully_connected") else: G.add_edge(n1, n2, kind={"fully_connected"})
[docs]def add_k_nn_edges( G: nx.Graph, k: int = 1, mode: str = "connectivity", metric: str = "minkowski", p: int = 2, include_self: Union[bool, str] = False, ): """ Adds edges to nodes based on K nearest neighbours. :param G: Molecule structure graph to add distance edges to. :type G: nx.Graph :param k: Number of neighbors for each sample. :type k: int :param mode: Type of returned matrix: ``"connectivity"`` will return the connectivity matrix with ones and zeros, and ``"distance"`` will return the distances between neighbors according to the given metric. :type mode: str :param metric: The distance metric used to calculate the k-Neighbors for each sample point. The DistanceMetric class gives a list of available metrics. The default distance is ``"euclidean"`` (``"minkowski"`` metric with the ``p`` param equal to ``2``). :type metric: str :param p: Power parameter for the Minkowski metric. When ``p = 1``, this is equivalent to using ``manhattan_distance`` (l1), and ``euclidean_distance`` (l2) for ``p = 2``. For arbitrary ``p``, ``minkowski_distance`` (l_p) is used. Default is ``2`` (euclidean). :type p: int :param include_self: Whether or not to mark each sample as the first nearest neighbor to itself. If ``"auto"``, then ``True`` is used for ``mode="connectivity"`` and ``False`` for ``mode="distance"``. Default is ``False``. :type include_self: Union[bool, str] :return: Graph with knn-based edges added. :rtype: nx.Graph """ dist_mat = compute_distmat(G.graph["coords"]) nn = kneighbors_graph( X=dist_mat, n_neighbors=k, mode=mode, metric=metric, p=p, include_self=include_self, ) # Create iterable of node indices outgoing = np.repeat(np.array(range(len(G.graph["coords"]))), k) outgoing = [list(G.nodes())[i] for i in outgoing] incoming = [list(G.nodes())[i] for i in nn.indices] interacting_nodes = list(zip(outgoing, incoming))"Found: {len(interacting_nodes)} KNN edges") for n1, n2 in interacting_nodes: if G.has_edge(n1, n2): G.edges[n1, n2]["kind"].add(f"k_nn_{k}") else: G.add_edge(n1, n2, kind={f"k_nn_{k}"})