Source code for graphein.protein.features.nodes.amino_acid

"""Featurization functions for amino acids."""
# Graphein
# Author: Arian Jamasb <arian@jamasb.io>, Eric Ma
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein

import logging
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

import numpy as np
import pandas as pd

from graphein.protein.resi_atoms import (
    BASE_AMINO_ACIDS,
    HYDROGEN_BOND_ACCEPTORS,
    HYDROGEN_BOND_DONORS,
    RESI_THREE_TO_1,
)
from graphein.utils.utils import onek_encoding_unk

log = logging.getLogger(__name__)


[docs]@lru_cache() def load_expasy_scales() -> pd.DataFrame: """ Load pre-downloaded EXPASY scales. This helps with node featuarization. The function is LRU-cached in memory for fast access on each function call. :returns: pd.DataFrame containing expasy scales :rtype: pd.DataFrame """ fpath = Path(__file__).parent / "amino_acid_properties.csv" log.debug(f"Reading Expasy protein scales from: {fpath}") return pd.read_csv(fpath, index_col=0)
[docs]@lru_cache() def load_meiler_embeddings() -> pd.DataFrame: """ Load pre-downloaded Meiler embeddings. This helps with node featurization. The function is LRU-cached in memory for fast access on each function call. :returns: pd.DataFrame containing Meiler Embeddings from Meiler et al. 2001 :rtype: pd.DataFrame """ fpath = Path(__file__).parent / "meiler_embeddings.csv" log.debug(f"Reading meiler embeddings from: {fpath}") return pd.read_csv(fpath, index_col=0)
[docs]def expasy_protein_scale( n, d, selection: Optional[List[str]] = None, add_separate: bool = False, return_array: bool = False, ) -> Union[pd.Series, np.ndarray]: """ Return amino acid features that come from the EXPASY protein scale. Source: https://web.expasy.org/protscale/ :param n: Node in a NetworkX graph :param d: NetworkX node attributes. :param selection: List of columns to select. Viewable in graphein.protein.features.nodes.meiler_embeddings :type selection: List[str], optional :param add_separate: Whether or not to add the expasy features as indvidual entries or as a series. :param return_array: Bool indicating whether or not to return a np.ndarray of the features. Default is pd.Series :type return_array: bool :returns: pd.Series of amino acid features :rtype: pd.Series """ df = load_expasy_scales() amino_acid = d["residue_name"] try: features = df[amino_acid] if selection is not None: features = features.filter(selection) except: features = pd.Series(np.zeros(len(df))) if return_array: features = np.array(features) if add_separate: for k, v in features.to_dict().items(): d[k] = v else: d["expasy"] = features return features
[docs]def meiler_embedding( n, d, return_array: bool = False ) -> Union[pd.Series, np.array]: """ Return amino acid features from reduced dimensional embeddings of amino acid physicochemical properties. Source: https://link.springer.com/article/10.1007/s008940100038 doi: https://doi.org/10.1007/s008940100038 :param n: Node in a NetworkX graph :param d: NetworkX node attributes. :returns: pd.Series of amino acid features :rtype: pd.Series """ df = load_meiler_embeddings() amino_acid = d["residue_name"] try: features = df[amino_acid] except: features = pd.Series(np.zeros(len(df))) if return_array: features = np.array(features) d["meiler"] = features return features
[docs]def amino_acid_one_hot( n, d: Dict[str, Any], return_array: bool = True, allowable_set: Optional[List[str]] = None, ) -> Union[pd.Series, np.ndarray]: """Adds a one-hot encoding of amino acid types as a node attribute. :param n: node name, this is unused and only included for compatibility with the other functions :type n: str :param d: Node data :type d: Dict[str, Any] :param return_array: If True, returns a numpy array of one-hot encoding, otherwise returns a pd.Series. Default is True. :type return_array: bool :param allowable_set: Specifies vocabulary of amino acids. Default is None (which uses `graphein.protein.resi_atoms.STANDARD_AMINO_ACIDS`). :return: One-hot encoding of amino acid types :rtype: Union[pd.Series, np.ndarray] """ if allowable_set is None: allowable_set = BASE_AMINO_ACIDS features = onek_encoding_unk( RESI_THREE_TO_1[d["residue_name"]], allowable_set ) if return_array: features = np.array(features).astype(int) else: features = pd.Series(features).astype(int) features.index = allowable_set d["amino_acid_one_hot"] = features return features
[docs]def hydrogen_bond_donor( n: str, d: Dict[str, Any], sum_features: bool = True, return_array: bool = False, ) -> pd.Series: """Adds Hydrogen Bond Donor status to nodes as a feature. :param n: node id :type n: str :param d: Dict of node attributes :type d: Dict[str, Any] :param sum_features: If ``True``, the feature is the number of hydrogen bond donors per node. If ``False``, the feature is a boolean indicating whether or not the node has a hydrogen bond donor. Default is ``True``. :type sum_features: bool :param return_array: If ``True``, returns a ``np.ndarray``, otherwise returns a ``pd.Series``. Default is ``True``. :type return_array: bool """ node_id = n.split(":") res = node_id[1] if len(node_id) == 4: # Atomic graph atom = node_id[-1] try: features = HYDROGEN_BOND_DONORS[res][atom] except KeyError: features = 0 elif len(node_id) == 3: # Residue graph if res not in HYDROGEN_BOND_DONORS.keys(): features = 0 else: features = sum(HYDROGEN_BOND_DONORS[res].values()) if return_array: features = np.array(features).astype(int) else: features = pd.Series(features).astype(int) if not sum_features: features = np.array(features > 0).astype(int) d["hbond_donors"] = features
[docs]def hydrogen_bond_acceptor( n, d, sum_features: bool = True, return_array: bool = False ) -> pd.Series: """Adds Hydrogen Bond Acceptor status to nodes as a feature." :param n: node id :type n: str :param d: Dict of node attributes :type d: Dict[str, Any] :param sum_features: If ``True``, the feature is the number of hydrogen bond acceptors per node. If ``False``, the feature is a boolean indicating whether or not the node has a hydrogen bond acceptor. Default is ``True``. :type sum_features: bool :param return_array: If ``True``, returns a ``np.ndarray``, otherwise returns a ``pd.Series``. Default is ``True``. :type return_array: bool """ node_id = n.split(":") res = node_id[1] if len(node_id) == 4: # Atomic graph atom = node_id[-1] try: features = HYDROGEN_BOND_ACCEPTORS[res][atom] except KeyError: features = 0 elif len(node_id) == 3: # Residue graph if res not in HYDROGEN_BOND_ACCEPTORS.keys(): features = 0 else: features = sum(HYDROGEN_BOND_ACCEPTORS[res].values()) if return_array: features = np.array(features).astype(int) else: features = pd.Series(features).astype(int) if not sum_features: features = np.array(features > 0).astype(int) d["hbond_acceptors"] = features