"""Featurization functions for graph nodes using DSSP predicted features."""
# Graphein
# Author: Arian Jamasb <arian@jamasb.io>, Charlie Harris
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
from __future__ import annotations
import os
from typing import Any, Dict, Optional
import networkx as nx
import pandas as pd
from Bio.Data.IUPACData import protein_letters_1to3
from Bio.PDB.DSSP import dssp_dict_from_pdb_file, residue_max_acc
from graphein.protein.utils import download_pdb, is_tool
DSSP_COLS = [
    "chain",
    "resnum",
    "icode",
    "aa",
    "ss",
    "asa",
    "phi",
    "psi",
    "dssp_index",
    "NH_O_1_relidx",
    "NH_O_1_energy",
    "O_NH_1_relidx",
    "O_NH_1_energy",
    "NH_O_2_relidx",
    "NH_O_2_energy",
    "O_NH_2_relidx",
    "O_NH_2_energy",
]
DSSP_SS = ["H", "B", "E", "G", "I", "T", "S"]
[docs]def parse_dssp_df(dssp: Dict[str, Any]) -> pd.DataFrame:
    """
    Parse DSSP output to DataFrame
    :param dssp: Dictionary containing DSSP output
    :type dssp: Dict[str, Any]
    :return: pd.Dataframe containing parsed DSSP output
    :rtype: pd.DataFrame
    """
    appender = []
    for k in dssp[1]:
        to_append = []
        y = dssp[0][k]
        chain = k[0]
        residue = k[1]
        # het = residue[0]
        resnum = residue[1]
        icode = residue[2]
        to_append.extend([chain, resnum, icode])
        to_append.extend(y)
        appender.append(to_append)
    return pd.DataFrame.from_records(appender, columns=DSSP_COLS) 
[docs]def process_dssp_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes a DSSP DataFrame to make indexes align with node IDs
    :param df: pd.DataFrame containing the parsed output from DSSP.
    :type df: pd.DataFrame
    :return: pd.DataFrame with node IDs
    :rtype: pd.DataFrame
    """
    # Convert 1 letter aa code to 3 letter
    amino_acids = df["aa"].tolist()
    for i, amino_acid in enumerate(amino_acids):
        amino_acids[i] = protein_letters_1to3[amino_acid].upper()
    df["aa"] = amino_acids
    # Construct node IDs
    node_ids = []
    for i, row in df.iterrows():
        node_id = row["chain"] + ":" + row["aa"] + ":" + str(row["resnum"])
        node_ids.append(node_id)
    df["node_id"] = node_ids
    df.set_index("node_id", inplace=True)
    return df 
[docs]def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph:
    """
    Construct DSSP dataframe and add as graph level variable to protein graph
    :param G: Input protein graph
    :param G: nx.Graph
    :param dssp_config: DSSPConfig object. Specifies which executable to run. Located in graphein.protein.config
    :type dssp_config: DSSPConfig, optional
    :return: Protein graph with DSSP dataframe added
    :rtype: nx.Graph
    """
    config = G.graph["config"]
    pdb_id = G.graph["pdb_id"]
    # Extract DSSP executable
    executable = dssp_config.executable
    # Ensure that DSSP is on PATH and is marked as an executable.
    assert is_tool(
        executable
    ), "DSSP must be on PATH and marked as an executable"
    # Check for existence of pdb file. If not, download it.
    if not os.path.isfile(config.pdb_dir / pdb_id):
        pdb_file = download_pdb(config, pdb_id)
    else:
        pdb_file = config.pdb_dir + pdb_id + ".pdb"
    if config.verbose:
        print(f"Using DSSP executable '{executable}'")
    # Run DSSP
    dssp_dict = dssp_dict_from_pdb_file(pdb_file, DSSP=executable)
    dssp_dict = parse_dssp_df(dssp_dict)
    dssp_dict = process_dssp_df(dssp_dict)
    if config.verbose:
        print(dssp_dict)
    # Assign DSSP Dict
    G.graph["dssp_df"] = dssp_dict
    return G 
[docs]def add_dssp_feature(G: nx.Graph, feature: str) -> nx.Graph:
    """
    Adds add_dssp_feature specified amino acid feature as calculated
    by DSSP to every node in a protein graph
    :param G: Protein structure graph to add dssp feature to
    :param feature: string specifying name of DSSP feature to add:
    "chain",
    "resnum",
    "icode",
    "aa",
    "ss",
    "asa",
    "phi",
    "psi",
    "dssp_index",
    "NH_O_1_relidx",
    "NH_O_1_energy",
    "O_NH_1_relidx",
    "O_NH_1_energy",
    "NH_O_2_relidx",
    "NH_O_2_energy",
    "O_NH_2_relidx",
    "O_NH_2_energy",
    These names parse_dssp_df accessible in the DSSP_COLS list
    :param G: Protein Graph to add features to
    :type G: nx.Graph
    :return: Protein structure graph with DSSP feature added to nodes
    :rtype: nx.Graph
    """
    if "dssp_df" not in G.graph:
        G = add_dssp_df(G, G.graph["config"].dssp_config)
    config = G.graph["config"]
    dssp_df = G.graph["dssp_df"]
    # Change to not allow for atom granularity?
    if config.granularity == "atom":
        # TODO confirm below is not needed and remove
        """
        # If granularity is atom, apply residue feature to every atom
        for n in G.nodes():
            residue = n.split(":")
            residue = residue[0] + ":" + residue[1] + ":" + residue[2]
            G.nodes[n][feature] = dssp_df.loc[residue, feature]
        """
        raise NameError(
            f"DSSP residue features ({feature}) \
            cannot be added to atom granularity graph"
        )
    else:
        nx.set_node_attributes(G, dict(dssp_df[feature]), feature)
    if config.verbose:
        print("Added " + feature + " features to graph nodes")
    return G 
[docs]def rsa(G: nx.Graph) -> nx.Graph:
    """
    Adds RSA (relative solvent accessibility) of each residue in protein graph
    as calculated by DSSP.
    :param G: Input protein graph
    :type G: nx.Graph
    :return: Protein graph with rsa values added
    :rtype: nx.Graph
    """
    # Calculate RSA
    try:
        dssp_df = G.graph["dssp_df"]
    except KeyError:
        G = add_dssp_df(G, G.graph["config"].dssp_config)
        dssp_df = G.graph["dssp_df"]
    dssp_df["max_acc"] = dssp_df["aa"].map(residue_max_acc["Sander"].get)
    dssp_df[["asa", "max_acc"]] = dssp_df[["asa", "max_acc"]].astype(float)
    dssp_df["rsa"] = dssp_df["asa"] / dssp_df["max_acc"]
    G.graph["dssp_df"] = dssp_df
    return add_dssp_feature(G, "rsa") 
[docs]def asa(G: nx.Graph) -> nx.Graph:
    """
    Adds ASA of each residue in protein graph as calculated by DSSP.
    :param G: Input protein graph
    :type G: nx.Graph
    :return: Protein graph with asa values added
    :rtype: nx.Graph
    """
    return add_dssp_feature(G, "asa") 
[docs]def phi(G: nx.Graph) -> nx.Graph:
    """
    Adds phi-angles of each residue in protein graph as calculated by DSSP.
    :param G: Input protein graph
    :type G: nx.Graph
    :return: Protein graph with phi-angles values added
    :rtype: nx.Graph
    """
    return add_dssp_feature(G, "phi") 
[docs]def psi(G: nx.Graph) -> nx.Graph:
    """
    Adds psi-angles of each residue in protein graph as calculated by DSSP.
    :param G: Input protein graph
    :type G: nx.Graph
    :return: Protein graph with psi-angles values added
    :rtype: nx.Graph
    """
    return add_dssp_feature(G, "psi") 
[docs]def secondary_structure(G: nx.Graph) -> nx.Graph:
    """
    Adds secondary structure of each residue in protein graph
    as calculated by DSSP in the form of a string
    :param G: Input protein graph
    :type G: nx.Graph
    :return: Protein graph with secondary structure added
    :rtype: nx.Graph
    """
    return add_dssp_feature(G, "ss")