Source code for graphein.rna.graphs

"""Functions for working with RNA Secondary Structure Graphs."""
# %%
# Graphein
# Author: Arian Jamasb <arian@jamasb.io>, Emmanuele Rossi, Eric Ma
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
# This submodule is heavily inspired by: https://github.com/emalgorithm/rna-design/blob/aec77a18abe4850958d6736ec185a6f8cbfdf20c/src/util.py#L9
import logging
from typing import Callable, Dict, List, Optional

import networkx as nx

from graphein.rna.constants import (
    RNA_BASE_COLORS,
    RNA_BASES,
    SUPPORTED_DOTBRACKET_NOTATION,
)
from graphein.utils.utils import (
    annotate_edge_metadata,
    annotate_graph_metadata,
    annotate_node_metadata,
    compute_edges,
)

log = logging.getLogger(__name__)


[docs]def validate_rna_sequence(s: str) -> None: """ Validate RNA sequence. This ensures that it only containts supported bases. Supported bases are: ``"A", "U", "G", "C", "I"`` Supported bases can be accessed in :const:`~graphein.rna.constants.RNA_BASES` :param s: Sequence to validate :type s: str :raises ValueError: Raises ValueError if the sequence contains an unsupported base character """ letters_used = set(s) if not letters_used.issubset(RNA_BASES): offending_letter = letters_used.difference(RNA_BASES) position = s.index(offending_letter) raise ValueError( f"Invalid letter {offending_letter} found at position {position} in the sequence {s}." )
[docs]def validate_lengths(db: str, seq: str) -> None: """ Check lengths of dotbracket and sequence match. :param db: Dotbracket string to check :type db: str :param seq: RNA nucleotide sequence to check. :type seq: str :raises ValueError: Raises ValueError if lengths of dotbracket and sequence do not match. """ if len(db) != len(seq): raise ValueError( f"Length of dotbracket ({len(db)}) does not match length of sequence ({len(seq)})." )
[docs]def validate_dotbracket(db: str): """ Sanitize dotbracket string. This ensures that it only has supported symbols. See: :const:`~graphein.rna.constants.SUPPORTED_DOTBRACKET_NOTATION` :param db: Dotbracket notation string :type db: str :raises ValueError: Raises ValueError if dotbracket notation contains unsupported symbols """ chars_used = set(db) if not chars_used.issubset(SUPPORTED_DOTBRACKET_NOTATION): offending_letter = chars_used.difference(SUPPORTED_DOTBRACKET_NOTATION) position = db.index(offending_letter) raise ValueError( f"Invalid letter {offending_letter} found at position {position} in the sequence {db}." )
[docs]def construct_rna_graph( dotbracket: Optional[str], sequence: Optional[str], edge_construction_funcs: List[Callable], edge_annotation_funcs: Optional[List[Callable]] = None, node_annotation_funcs: Optional[List[Callable]] = None, graph_annotation_funcs: Optional[List[Callable]] = None, ) -> nx.Graph: """ Constructs an RNA secondary structure graph from dotbracket notation. :param dotbracket: Dotbracket notation representation of secondary structure :type dotbracket: str, optional :param sequence: Corresponding sequence RNA bases :type sequence: str, optional :param edge_construction_funcs: List of edge construction functions. Defaults to ``None``. :type edge_construction_funcs: List[Callable], optional :param edge_annotation_funcs: List of edge metadata annotation functions. Defaults to ``None``. :type edge_annotation_funcs: List[Callable], optional :param node_annotation_funcs: List of node metadata annotation functions. Defaults to ``None``. :type node_annotation_funcs: List[Callable], optional :param graph_annotation_funcs: List of graph metadata annotation functions. Defaults to ``None``. :type graph_annotation_funcs: List[Callable], optional :return: nx.Graph of RNA secondary structure :rtype: nx.Graph """ G = nx.Graph() # Build node IDs first. node_ids = ( list(range(len(sequence))) if sequence else list(range(len(dotbracket))) ) # Check sequence and dotbracket lengths match if dotbracket and sequence: validate_lengths(dotbracket, sequence) # add nodes G.add_nodes_from(node_ids) log.debug(f"Added {len(node_ids)} nodes") # Add dotbracket symbol if dotbracket is provided if dotbracket: validate_dotbracket(dotbracket) G.graph["dotbracket"] = dotbracket nx.set_node_attributes( G, dict(zip(node_ids, dotbracket)), "dotbracket_symbol", ) # Add nucleotide base info if sequence is provided if sequence: validate_rna_sequence(sequence) G.graph["sequence"] = sequence nx.set_node_attributes(G, dict(zip(node_ids, sequence)), "nucleotide") colors = [RNA_BASE_COLORS[i] for i in sequence] nx.set_node_attributes(G, dict(zip(node_ids, colors)), "color") # Annotate additional graph metadata if graph_annotation_funcs is not None: G = annotate_graph_metadata(G, graph_annotation_funcs) # Annotate additional node metadata if node_annotation_funcs is not None: G = annotate_node_metadata(G, node_annotation_funcs) # Add edges G = compute_edges(G, edge_construction_funcs) # Annotate additional edge metadata if edge_annotation_funcs is not None: G = annotate_edge_metadata(G, edge_annotation_funcs) return G
if __name__ == "__main__": import matplotlib.pyplot as plt from graphein.rna.edges import ( add_all_dotbracket_edges, add_base_pairing_interactions, add_phosphodiester_bonds, add_pseudoknots, ) edge_funcs_1 = [ add_base_pairing_interactions, add_phosphodiester_bonds, add_pseudoknots, ] edge_funcs_2 = [add_all_dotbracket_edges] g = construct_rna_graph( "......((((((......[[[))))))......]]]....", sequence=None, edge_construction_funcs=edge_funcs_1, ) nx.info(g) edge_colors = nx.get_edge_attributes(g, "color").values() node_colors = nx.get_node_attributes(g, "color").values() nx.draw( g, edge_color=edge_colors # , node_color=node_colors, with_labels=True ) plt.show()