Graphein Protein Structure Dataloaders#
PyTorch Geometric Datasets#
Graphein provides three dataset classes for working with PyTorch Geometric:
`ProteinGraphDataset
<>`__ - For processing large datasets that can’t be kept in memory`InMemoryProteinGraphDataset
<>`__ - For smaller datasets that can be kept in memory`ProteinGraphListDataset
<>`__ - For creating a dataset from a list of pre-computed PyTorch Geometric graphs.
Both ProteinGraphDataset
and InMemoryGraphDataset
will take care of downloading structures from either the RCSB PDB, EBI AlphaFold database, or both! ProteinGraphListDataset
is a lightweight alternative for creating a dataset from a collection of graphs you have pre-computed.
[1]:
# Install graphein if necessary
# !pip install graphein
# Install torch if necessary. See https://pytorch.org/get-started/locally/
# pip install torch==1.11.0
# Install torch geometric if necessary. See: https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html
# pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.11.0+cpu.html
ProteinGraphDataset#
ProteinGraphDataset
will download structures from the PDB/AlphafoldDB, process the structures into graphs according to a ProteinGraphConfig
.
Parameters#
ProteinGraphDataset(
root: str,
# Root directory where the dataset should be saved.
name: str,
# Name of the dataset. Will be saved to ``data_$name.pt``.
pdb_codes: Optional[List[str]] = None,
# List of PDB codes to download and parse from the PDB.
uniprot_ids: Optional[List[str]] = None,
# List of Uniprot IDs to download and parse from Alphafold Database
graph_label_map: Optional[Dict[str, torch.Tensor]] = None,
# Dictionary mapping PDB/Uniprot IDs to graph-level labels.
node_label_map: Optional[Dict[str, torch.Tensor]] = None,
# Dictionary mapping PDB/Uniprot IDs to node-level labels.
chain_selection_map: Optional[Dict[str, List[str]]] = None,
# Dictionary mapping PDB/Uniprot IDs to the desired chains in the PDB files
graphein_config: ProteinGraphConfig = ProteinGraphConfig(),
# Protein graph construction config
graph_format_convertor: GraphFormatConvertor = GraphFormatConvertor(
src_format="nx", dst_format="pyg"
),
# Conversion handler for graphs
graph_transformation_funcs: Optional[List[Callable]] = None,
# List of functions that consume a nx.Graph and return a nx.Graph. Applied to graphs after construction but before conversion to pyg
transform: Optional[Callable] = None,
# A function/transform that takes in a torch_geometric.data.Data object and returns a transformed version. The data object will be transformed before every access.
pdb_transform: Optional[List[Callable]] = None,
pre_transform: Optional[Callable] = None,
# A function/transform that takes in a torch_geometric.data.Data object and returns a transformed version. The data object will be transformed before being saved to disk
pre_filter: Optional[Callable] = None,
# A function that takes in a torch_geometric.data.Data object and returns a boolean value, indicating whether the data object should be included in the final dataset
num_cores: int = 16,
# Number of cores to use for multiprocessing of graph construction
af_version: int = 2,
# Version of AlphaFoldDB structures to use,
)
Directory Structure#
Creating a ProteinGraphDataset
will create two directories under root
:
root/raw
- Contains raw PDB filesroot/processed
- Contains processed graphs (inpytorch_geometric.data.Data
format) saved as$PDB.pt / $UNIPROT_ID.pt
[2]:
import torch
from graphein.ml import ProteinGraphDataset
import graphein.protein as gp
# Create some labels
g_labels = torch.randn([5])
n_labels = torch.randn([5, 10])
g_lab_map = {"3eiy": g_labels[0], "4hhb": g_labels[1], "Q5VSL9": g_labels[2], "1lds": g_labels[3], "Q8W3K0": g_labels[4]}
node_lab_map = {"3eiy": n_labels[0], "4hhb": n_labels[1], "Q5VSL9": n_labels[2], "1lds": n_labels[3], "Q8W3K0": n_labels[4]}
# Select some chains
chain_selection_map = {"4hhb": "A"}
# Create the dataset
ds = ProteinGraphDataset(
root = "../graphein/ml/datasets/test",
pdb_codes=["3eiy", "4hhb", "1lds"],
uniprot_ids=["Q5VSL9", "Q8W3K0"],
graph_label_map=g_lab_map,
node_label_map=node_lab_map,
chain_selection_map=chain_selection_map,
graphein_config=gp.ProteinGraphConfig()
)
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/var/folders/p1/0qtk6t_520n7k4bvk1p26l_w0000gn/T/ipykernel_73928/1241113219.py in <module>
1 import torch
----> 2 from graphein.ml import ProteinGraphDataset
3 import graphein.protein as gp
4
5 # Create some labels
~/github/graphein/graphein/ml/__init__.py in <module>
----> 1 from .conversion import GraphFormatConvertor
2
3 try:
4 from .datasets import (
5 InMemoryProteinGraphDataset,
~/github/graphein/graphein/ml/conversion.py in <module>
18
19 try:
---> 20 from torch_geometric.data import Data
21 except ImportError:
22 import_message(
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/__init__.py in <module>
2 from importlib import import_module
3
----> 4 import torch_geometric.data
5 import torch_geometric.loader
6 import torch_geometric.transforms
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/__init__.py in <module>
----> 1 from .data import Data
2 from .hetero_data import HeteroData
3 from .temporal import TemporalData
4 from .batch import Batch
5 from .dataset import Dataset
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/data.py in <module>
7 import torch
8 from torch import Tensor
----> 9 from torch_sparse import SparseTensor
10
11 from torch_geometric.data.storage import (BaseStorage, EdgeStorage,
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/__init__.py in <module>
17 spec = cuda_spec or cpu_spec
18 if spec is not None:
---> 19 torch.ops.load_library(spec.origin)
20 else: # pragma: no cover
21 raise ImportError(f"Could not find module '{library}_cpu' in "
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/_ops.py in load_library(self, path)
102 # static (global) initialization code in order to register custom
103 # operators with the JIT.
--> 104 ctypes.CDLL(path)
105 self.loaded_libraries.add(path)
106
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/ctypes/__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error, winmode)
371
372 if handle is None:
--> 373 self._handle = _dlopen(self._name, mode)
374 else:
375 self._handle = handle
OSError: dlopen(/Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so, 0x0006): Symbol not found: __ZN2at4_ops19empty_memory_format4callEN3c108ArrayRefIxEENS2_8optionalINS2_10ScalarTypeEEENS5_INS2_6LayoutEEENS5_INS2_6DeviceEEENS5_IbEENS5_INS2_12MemoryFormatEEE
Referenced from: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so
Expected in: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/lib/libtorch_cpu.dylib
[3]:
# Create a dataloader from dataset and inspect a batch
from torch_geometric.loader import DataLoader
dl = DataLoader(ds, batch_size=2, shuffle=True, drop_last=True)
for i in dl:
print(i)
print("Graph labels: ", i.graph_y)
print("Node labels: ", i.node_y)
break
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/var/folders/p1/0qtk6t_520n7k4bvk1p26l_w0000gn/T/ipykernel_73928/2096137191.py in <module>
1 # Create a dataloader from dataset and inspect a batch
----> 2 from torch_geometric.loader import DataLoader
3
4 dl = DataLoader(ds, batch_size=2, shuffle=True, drop_last=True)
5 for i in dl:
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/__init__.py in <module>
2 from importlib import import_module
3
----> 4 import torch_geometric.data
5 import torch_geometric.loader
6 import torch_geometric.transforms
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/__init__.py in <module>
----> 1 from .data import Data
2 from .hetero_data import HeteroData
3 from .temporal import TemporalData
4 from .batch import Batch
5 from .dataset import Dataset
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/data.py in <module>
7 import torch
8 from torch import Tensor
----> 9 from torch_sparse import SparseTensor
10
11 from torch_geometric.data.storage import (BaseStorage, EdgeStorage,
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/__init__.py in <module>
17 spec = cuda_spec or cpu_spec
18 if spec is not None:
---> 19 torch.ops.load_library(spec.origin)
20 else: # pragma: no cover
21 raise ImportError(f"Could not find module '{library}_cpu' in "
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/_ops.py in load_library(self, path)
102 # static (global) initialization code in order to register custom
103 # operators with the JIT.
--> 104 ctypes.CDLL(path)
105 self.loaded_libraries.add(path)
106
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/ctypes/__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error, winmode)
371
372 if handle is None:
--> 373 self._handle = _dlopen(self._name, mode)
374 else:
375 self._handle = handle
OSError: dlopen(/Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so, 0x0006): Symbol not found: __ZN2at4_ops19empty_memory_format4callEN3c108ArrayRefIxEENS2_8optionalINS2_10ScalarTypeEEENS5_INS2_6LayoutEEENS5_INS2_6DeviceEEENS5_IbEENS5_INS2_12MemoryFormatEEE
Referenced from: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so
Expected in: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/lib/libtorch_cpu.dylib
InMemoryProteinGraphDataset#
Parameters#
InMemoryProteinGraphDataset(
root: str,
# Root directory where the dataset should be saved.
name: str,
# Name of the dataset. Will be saved to ``data_$name.pt``.
pdb_codes: Optional[List[str]] = None,
# List of PDB codes to download and parse from the PDB.
uniprot_ids: Optional[List[str]] = None,
# List of Uniprot IDs to download and parse from Alphafold Database
graph_label_map: Optional[Dict[str, torch.Tensor]] = None,
# Dictionary mapping PDB/Uniprot IDs to graph-level labels.
node_label_map: Optional[Dict[str, torch.Tensor]] = None,
# Dictionary mapping PDB/Uniprot IDs to node-level labels.
chain_selection_map: Optional[Dict[str, List[str]]] = None,
# Dictionary mapping PDB/Uniprot IDs to the desired chains in the PDB files
graphein_config: ProteinGraphConfig = ProteinGraphConfig(),
# Protein graph construction config
graph_format_convertor: GraphFormatConvertor = GraphFormatConvertor(
src_format="nx", dst_format="pyg"
),
# Conversion handler for graphs
graph_transformation_funcs: Optional[List[Callable]] = None,
# List of functions that consume a nx.Graph and return a nx.Graph. Applied to graphs after construction but before conversion to pyg
transform: Optional[Callable] = None,
# A function/transform that takes in a torch_geometric.data.Data object and returns a transformed version. The data object will be transformed before every access.
pdb_transform: Optional[List[Callable]] = None,
pre_transform: Optional[Callable] = None,
# A function/transform that takes in a torch_geometric.data.Data object and returns a transformed version. The data object will be transformed before being saved to disk
pre_filter: Optional[Callable] = None,
# A function that takes in a torch_geometric.data.Data object and returns a boolean value, indicating whether the data object should be included in the final dataset
num_cores: int = 16,
# Number of cores to use for multiprocessing of graph construction
af_version: int = 2,
# Version of AlphaFoldDB structures to use,
)
Directory Structure#
Creating an InMemoryProteinGraphDataset
will create two directories under root
: * root/raw
- Contains raw PDB files * root/processed
- Contains processed datasets saved as data_{name}.pt
[4]:
from graphein.ml import InMemoryProteinGraphDataset
g_lab_map = {"3eiy": 1, "4hhb": 2, "Q5VSL9": 3, "1lds": 10, "2ll6": 4}
node_lab_map = {"3eiy": 1, "4hhb": 2, "Q5VSL9": 3, "1lds": 10, "2ll6": 4}
chain_selection_map = {"4hhb": "A"}
ds = InMemoryProteinGraphDataset(
root = "../graphein/ml/datasets/test",
name="test",
pdb_codes=["3eiy", "4hhb", "1lds", "2ll6"],
uniprot_ids=["Q5VSL9"],
graph_label_map=g_lab_map,
node_label_map=node_lab_map,
chain_selection_map=chain_selection_map
)
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/var/folders/p1/0qtk6t_520n7k4bvk1p26l_w0000gn/T/ipykernel_73928/1244945871.py in <module>
----> 1 from graphein.ml import InMemoryProteinGraphDataset
2
3 g_lab_map = {"3eiy": 1, "4hhb": 2, "Q5VSL9": 3, "1lds": 10, "2ll6": 4}
4 node_lab_map = {"3eiy": 1, "4hhb": 2, "Q5VSL9": 3, "1lds": 10, "2ll6": 4}
5 chain_selection_map = {"4hhb": "A"}
~/github/graphein/graphein/ml/__init__.py in <module>
----> 1 from .conversion import GraphFormatConvertor
2
3 try:
4 from .datasets import (
5 InMemoryProteinGraphDataset,
~/github/graphein/graphein/ml/conversion.py in <module>
18
19 try:
---> 20 from torch_geometric.data import Data
21 except ImportError:
22 import_message(
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/__init__.py in <module>
2 from importlib import import_module
3
----> 4 import torch_geometric.data
5 import torch_geometric.loader
6 import torch_geometric.transforms
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/__init__.py in <module>
----> 1 from .data import Data
2 from .hetero_data import HeteroData
3 from .temporal import TemporalData
4 from .batch import Batch
5 from .dataset import Dataset
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/data.py in <module>
7 import torch
8 from torch import Tensor
----> 9 from torch_sparse import SparseTensor
10
11 from torch_geometric.data.storage import (BaseStorage, EdgeStorage,
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/__init__.py in <module>
17 spec = cuda_spec or cpu_spec
18 if spec is not None:
---> 19 torch.ops.load_library(spec.origin)
20 else: # pragma: no cover
21 raise ImportError(f"Could not find module '{library}_cpu' in "
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/_ops.py in load_library(self, path)
102 # static (global) initialization code in order to register custom
103 # operators with the JIT.
--> 104 ctypes.CDLL(path)
105 self.loaded_libraries.add(path)
106
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/ctypes/__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error, winmode)
371
372 if handle is None:
--> 373 self._handle = _dlopen(self._name, mode)
374 else:
375 self._handle = handle
OSError: dlopen(/Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so, 0x0006): Symbol not found: __ZN2at4_ops19empty_memory_format4callEN3c108ArrayRefIxEENS2_8optionalINS2_10ScalarTypeEEENS5_INS2_6LayoutEEENS5_INS2_6DeviceEEENS5_IbEENS5_INS2_12MemoryFormatEEE
Referenced from: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so
Expected in: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/lib/libtorch_cpu.dylib
[5]:
# Create a dataloader from dataset and inspect a batch
dl = DataLoader(ds, batch_size=2, shuffle=True, drop_last=True)
for i in dl:
print(i)
break
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/var/folders/p1/0qtk6t_520n7k4bvk1p26l_w0000gn/T/ipykernel_73928/5460877.py in <module>
1 # Create a dataloader from dataset and inspect a batch
----> 2 dl = DataLoader(ds, batch_size=2, shuffle=True, drop_last=True)
3 for i in dl:
4 print(i)
5 break
NameError: name 'DataLoader' is not defined
ProteinGraphListDataset#
The ProteinGraphListDataset
class is a lightweight class for wrapping a list of pre-computed pytorch_geometric.data.Data
graphs.
Parameters#
ProteinGraphListDataset(
root: str, # Root directory where the dataset is stored.
data_list: List[Data], # List of protein graphs as PyTorch Geometric Data objects.
name: str, # Name of dataset. Data will be saved as ``data_{name}.pt``.
transform: Optional[Callable]=None # A function/transform that takes in a torch_geometric.data.Data object and returns a transformed version. The data object will be transformed before every access.
)
[6]:
from graphein.ml import ProteinGraphListDataset, GraphFormatConvertor
import graphein.protein as gp
# Construct graphs
graphs = gp.construct_graphs_mp(
pdb_code_it=["3eiy", "4hhb", "1lds", "2ll6"],
return_dict=False
)
# do some transformation
graphs = [gp.extract_subgraph_from_chains(g, ["A"]) for g in graphs]
# Convert to PyG Data format
convertor = GraphFormatConvertor(src_format="nx", dst_format="pyg")
graphs = [convertor(g) for g in graphs]
# Create dataset
ds = ProteinGraphListDataset(root=".", data_list=graphs, name="list_test")
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/var/folders/p1/0qtk6t_520n7k4bvk1p26l_w0000gn/T/ipykernel_73928/1137209903.py in <module>
----> 1 from graphein.ml import ProteinGraphListDataset, GraphFormatConvertor
2 import graphein.protein as gp
3
4 # Construct graphs
5 graphs = gp.construct_graphs_mp(
~/github/graphein/graphein/ml/__init__.py in <module>
----> 1 from .conversion import GraphFormatConvertor
2
3 try:
4 from .datasets import (
5 InMemoryProteinGraphDataset,
~/github/graphein/graphein/ml/conversion.py in <module>
18
19 try:
---> 20 from torch_geometric.data import Data
21 except ImportError:
22 import_message(
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/__init__.py in <module>
2 from importlib import import_module
3
----> 4 import torch_geometric.data
5 import torch_geometric.loader
6 import torch_geometric.transforms
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/__init__.py in <module>
----> 1 from .data import Data
2 from .hetero_data import HeteroData
3 from .temporal import TemporalData
4 from .batch import Batch
5 from .dataset import Dataset
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/data.py in <module>
7 import torch
8 from torch import Tensor
----> 9 from torch_sparse import SparseTensor
10
11 from torch_geometric.data.storage import (BaseStorage, EdgeStorage,
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/__init__.py in <module>
17 spec = cuda_spec or cpu_spec
18 if spec is not None:
---> 19 torch.ops.load_library(spec.origin)
20 else: # pragma: no cover
21 raise ImportError(f"Could not find module '{library}_cpu' in "
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/_ops.py in load_library(self, path)
102 # static (global) initialization code in order to register custom
103 # operators with the JIT.
--> 104 ctypes.CDLL(path)
105 self.loaded_libraries.add(path)
106
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/ctypes/__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error, winmode)
371
372 if handle is None:
--> 373 self._handle = _dlopen(self._name, mode)
374 else:
375 self._handle = handle
OSError: dlopen(/Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so, 0x0006): Symbol not found: __ZN2at4_ops19empty_memory_format4callEN3c108ArrayRefIxEENS2_8optionalINS2_10ScalarTypeEEENS5_INS2_6LayoutEEENS5_INS2_6DeviceEEENS5_IbEENS5_INS2_12MemoryFormatEEE
Referenced from: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so
Expected in: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/lib/libtorch_cpu.dylib
[7]:
for i in ds:
print(i)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/var/folders/p1/0qtk6t_520n7k4bvk1p26l_w0000gn/T/ipykernel_73928/2548680123.py in <module>
----> 1 for i in ds:
2 print(i)
NameError: name 'ds' is not defined
[8]:
# Create a dataloader from dataset and inspect a few batches
dl = DataLoader(ds, batch_size=2, shuffle=True, drop_last=False)
for i in dl:
print(i)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/var/folders/p1/0qtk6t_520n7k4bvk1p26l_w0000gn/T/ipykernel_73928/3456704568.py in <module>
1 # Create a dataloader from dataset and inspect a few batches
----> 2 dl = DataLoader(ds, batch_size=2, shuffle=True, drop_last=False)
3 for i in dl:
4 print(i)
NameError: name 'DataLoader' is not defined
Transforms#
We can supply various functions to ProteinGraphDataset
and InMemoryProteinGraphDataset
to alter the composition of the dataset.
pdb_transform
(list(callable)
, optional) - A function that receives a list of paths to the downloaded structures. This provides an entry point to apply pre-processing from bioinformatics tools of your choosinggraph_transformation_funcs
: (List[Callable]
, optional) List of functions that consume anx.Graph
and return anx.Graph
. Applied to graphs after construction but before conversion totorch_geometric.data.Data
. Defaults toNone
.transform
(callable
, optional) – A function/transform that takes in atorch_geometric.data.Data
object and returns a transformed version. The data object will be transformed before every access. (default:None
)pre_transform
(callable
, optional) – A function/transform that takes in a torch_geometric.data.Data object and returns a transformed version. The data object will be transformed before being saved to disk. (default:None
)pre_filter
(callable,
optional) – A function that takes in atorch_geometric.data.Data
object and returns a boolean value, indicating whether the data object should be included in the final dataset. (default:None
)
[9]:
from typing import List
import networkx as nx
from torch_geometric.data import Data
# Create dummy transforms
def pdb_transform_fn(files: List[str]):
"""Transforms raw pdbs prior to computing graphs."""
return
def graph_transform_fn(graph: nx.Graph) -> nx.Graph:
"""Transforms graphein nx.Graph prior to conversion to torch_geometric.data.Data."""
return graph
def transform_fn(data: Data) -> Data:
"""Transforms torch_geometric.data.Data prior to every access."""
return data
def pre_transform_fn(data: Data) -> Data:
"""Transforms torch_geometric.data.Data prior to saving to disk."""
return data
def pre_filter_fn(data: Data) -> bool:
"""Takes in a torch_geometric.data.Data and returns True if the data should be included in the dataset."""
return True
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/var/folders/p1/0qtk6t_520n7k4bvk1p26l_w0000gn/T/ipykernel_73928/4159870238.py in <module>
1 from typing import List
2 import networkx as nx
----> 3 from torch_geometric.data import Data
4
5 # Create dummy transforms
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/__init__.py in <module>
2 from importlib import import_module
3
----> 4 import torch_geometric.data
5 import torch_geometric.loader
6 import torch_geometric.transforms
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/__init__.py in <module>
----> 1 from .data import Data
2 from .hetero_data import HeteroData
3 from .temporal import TemporalData
4 from .batch import Batch
5 from .dataset import Dataset
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/data.py in <module>
7 import torch
8 from torch import Tensor
----> 9 from torch_sparse import SparseTensor
10
11 from torch_geometric.data.storage import (BaseStorage, EdgeStorage,
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/__init__.py in <module>
17 spec = cuda_spec or cpu_spec
18 if spec is not None:
---> 19 torch.ops.load_library(spec.origin)
20 else: # pragma: no cover
21 raise ImportError(f"Could not find module '{library}_cpu' in "
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/_ops.py in load_library(self, path)
102 # static (global) initialization code in order to register custom
103 # operators with the JIT.
--> 104 ctypes.CDLL(path)
105 self.loaded_libraries.add(path)
106
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/ctypes/__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error, winmode)
371
372 if handle is None:
--> 373 self._handle = _dlopen(self._name, mode)
374 else:
375 self._handle = handle
OSError: dlopen(/Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so, 0x0006): Symbol not found: __ZN2at4_ops19empty_memory_format4callEN3c108ArrayRefIxEENS2_8optionalINS2_10ScalarTypeEEENS5_INS2_6LayoutEEENS5_INS2_6DeviceEEENS5_IbEENS5_INS2_12MemoryFormatEEE
Referenced from: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so
Expected in: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/lib/libtorch_cpu.dylib
[10]:
from graphein.ml.datasets.torch_geometric_dataset import InMemoryProteinGraphDataset
g_lab_map = {"3eiy": 1, "4hhb": 2, "Q5VSL9": 3, "1lds": 10, "2ll6": 4}
node_lab_map = {"3eiy": 1, "4hhb": 2, "Q5VSL9": 3, "1lds": 10, "2ll6": 4}
chain_selection_map = {"4hhb": "A"}
ds = InMemoryProteinGraphDataset(
root = "../graphein/ml/datasets/test",
name="test",
pdb_codes=["3eiy", "4hhb", "1lds", "2ll6"],
uniprot_ids=["Q5VSL9"],
graph_label_map=g_lab_map,
node_label_map=node_lab_map,
chain_selection_map=chain_selection_map,
pdb_transform=[pdb_transform_fn],
graph_transformation_funcs=[graph_transform_fn],
transform=transform_fn,
pre_transform=pre_transform_fn,
pre_filter=pre_filter_fn
)
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/var/folders/p1/0qtk6t_520n7k4bvk1p26l_w0000gn/T/ipykernel_73928/1112474063.py in <module>
----> 1 from graphein.ml.datasets.torch_geometric_dataset import InMemoryProteinGraphDataset
2
3 g_lab_map = {"3eiy": 1, "4hhb": 2, "Q5VSL9": 3, "1lds": 10, "2ll6": 4}
4 node_lab_map = {"3eiy": 1, "4hhb": 2, "Q5VSL9": 3, "1lds": 10, "2ll6": 4}
5 chain_selection_map = {"4hhb": "A"}
~/github/graphein/graphein/ml/__init__.py in <module>
----> 1 from .conversion import GraphFormatConvertor
2
3 try:
4 from .datasets import (
5 InMemoryProteinGraphDataset,
~/github/graphein/graphein/ml/conversion.py in <module>
18
19 try:
---> 20 from torch_geometric.data import Data
21 except ImportError:
22 import_message(
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/__init__.py in <module>
2 from importlib import import_module
3
----> 4 import torch_geometric.data
5 import torch_geometric.loader
6 import torch_geometric.transforms
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/__init__.py in <module>
----> 1 from .data import Data
2 from .hetero_data import HeteroData
3 from .temporal import TemporalData
4 from .batch import Batch
5 from .dataset import Dataset
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_geometric/data/data.py in <module>
7 import torch
8 from torch import Tensor
----> 9 from torch_sparse import SparseTensor
10
11 from torch_geometric.data.storage import (BaseStorage, EdgeStorage,
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/__init__.py in <module>
17 spec = cuda_spec or cpu_spec
18 if spec is not None:
---> 19 torch.ops.load_library(spec.origin)
20 else: # pragma: no cover
21 raise ImportError(f"Could not find module '{library}_cpu' in "
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/_ops.py in load_library(self, path)
102 # static (global) initialization code in order to register custom
103 # operators with the JIT.
--> 104 ctypes.CDLL(path)
105 self.loaded_libraries.add(path)
106
~/opt/anaconda3/envs/graphein-wip/lib/python3.8/ctypes/__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error, winmode)
371
372 if handle is None:
--> 373 self._handle = _dlopen(self._name, mode)
374 else:
375 self._handle = handle
OSError: dlopen(/Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so, 0x0006): Symbol not found: __ZN2at4_ops19empty_memory_format4callEN3c108ArrayRefIxEENS2_8optionalINS2_10ScalarTypeEEENS5_INS2_6LayoutEEENS5_INS2_6DeviceEEENS5_IbEENS5_INS2_12MemoryFormatEEE
Referenced from: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch_sparse/_convert_cpu.so
Expected in: /Users/arianjamasb/opt/anaconda3/envs/graphein-wip/lib/python3.8/site-packages/torch/lib/libtorch_cpu.dylib