Module hetmatpy.hetmat
None
None
View Source
import functools
import gc
import itertools
import logging
import pathlib
import shutil
import hetnetpy.hetnet
import hetnetpy.matrix
import hetnetpy.permute
import hetnetpy.readwrite
import numpy
import pandas
import scipy.sparse
import hetmatpy.degree_weight
import hetmatpy.matrix
def hetmat_from_graph(
graph, path, save_metagraph=True, save_nodes=True, save_edges=True
):
"""
Create a hetmat.HetMat from a hetnetpy.hetnet.Graph.
"""
assert isinstance(graph, hetnetpy.hetnet.Graph)
hetmat = HetMat(path, initialize=True)
hetmat.metagraph = graph.metagraph
# Save metanodes
metanodes = list(graph.metagraph.get_nodes())
for metanode in metanodes:
path = hetmat.get_nodes_path(metanode)
rows = list()
node_to_position = hetnetpy.matrix.get_node_to_position(graph, metanode)
for node, position in node_to_position.items():
rows.append((position, node.identifier, node.name))
node_df = pandas.DataFrame(rows, columns=["position", "identifier", "name"])
path = hetmat.get_nodes_path(metanode)
node_df.to_csv(path, index=False, sep="\t")
# Save metaedges
metaedges = list(graph.metagraph.get_edges(exclude_inverts=True))
for metaedge in metaedges:
rows, cols, matrix = hetnetpy.matrix.metaedge_to_adjacency_matrix(
graph, metaedge, dense_threshold=1
)
path = hetmat.get_edges_path(metaedge, file_format=None)
save_matrix(matrix, path)
return hetmat
def hetmat_from_permuted_graph(hetmat, permutation_id, permuted_graph):
"""
Assumes subdirectory structure and that permutations inherit nodes but not
edges.
"""
permuted_hetmat = initialize_permutation_directory(hetmat, permutation_id)
permuted_hetmat = hetmat_from_graph(
permuted_graph,
permuted_hetmat.directory,
save_metagraph=False,
save_nodes=False,
)
return permuted_hetmat
def initialize_permutation_directory(hetmat, permutation_id):
"""
Initializes the directory structure of a HetMat permutation.
Parameters
----------
hetmat : HetMat
permutation_id : str
Returns
-------
HetMat
"""
if not hetmat.permutations_directory.is_dir():
hetmat.permutations_directory.mkdir()
directory = hetmat.permutations_directory.joinpath(f"{permutation_id}.hetmat")
if directory.is_dir():
# If directory exists, back it up using a .bak extension
backup_directory = directory.with_name(directory.name + ".bak")
if backup_directory.is_dir():
shutil.rmtree(backup_directory)
shutil.move(directory, backup_directory)
permuted_hetmat = HetMat(directory, initialize=True)
permuted_hetmat.is_permutation = True
permuted_hetmat.metagraph_path.symlink_to("../../metagraph.json")
permuted_hetmat.nodes_directory.rmdir()
permuted_hetmat.nodes_directory.symlink_to("../../nodes", target_is_directory=True)
return permuted_hetmat
def read_matrix(path, file_format="infer"):
path = str(path)
if file_format == "infer":
if path.endswith(".sparse.npz"):
file_format = "sparse.npz"
if path.endswith(".npy"):
file_format = "npy"
if file_format == "infer":
raise ValueError("Could not infer file_format for {path}")
if file_format == "sparse.npz":
# https://docs.scipy.org/doc/scipy-1.0.0/reference/generated/scipy.sparse.load_npz.html
return scipy.sparse.load_npz(path)
if file_format == "npy":
# https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.load.html
return numpy.load(path)
raise ValueError(f"file_format={file_format} is not supported.")
def save_matrix(matrix, path):
"""
Save a matrix to a the file specified by path.
Path should not include it's extension which is inferred.
"""
path = pathlib.Path(path)
if not path.parent.exists():
path.parent.mkdir()
path = str(path)
if isinstance(matrix, numpy.ndarray):
if not path.endswith(".npy"):
path += ".npy"
numpy.save(path, matrix)
elif scipy.sparse.issparse(matrix):
if not path.endswith(".sparse.npz"):
path += ".sparse.npz"
scipy.sparse.save_npz(path, matrix, compressed=True)
def read_first_matrix(specs, delete_failures=False):
"""
Attempt to read each path provided by specs, until one exists. If none of
the specs point to an existing path, raise a FileNotFoundError.
specs should be a list where each element is a dictionary specifying a
potential path from which to read a matrix. Currently, the spec dictionary
supports the following keys:
- path: path to the file
- transpose: whether to transpose the file after reading it. If omitted,
then False.
- file_format: format of the matrix. If omitted, then infer.
"""
paths = list()
for spec in specs:
path = pathlib.Path(spec["path"])
paths.append(str(path))
if not path.is_file():
continue
transpose = spec.get("transpose", False)
file_format = spec.get("file_format", "infer")
try:
matrix = read_matrix(path, file_format=file_format)
except Exception as error:
logging.warning(f"Error reading matrix at {path}:\n{error}")
if delete_failures:
path.unlink()
logging.warning(f"Deleting file at {path}")
continue
if transpose:
matrix = matrix.transpose()
return matrix
raise FileNotFoundError(
"No matrix files found at the specified paths:\n" + "\n".join(paths)
)
compression_extension = {
"gzip": ".gz",
"bz2": ".bz2",
"zip": ".zip",
"xz": ".xz",
None: "",
}
class HetMat:
# Supported formats for nodes files
nodes_formats = {
"tsv",
# 'feather',
# 'pickle',
# 'json',
}
# Supported formats for edges files
edges_formats = {
"npy",
"sparse.npz",
# 'tsv',
}
def __init__(self, directory, initialize=False):
"""
Initialize a HetMat with its MetaGraph.
"""
self.directory = pathlib.Path(directory)
self.metagraph_path = self.directory.joinpath("metagraph.json")
self.nodes_directory = self.directory.joinpath("nodes")
self.edges_directory = self.directory.joinpath("edges")
self.path_counts_directory = self.directory.joinpath("path-counts")
self.path_counts_cache = None
# Permutations should set is_permutation=True
self.is_permutation = False
self.permutations_directory = self.directory.joinpath("permutations")
if initialize:
self.initialize()
def initialize(self):
"""
Initialize the directory structure. This function is intended to be
called when creating new HetMat instance on disk.
"""
# Create directories
directories = [
self.directory,
self.nodes_directory,
self.edges_directory,
]
for directory in directories:
if not directory.is_dir():
directory.mkdir()
@property
@functools.lru_cache()
def permutations(self):
"""
Return a dictionary of permutation name to permutation directory.
Assumes permutation name is the directory name minus its .hetmat
extension.
"""
permutations = {}
for directory in sorted(self.permutations_directory.glob("*.hetmat")):
if not directory.is_dir():
continue
permutation = HetMat(directory)
permutation.is_permutation = True
name, _ = directory.name.rsplit(".", 1)
permutations[name] = permutation
return permutations
def permute_graph(
self,
num_new_permutations=None,
namer=None,
start_from=None,
multiplier=10,
seed=0,
):
"""
Generate and save permutations of the HetMat adjacency matrices.
Parameters
----------
num_new_permutations : int
The number of new, permuted HetMats to generate
namer : generator
Yields the names of new permutations. Cannot pass names of existing permutations
start_from : str
Name of permutation to use as starting point. For multiple permutations,
the first permutation starts from start_from, and future permutations
continue from the previous one.
multiplier : int
How many attempts to make when cross-swapping edges.
seed : int
Random seed for generating new permutations
"""
if namer is None:
# If no namer given, continue increasing names by one for new permutations
namer = (f"{x:03}" for x in itertools.count(start=1))
stat_dfs = list()
for _ in range(num_new_permutations):
permutation_name = next(namer)
new_hetmat = initialize_permutation_directory(self, permutation_name)
if start_from is None:
start_from = self
elif isinstance(start_from, str):
start_from = self.permutations[start_from]
assert isinstance(start_from, HetMat)
metaedges = list(self.metagraph.get_edges(exclude_inverts=True))
for metaedge in metaedges:
rows, cols, original_matrix = start_from.metaedge_to_adjacency_matrix(
metaedge, dense_threshold=1
)
is_directed = metaedge.direction != "both"
permuted_matrix, stats = hetmatpy.matrix.permute_matrix(
original_matrix,
directed=is_directed,
multiplier=multiplier,
seed=seed,
)
path = new_hetmat.get_edges_path(metaedge, file_format=None)
save_matrix(permuted_matrix, path)
stat_df = pandas.DataFrame(stats)
stat_df["metaedge"] = metaedge
stat_df["abbrev"] = metaedge.get_abbrev()
stat_df["permutation"] = permutation_name
stat_dfs.append(stat_df)
start_from = permutation_name
seed += 1
self.permutations[permutation_name] = new_hetmat
return pandas.concat(stat_dfs)
@property
@functools.lru_cache()
def metagraph(self):
"""
HetMat.metagraph is a cached property. Hence reading the metagraph from
disk should only occur once, the first time the metagraph property is
accessed. See https://stackoverflow.com/a/19979379/4651668. If this
method has issues, consider using cached_property from
https://github.com/pydanny/cached-property.
"""
return hetnetpy.readwrite.read_metagraph(self.metagraph_path)
@metagraph.setter
def metagraph(self, metagraph):
"""
Set the metagraph property by writing the metagraph to disk.
"""
hetnetpy.readwrite.write_metagraph(metagraph, self.metagraph_path)
def get_nodes_path(self, metanode, file_format="tsv"):
"""
Get the path for the nodes file for the specified metanode. Setting
file_format=None returns the path without any extension suffix.
"""
metanode = self.metagraph.get_metanode(metanode)
path = self.nodes_directory.joinpath(f"{metanode}")
if file_format is not None:
path = path.with_name(f"{path.name}.{file_format}")
return path
def get_edges_path(self, metaedge, file_format="npy"):
"""
Get the path for the edges file for the specified metaedge. Setting
file_format=None returns the path without any extension suffix.
"""
metaedge_abbrev = self.metagraph.get_metaedge(metaedge).get_abbrev()
path = self.edges_directory.joinpath(f"{metaedge_abbrev}")
if file_format is not None:
path = path.with_name(f"{path.name}.{file_format}")
return path
def get_path_counts_path(self, metapath, metric, damping, file_format):
"""
Setting file_format=None returns the path without any extension suffix.
Supported metrics are 'dwpc' and 'dwwc'.
"""
damping = float(damping)
path = self.path_counts_directory.joinpath(f"{metric}-{damping}/{metapath}")
if file_format is not None:
path = path.with_name(f"{path.name}.{file_format}")
return path
def get_running_degree_group_path(
self, metapath, metric, damping, extension=".tsv.gz"
):
"""
Get path for degree-grouped permutatation running metrics.
Must specify extension.
"""
damping = float(damping)
path = self.directory.joinpath(
"adjusted-path-counts",
f"{metric}-{damping}",
"degree-grouped-permutations",
f"{metapath}{extension}",
)
return path
def get_metapath_summary_path(self, metapath, metric, damping, compression=None):
damping = float(damping)
compr = compression_extension[compression]
path = self.directory.joinpath(
"adjusted-path-counts",
f"{metric}-{damping}",
"adjusted-dwpcs",
f"{metapath}.tsv{compr}",
)
return path
@functools.lru_cache()
def get_node_identifiers(self, metanode):
"""
Returns a list of node identifiers for a metapath
"""
path = self.get_nodes_path(metanode, file_format="tsv")
node_df = pandas.read_csv(path, sep="\t")
return list(node_df["identifier"])
@functools.lru_cache()
def count_nodes(self, metanode):
nodes = self.get_node_identifiers(metanode)
return len(nodes)
def metaedge_to_adjacency_matrix(
self,
metaedge,
dtype=None,
dense_threshold=None,
file_formats=["sparse.npz", "npy"],
):
"""
file_formats sets the precedence of which file to read in
"""
metaedge = self.metagraph.get_metaedge(metaedge)
specs = list()
configurations = itertools.product(file_formats, (True, False))
for file_format, invert in configurations:
path = self.get_edges_path(
metaedge=metaedge.inverse if invert else metaedge,
file_format=file_format,
)
spec = {"path": path, "transpose": invert, "file_format": file_format}
specs.append(spec)
matrix = read_first_matrix(specs)
if dense_threshold is not None:
matrix = hetnetpy.matrix.sparsify_or_densify(
matrix, dense_threshold=dense_threshold
)
if dtype is not None:
matrix = matrix.astype(dtype)
row_ids = self.get_node_identifiers(metaedge.source)
col_ids = self.get_node_identifiers(metaedge.target)
return row_ids, col_ids, matrix
def read_path_counts(
self, metapath, metric, damping, file_formats=["sparse.npz", "npy"]
):
"""
Read matrix with values of a path-count-based metric. Attempts to
locate any files with the matrix (or with trivial transformations).
"""
category = hetmatpy.degree_weight.categorize(metapath)
metrics = [metric]
if metric == "dwpc" and category == "no_repeats":
metrics.append("dwwc")
if metric == "dwwc" and category == "no_repeats":
metrics.append("dwpc")
specs = list()
configurations = itertools.product(
file_formats,
metrics,
(True, False),
)
for file_format, metric, invert in configurations:
path = self.get_path_counts_path(
metapath=metapath.inverse if invert else metapath,
metric=metric,
damping=damping,
file_format=file_format,
)
spec = {"path": path, "transpose": invert, "file_format": file_format}
specs.append(spec)
row_ids = self.get_node_identifiers(metapath.source())
col_ids = self.get_node_identifiers(metapath.target())
matrix = read_first_matrix(specs)
return row_ids, col_ids, matrix
def clear_caches(self):
"""
Clear cached assets of this HetMat and force garbage collection.
"""
# See workaround for methods with @property and @lru_cache decoration
# https://stackoverflow.com/a/45283290/4651668
for lru_cached_function in [
type(self).permutations.fget,
type(self).metagraph.fget,
self.get_node_identifiers,
self.count_nodes,
]:
lru_cached_function.cache_clear()
self.path_counts_cache = None
gc.collect()
Sub-modules
Variables
compression_extension
Functions
hetmat_from_graph
def hetmat_from_graph(
graph,
path,
save_metagraph=True,
save_nodes=True,
save_edges=True
)
Create a hetmat.HetMat from a hetnetpy.hetnet.Graph.
View Source
def hetmat_from_graph(
graph, path, save_metagraph=True, save_nodes=True, save_edges=True
):
"""
Create a hetmat.HetMat from a hetnetpy.hetnet.Graph.
"""
assert isinstance(graph, hetnetpy.hetnet.Graph)
hetmat = HetMat(path, initialize=True)
hetmat.metagraph = graph.metagraph
# Save metanodes
metanodes = list(graph.metagraph.get_nodes())
for metanode in metanodes:
path = hetmat.get_nodes_path(metanode)
rows = list()
node_to_position = hetnetpy.matrix.get_node_to_position(graph, metanode)
for node, position in node_to_position.items():
rows.append((position, node.identifier, node.name))
node_df = pandas.DataFrame(rows, columns=["position", "identifier", "name"])
path = hetmat.get_nodes_path(metanode)
node_df.to_csv(path, index=False, sep="\t")
# Save metaedges
metaedges = list(graph.metagraph.get_edges(exclude_inverts=True))
for metaedge in metaedges:
rows, cols, matrix = hetnetpy.matrix.metaedge_to_adjacency_matrix(
graph, metaedge, dense_threshold=1
)
path = hetmat.get_edges_path(metaedge, file_format=None)
save_matrix(matrix, path)
return hetmat
hetmat_from_permuted_graph
def hetmat_from_permuted_graph(
hetmat,
permutation_id,
permuted_graph
)
Assumes subdirectory structure and that permutations inherit nodes but not
edges.
View Source
def hetmat_from_permuted_graph(hetmat, permutation_id, permuted_graph):
"""
Assumes subdirectory structure and that permutations inherit nodes but not
edges.
"""
permuted_hetmat = initialize_permutation_directory(hetmat, permutation_id)
permuted_hetmat = hetmat_from_graph(
permuted_graph,
permuted_hetmat.directory,
save_metagraph=False,
save_nodes=False,
)
return permuted_hetmat
initialize_permutation_directory
def initialize_permutation_directory(
hetmat,
permutation_id
)
Initializes the directory structure of a HetMat permutation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
hetmat | HetMat | None | None |
permutation_id | str | None | None |
Returns:
Type | Description |
---|---|
HetMat | None |
View Source
def initialize_permutation_directory(hetmat, permutation_id):
"""
Initializes the directory structure of a HetMat permutation.
Parameters
----------
hetmat : HetMat
permutation_id : str
Returns
-------
HetMat
"""
if not hetmat.permutations_directory.is_dir():
hetmat.permutations_directory.mkdir()
directory = hetmat.permutations_directory.joinpath(f"{permutation_id}.hetmat")
if directory.is_dir():
# If directory exists, back it up using a .bak extension
backup_directory = directory.with_name(directory.name + ".bak")
if backup_directory.is_dir():
shutil.rmtree(backup_directory)
shutil.move(directory, backup_directory)
permuted_hetmat = HetMat(directory, initialize=True)
permuted_hetmat.is_permutation = True
permuted_hetmat.metagraph_path.symlink_to("../../metagraph.json")
permuted_hetmat.nodes_directory.rmdir()
permuted_hetmat.nodes_directory.symlink_to("../../nodes", target_is_directory=True)
return permuted_hetmat
read_first_matrix
def read_first_matrix(
specs,
delete_failures=False
)
Attempt to read each path provided by specs, until one exists. If none of
the specs point to an existing path, raise a FileNotFoundError. specs should be a list where each element is a dictionary specifying a potential path from which to read a matrix. Currently, the spec dictionary supports the following keys: - path: path to the file - transpose: whether to transpose the file after reading it. If omitted, then False. - file_format: format of the matrix. If omitted, then infer.
View Source
def read_first_matrix(specs, delete_failures=False):
"""
Attempt to read each path provided by specs, until one exists. If none of
the specs point to an existing path, raise a FileNotFoundError.
specs should be a list where each element is a dictionary specifying a
potential path from which to read a matrix. Currently, the spec dictionary
supports the following keys:
- path: path to the file
- transpose: whether to transpose the file after reading it. If omitted,
then False.
- file_format: format of the matrix. If omitted, then infer.
"""
paths = list()
for spec in specs:
path = pathlib.Path(spec["path"])
paths.append(str(path))
if not path.is_file():
continue
transpose = spec.get("transpose", False)
file_format = spec.get("file_format", "infer")
try:
matrix = read_matrix(path, file_format=file_format)
except Exception as error:
logging.warning(f"Error reading matrix at {path}:\n{error}")
if delete_failures:
path.unlink()
logging.warning(f"Deleting file at {path}")
continue
if transpose:
matrix = matrix.transpose()
return matrix
raise FileNotFoundError(
"No matrix files found at the specified paths:\n" + "\n".join(paths)
)
read_matrix
def read_matrix(
path,
file_format='infer'
)
View Source
def read_matrix(path, file_format="infer"):
path = str(path)
if file_format == "infer":
if path.endswith(".sparse.npz"):
file_format = "sparse.npz"
if path.endswith(".npy"):
file_format = "npy"
if file_format == "infer":
raise ValueError("Could not infer file_format for {path}")
if file_format == "sparse.npz":
# https://docs.scipy.org/doc/scipy-1.0.0/reference/generated/scipy.sparse.load_npz.html
return scipy.sparse.load_npz(path)
if file_format == "npy":
# https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.load.html
return numpy.load(path)
raise ValueError(f"file_format={file_format} is not supported.")
save_matrix
def save_matrix(
matrix,
path
)
Save a matrix to a the file specified by path.
Path should not include it's extension which is inferred.
View Source
def save_matrix(matrix, path):
"""
Save a matrix to a the file specified by path.
Path should not include it's extension which is inferred.
"""
path = pathlib.Path(path)
if not path.parent.exists():
path.parent.mkdir()
path = str(path)
if isinstance(matrix, numpy.ndarray):
if not path.endswith(".npy"):
path += ".npy"
numpy.save(path, matrix)
elif scipy.sparse.issparse(matrix):
if not path.endswith(".sparse.npz"):
path += ".sparse.npz"
scipy.sparse.save_npz(path, matrix, compressed=True)
Classes
HetMat
class HetMat(
directory,
initialize=False
)
View Source
class HetMat:
# Supported formats for nodes files
nodes_formats = {
"tsv",
# 'feather',
# 'pickle',
# 'json',
}
# Supported formats for edges files
edges_formats = {
"npy",
"sparse.npz",
# 'tsv',
}
def __init__(self, directory, initialize=False):
"""
Initialize a HetMat with its MetaGraph.
"""
self.directory = pathlib.Path(directory)
self.metagraph_path = self.directory.joinpath("metagraph.json")
self.nodes_directory = self.directory.joinpath("nodes")
self.edges_directory = self.directory.joinpath("edges")
self.path_counts_directory = self.directory.joinpath("path-counts")
self.path_counts_cache = None
# Permutations should set is_permutation=True
self.is_permutation = False
self.permutations_directory = self.directory.joinpath("permutations")
if initialize:
self.initialize()
def initialize(self):
"""
Initialize the directory structure. This function is intended to be
called when creating new HetMat instance on disk.
"""
# Create directories
directories = [
self.directory,
self.nodes_directory,
self.edges_directory,
]
for directory in directories:
if not directory.is_dir():
directory.mkdir()
@property
@functools.lru_cache()
def permutations(self):
"""
Return a dictionary of permutation name to permutation directory.
Assumes permutation name is the directory name minus its .hetmat
extension.
"""
permutations = {}
for directory in sorted(self.permutations_directory.glob("*.hetmat")):
if not directory.is_dir():
continue
permutation = HetMat(directory)
permutation.is_permutation = True
name, _ = directory.name.rsplit(".", 1)
permutations[name] = permutation
return permutations
def permute_graph(
self,
num_new_permutations=None,
namer=None,
start_from=None,
multiplier=10,
seed=0,
):
"""
Generate and save permutations of the HetMat adjacency matrices.
Parameters
----------
num_new_permutations : int
The number of new, permuted HetMats to generate
namer : generator
Yields the names of new permutations. Cannot pass names of existing permutations
start_from : str
Name of permutation to use as starting point. For multiple permutations,
the first permutation starts from start_from, and future permutations
continue from the previous one.
multiplier : int
How many attempts to make when cross-swapping edges.
seed : int
Random seed for generating new permutations
"""
if namer is None:
# If no namer given, continue increasing names by one for new permutations
namer = (f"{x:03}" for x in itertools.count(start=1))
stat_dfs = list()
for _ in range(num_new_permutations):
permutation_name = next(namer)
new_hetmat = initialize_permutation_directory(self, permutation_name)
if start_from is None:
start_from = self
elif isinstance(start_from, str):
start_from = self.permutations[start_from]
assert isinstance(start_from, HetMat)
metaedges = list(self.metagraph.get_edges(exclude_inverts=True))
for metaedge in metaedges:
rows, cols, original_matrix = start_from.metaedge_to_adjacency_matrix(
metaedge, dense_threshold=1
)
is_directed = metaedge.direction != "both"
permuted_matrix, stats = hetmatpy.matrix.permute_matrix(
original_matrix,
directed=is_directed,
multiplier=multiplier,
seed=seed,
)
path = new_hetmat.get_edges_path(metaedge, file_format=None)
save_matrix(permuted_matrix, path)
stat_df = pandas.DataFrame(stats)
stat_df["metaedge"] = metaedge
stat_df["abbrev"] = metaedge.get_abbrev()
stat_df["permutation"] = permutation_name
stat_dfs.append(stat_df)
start_from = permutation_name
seed += 1
self.permutations[permutation_name] = new_hetmat
return pandas.concat(stat_dfs)
@property
@functools.lru_cache()
def metagraph(self):
"""
HetMat.metagraph is a cached property. Hence reading the metagraph from
disk should only occur once, the first time the metagraph property is
accessed. See https://stackoverflow.com/a/19979379/4651668. If this
method has issues, consider using cached_property from
https://github.com/pydanny/cached-property.
"""
return hetnetpy.readwrite.read_metagraph(self.metagraph_path)
@metagraph.setter
def metagraph(self, metagraph):
"""
Set the metagraph property by writing the metagraph to disk.
"""
hetnetpy.readwrite.write_metagraph(metagraph, self.metagraph_path)
def get_nodes_path(self, metanode, file_format="tsv"):
"""
Get the path for the nodes file for the specified metanode. Setting
file_format=None returns the path without any extension suffix.
"""
metanode = self.metagraph.get_metanode(metanode)
path = self.nodes_directory.joinpath(f"{metanode}")
if file_format is not None:
path = path.with_name(f"{path.name}.{file_format}")
return path
def get_edges_path(self, metaedge, file_format="npy"):
"""
Get the path for the edges file for the specified metaedge. Setting
file_format=None returns the path without any extension suffix.
"""
metaedge_abbrev = self.metagraph.get_metaedge(metaedge).get_abbrev()
path = self.edges_directory.joinpath(f"{metaedge_abbrev}")
if file_format is not None:
path = path.with_name(f"{path.name}.{file_format}")
return path
def get_path_counts_path(self, metapath, metric, damping, file_format):
"""
Setting file_format=None returns the path without any extension suffix.
Supported metrics are 'dwpc' and 'dwwc'.
"""
damping = float(damping)
path = self.path_counts_directory.joinpath(f"{metric}-{damping}/{metapath}")
if file_format is not None:
path = path.with_name(f"{path.name}.{file_format}")
return path
def get_running_degree_group_path(
self, metapath, metric, damping, extension=".tsv.gz"
):
"""
Get path for degree-grouped permutatation running metrics.
Must specify extension.
"""
damping = float(damping)
path = self.directory.joinpath(
"adjusted-path-counts",
f"{metric}-{damping}",
"degree-grouped-permutations",
f"{metapath}{extension}",
)
return path
def get_metapath_summary_path(self, metapath, metric, damping, compression=None):
damping = float(damping)
compr = compression_extension[compression]
path = self.directory.joinpath(
"adjusted-path-counts",
f"{metric}-{damping}",
"adjusted-dwpcs",
f"{metapath}.tsv{compr}",
)
return path
@functools.lru_cache()
def get_node_identifiers(self, metanode):
"""
Returns a list of node identifiers for a metapath
"""
path = self.get_nodes_path(metanode, file_format="tsv")
node_df = pandas.read_csv(path, sep="\t")
return list(node_df["identifier"])
@functools.lru_cache()
def count_nodes(self, metanode):
nodes = self.get_node_identifiers(metanode)
return len(nodes)
def metaedge_to_adjacency_matrix(
self,
metaedge,
dtype=None,
dense_threshold=None,
file_formats=["sparse.npz", "npy"],
):
"""
file_formats sets the precedence of which file to read in
"""
metaedge = self.metagraph.get_metaedge(metaedge)
specs = list()
configurations = itertools.product(file_formats, (True, False))
for file_format, invert in configurations:
path = self.get_edges_path(
metaedge=metaedge.inverse if invert else metaedge,
file_format=file_format,
)
spec = {"path": path, "transpose": invert, "file_format": file_format}
specs.append(spec)
matrix = read_first_matrix(specs)
if dense_threshold is not None:
matrix = hetnetpy.matrix.sparsify_or_densify(
matrix, dense_threshold=dense_threshold
)
if dtype is not None:
matrix = matrix.astype(dtype)
row_ids = self.get_node_identifiers(metaedge.source)
col_ids = self.get_node_identifiers(metaedge.target)
return row_ids, col_ids, matrix
def read_path_counts(
self, metapath, metric, damping, file_formats=["sparse.npz", "npy"]
):
"""
Read matrix with values of a path-count-based metric. Attempts to
locate any files with the matrix (or with trivial transformations).
"""
category = hetmatpy.degree_weight.categorize(metapath)
metrics = [metric]
if metric == "dwpc" and category == "no_repeats":
metrics.append("dwwc")
if metric == "dwwc" and category == "no_repeats":
metrics.append("dwpc")
specs = list()
configurations = itertools.product(
file_formats,
metrics,
(True, False),
)
for file_format, metric, invert in configurations:
path = self.get_path_counts_path(
metapath=metapath.inverse if invert else metapath,
metric=metric,
damping=damping,
file_format=file_format,
)
spec = {"path": path, "transpose": invert, "file_format": file_format}
specs.append(spec)
row_ids = self.get_node_identifiers(metapath.source())
col_ids = self.get_node_identifiers(metapath.target())
matrix = read_first_matrix(specs)
return row_ids, col_ids, matrix
def clear_caches(self):
"""
Clear cached assets of this HetMat and force garbage collection.
"""
# See workaround for methods with @property and @lru_cache decoration
# https://stackoverflow.com/a/45283290/4651668
for lru_cached_function in [
type(self).permutations.fget,
type(self).metagraph.fget,
self.get_node_identifiers,
self.count_nodes,
]:
lru_cached_function.cache_clear()
self.path_counts_cache = None
gc.collect()
Class variables
edges_formats
nodes_formats
Instance variables
metagraph
HetMat.metagraph is a cached property. Hence reading the metagraph from
disk should only occur once, the first time the metagraph property is accessed. See https://stackoverflow.com/a/19979379/4651668. If this method has issues, consider using cached_property from https://github.com/pydanny/cached-property.
permutations
Return a dictionary of permutation name to permutation directory.
Assumes permutation name is the directory name minus its .hetmat extension.
Methods
clear_caches
def clear_caches(
self
)
Clear cached assets of this HetMat and force garbage collection.
View Source
def clear_caches(self):
"""
Clear cached assets of this HetMat and force garbage collection.
"""
# See workaround for methods with @property and @lru_cache decoration
# https://stackoverflow.com/a/45283290/4651668
for lru_cached_function in [
type(self).permutations.fget,
type(self).metagraph.fget,
self.get_node_identifiers,
self.count_nodes,
]:
lru_cached_function.cache_clear()
self.path_counts_cache = None
gc.collect()
count_nodes
def count_nodes(
self,
metanode
)
View Source
@functools.lru_cache()
def count_nodes(self, metanode):
nodes = self.get_node_identifiers(metanode)
return len(nodes)
get_edges_path
def get_edges_path(
self,
metaedge,
file_format='npy'
)
Get the path for the edges file for the specified metaedge. Setting
file_format=None returns the path without any extension suffix.
View Source
def get_edges_path(self, metaedge, file_format="npy"):
"""
Get the path for the edges file for the specified metaedge. Setting
file_format=None returns the path without any extension suffix.
"""
metaedge_abbrev = self.metagraph.get_metaedge(metaedge).get_abbrev()
path = self.edges_directory.joinpath(f"{metaedge_abbrev}")
if file_format is not None:
path = path.with_name(f"{path.name}.{file_format}")
return path
get_metapath_summary_path
def get_metapath_summary_path(
self,
metapath,
metric,
damping,
compression=None
)
View Source
def get_metapath_summary_path(self, metapath, metric, damping, compression=None):
damping = float(damping)
compr = compression_extension[compression]
path = self.directory.joinpath(
"adjusted-path-counts",
f"{metric}-{damping}",
"adjusted-dwpcs",
f"{metapath}.tsv{compr}",
)
return path
get_node_identifiers
def get_node_identifiers(
self,
metanode
)
Returns a list of node identifiers for a metapath
View Source
@functools.lru_cache()
def get_node_identifiers(self, metanode):
"""
Returns a list of node identifiers for a metapath
"""
path = self.get_nodes_path(metanode, file_format="tsv")
node_df = pandas.read_csv(path, sep="\t")
return list(node_df["identifier"])
get_nodes_path
def get_nodes_path(
self,
metanode,
file_format='tsv'
)
Get the path for the nodes file for the specified metanode. Setting
file_format=None returns the path without any extension suffix.
View Source
def get_nodes_path(self, metanode, file_format="tsv"):
"""
Get the path for the nodes file for the specified metanode. Setting
file_format=None returns the path without any extension suffix.
"""
metanode = self.metagraph.get_metanode(metanode)
path = self.nodes_directory.joinpath(f"{metanode}")
if file_format is not None:
path = path.with_name(f"{path.name}.{file_format}")
return path
get_path_counts_path
def get_path_counts_path(
self,
metapath,
metric,
damping,
file_format
)
Setting file_format=None returns the path without any extension suffix.
Supported metrics are 'dwpc' and 'dwwc'.
View Source
def get_path_counts_path(self, metapath, metric, damping, file_format):
"""
Setting file_format=None returns the path without any extension suffix.
Supported metrics are 'dwpc' and 'dwwc'.
"""
damping = float(damping)
path = self.path_counts_directory.joinpath(f"{metric}-{damping}/{metapath}")
if file_format is not None:
path = path.with_name(f"{path.name}.{file_format}")
return path
get_running_degree_group_path
def get_running_degree_group_path(
self,
metapath,
metric,
damping,
extension='.tsv.gz'
)
Get path for degree-grouped permutatation running metrics.
Must specify extension.
View Source
def get_running_degree_group_path(
self, metapath, metric, damping, extension=".tsv.gz"
):
"""
Get path for degree-grouped permutatation running metrics.
Must specify extension.
"""
damping = float(damping)
path = self.directory.joinpath(
"adjusted-path-counts",
f"{metric}-{damping}",
"degree-grouped-permutations",
f"{metapath}{extension}",
)
return path
initialize
def initialize(
self
)
Initialize the directory structure. This function is intended to be
called when creating new HetMat instance on disk.
View Source
def initialize(self):
"""
Initialize the directory structure. This function is intended to be
called when creating new HetMat instance on disk.
"""
# Create directories
directories = [
self.directory,
self.nodes_directory,
self.edges_directory,
]
for directory in directories:
if not directory.is_dir():
directory.mkdir()
metaedge_to_adjacency_matrix
def metaedge_to_adjacency_matrix(
self,
metaedge,
dtype=None,
dense_threshold=None,
file_formats=['sparse.npz', 'npy']
)
file_formats sets the precedence of which file to read in
View Source
def metaedge_to_adjacency_matrix(
self,
metaedge,
dtype=None,
dense_threshold=None,
file_formats=["sparse.npz", "npy"],
):
"""
file_formats sets the precedence of which file to read in
"""
metaedge = self.metagraph.get_metaedge(metaedge)
specs = list()
configurations = itertools.product(file_formats, (True, False))
for file_format, invert in configurations:
path = self.get_edges_path(
metaedge=metaedge.inverse if invert else metaedge,
file_format=file_format,
)
spec = {"path": path, "transpose": invert, "file_format": file_format}
specs.append(spec)
matrix = read_first_matrix(specs)
if dense_threshold is not None:
matrix = hetnetpy.matrix.sparsify_or_densify(
matrix, dense_threshold=dense_threshold
)
if dtype is not None:
matrix = matrix.astype(dtype)
row_ids = self.get_node_identifiers(metaedge.source)
col_ids = self.get_node_identifiers(metaedge.target)
return row_ids, col_ids, matrix
permute_graph
def permute_graph(
self,
num_new_permutations=None,
namer=None,
start_from=None,
multiplier=10,
seed=0
)
Generate and save permutations of the HetMat adjacency matrices.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
num_new_permutations | int | The number of new, permuted HetMats to generate | None |
namer | generator | Yields the names of new permutations. Cannot pass names of existing permutations | None |
start_from | str | Name of permutation to use as starting point. For multiple permutations, | |
the first permutation starts from start_from, and future permutations | |||
continue from the previous one. | None | ||
multiplier | int | How many attempts to make when cross-swapping edges. | None |
seed | int | Random seed for generating new permutations | None |
View Source
def permute_graph(
self,
num_new_permutations=None,
namer=None,
start_from=None,
multiplier=10,
seed=0,
):
"""
Generate and save permutations of the HetMat adjacency matrices.
Parameters
----------
num_new_permutations : int
The number of new, permuted HetMats to generate
namer : generator
Yields the names of new permutations. Cannot pass names of existing permutations
start_from : str
Name of permutation to use as starting point. For multiple permutations,
the first permutation starts from start_from, and future permutations
continue from the previous one.
multiplier : int
How many attempts to make when cross-swapping edges.
seed : int
Random seed for generating new permutations
"""
if namer is None:
# If no namer given, continue increasing names by one for new permutations
namer = (f"{x:03}" for x in itertools.count(start=1))
stat_dfs = list()
for _ in range(num_new_permutations):
permutation_name = next(namer)
new_hetmat = initialize_permutation_directory(self, permutation_name)
if start_from is None:
start_from = self
elif isinstance(start_from, str):
start_from = self.permutations[start_from]
assert isinstance(start_from, HetMat)
metaedges = list(self.metagraph.get_edges(exclude_inverts=True))
for metaedge in metaedges:
rows, cols, original_matrix = start_from.metaedge_to_adjacency_matrix(
metaedge, dense_threshold=1
)
is_directed = metaedge.direction != "both"
permuted_matrix, stats = hetmatpy.matrix.permute_matrix(
original_matrix,
directed=is_directed,
multiplier=multiplier,
seed=seed,
)
path = new_hetmat.get_edges_path(metaedge, file_format=None)
save_matrix(permuted_matrix, path)
stat_df = pandas.DataFrame(stats)
stat_df["metaedge"] = metaedge
stat_df["abbrev"] = metaedge.get_abbrev()
stat_df["permutation"] = permutation_name
stat_dfs.append(stat_df)
start_from = permutation_name
seed += 1
self.permutations[permutation_name] = new_hetmat
return pandas.concat(stat_dfs)
read_path_counts
def read_path_counts(
self,
metapath,
metric,
damping,
file_formats=['sparse.npz', 'npy']
)
Read matrix with values of a path-count-based metric. Attempts to
locate any files with the matrix (or with trivial transformations).
View Source
def read_path_counts(
self, metapath, metric, damping, file_formats=["sparse.npz", "npy"]
):
"""
Read matrix with values of a path-count-based metric. Attempts to
locate any files with the matrix (or with trivial transformations).
"""
category = hetmatpy.degree_weight.categorize(metapath)
metrics = [metric]
if metric == "dwpc" and category == "no_repeats":
metrics.append("dwwc")
if metric == "dwwc" and category == "no_repeats":
metrics.append("dwpc")
specs = list()
configurations = itertools.product(
file_formats,
metrics,
(True, False),
)
for file_format, metric, invert in configurations:
path = self.get_path_counts_path(
metapath=metapath.inverse if invert else metapath,
metric=metric,
damping=damping,
file_format=file_format,
)
spec = {"path": path, "transpose": invert, "file_format": file_format}
specs.append(spec)
row_ids = self.get_node_identifiers(metapath.source())
col_ids = self.get_node_identifiers(metapath.target())
matrix = read_first_matrix(specs)
return row_ids, col_ids, matrix