Skip to content

Module hetmatpy.degree_group

None

None

View Source
import collections

import itertools

import numpy

import pandas

import scipy.sparse

import hetmatpy.degree_weight

from hetmatpy.matrix import metaedge_to_adjacency_matrix

def degrees_to_degree_to_ind(degrees):

    degree_to_indices = dict()

    for i, degree in sorted(enumerate(degrees), key=lambda x: x[1]):

        degree_to_indices.setdefault(degree, []).append(i)

    return degree_to_indices

def metapath_to_degree_dicts(graph, metapath):

    metapath = graph.metagraph.get_metapath(metapath)

    _, _, source_adj_mat = metaedge_to_adjacency_matrix(

        graph, metapath[0], dense_threshold=0.7

    )

    _, _, target_adj_mat = metaedge_to_adjacency_matrix(

        graph, metapath[-1], dense_threshold=0.7

    )

    source_degrees = source_adj_mat.sum(axis=1).flat

    target_degrees = target_adj_mat.sum(axis=0).flat

    source_degree_to_ind = degrees_to_degree_to_ind(source_degrees)

    target_degree_to_ind = degrees_to_degree_to_ind(target_degrees)

    return source_degree_to_ind, target_degree_to_ind

def generate_degree_group_stats(

    source_degree_to_ind, target_degree_to_ind, matrix, scale=False, scaler=1

):

    """

    Yield dictionaries with degree grouped stats

    """

    if scipy.sparse.issparse(matrix) and not scipy.sparse.isspmatrix_csr(matrix):

        matrix = scipy.sparse.csr_matrix(matrix)

    for source_degree, row_inds in source_degree_to_ind.items():

        if source_degree > 0:

            row_matrix = matrix[row_inds, :]

            if scipy.sparse.issparse(row_matrix):

                row_matrix = row_matrix.toarray()

                # row_matrix = scipy.sparse.csc_matrix(row_matrix)

        for target_degree, col_inds in target_degree_to_ind.items():

            row = {

                "source_degree": source_degree,

                "target_degree": target_degree,

            }

            row["n"] = len(row_inds) * len(col_inds)

            if source_degree == 0 or target_degree == 0:

                row["sum"] = 0

                row["nnz"] = 0

                row["sum_of_squares"] = 0

                yield row

                continue

            slice_matrix = row_matrix[:, col_inds]

            values = (

                slice_matrix.data

                if scipy.sparse.issparse(slice_matrix)

                else slice_matrix

            )

            if scale:

                values = numpy.arcsinh(values / scaler)

            row["sum"] = values.sum()

            row["sum_of_squares"] = (values**2).sum()

            if scipy.sparse.issparse(slice_matrix):

                row["nnz"] = slice_matrix.nnz

            else:

                row["nnz"] = numpy.count_nonzero(slice_matrix)

            yield row

def dwpc_to_degrees(

    graph, metapath, damping=0.5, ignore_zeros=False, ignore_redundant=True

):

    """

    Yield a description of each cell in a DWPC matrix adding source and target

    node degree info as well as the corresponding path count.

    Parameters

    ----------

    ignore_redundant: bool

        When metapath is symmetric, only return a single orientation of a node pair.

        For example, yield source-target but not also target-source, which should have

        the same DWPC.

    """

    metapath = graph.metagraph.get_metapath(metapath)

    _, _, source_adj_mat = metaedge_to_adjacency_matrix(

        graph, metapath[0], dense_threshold=0.7

    )

    _, _, target_adj_mat = metaedge_to_adjacency_matrix(

        graph, metapath[-1], dense_threshold=0.7

    )

    source_degrees = source_adj_mat.sum(axis=1).flat

    target_degrees = target_adj_mat.sum(axis=0).flat

    del source_adj_mat, target_adj_mat

    source_path = graph.get_nodes_path(metapath.source(), file_format="tsv")

    source_node_df = pandas.read_csv(source_path, sep="\t")

    source_node_names = list(source_node_df["name"])

    target_path = graph.get_nodes_path(metapath.target(), file_format="tsv")

    target_node_df = pandas.read_csv(target_path, sep="\t")

    target_node_names = list(target_node_df["name"])

    row_names, col_names, dwpc_matrix = graph.read_path_counts(

        metapath, "dwpc", damping

    )

    dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean())

    if scipy.sparse.issparse(dwpc_matrix):

        dwpc_matrix = dwpc_matrix.toarray()

    _, _, path_count = graph.read_path_counts(metapath, "dwpc", 0.0)

    if scipy.sparse.issparse(path_count):

        path_count = path_count.toarray()

    if ignore_redundant and metapath.is_symmetric():

        pairs = itertools.combinations_with_replacement(range(len(row_names)), 2)

    else:

        pairs = itertools.product(range(len(row_names)), range(len(col_names)))

    for row_ind, col_ind in pairs:

        dwpc_value = dwpc_matrix[row_ind, col_ind]

        if ignore_zeros and dwpc_value == 0:

            continue

        row = {

            "source_id": row_names[row_ind],

            "target_id": col_names[col_ind],

            "source_name": source_node_names[row_ind],

            "target_name": target_node_names[col_ind],

            "source_degree": source_degrees[row_ind],

            "target_degree": target_degrees[col_ind],

            "path_count": path_count[row_ind, col_ind],

            "dwpc": dwpc_value,

        }

        yield collections.OrderedDict(row)

def single_permutation_degree_group(permuted_hetmat, metapath, dwpc_mean, damping):

    """

    Compute degree-grouped permutations for a single permuted_hetmat,

    for one metapath.

    """

    _, _, matrix = hetmatpy.degree_weight.dwpc(

        permuted_hetmat, metapath, damping=damping, dense_threshold=0.7

    )

    (

        source_deg_to_ind,

        target_deg_to_ind,

    ) = hetmatpy.degree_group.metapath_to_degree_dicts(permuted_hetmat, metapath)

    row_generator = hetmatpy.degree_group.generate_degree_group_stats(

        source_deg_to_ind, target_deg_to_ind, matrix, scale=True, scaler=dwpc_mean

    )

    degree_grouped_df = (

        pandas.DataFrame(row_generator)

        .set_index(["source_degree", "target_degree"])

        .assign(n_perms=1)

    )

    return degree_grouped_df

Functions

degrees_to_degree_to_ind

def degrees_to_degree_to_ind(
    degrees
)
View Source
def degrees_to_degree_to_ind(degrees):

    degree_to_indices = dict()

    for i, degree in sorted(enumerate(degrees), key=lambda x: x[1]):

        degree_to_indices.setdefault(degree, []).append(i)

    return degree_to_indices

dwpc_to_degrees

def dwpc_to_degrees(
    graph,
    metapath,
    damping=0.5,
    ignore_zeros=False,
    ignore_redundant=True
)

Yield a description of each cell in a DWPC matrix adding source and target

node degree info as well as the corresponding path count.

Parameters:

Name Type Description Default
ignore_redundant bool When metapath is symmetric, only return a single orientation of a node pair.
For example, yield source-target but not also target-source, which should have
the same DWPC. None
View Source
def dwpc_to_degrees(

    graph, metapath, damping=0.5, ignore_zeros=False, ignore_redundant=True

):

    """

    Yield a description of each cell in a DWPC matrix adding source and target

    node degree info as well as the corresponding path count.

    Parameters

    ----------

    ignore_redundant: bool

        When metapath is symmetric, only return a single orientation of a node pair.

        For example, yield source-target but not also target-source, which should have

        the same DWPC.

    """

    metapath = graph.metagraph.get_metapath(metapath)

    _, _, source_adj_mat = metaedge_to_adjacency_matrix(

        graph, metapath[0], dense_threshold=0.7

    )

    _, _, target_adj_mat = metaedge_to_adjacency_matrix(

        graph, metapath[-1], dense_threshold=0.7

    )

    source_degrees = source_adj_mat.sum(axis=1).flat

    target_degrees = target_adj_mat.sum(axis=0).flat

    del source_adj_mat, target_adj_mat

    source_path = graph.get_nodes_path(metapath.source(), file_format="tsv")

    source_node_df = pandas.read_csv(source_path, sep="\t")

    source_node_names = list(source_node_df["name"])

    target_path = graph.get_nodes_path(metapath.target(), file_format="tsv")

    target_node_df = pandas.read_csv(target_path, sep="\t")

    target_node_names = list(target_node_df["name"])

    row_names, col_names, dwpc_matrix = graph.read_path_counts(

        metapath, "dwpc", damping

    )

    dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean())

    if scipy.sparse.issparse(dwpc_matrix):

        dwpc_matrix = dwpc_matrix.toarray()

    _, _, path_count = graph.read_path_counts(metapath, "dwpc", 0.0)

    if scipy.sparse.issparse(path_count):

        path_count = path_count.toarray()

    if ignore_redundant and metapath.is_symmetric():

        pairs = itertools.combinations_with_replacement(range(len(row_names)), 2)

    else:

        pairs = itertools.product(range(len(row_names)), range(len(col_names)))

    for row_ind, col_ind in pairs:

        dwpc_value = dwpc_matrix[row_ind, col_ind]

        if ignore_zeros and dwpc_value == 0:

            continue

        row = {

            "source_id": row_names[row_ind],

            "target_id": col_names[col_ind],

            "source_name": source_node_names[row_ind],

            "target_name": target_node_names[col_ind],

            "source_degree": source_degrees[row_ind],

            "target_degree": target_degrees[col_ind],

            "path_count": path_count[row_ind, col_ind],

            "dwpc": dwpc_value,

        }

        yield collections.OrderedDict(row)

generate_degree_group_stats

def generate_degree_group_stats(
    source_degree_to_ind,
    target_degree_to_ind,
    matrix,
    scale=False,
    scaler=1
)

Yield dictionaries with degree grouped stats

View Source
def generate_degree_group_stats(

    source_degree_to_ind, target_degree_to_ind, matrix, scale=False, scaler=1

):

    """

    Yield dictionaries with degree grouped stats

    """

    if scipy.sparse.issparse(matrix) and not scipy.sparse.isspmatrix_csr(matrix):

        matrix = scipy.sparse.csr_matrix(matrix)

    for source_degree, row_inds in source_degree_to_ind.items():

        if source_degree > 0:

            row_matrix = matrix[row_inds, :]

            if scipy.sparse.issparse(row_matrix):

                row_matrix = row_matrix.toarray()

                # row_matrix = scipy.sparse.csc_matrix(row_matrix)

        for target_degree, col_inds in target_degree_to_ind.items():

            row = {

                "source_degree": source_degree,

                "target_degree": target_degree,

            }

            row["n"] = len(row_inds) * len(col_inds)

            if source_degree == 0 or target_degree == 0:

                row["sum"] = 0

                row["nnz"] = 0

                row["sum_of_squares"] = 0

                yield row

                continue

            slice_matrix = row_matrix[:, col_inds]

            values = (

                slice_matrix.data

                if scipy.sparse.issparse(slice_matrix)

                else slice_matrix

            )

            if scale:

                values = numpy.arcsinh(values / scaler)

            row["sum"] = values.sum()

            row["sum_of_squares"] = (values**2).sum()

            if scipy.sparse.issparse(slice_matrix):

                row["nnz"] = slice_matrix.nnz

            else:

                row["nnz"] = numpy.count_nonzero(slice_matrix)

            yield row

metapath_to_degree_dicts

def metapath_to_degree_dicts(
    graph,
    metapath
)
View Source
def metapath_to_degree_dicts(graph, metapath):

    metapath = graph.metagraph.get_metapath(metapath)

    _, _, source_adj_mat = metaedge_to_adjacency_matrix(

        graph, metapath[0], dense_threshold=0.7

    )

    _, _, target_adj_mat = metaedge_to_adjacency_matrix(

        graph, metapath[-1], dense_threshold=0.7

    )

    source_degrees = source_adj_mat.sum(axis=1).flat

    target_degrees = target_adj_mat.sum(axis=0).flat

    source_degree_to_ind = degrees_to_degree_to_ind(source_degrees)

    target_degree_to_ind = degrees_to_degree_to_ind(target_degrees)

    return source_degree_to_ind, target_degree_to_ind

single_permutation_degree_group

def single_permutation_degree_group(
    permuted_hetmat,
    metapath,
    dwpc_mean,
    damping
)

Compute degree-grouped permutations for a single permuted_hetmat,

for one metapath.

View Source
def single_permutation_degree_group(permuted_hetmat, metapath, dwpc_mean, damping):

    """

    Compute degree-grouped permutations for a single permuted_hetmat,

    for one metapath.

    """

    _, _, matrix = hetmatpy.degree_weight.dwpc(

        permuted_hetmat, metapath, damping=damping, dense_threshold=0.7

    )

    (

        source_deg_to_ind,

        target_deg_to_ind,

    ) = hetmatpy.degree_group.metapath_to_degree_dicts(permuted_hetmat, metapath)

    row_generator = hetmatpy.degree_group.generate_degree_group_stats(

        source_deg_to_ind, target_deg_to_ind, matrix, scale=True, scaler=dwpc_mean

    )

    degree_grouped_df = (

        pandas.DataFrame(row_generator)

        .set_index(["source_degree", "target_degree"])

        .assign(n_perms=1)

    )

    return degree_grouped_df