Source code for CelLink.metrics

"""
Evaluation metrics for alignment quality, imputation accuracy, and graph connectivity.
"""

import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.neighbors import kneighbors_graph
from scipy.spatial import distance
import networkx as nx

[docs]def cell_type_matching_accuracy(m1_source_ct, m1_predict_ct, m2_source_ct, m2_predict_ct):
    """
    Calculate the cell-type prediction accuracy of cell-cell alignment.

    Parameters
    ----------
    m1_source_ct : list
        The original cell type labels of modality 1.
    m1_predict_ct : list
        The predicted cell type labels of modality 1.
    m2_source_ct : list
        The original cell type labels of modality 2.
    m2_predict_ct : list
        The predicted cell type labels of modality 2.

    Returns
    -------
    float
        The overall cell-type prediction accuracy (rounded to 4 decimals).
    """
    n1 = len(m1_source_ct)
    n2 = len(m2_source_ct)
    r1 = sum(m1_source_ct == m1_predict_ct)
    r2 = sum(m2_source_ct == m2_predict_ct)
    acc = (r1 + r2) / (n1 + n2)
    acc = round(acc, 4)
    return acc

[docs]def average_sihouette_width(embedding, cell_type_label):
    """
    Calculate the average silhouette score of integration performance.

    Parameters
    ----------
    embedding : np.ndarray of shape (n_samples, n_dims)
        The 2D (or low-dim) embedding array used for silhouette calculation.
    cell_type_label : array-like of shape (n_samples,)
        The cell type labels corresponding to each embedding point.

    Returns
    -------
    float
        The average silhouette score (rounded to 4 decimals).
    """
    sihouette_avg = silhouette_score(embedding, cell_type_label) 
    sihouette_avg = round(sihouette_avg, 4)
    return sihouette_avg

[docs]def feature_imputation_accuracy_corr(m1_feature, m2_aligned_feature1):
    """
    Calculate the feature imputation accuracy of the aligned feature profile using Pearson correlation.

    Parameters
    ----------
    m1_feature : np.ndarray of shape (n_samples, n_features)
        The original modality-1 feature matrix.
    m2_aligned_feature1 : np.ndarray of shape (n_samples, n_features)
        The imputed (aligned) modality-1 feature matrix obtained for modality 2.

    Returns
    -------
    float
        The average per-sample Pearson correlation (imputation accuracy).
    """
    assert m1_feature.shape == m2_aligned_feature1.shape
    n_samples = m1_feature.shape[0]
    corr_vec = np.zeros(n_samples)
    for i in range(n_samples):
        corr = np.corrcoef(m1_feature[i, :], m2_aligned_feature1[i, :])[0, 1]
        corr_vec[i] = round(corr, 4)
    impute_acc = np.mean(corr_vec)
    return impute_acc

[docs]def feature_imputation_rmse(m1_feature, m2_aligned_feature1):
    """
    Calculate the RMSE of the aligned feature profile.

    Parameters
    ----------
    m1_feature : np.ndarray of shape (n_samples, n_features)
        The original modality-1 feature matrix.
    m2_aligned_feature1 : np.ndarray of shape (n_samples, n_features)
        The imputed (aligned) modality-1 feature matrix obtained for modality 2.

    Returns
    -------
    float
        The root mean squared error between the original and imputed features.
    """
    assert m1_feature.shape == m2_aligned_feature1.shape
    error = m1_feature - m2_aligned_feature1
    squared_error = np.square(error)
    mean_squared_error = np.mean(squared_error)
    impute_rmse = np.sqrt(mean_squared_error)
    return impute_rmse

[docs]def uniFOSCTTM(m1_embedding, m2_embedding, true_matches_for_m2):
    """
    Calculate the proportion of samples closer than the true paired sample (uniFOSCTTM).

    Parameters
    ----------
    m1_embedding : np.ndarray of shape (n, d)
        Embedding of modality 1.
    m2_embedding : np.ndarray of shape (n, d)
        Embedding of modality 2.
    true_matches_for_m2 : array-like of length n
        Indices of the true matched cells in modality 1 for each cell in modality 2.

    Returns
    -------
    float
        The uniFOSCTTM score (rounded to 4 decimals).
    """
    distance_matrix = distance.cdist(m2_embedding, m1_embedding, metric = 'euclidean')
    n = len(true_matches_for_m2)
    vec = np.zeros(n)
    for idx, true_match in enumerate(true_matches_for_m2):
        true_distance = distance_matrix[idx, true_match]
        # Count how many cells in modality 1 are closer to cell idx in modality 2 than the true match
        closer_samples = np.sum(distance_matrix[idx, :] < true_distance)
        vec[idx] = closer_samples / distance_matrix.shape[1]
        prop = np.mean(vec)

    return round(prop, 4)


[docs]def calculate_graph_connectivity(data, labels, k=15):
    """
    Calculate the Graph Connectivity for each cell type in the dataset.

    Parameters
    ----------
    data : np.ndarray of shape (n_samples, n_features)
        The dataset where rows are samples and columns are features.
    labels : array-like of shape (n_samples,)
        The cell type labels for each sample.
    k : int, default=15
        Number of nearest neighbors to consider for each cell.

    Returns
    -------
    float
        The graph connectivity score averaged across cell types.
    """
    kng = kneighbors_graph(data, n_neighbors=k, mode='connectivity', include_self=False)
    G = nx.from_scipy_sparse_array(kng)

    unique_labels = np.unique(labels)
    M = len(unique_labels)
    sum_lcc_ratio = 0

    for label in unique_labels:
        indices = np.where(labels == label)[0]
        subG = G.subgraph(indices)
        largest_cc = max(nx.connected_components(subG), key=len)
        LCC_j = len(largest_cc)
        N_j = len(indices)
        lcc_ratio = LCC_j / N_j
        sum_lcc_ratio += lcc_ratio

    return sum_lcc_ratio / M