import numpy as np
import pandas as pd
import pickle
import logging
from typing import Literal
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from transbrain.config import Config
import copy
logging.basicConfig(level=logging.INFO)
RegionType = Literal['cortex', 'subcortex', 'all']
[docs]class SpeciesTrans:
"""
Transfer phenotypes between species using graph embeddings.
Parameters
----------
atlas_type : {'bn', 'dk', 'aal'}, optional
The type of human atlas used for initialization.
- 'bn' : Brainnetome Atlas
- 'dk' : Desikan-Killiany Atlas
- 'aal' : Automated Anatomical Labeling (AAL) Atlas
Default is 'bn'.
Attributes
----------
atlas_type : str
The selected atlas type.
regions : dict
Dictionary containing human and mouse brain regions (cortex, subcortex, all).
embeddings : np.ndarray
Loaded graph embeddings used for phenotype translation.
"""
def __init__(self, atlas_type: str = 'bn'):
self.atlas_type = atlas_type
self.regions = self._load_region_data()
self.embeddings = self._load_embeddings()
logging.info(f'Initialized for {atlas_type} atlas.')
def _load_region_data(self) -> dict:
h_cortex, h_subcortex = Config.region_resources[self.atlas_type]
m_cortex, m_subcortex = Config.region_resources['mouse']
return {
'human': {'cortex': h_cortex, 'subcortex': h_subcortex, 'all': h_cortex + h_subcortex},
'mouse': {'cortex': m_cortex, 'subcortex': m_subcortex, 'all': m_cortex + m_subcortex}
}
def _fill_region_values(self, df):
"""
Fill dropped regions using the mean value of regions from the same hierarchical level.
Parameters
----------
df : pd.DataFrame
DataFrame to fill.
Returns
-------
pd.DataFrame
DataFrame with values filled.
"""
regions_to_fill = {
'MOs': ['MOp', 'SSp-ll'],
'VISam': ['VISal', 'VISpm']
}
for region_name in ['MOs','VISam']:
region_mean = df.loc[regions_to_fill[region_name]].mean(axis=0)
df.loc[region_name] = region_mean
return df
[docs] def _load_embeddings(self) -> np.ndarray:
"""
Load graph embeddings for phenotype translation.
The function loads precomputed embeddings from a binary file (pickle format)
based on the selected atlas type. These embeddings are used to map phenotypes
between species.
Returns
-------
np.ndarray
A NumPy array containing the loaded embeddings corresponding to the
specified atlas type.
"""
with open(Config.embeddings_resources[self.atlas_type], 'rb') as f:
return pickle.load(f)
def _restore_values(self, prediction, phenotype_name, phenotype_data):
"""
Restore the values to the original scale.
Parameters
----------
prediction : np.ndarray
The predicted phenotype values after translation.
phenotype_name : str
The name of the phenotype type.
phenotype_data : pd.DataFrame
The original phenotype data used for scaling.
Returns
-------
np.ndarray
The restored phenotype values in the original scale.
"""
# Get the original min and max values of the phenotype data
original_min = phenotype_data[phenotype_name].min()
original_max = phenotype_data[phenotype_name].max()
# First, map the prediction to [0, 1] range
prediction_min = prediction.min()
prediction_max = prediction.max()
normalized_prediction = (prediction - prediction_min) / (prediction_max - prediction_min)
# Then, reverse the normalization to the original scale [original_min, original_max]
restored_prediction = normalized_prediction * (original_max - original_min) + original_min
return restored_prediction
[docs] def _dual_mapping(self, pheno_data: np.ndarray, source_matrix: np.ndarray,
target_matrix: np.ndarray, normalize: bool = False) -> np.ndarray:
"""
Map phenotype data from source to target space using dual regression.
Parameters
----------
pheno_data : np.ndarray
An array of phenotype values (regions,) in the source species.
source_matrix : np.ndarray
The embedding matrix for the source species.
target_matrix : np.ndarray
The embedding matrix for the target species.
normalize : bool, optional
Whether to normalize the phenotype values before regression. Default is False.
Returns
-------
np.ndarray
An array of predicted phenotype values in the target species.
"""
y = pheno_data.reshape(-1, 1)
scaler = MinMaxScaler() if normalize else None
if normalize:
y = scaler.fit_transform(y)
model = LinearRegression().fit(source_matrix, y.ravel())
prediction = model.predict(target_matrix)
return prediction
[docs] def _translate(self, phenotype: pd.DataFrame, direction: str, region_type: RegionType = 'cortex',
normalize: bool = True,restore: bool = False) -> pd.DataFrame:
"""
Unified translation method for both directions.
Parameters
----------
phenotype : pd.DataFrame
A DataFrame where rows are brain regions and columns are phenotype types.
direction : {'human_to_mouse', 'mouse_to_human'}
The translation direction.
region_type : {'cortex', 'subcortex', 'all'}, optional
The region subset to use for translation. Default is 'cortex'.
normalize : bool, optional
Whether to normalize phenotype values before translation. Default is True.
restore : bool, optional
Whether to inverse-transform values back to original scale. Only used if normalize is True.
Returns
-------
pd.DataFrame
Translated phenotype values in the target species, indexed by brain region name.
"""
if restore and not normalize:
raise ValueError("Restore requires normalized input.")
if direction not in ['human_to_mouse', 'mouse_to_human']:
raise ValueError("Invalid translation direction.")
if direction == 'mouse_to_human':
source_species, target_species = direction.split('_to_')
source_regions = copy.deepcopy(self.regions[source_species])
for dict_region_type, dict_regions in source_regions.items():
#Remove the dropped regions during the graph stage due to clearly incorrect similarity.
source_regions[dict_region_type] = [region for region in dict_regions if region not in ['MOs', 'VISam']]
region_data = source_regions[region_type]
target_regions = copy.deepcopy(self.regions[target_species])
else:
source_species, target_species = direction.split('_to_')
source_regions = copy.deepcopy(self.regions[source_species])
region_data = source_regions[region_type]
target_regions = copy.deepcopy(self.regions[target_species])
for dict_region_type, dict_regions in target_regions.items():
target_regions[dict_region_type] = [region for region in dict_regions if region not in ['MOs', 'VISam']]
phenotype = phenotype.T[region_data].T
n_cortex = len(source_regions['cortex'])
n_subcortex = len(source_regions['subcortex'])
if region_type == 'cortex':
padding = ((0, 0), (0, n_subcortex))
elif region_type == 'subcortex':
padding = ((0, 0), (n_cortex, 0))
else:
padding = None
n_human = len(self.regions['human']['all'])
embed_slices = {
'mouse_to_human': (slice(n_human, None), slice(0, n_human)),
'human_to_mouse': (slice(0, n_human), slice(n_human, None))
}
src_slice, tgt_slice = embed_slices[direction]
results = {}
for phenotype_name, values in phenotype.items():
arr = np.pad(values.values[None, :], padding) if padding else values.values[None, :]
predictions = []
for emb in self.embeddings:
src_mat = emb[src_slice]
tgt_mat = emb[tgt_slice]
pred = self._dual_mapping(arr, src_mat, tgt_mat, normalize)
predictions.append(pred)
results[phenotype_name] = np.mean(predictions, axis=0)
# Restore the values if needed
if restore and normalize:
results[phenotype_name] = self._restore_values(results[phenotype_name], phenotype_name, phenotype)
if region_type == 'cortex':
results = {k: v[:len(target_regions['cortex'])] for k, v in results.items()}
index = target_regions['cortex']
elif region_type == 'subcortex':
results = {k: v[-len(target_regions['subcortex']):] for k, v in results.items()}
index = target_regions['subcortex']
else:
index = target_regions['all']
logging.info(f'Successfully translated {source_species} {region_type} phenotypes to {target_species}.')
return pd.DataFrame(results, index=index)
[docs] def mouse_to_human(self, phenotype: pd.DataFrame, region_type: RegionType = 'cortex',
normalize: bool = True,restore: bool = False) -> pd.DataFrame:
"""
Translate mouse phenotype to human.
Parameters
----------
phenotype : pd.DataFrame
Mouse phenotype DataFrame (regions × phenotypes).
region_type : {'cortex', 'subcortex', 'all'}, optional
The brain region type to translate. Default is 'cortex'.
normalize : bool, optional
Whether to normalize data before translation. Default is True.
restore : bool, optional
Whether to restore values back to original scale after translation. Only used if normalize is True.
Returns
-------
pd.DataFrame
Translated human phenotype DataFrame (regions × phenotypes).
"""
result_df = self._translate(phenotype, 'mouse_to_human', region_type, normalize, restore)
return result_df
[docs] def human_to_mouse(self, phenotype: pd.DataFrame, region_type: RegionType = 'cortex',
normalize: bool = True,restore: bool = False) -> pd.DataFrame:
"""
Translate human phenotype to mouse.
Parameters
----------
phenotype : pd.DataFrame
Human phenotype DataFrame (regions × phenotypes).
region_type : {'cortex', 'subcortex', 'all'}, optional
The brain region type to translate. Default is 'cortex'.
normalize : bool, optional
Whether to normalize data before translation. Default is True.
restore : bool, optional
Whether to restore values back to original scale after translation. Only used if normalize is True.
Returns
-------
pd.DataFrame
Translated mouse phenotype DataFrame (regions × phenotypes).
"""
result_df = self._translate(phenotype, 'human_to_mouse', region_type, normalize, restore)
if region_type != 'subcortex':
#Fill regions using brain areas from the same hierarchical level.
target_regions = self.regions['mouse']
index_all = target_regions['cortex'] if region_type == 'cortex' else target_regions['subcortex'] if region_type == 'subcortex' else target_regions['all']
result_df = result_df.reindex(index_all, fill_value=0)
result_df = self._fill_region_values(result_df)
return result_df