Source code for transbrain.trans

import numpy as np
import pandas as pd
import pickle
import logging
from typing import Literal
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from transbrain.config import Config
import copy

logging.basicConfig(level=logging.INFO)
RegionType = Literal['cortex', 'subcortex', 'all']

[docs]class SpeciesTrans:
    """
    Transfer phenotypes between species using graph embeddings.

    Parameters
    ----------
    atlas_type : {'bn', 'dk', 'aal'}, optional
        The type of human atlas used for initialization.
        
        - 'bn'  : Brainnetome Atlas
        - 'dk'  : Desikan-Killiany Atlas
        - 'aal' : Automated Anatomical Labeling (AAL) Atlas

        Default is 'bn'.

    Attributes
    ----------
    atlas_type : str
        The selected atlas type.
    regions : dict
        Dictionary containing human and mouse brain regions (cortex, subcortex, all).
    embeddings : np.ndarray
        Loaded graph embeddings used for phenotype translation.
    """
    
    def __init__(self, atlas_type: str = 'bn'):
        self.atlas_type = atlas_type
        self.regions = self._load_region_data()
        self.embeddings = self._load_embeddings()
        logging.info(f'Initialized for {atlas_type} atlas.')

    def _load_region_data(self) -> dict:
        h_cortex, h_subcortex = Config.region_resources[self.atlas_type]
        m_cortex, m_subcortex = Config.region_resources['mouse']
        return {
            'human': {'cortex': h_cortex, 'subcortex': h_subcortex, 'all': h_cortex + h_subcortex},
            'mouse': {'cortex': m_cortex, 'subcortex': m_subcortex, 'all': m_cortex + m_subcortex}
        }
    
    def _fill_region_values(self, df):
        """
        Fill dropped regions using the mean value of regions from the same hierarchical level.
        
        Parameters
        ----------
        df : pd.DataFrame
            DataFrame to fill.

        Returns
        -------
        pd.DataFrame
            DataFrame with values filled.
        """
        regions_to_fill = {
            'MOs': ['MOp', 'SSp-ll'],
            'VISam': ['VISal', 'VISpm']
        }

        for region_name in ['MOs','VISam']:
            region_mean = df.loc[regions_to_fill[region_name]].mean(axis=0)
            df.loc[region_name] = region_mean
            
        return df

[docs]    def _load_embeddings(self) -> np.ndarray:
        """
        Load graph embeddings for phenotype translation.

        The function loads precomputed embeddings from a binary file (pickle format)
        based on the selected atlas type. These embeddings are used to map phenotypes
        between species.

        Returns
        -------
        np.ndarray
            A NumPy array containing the loaded embeddings corresponding to the
            specified atlas type.
        """
        with open(Config.embeddings_resources[self.atlas_type], 'rb') as f:
            return pickle.load(f)
    
    def _restore_values(self, prediction, phenotype_name, phenotype_data):
        """
        Restore the values to the original scale.

        Parameters
        ----------
        prediction : np.ndarray
            The predicted phenotype values after translation.
        phenotype_name : str
            The name of the phenotype type.
        phenotype_data : pd.DataFrame
            The original phenotype data used for scaling.

        Returns
        -------
        np.ndarray
            The restored phenotype values in the original scale.
        """
        # Get the original min and max values of the phenotype data
        original_min = phenotype_data[phenotype_name].min()
        original_max = phenotype_data[phenotype_name].max()

        # First, map the prediction to [0, 1] range
        prediction_min = prediction.min()
        prediction_max = prediction.max()
        normalized_prediction = (prediction - prediction_min) / (prediction_max - prediction_min)
        
        # Then, reverse the normalization to the original scale [original_min, original_max]
        restored_prediction = normalized_prediction * (original_max - original_min) + original_min

        return restored_prediction

[docs]    def _dual_mapping(self, pheno_data: np.ndarray, source_matrix: np.ndarray, 
                     target_matrix: np.ndarray, normalize: bool = False) -> np.ndarray:

        """
        Map phenotype data from source to target space using dual regression.

        Parameters
        ----------
        pheno_data : np.ndarray
            An array of phenotype values (regions,) in the source species.
        source_matrix : np.ndarray
            The embedding matrix for the source species.
        target_matrix : np.ndarray
            The embedding matrix for the target species.
        normalize : bool, optional
            Whether to normalize the phenotype values before regression. Default is False.

        Returns
        -------
        np.ndarray
            An array of predicted phenotype values in the target species.
        """
            
        y = pheno_data.reshape(-1, 1)
        scaler = MinMaxScaler() if normalize else None
        
        if normalize:
            y = scaler.fit_transform(y)
        model = LinearRegression().fit(source_matrix, y.ravel())
        prediction = model.predict(target_matrix)

        return prediction

[docs]    def _translate(self, phenotype: pd.DataFrame, direction: str, region_type: RegionType = 'cortex',
                  normalize: bool = True,restore: bool = False) -> pd.DataFrame:
        """
        Unified translation method for both directions.

        Parameters
        ----------
        phenotype : pd.DataFrame
            A DataFrame where rows are brain regions and columns are phenotype types.
        direction : {'human_to_mouse', 'mouse_to_human'}
            The translation direction.
        region_type : {'cortex', 'subcortex', 'all'}, optional
            The region subset to use for translation. Default is 'cortex'.
        normalize : bool, optional
            Whether to normalize phenotype values before translation. Default is True.
        restore : bool, optional
            Whether to inverse-transform values back to original scale. Only used if normalize is True.

        Returns
        -------
        pd.DataFrame
            Translated phenotype values in the target species, indexed by brain region name.
        """
        
        if restore and not normalize:
            raise ValueError("Restore requires normalized input.")

        if direction not in ['human_to_mouse', 'mouse_to_human']:
            raise ValueError("Invalid translation direction.")
        
        if direction == 'mouse_to_human':
            source_species, target_species = direction.split('_to_')
            source_regions = copy.deepcopy(self.regions[source_species])

            for dict_region_type, dict_regions in source_regions.items():
                #Remove the dropped regions during the graph stage due to clearly incorrect similarity.
                source_regions[dict_region_type] = [region for region in dict_regions if region not in ['MOs', 'VISam']]

            region_data = source_regions[region_type] 
            target_regions = copy.deepcopy(self.regions[target_species])
        else:
            source_species, target_species = direction.split('_to_')
            source_regions = copy.deepcopy(self.regions[source_species])
            region_data = source_regions[region_type]
            target_regions = copy.deepcopy(self.regions[target_species])

            for dict_region_type, dict_regions in target_regions.items():
                target_regions[dict_region_type] = [region for region in dict_regions if region not in ['MOs', 'VISam']] 


        phenotype = phenotype.T[region_data].T
        n_cortex = len(source_regions['cortex'])
        n_subcortex = len(source_regions['subcortex'])
        
        if region_type == 'cortex':
            padding = ((0, 0), (0, n_subcortex))
        elif region_type == 'subcortex':
            padding = ((0, 0), (n_cortex, 0))
        else:
            padding = None
        
        n_human = len(self.regions['human']['all'])
        embed_slices = {
            'mouse_to_human': (slice(n_human, None), slice(0, n_human)),
            'human_to_mouse': (slice(0, n_human), slice(n_human, None))
        }
        src_slice, tgt_slice = embed_slices[direction]
        
        results = {}
        for phenotype_name, values in phenotype.items():
            arr = np.pad(values.values[None, :], padding) if padding else values.values[None, :]
            predictions = []
            
            for emb in self.embeddings:
                src_mat = emb[src_slice]
                tgt_mat = emb[tgt_slice]
                pred = self._dual_mapping(arr, src_mat, tgt_mat, normalize)
                predictions.append(pred)
                
            results[phenotype_name] = np.mean(predictions, axis=0)
            # Restore the values if needed
            if restore and normalize:
                results[phenotype_name] = self._restore_values(results[phenotype_name], phenotype_name, phenotype)
    
        
        if region_type == 'cortex':
            results = {k: v[:len(target_regions['cortex'])] for k, v in results.items()}
            index = target_regions['cortex']
        elif region_type == 'subcortex':
            results = {k: v[-len(target_regions['subcortex']):] for k, v in results.items()}
            index = target_regions['subcortex']
        else:
            index = target_regions['all']
        
        logging.info(f'Successfully translated {source_species} {region_type} phenotypes to {target_species}.')

        return pd.DataFrame(results, index=index)


[docs]    def mouse_to_human(self, phenotype: pd.DataFrame, region_type: RegionType = 'cortex',
                      normalize: bool = True,restore: bool = False) -> pd.DataFrame:
        
        """
        Translate mouse phenotype to human.

        Parameters
        ----------
        phenotype : pd.DataFrame
            Mouse phenotype DataFrame (regions × phenotypes).
        region_type : {'cortex', 'subcortex', 'all'}, optional
            The brain region type to translate. Default is 'cortex'.
        normalize : bool, optional
            Whether to normalize data before translation. Default is True.
        restore : bool, optional
            Whether to restore values back to original scale after translation. Only used if normalize is True.

        Returns
        -------
        pd.DataFrame
            Translated human phenotype DataFrame (regions × phenotypes).
        """
        result_df = self._translate(phenotype, 'mouse_to_human', region_type, normalize, restore)

        return result_df

[docs]    def human_to_mouse(self, phenotype: pd.DataFrame, region_type: RegionType = 'cortex',
                      normalize: bool = True,restore: bool = False) -> pd.DataFrame:

        """
        Translate human phenotype to mouse.

        Parameters
        ----------
        phenotype : pd.DataFrame
            Human phenotype DataFrame (regions × phenotypes).
        region_type : {'cortex', 'subcortex', 'all'}, optional
            The brain region type to translate. Default is 'cortex'.
        normalize : bool, optional
            Whether to normalize data before translation. Default is True.
        restore : bool, optional
            Whether to restore values back to original scale after translation. Only used if normalize is True.

        Returns
        -------
        pd.DataFrame
            Translated mouse phenotype DataFrame (regions × phenotypes).
        """
        result_df = self._translate(phenotype, 'human_to_mouse', region_type, normalize, restore)
        
        if region_type != 'subcortex':
            #Fill regions using brain areas from the same hierarchical level.
            target_regions = self.regions['mouse']
            index_all = target_regions['cortex'] if region_type == 'cortex' else target_regions['subcortex'] if region_type == 'subcortex' else target_regions['all']
            result_df = result_df.reindex(index_all, fill_value=0)
            result_df = self._fill_region_values(result_df)

        return result_df