Spatial perturbation tasks on mouse brain data

[1]:
import os
import sys
sys.path.append("../")
device = "cuda"
import importlib

[2]:
import scanpy as sc
import squidpy as sq
import pandas as pd
from tqdm.notebook import tqdm
import scipy as sp
import numpy as np
import multiprocessing
import pickle as pkl
import torch
import gc
import sklearn.metrics

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'dejavuserif'
matplotlib.rcParams['font.family'] = 'arial'
matplotlib.rc('pdf', fonttype=42)
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\dask\dataframe\_pyarrow_compat.py:15: FutureWarning: Minimal version of pyarrow will soon be increased to 14.0.1. You are using 11.0.0. Please consider upgrading.
  warnings.warn(
[3]:
import steamboat as sf
import steamboat.tools
[4]:
import torch
[5]:
adata = sc.read_h5ad("saved_h5ad/mmbrain_0.h5ad")
[6]:
model = sf.Steamboat(adata.var_names.tolist(), n_heads=50, n_scales=3)
model = model.to(device)
model.load_state_dict(torch.load('saved_models/mmbrain.pth', weights_only=True), strict=False)
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\torch\_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  return self.fget.__get__(instance, owner)()
[6]:
_IncompatibleKeys(missing_keys=[], unexpected_keys=['spatial_gather.w_local._scale', 'spatial_gather.w_global._scale'])
[7]:
import importlib
importlib.reload(steamboat.tools)
[7]:
<module 'steamboat.tools' from 'G:\\Projects\\Steamboat\\examples\\..\\steamboat\\tools.py'>
[8]:
model.features = adata.var['gene_symbol']
sf.tools.plot_all_transforms(model, top=3, head_order=[2, 5, 24], figsize=(4, 5))
# # plt.savefig(fig_path + "crc_all_loadings.pdf", bbox_inches='tight')
['Gbx2' 'Trdn' 'Edaradd' 'Hoxb3' 'Barhl2' 'Sec14l5' 'Mog' 'C030029H02Rik'
 'Cldn11' 'Sox10' 'Dgkk' 'Drd2' 'Onecut2' 'Mafa' 'Gabra6' 'Cnpy1' 'Grm3'
 'Sema6a' 'Ermn' 'Nr2e1' 'Emx2os' 'Sstr1' 'Drd1' 'Grin2c' 'Mfge8' 'S1pr1']
../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_8_1.png

Spatial perturbation scenario 1: transplanting cells

Designate cells to be “transplanted”

[9]:
# Find OPC-Oligo in two regions

adata.obs['class_region'] = adata.obs['class'].astype(str) + '@' + adata.obs['parcellation_division'].astype(str)
adata.obs['oligo_class_region'] = adata.obs['class_region']
adata.obs.loc[adata.obs['class_region'] != '31 OPC-Oligo', 'oligo_class_region'] = '98 Other'
[10]:
# Confirm that there are some expression differences in OPC-Oligo in two regions.

oligo_adata = adata[adata.obs['class_region'].isin(['31 OPC-Oligo@Isocortex', '31 OPC-Oligo@HPF'])]
[11]:
oligo_adata.obs['class_region'].value_counts()
[11]:
class_region
31 OPC-Oligo@HPF          980
31 OPC-Oligo@Isocortex    784
Name: count, dtype: int64
[12]:
sc.tl.rank_genes_groups(oligo_adata, groupby='class_region', reference='31 OPC-Oligo@Isocortex', method='wilcoxon')
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\anndata\_core\anndata.py:1292: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  df[key] = c
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\anndata\_core\anndata.py:1292: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  df[key] = c
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\numpy\core\fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)
  return reduction(axis=axis, out=out, **passkwargs)
[13]:
sc.get.rank_genes_groups_df(oligo_adata, group='31 OPC-Oligo@HPF')
[13]:
names scores logfoldchanges pvals pvals_adj
0 ENSMUSG00000020932 13.187178 2.604712 1.040073e-39 1.166962e-36
1 ENSMUSG00000090291 5.466264 4.215365 4.596197e-08 6.446167e-06
2 ENSMUSG00000023913 3.856600 0.769698 1.149752e-04 9.146835e-03
3 ENSMUSG00000031980 3.806837 2.439926 1.407553e-04 9.870467e-03
4 ENSMUSG00000053930 3.645979 2.021932 2.663758e-04 1.758080e-02
... ... ... ... ... ...
1117 ENSMUSG00000058624 -6.016663 -1.100518 1.780497e-09 3.329530e-07
1118 ENSMUSG00000033730 -6.205130 -2.224680 5.465168e-10 1.226384e-07
1119 ENSMUSG00000070570 -6.513019 -0.717460 7.365549e-11 2.066036e-08
1120 ENSMUSG00000047907 -6.625102 -2.285636 3.470090e-11 1.297814e-08
1121 ENSMUSG00000005583 -10.971990 -2.906106 5.211266e-28 2.923520e-25

1122 rows × 5 columns

[14]:
old_adata = adata.copy()
new_adata = adata.copy()
[15]:
sq.pl.spatial_scatter(old_adata, color='parcellation_division', shape=None, figsize=(5, 5), size=1., lw=0., legend_fontsize=9, title="", frameon=False)
WARNING: Please specify a valid `library_id` or set it permanently in `adata.uns['spatial']`
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\squidpy\pl\_spatial_utils.py:946: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  _cax = scatter(
../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_16_2.png
[16]:
repalcement_class = '31 OPC-Oligo@Isocortex'
to_be_replaced_class = '31 OPC-Oligo@HPF'
new_name = '31 OPC-Oligo@Isocortex->HPF'

n = 200
np.random.seed(0)
replacement = np.random.choice(np.where(new_adata.obs['class_region'] == repalcement_class)[0], n, replace=False)
to_be_replaced = np.random.choice(np.where(new_adata.obs['class_region'] == to_be_replaced_class)[0], n, replace=False)

Reconstruct the gene expression

[17]:
new_adata.X[to_be_replaced, :] = new_adata.X[replacement, :]
new_adata.obs['class2'] = new_adata.obs['class_region'].tolist()
new_adata.obs.loc[new_adata.obs_names[to_be_replaced], 'class2'] = new_name

new_dataset = sf.make_dataset([new_adata], sparse_graph=True, regional_obs=['global'])
sf.tools.calc_obs([new_adata], new_dataset, model, get_recon=True)
new_adata.X = new_adata.obsm['X_recon']
Using ['global'] as regional annotations.

Perform DE analysis

[18]:
highlight = [repalcement_class, new_name, to_be_replaced_class]
sc.tl.rank_genes_groups(new_adata, groupby='class2', groups=highlight, reference=repalcement_class, method='wilcoxon')
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\numpy\core\fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)
  return reduction(axis=axis, out=out, **passkwargs)
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\numpy\core\fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)
  return reduction(axis=axis, out=out, **passkwargs)
[19]:
a_df = pd.merge(sc.get.rank_genes_groups_df(new_adata, group=to_be_replaced_class), new_adata.var, right_index=True, left_on='names')
b_df = pd.merge(sc.get.rank_genes_groups_df(new_adata, group=new_name), new_adata.var, right_index=True, left_on='names')
ab_df = pd.merge(a_df, b_df, left_on='names', right_on='names')
[20]:
matplotlib.rcParams['mathtext.fontset'] = 'dejavuserif'
matplotlib.rcParams['font.family'] = 'arial'
matplotlib.rc('pdf', fonttype=42)

fig, ax = plt.subplots(figsize=(1.5, 1.5))
ab_df.plot(kind='scatter', x='logfoldchanges_x', y='logfoldchanges_y', s=0.5, ax=ax, alpha=.5)
ax.set_xlabel(to_be_replaced_class)
ax.set_ylabel(new_name)

for pos in ['right', 'top']:
        plt.gca().spines[pos].set_visible(False)

# plt.savefig(savefig_path + "mmbrain_perturbation_logfc.pdf", bbox_inches='tight', transparent=True)
../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_23_0.png
[21]:
ab_df2 = ab_df[~ab_df['logfoldchanges_x'].isna() & ~ab_df['logfoldchanges_y'].isna()]
sp.stats.spearmanr(ab_df2['logfoldchanges_x'], ab_df2['logfoldchanges_y'])
[21]:
SignificanceResult(statistic=0.6419909066233113, pvalue=2.277434802943895e-131)
[22]:
ab_df[['logfoldchanges_x', 'logfoldchanges_y']].corr(method='spearman')
[22]:
logfoldchanges_x logfoldchanges_y
logfoldchanges_x 1.000000 0.641991
logfoldchanges_y 0.641991 1.000000
[23]:
ab_df[['logfoldchanges_x', 'logfoldchanges_y']].corr(method='pearson')
# 0.475
[23]:
logfoldchanges_x logfoldchanges_y
logfoldchanges_x 1.000000 0.680731
logfoldchanges_y 0.680731 1.000000

Spatial perturbation scenario 2: changing environment.

Overview the variability of expression of genes

[38]:
adata_rearrange = adata[np.where(~adata.obs['class'].isin(['31 OPC-Oligo']))[0].tolist() +
                        np.where(adata.obs['class'].isin(['31 OPC-Oligo']))[0].tolist()]
adata_rearrange.obs['class'] = adata_rearrange.obs['class'].astype(str)
adata_rearrange.obs.loc[adata_rearrange.obs['class'] != '31 OPC-Oligo', 'class'] = '99 Other'
sq.pl.spatial_scatter(adata_rearrange, color='class', shape=None, figsize=(4, 4), size=1., legend_fontsize=9, title="", frameon=False,
                      palette=matplotlib.colors.ListedColormap(['C0', '#f0f0f0']))
# plt.savefig(savefig_path + "mmbrain_KO_scheme.pdf", bbox_inches='tight', transparent=True)
C:\Users\lshh\AppData\Local\Temp\ipykernel_116268\147967129.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  adata_rearrange.obs['class'] = adata_rearrange.obs['class'].astype(str)
WARNING: Please specify a valid `library_id` or set it permanently in `adata.uns['spatial']`
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\squidpy\pl\_spatial_utils.py:946: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  _cax = scatter(
../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_28_3.png
[39]:
wt_adata = adata.copy()
ko_adata = adata.copy()
[40]:
pd.Series((wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].std(axis=0)/
           wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].mean(axis=0)), index=adata.var['gene_symbol']).sort_values()
[40]:
gene_symbol
Sox10      0.298431
Nfix       0.443308
Cldn11     0.455745
Mog        0.514136
Pou3f3     0.569786
            ...
Lhx4      28.757689
Npffr2    28.905586
Fgf3      29.486641
Scgn      29.530397
Hoxb8     30.028955
Length: 1122, dtype: float32
[41]:
dispersion_df = pd.DataFrame([wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].var(axis=0),
             wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].mean(axis=0)],
             columns=adata.var['gene_symbol'],
             index=['var', 'mean']).T
dispersion_df['dispersion'] = dispersion_df['var'] / dispersion_df['mean']
dispersion_df = dispersion_df[dispersion_df['mean'] > 1.]
dispersion_df.sort_values('dispersion', ascending=False)
[41]:
var mean dispersion
gene_symbol
Grm3 0.706748 1.044864 0.676402
Ermn 0.840322 1.308472 0.642216
Cldn11 1.662142 2.828864 0.587565
Sec14l5 0.932986 1.588670 0.587275
Sulf2 0.736025 1.312926 0.560599
Sema6a 0.627580 1.130927 0.554925
Mog 1.162848 2.097412 0.554421
Zfp536 0.711731 1.330900 0.534774
Gprc5b 0.698224 1.348250 0.517874
Zeb2 0.613312 1.227442 0.499667
Pou3f3 0.565051 1.319265 0.428307
Nfix 0.569929 1.702961 0.334669
Sox10 0.429683 2.196494 0.195622
[42]:
plt.scatter(wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].mean(axis=0),
            wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].var(axis=0))
[42]:
<matplotlib.collections.PathCollection at 0x174f6c463d0>
../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_32_1.png

Remove Mog from OPC-Oligo and reconstruct cells

[43]:
ko_gene = 'Mog'

plt.hist(ko_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0],
         np.where(ko_adata.var['gene_symbol'] == ko_gene)[0][0]])
ko_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0],
           np.where(ko_adata.var['gene_symbol'] == ko_gene)[0][0]] = 0
ko_dataset = sf.make_dataset([ko_adata], sparse_graph=True, regional_obs=['global'])
sf.tools.calc_obs([ko_adata], ko_dataset, model, get_recon=True)

wt_dataset = sf.make_dataset([wt_adata], sparse_graph=True, regional_obs=['global'])
sf.tools.calc_obs([wt_adata], wt_dataset, model, get_recon=True)
Using ['global'] as regional annotations.
Using ['global'] as regional annotations.
../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_34_6.png
[44]:
gene_rmse = ((ko_adata.obsm['X_recon'] - wt_adata.obsm['X_recon']) ** 2).sum(axis=0) ** .5
[45]:
cell_rmse = ((ko_adata.obsm['X_recon'] - wt_adata.obsm['X_recon']) ** 2).sum(axis=1) ** .5
[46]:
cell_rmse
[46]:
array([0.01820022, 0.04508448, 0.04385628, ..., 0.03686631, 0.04018443,
       0.02311902], dtype=float32)
[47]:
ko_adata.obs['cell_rmse'] = cell_rmse
[48]:
fig, ax = plt.subplots(figsize=(.8, 3))
my_obs = ko_adata.obs.copy()
my_obs['class'] = my_obs['class'].tolist()
my_obs = my_obs.sort_values('class')
class_count = my_obs['class'].value_counts()
chosen_classes = class_count[class_count > (class_count.sum()) * 0.005].index.tolist()
chosen_classes.remove('31 OPC-Oligo')
sns.boxplot(my_obs[my_obs['class'].isin(chosen_classes)], y='class', x='cell_rmse',
            fliersize=.0, linewidth=.5, ax=ax)
#ax.set_yticklabels(ax.get_yticklabels(), rotation=30, ha='right', va='top', rotation_mode='anchor')
ax.set_ylabel("")
ax.set_xlabel("RMSE(WT, KO)")
for pos in ['right', 'top']:
    plt.gca().spines[pos].set_visible(False)
plt.xlim([0, .2])
# plt.savefig(savefig_path + "mmbrain_perturbation_cell_mse.pdf", transparent=True, bbox_inches='tight')
[48]:
(0.0, 0.2)
../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_39_1.png
[49]:
fig, ax = plt.subplots(figsize=(4.5, .8))
my_obs = ko_adata.obs.copy()
my_obs['class'] = my_obs['class'].tolist()
my_obs = my_obs.sort_values('class')
class_count = my_obs['class'].value_counts()
chosen_classes = class_count[class_count > (class_count.sum()) * 0.005].index.tolist()
chosen_classes.remove('31 OPC-Oligo')
sns.boxplot(my_obs[my_obs['class'].isin(chosen_classes)], x='class', y='cell_rmse',
            fliersize=.0, linewidth=.5, ax=ax)
#ax.set_yticklabels(ax.get_yticklabels(), rotation=30, ha='right', va='top', rotation_mode='anchor')
ax.set_ylabel("RMSE(WT, KO)")
ax.set_xlabel("")
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right', va='center', rotation_mode='anchor')
for pos in ['right', 'top']:
    plt.gca().spines[pos].set_visible(False)
plt.ylim([0, .2])
# plt.savefig(savefig_path + "mmbrain_perturbation_cell_mse.pdf", transparent=True, bbox_inches='tight')
C:\Users\lshh\AppData\Local\Temp\ipykernel_116268\945811404.py:13: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right', va='center', rotation_mode='anchor')
[49]:
(0.0, 0.2)
../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_40_2.png
[50]:
x = wt_adata.obsm['X_recon'][wt_adata.obs['class'] == '31 OPC-Oligo', :]
y = ko_adata.obsm['X_recon'][ko_adata.obs['class'] == '31 OPC-Oligo', :]
cmp_adata = sc.AnnData(np.vstack([x, y]), var=ko_adata.var.copy())
cmp_adata.var_names = cmp_adata.var['gene_symbol']
cmp_adata.obs['grp'] = ['WT'] * x.shape[0] + ['KO'] * y.shape[0]
sc.tl.rank_genes_groups(cmp_adata, groupby='grp', method='wilcoxon')
cmp_df = sc.get.rank_genes_groups_df(cmp_adata, group="KO")
cmp_df
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\numpy\core\fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)
  return reduction(axis=axis, out=out, **passkwargs)
[50]:
names scores logfoldchanges pvals pvals_adj
0 Pif1 16.773153 0.083190 3.835997e-63 4.303989e-60
1 Diaph3 14.955308 0.079347 1.438155e-50 8.068052e-48
2 Clspn 14.882693 0.067352 4.269589e-50 1.596826e-47
3 Slfn9 14.656121 0.064221 1.231072e-48 3.453157e-46
4 Nfib 13.375875 0.069715 8.366315e-41 1.341001e-38
... ... ... ... ... ...
1117 Sec14l5 -11.687506 -0.152907 1.476599e-31 8.283721e-30
1118 Cldn11 -12.559337 -0.239797 3.532823e-36 3.303190e-34
1119 Zfp536 -13.351248 -0.143543 1.164816e-40 1.633655e-38
1120 Sox10 -13.862960 -0.142347 1.062029e-43 1.985995e-41
1121 4921534H16Rik -14.482137 -0.034744 1.571361e-47 3.526134e-45

1122 rows × 5 columns

[51]:
celltype = '12 HY GABA'
x = wt_adata.obsm['X_recon'][(wt_adata.obs['class'] == celltype), :]
y = ko_adata.obsm['X_recon'][(ko_adata.obs['class'] == celltype), :]
cmp_adata = sc.AnnData(np.vstack([x, y]), var=ko_adata.var.copy())
cmp_adata.var_names = cmp_adata.var['gene_symbol']
cmp_adata.obs['grp'] = ['WT'] * x.shape[0] + ['KO'] * y.shape[0]
sc.tl.rank_genes_groups(cmp_adata, groupby='grp', reference='WT', method='wilcoxon')
cmp_df = sc.get.rank_genes_groups_df(cmp_adata, group="KO")
cmp_df
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\numpy\core\fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)
  return reduction(axis=axis, out=out, **passkwargs)
[51]:
names scores logfoldchanges pvals pvals_adj
0 Slc35d3 0.756400 0.013487 0.449409 0.999606
1 Kirrel 0.742567 0.014942 0.457744 0.999606
2 Sfrp1 0.733180 0.011120 0.463449 0.999606
3 Satb1 0.711935 0.017550 0.476505 0.999606
4 Il1rap 0.702054 0.010552 0.482645 0.999606
... ... ... ... ... ...
1117 Tspear -0.681304 -0.015128 0.495679 0.999606
1118 Tnc -0.696125 -0.024466 0.486350 0.999606
1119 Pappa2 -0.729721 -0.018365 0.465561 0.999606
1120 Pax3 -0.729721 -0.021977 0.465561 0.999606
1121 Irx5 -0.852247 -0.019905 0.394077 0.999606

1122 rows × 5 columns

GSEA on DEs

[52]:
import gseapy
from gseapy import Msigdb

msig = Msigdb()
gmt = msig.get_gmt(category='c5.go.bp', dbver="2024.1.Hs")
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\gseapy\msigdb.py:20: FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  d = pd.read_html(resp.text)[0]
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\gseapy\msigdb.py:72: FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  d = pd.read_html(resp.text)[0]
[53]:
gene_df = cmp_df.copy()
gene_df.index = cmp_df['names'].str.upper()

gsea_res = gseapy.prerank(rnk=gene_df['scores'], # or rnk = rnk,
                          gene_sets=gmt,
                          threads=5,
                          min_size=5,
                          max_size=1000,
                          permutation_num=1000, # reduce number to speed up testing
                          outdir=None, # don't write to disk
                          seed=0,
                          verbose=True, # see what's going on behind the scenes
                          )

grea_res_selected = gsea_res.res2d[(gsea_res.res2d['NOM p-val'] < 0.01) & (gsea_res.res2d['FDR q-val'] < 0.2)]
2025-03-03 17:52:50,598 [WARNING] Duplicated values found in preranked stats: 36.36% of genes
The order of those genes will be arbitrary, which may produce unexpected results.
2025-03-03 17:52:50,598 [INFO] Parsing data files for GSEA.............................
2025-03-03 17:52:50,688 [INFO] 5396 gene_sets have been filtered out when max_size=1000 and min_size=5
2025-03-03 17:52:50,689 [INFO] 2212 gene_sets used for further statistical testing.....
2025-03-03 17:52:50,690 [INFO] Start to run GSEA...Might take a while..................
2025-03-03 17:52:55,533 [INFO] Congratulations. GSEApy runs successfully................

[54]:
grea_res_selected
[54]:
Name Term ES NES NOM p-val FDR q-val FWER p-val Tag % Gene % Lead_genes
0 prerank GOBP_TELENCEPHALON_DEVELOPMENT 0.482072 2.151512 0.0 0.0786 0.048 33/58 31.37% SLC32A1;ARX;ERBB4;DLX2;LHX6;BCL11B;OXTR;NR2E1;...
1 prerank GOBP_PALLIUM_DEVELOPMENT 0.546905 2.149917 0.0 0.039896 0.049 19/34 23.71% SLC32A1;ARX;DLX2;LHX6;NR2E1;PROX1;DRD1;EGFR;AL...
2 prerank GOBP_REGULATION_OF_ORGAN_GROWTH 0.725254 2.047186 0.001608 0.106388 0.16 7/11 7.13% ARX;ERBB4;COL14A1;IGF1;ZFPM2;PROX1;RBP4
3 prerank GOBP_FOREBRAIN_DEVELOPMENT 0.419623 2.040435 0.0 0.086937 0.172 45/91 29.06% SLC32A1;ARX;ERBB4;DLX2;LHX6;BCL11B;SOX2;LHX8;O...
4 prerank GOBP_CEREBRAL_CORTEX_CELL_MIGRATION 0.676108 2.031769 0.0 0.0786 0.196 8/13 22.37% ARX;LHX6;NR2E1;EGFR;RELN;FGF13;SLIT2;POU3F3
5 prerank GOBP_FOREBRAIN_GENERATION_OF_NEURONS 0.572954 1.984345 0.0 0.131596 0.355 11/21 18.81% DLX2;LHX6;BCL11B;LHX8;NR2E1;PROX1;SOX1;GATA2;N...
6 prerank GOBP_HIPPOCAMPUS_DEVELOPMENT 0.591436 1.956173 0.0 0.163665 0.457 11/19 18.63% SLC32A1;DLX2;NR2E1;PROX1;DRD1;ALK;RELN;FGF13;Z...
[55]:
def func(x):
    x = x[5:]
    return x.replace('_', ' ').lower()
grea_res_selected['Term2'] = grea_res_selected['Term'].apply(func)
C:\Users\lshh\AppData\Local\Temp\ipykernel_116268\2450731809.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grea_res_selected['Term2'] = grea_res_selected['Term'].apply(func)
[56]:
grea_res_selected['-log(q)'] = -np.log10(grea_res_selected['FDR q-val'].astype(float))
fig = sns.relplot(grea_res_selected, y='Term2', x='NES', size='-log(q)', legend=False)
plt.gcf().set_size_inches(1.8, 1.25)
plt.grid(axis='y', zorder=0)
# plt.savefig(savefig_path + "mmbrain_perturbation_go.pdf", transparent=True, bbox_inches='tight')
C:\Users\lshh\AppData\Local\Temp\ipykernel_116268\2706694812.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grea_res_selected['-log(q)'] = -np.log10(grea_res_selected['FDR q-val'].astype(float))
../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_48_1.png
[ ]: