Spatial perturbation tasks on mouse brain data

[1]:

import os
import sys
sys.path.append("../")
device = "cuda"
import importlib

[2]:

import scanpy as sc
import squidpy as sq
import pandas as pd
from tqdm.notebook import tqdm
import scipy as sp
import numpy as np
import multiprocessing
import pickle as pkl
import torch
import gc
import sklearn.metrics

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'dejavuserif'
matplotlib.rcParams['font.family'] = 'arial'
matplotlib.rc('pdf', fonttype=42)

C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\dask\dataframe\_pyarrow_compat.py:15: FutureWarning: Minimal version of pyarrow will soon be increased to 14.0.1. You are using 11.0.0. Please consider upgrading.
  warnings.warn(

[3]:

import steamboat as sf
import steamboat.tools

[4]:

import torch

[5]:

adata = sc.read_h5ad("saved_h5ad/mmbrain_0.h5ad")

[6]:

model = sf.Steamboat(adata.var_names.tolist(), n_heads=50, n_scales=3)
model = model.to(device)
model.load_state_dict(torch.load('saved_models/mmbrain.pth', weights_only=True), strict=False)

C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\torch\_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  return self.fget.__get__(instance, owner)()

[6]:

_IncompatibleKeys(missing_keys=[], unexpected_keys=['spatial_gather.w_local._scale', 'spatial_gather.w_global._scale'])

[7]:

import importlib
importlib.reload(steamboat.tools)

[7]:

<module 'steamboat.tools' from 'G:\\Projects\\Steamboat\\examples\\..\\steamboat\\tools.py'>

[8]:

model.features = adata.var['gene_symbol']
sf.tools.plot_all_transforms(model, top=3, head_order=[2, 5, 24], figsize=(4, 5))
# # plt.savefig(fig_path + "crc_all_loadings.pdf", bbox_inches='tight')

['Gbx2' 'Trdn' 'Edaradd' 'Hoxb3' 'Barhl2' 'Sec14l5' 'Mog' 'C030029H02Rik'
 'Cldn11' 'Sox10' 'Dgkk' 'Drd2' 'Onecut2' 'Mafa' 'Gabra6' 'Cnpy1' 'Grm3'
 'Sema6a' 'Ermn' 'Nr2e1' 'Emx2os' 'Sstr1' 'Drd1' 'Grin2c' 'Mfge8' 'S1pr1']

../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_8_1.png

Spatial perturbation scenario 1: transplanting cells

Designate cells to be “transplanted”

[9]:

# Find OPC-Oligo in two regions

adata.obs['class_region'] = adata.obs['class'].astype(str) + '@' + adata.obs['parcellation_division'].astype(str)
adata.obs['oligo_class_region'] = adata.obs['class_region']
adata.obs.loc[adata.obs['class_region'] != '31 OPC-Oligo', 'oligo_class_region'] = '98 Other'

[10]:

# Confirm that there are some expression differences in OPC-Oligo in two regions.

oligo_adata = adata[adata.obs['class_region'].isin(['31 OPC-Oligo@Isocortex', '31 OPC-Oligo@HPF'])]

[11]:

oligo_adata.obs['class_region'].value_counts()

[11]:

class_region
31 OPC-Oligo@HPF          980
31 OPC-Oligo@Isocortex    784
Name: count, dtype: int64

[12]:

sc.tl.rank_genes_groups(oligo_adata, groupby='class_region', reference='31 OPC-Oligo@Isocortex', method='wilcoxon')

C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\anndata\_core\anndata.py:1292: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  df[key] = c
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\anndata\_core\anndata.py:1292: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  df[key] = c
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\numpy\core\fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)
  return reduction(axis=axis, out=out, **passkwargs)

[13]:

sc.get.rank_genes_groups_df(oligo_adata, group='31 OPC-Oligo@HPF')

[13]:

	names	scores	logfoldchanges	pvals	pvals_adj
0	ENSMUSG00000020932	13.187178	2.604712	1.040073e-39	1.166962e-36
1	ENSMUSG00000090291	5.466264	4.215365	4.596197e-08	6.446167e-06
2	ENSMUSG00000023913	3.856600	0.769698	1.149752e-04	9.146835e-03
3	ENSMUSG00000031980	3.806837	2.439926	1.407553e-04	9.870467e-03
4	ENSMUSG00000053930	3.645979	2.021932	2.663758e-04	1.758080e-02
...	...	...	...	...	...
1117	ENSMUSG00000058624	-6.016663	-1.100518	1.780497e-09	3.329530e-07
1118	ENSMUSG00000033730	-6.205130	-2.224680	5.465168e-10	1.226384e-07
1119	ENSMUSG00000070570	-6.513019	-0.717460	7.365549e-11	2.066036e-08
1120	ENSMUSG00000047907	-6.625102	-2.285636	3.470090e-11	1.297814e-08
1121	ENSMUSG00000005583	-10.971990	-2.906106	5.211266e-28	2.923520e-25

1122 rows × 5 columns

[14]:

old_adata = adata.copy()
new_adata = adata.copy()

[15]:

sq.pl.spatial_scatter(old_adata, color='parcellation_division', shape=None, figsize=(5, 5), size=1., lw=0., legend_fontsize=9, title="", frameon=False)

WARNING: Please specify a valid `library_id` or set it permanently in `adata.uns['spatial']`

C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\squidpy\pl\_spatial_utils.py:946: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  _cax = scatter(

../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_16_2.png

[16]:

repalcement_class = '31 OPC-Oligo@Isocortex'
to_be_replaced_class = '31 OPC-Oligo@HPF'
new_name = '31 OPC-Oligo@Isocortex->HPF'

n = 200
np.random.seed(0)
replacement = np.random.choice(np.where(new_adata.obs['class_region'] == repalcement_class)[0], n, replace=False)
to_be_replaced = np.random.choice(np.where(new_adata.obs['class_region'] == to_be_replaced_class)[0], n, replace=False)

Reconstruct the gene expression

[17]:

new_adata.X[to_be_replaced, :] = new_adata.X[replacement, :]
new_adata.obs['class2'] = new_adata.obs['class_region'].tolist()
new_adata.obs.loc[new_adata.obs_names[to_be_replaced], 'class2'] = new_name

new_dataset = sf.make_dataset([new_adata], sparse_graph=True, regional_obs=['global'])
sf.tools.calc_obs([new_adata], new_dataset, model, get_recon=True)
new_adata.X = new_adata.obsm['X_recon']

Using ['global'] as regional annotations.

Perform DE analysis

[18]:

highlight = [repalcement_class, new_name, to_be_replaced_class]
sc.tl.rank_genes_groups(new_adata, groupby='class2', groups=highlight, reference=repalcement_class, method='wilcoxon')

C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\numpy\core\fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)
  return reduction(axis=axis, out=out, **passkwargs)
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\numpy\core\fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)
  return reduction(axis=axis, out=out, **passkwargs)

[19]:

a_df = pd.merge(sc.get.rank_genes_groups_df(new_adata, group=to_be_replaced_class), new_adata.var, right_index=True, left_on='names')
b_df = pd.merge(sc.get.rank_genes_groups_df(new_adata, group=new_name), new_adata.var, right_index=True, left_on='names')
ab_df = pd.merge(a_df, b_df, left_on='names', right_on='names')

[20]:

matplotlib.rcParams['mathtext.fontset'] = 'dejavuserif'
matplotlib.rcParams['font.family'] = 'arial'
matplotlib.rc('pdf', fonttype=42)

fig, ax = plt.subplots(figsize=(1.5, 1.5))
ab_df.plot(kind='scatter', x='logfoldchanges_x', y='logfoldchanges_y', s=0.5, ax=ax, alpha=.5)
ax.set_xlabel(to_be_replaced_class)
ax.set_ylabel(new_name)

for pos in ['right', 'top']:
        plt.gca().spines[pos].set_visible(False)

# plt.savefig(savefig_path + "mmbrain_perturbation_logfc.pdf", bbox_inches='tight', transparent=True)

../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_23_0.png

[21]:

ab_df2 = ab_df[~ab_df['logfoldchanges_x'].isna() & ~ab_df['logfoldchanges_y'].isna()]
sp.stats.spearmanr(ab_df2['logfoldchanges_x'], ab_df2['logfoldchanges_y'])

[21]:

SignificanceResult(statistic=0.6419909066233113, pvalue=2.277434802943895e-131)

[22]:

ab_df[['logfoldchanges_x', 'logfoldchanges_y']].corr(method='spearman')

[22]:

	logfoldchanges_x	logfoldchanges_y
logfoldchanges_x	1.000000	0.641991
logfoldchanges_y	0.641991	1.000000

[23]:

ab_df[['logfoldchanges_x', 'logfoldchanges_y']].corr(method='pearson')
# 0.475

[23]:

	logfoldchanges_x	logfoldchanges_y
logfoldchanges_x	1.000000	0.680731
logfoldchanges_y	0.680731	1.000000

Spatial perturbation scenario 2: changing environment.

Overview the variability of expression of genes

[38]:

adata_rearrange = adata[np.where(~adata.obs['class'].isin(['31 OPC-Oligo']))[0].tolist() +
                        np.where(adata.obs['class'].isin(['31 OPC-Oligo']))[0].tolist()]
adata_rearrange.obs['class'] = adata_rearrange.obs['class'].astype(str)
adata_rearrange.obs.loc[adata_rearrange.obs['class'] != '31 OPC-Oligo', 'class'] = '99 Other'
sq.pl.spatial_scatter(adata_rearrange, color='class', shape=None, figsize=(4, 4), size=1., legend_fontsize=9, title="", frameon=False,
                      palette=matplotlib.colors.ListedColormap(['C0', '#f0f0f0']))
# plt.savefig(savefig_path + "mmbrain_KO_scheme.pdf", bbox_inches='tight', transparent=True)

C:\Users\lshh\AppData\Local\Temp\ipykernel_116268\147967129.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  adata_rearrange.obs['class'] = adata_rearrange.obs['class'].astype(str)

WARNING: Please specify a valid `library_id` or set it permanently in `adata.uns['spatial']`

C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\squidpy\pl\_spatial_utils.py:946: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  _cax = scatter(

../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_28_3.png

[39]:

wt_adata = adata.copy()
ko_adata = adata.copy()

[40]:

pd.Series((wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].std(axis=0)/
           wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].mean(axis=0)), index=adata.var['gene_symbol']).sort_values()

[40]:

gene_symbol
Sox10      0.298431
Nfix       0.443308
Cldn11     0.455745
Mog        0.514136
Pou3f3     0.569786
            ...
Lhx4      28.757689
Npffr2    28.905586
Fgf3      29.486641
Scgn      29.530397
Hoxb8     30.028955
Length: 1122, dtype: float32

[41]:

dispersion_df = pd.DataFrame([wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].var(axis=0),
             wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].mean(axis=0)],
             columns=adata.var['gene_symbol'],
             index=['var', 'mean']).T
dispersion_df['dispersion'] = dispersion_df['var'] / dispersion_df['mean']
dispersion_df = dispersion_df[dispersion_df['mean'] > 1.]
dispersion_df.sort_values('dispersion', ascending=False)

[41]:

	var	mean	dispersion
gene_symbol
Grm3	0.706748	1.044864	0.676402
Ermn	0.840322	1.308472	0.642216
Cldn11	1.662142	2.828864	0.587565
Sec14l5	0.932986	1.588670	0.587275
Sulf2	0.736025	1.312926	0.560599
Sema6a	0.627580	1.130927	0.554925
Mog	1.162848	2.097412	0.554421
Zfp536	0.711731	1.330900	0.534774
Gprc5b	0.698224	1.348250	0.517874
Zeb2	0.613312	1.227442	0.499667
Pou3f3	0.565051	1.319265	0.428307
Nfix	0.569929	1.702961	0.334669
Sox10	0.429683	2.196494	0.195622

[42]:

plt.scatter(wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].mean(axis=0),
            wt_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0]].var(axis=0))

[42]:

<matplotlib.collections.PathCollection at 0x174f6c463d0>

../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_32_1.png

Remove Mog from OPC-Oligo and reconstruct cells

[43]:

ko_gene = 'Mog'

plt.hist(ko_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0],
         np.where(ko_adata.var['gene_symbol'] == ko_gene)[0][0]])
ko_adata.X[np.where(ko_adata.obs['class'] == '31 OPC-Oligo')[0],
           np.where(ko_adata.var['gene_symbol'] == ko_gene)[0][0]] = 0
ko_dataset = sf.make_dataset([ko_adata], sparse_graph=True, regional_obs=['global'])
sf.tools.calc_obs([ko_adata], ko_dataset, model, get_recon=True)

wt_dataset = sf.make_dataset([wt_adata], sparse_graph=True, regional_obs=['global'])
sf.tools.calc_obs([wt_adata], wt_dataset, model, get_recon=True)

Using ['global'] as regional annotations.

Using ['global'] as regional annotations.

../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_34_6.png

[44]:

gene_rmse = ((ko_adata.obsm['X_recon'] - wt_adata.obsm['X_recon']) ** 2).sum(axis=0) ** .5

[45]:

cell_rmse = ((ko_adata.obsm['X_recon'] - wt_adata.obsm['X_recon']) ** 2).sum(axis=1) ** .5

[46]:

cell_rmse

[46]:

array([0.01820022, 0.04508448, 0.04385628, ..., 0.03686631, 0.04018443,
       0.02311902], dtype=float32)

[47]:

ko_adata.obs['cell_rmse'] = cell_rmse

[48]:

fig, ax = plt.subplots(figsize=(.8, 3))
my_obs = ko_adata.obs.copy()
my_obs['class'] = my_obs['class'].tolist()
my_obs = my_obs.sort_values('class')
class_count = my_obs['class'].value_counts()
chosen_classes = class_count[class_count > (class_count.sum()) * 0.005].index.tolist()
chosen_classes.remove('31 OPC-Oligo')
sns.boxplot(my_obs[my_obs['class'].isin(chosen_classes)], y='class', x='cell_rmse',
            fliersize=.0, linewidth=.5, ax=ax)
#ax.set_yticklabels(ax.get_yticklabels(), rotation=30, ha='right', va='top', rotation_mode='anchor')
ax.set_ylabel("")
ax.set_xlabel("RMSE(WT, KO)")
for pos in ['right', 'top']:
    plt.gca().spines[pos].set_visible(False)
plt.xlim([0, .2])
# plt.savefig(savefig_path + "mmbrain_perturbation_cell_mse.pdf", transparent=True, bbox_inches='tight')

[48]:

(0.0, 0.2)

../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_39_1.png

[49]:

fig, ax = plt.subplots(figsize=(4.5, .8))
my_obs = ko_adata.obs.copy()
my_obs['class'] = my_obs['class'].tolist()
my_obs = my_obs.sort_values('class')
class_count = my_obs['class'].value_counts()
chosen_classes = class_count[class_count > (class_count.sum()) * 0.005].index.tolist()
chosen_classes.remove('31 OPC-Oligo')
sns.boxplot(my_obs[my_obs['class'].isin(chosen_classes)], x='class', y='cell_rmse',
            fliersize=.0, linewidth=.5, ax=ax)
#ax.set_yticklabels(ax.get_yticklabels(), rotation=30, ha='right', va='top', rotation_mode='anchor')
ax.set_ylabel("RMSE(WT, KO)")
ax.set_xlabel("")
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right', va='center', rotation_mode='anchor')
for pos in ['right', 'top']:
    plt.gca().spines[pos].set_visible(False)
plt.ylim([0, .2])
# plt.savefig(savefig_path + "mmbrain_perturbation_cell_mse.pdf", transparent=True, bbox_inches='tight')

C:\Users\lshh\AppData\Local\Temp\ipykernel_116268\945811404.py:13: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right', va='center', rotation_mode='anchor')

[49]:

(0.0, 0.2)

../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_40_2.png

[50]:

x = wt_adata.obsm['X_recon'][wt_adata.obs['class'] == '31 OPC-Oligo', :]
y = ko_adata.obsm['X_recon'][ko_adata.obs['class'] == '31 OPC-Oligo', :]
cmp_adata = sc.AnnData(np.vstack([x, y]), var=ko_adata.var.copy())
cmp_adata.var_names = cmp_adata.var['gene_symbol']
cmp_adata.obs['grp'] = ['WT'] * x.shape[0] + ['KO'] * y.shape[0]
sc.tl.rank_genes_groups(cmp_adata, groupby='grp', method='wilcoxon')
cmp_df = sc.get.rank_genes_groups_df(cmp_adata, group="KO")
cmp_df

C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\numpy\core\fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)
  return reduction(axis=axis, out=out, **passkwargs)

[50]:

	names	scores	logfoldchanges	pvals	pvals_adj
0	Pif1	16.773153	0.083190	3.835997e-63	4.303989e-60
1	Diaph3	14.955308	0.079347	1.438155e-50	8.068052e-48
2	Clspn	14.882693	0.067352	4.269589e-50	1.596826e-47
3	Slfn9	14.656121	0.064221	1.231072e-48	3.453157e-46
4	Nfib	13.375875	0.069715	8.366315e-41	1.341001e-38
...	...	...	...	...	...
1117	Sec14l5	-11.687506	-0.152907	1.476599e-31	8.283721e-30
1118	Cldn11	-12.559337	-0.239797	3.532823e-36	3.303190e-34
1119	Zfp536	-13.351248	-0.143543	1.164816e-40	1.633655e-38
1120	Sox10	-13.862960	-0.142347	1.062029e-43	1.985995e-41
1121	4921534H16Rik	-14.482137	-0.034744	1.571361e-47	3.526134e-45

1122 rows × 5 columns

[51]:

celltype = '12 HY GABA'
x = wt_adata.obsm['X_recon'][(wt_adata.obs['class'] == celltype), :]
y = ko_adata.obsm['X_recon'][(ko_adata.obs['class'] == celltype), :]
cmp_adata = sc.AnnData(np.vstack([x, y]), var=ko_adata.var.copy())
cmp_adata.var_names = cmp_adata.var['gene_symbol']
cmp_adata.obs['grp'] = ['WT'] * x.shape[0] + ['KO'] * y.shape[0]
sc.tl.rank_genes_groups(cmp_adata, groupby='grp', reference='WT', method='wilcoxon')
cmp_df = sc.get.rank_genes_groups_df(cmp_adata, group="KO")
cmp_df

C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\numpy\core\fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)
  return reduction(axis=axis, out=out, **passkwargs)

[51]:

	names	scores	logfoldchanges	pvals	pvals_adj
0	Slc35d3	0.756400	0.013487	0.449409	0.999606
1	Kirrel	0.742567	0.014942	0.457744	0.999606
2	Sfrp1	0.733180	0.011120	0.463449	0.999606
3	Satb1	0.711935	0.017550	0.476505	0.999606
4	Il1rap	0.702054	0.010552	0.482645	0.999606
...	...	...	...	...	...
1117	Tspear	-0.681304	-0.015128	0.495679	0.999606
1118	Tnc	-0.696125	-0.024466	0.486350	0.999606
1119	Pappa2	-0.729721	-0.018365	0.465561	0.999606
1120	Pax3	-0.729721	-0.021977	0.465561	0.999606
1121	Irx5	-0.852247	-0.019905	0.394077	0.999606

1122 rows × 5 columns

GSEA on DEs

[52]:

import gseapy
from gseapy import Msigdb

msig = Msigdb()
gmt = msig.get_gmt(category='c5.go.bp', dbver="2024.1.Hs")

C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\gseapy\msigdb.py:20: FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  d = pd.read_html(resp.text)[0]
C:\Users\lshh\miniconda3\envs\py311_torch211_cuda121\Lib\site-packages\gseapy\msigdb.py:72: FutureWarning: Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
  d = pd.read_html(resp.text)[0]

[53]:

gene_df = cmp_df.copy()
gene_df.index = cmp_df['names'].str.upper()

gsea_res = gseapy.prerank(rnk=gene_df['scores'], # or rnk = rnk,
                          gene_sets=gmt,
                          threads=5,
                          min_size=5,
                          max_size=1000,
                          permutation_num=1000, # reduce number to speed up testing
                          outdir=None, # don't write to disk
                          seed=0,
                          verbose=True, # see what's going on behind the scenes
                          )

grea_res_selected = gsea_res.res2d[(gsea_res.res2d['NOM p-val'] < 0.01) & (gsea_res.res2d['FDR q-val'] < 0.2)]

2025-03-03 17:52:50,598 [WARNING] Duplicated values found in preranked stats: 36.36% of genes
The order of those genes will be arbitrary, which may produce unexpected results.
2025-03-03 17:52:50,598 [INFO] Parsing data files for GSEA.............................
2025-03-03 17:52:50,688 [INFO] 5396 gene_sets have been filtered out when max_size=1000 and min_size=5
2025-03-03 17:52:50,689 [INFO] 2212 gene_sets used for further statistical testing.....
2025-03-03 17:52:50,690 [INFO] Start to run GSEA...Might take a while..................
2025-03-03 17:52:55,533 [INFO] Congratulations. GSEApy runs successfully................

[54]:

grea_res_selected

[54]:

	Name	Term	ES	NES	NOM p-val	FDR q-val	FWER p-val	Tag %	Gene %	Lead_genes
0	prerank	GOBP_TELENCEPHALON_DEVELOPMENT	0.482072	2.151512	0.0	0.0786	0.048	33/58	31.37%	SLC32A1;ARX;ERBB4;DLX2;LHX6;BCL11B;OXTR;NR2E1;...
1	prerank	GOBP_PALLIUM_DEVELOPMENT	0.546905	2.149917	0.0	0.039896	0.049	19/34	23.71%	SLC32A1;ARX;DLX2;LHX6;NR2E1;PROX1;DRD1;EGFR;AL...
2	prerank	GOBP_REGULATION_OF_ORGAN_GROWTH	0.725254	2.047186	0.001608	0.106388	0.16	7/11	7.13%	ARX;ERBB4;COL14A1;IGF1;ZFPM2;PROX1;RBP4
3	prerank	GOBP_FOREBRAIN_DEVELOPMENT	0.419623	2.040435	0.0	0.086937	0.172	45/91	29.06%	SLC32A1;ARX;ERBB4;DLX2;LHX6;BCL11B;SOX2;LHX8;O...
4	prerank	GOBP_CEREBRAL_CORTEX_CELL_MIGRATION	0.676108	2.031769	0.0	0.0786	0.196	8/13	22.37%	ARX;LHX6;NR2E1;EGFR;RELN;FGF13;SLIT2;POU3F3
5	prerank	GOBP_FOREBRAIN_GENERATION_OF_NEURONS	0.572954	1.984345	0.0	0.131596	0.355	11/21	18.81%	DLX2;LHX6;BCL11B;LHX8;NR2E1;PROX1;SOX1;GATA2;N...
6	prerank	GOBP_HIPPOCAMPUS_DEVELOPMENT	0.591436	1.956173	0.0	0.163665	0.457	11/19	18.63%	SLC32A1;DLX2;NR2E1;PROX1;DRD1;ALK;RELN;FGF13;Z...

[55]:

def func(x):
    x = x[5:]
    return x.replace('_', ' ').lower()
grea_res_selected['Term2'] = grea_res_selected['Term'].apply(func)

C:\Users\lshh\AppData\Local\Temp\ipykernel_116268\2450731809.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grea_res_selected['Term2'] = grea_res_selected['Term'].apply(func)

[56]:

grea_res_selected['-log(q)'] = -np.log10(grea_res_selected['FDR q-val'].astype(float))
fig = sns.relplot(grea_res_selected, y='Term2', x='NES', size='-log(q)', legend=False)
plt.gcf().set_size_inches(1.8, 1.25)
plt.grid(axis='y', zorder=0)
# plt.savefig(savefig_path + "mmbrain_perturbation_go.pdf", transparent=True, bbox_inches='tight')

C:\Users\lshh\AppData\Local\Temp\ipykernel_116268\2706694812.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grea_res_selected['-log(q)'] = -np.log10(grea_res_selected['FDR q-val'].astype(float))

../_images/tutorial_nbs_Ex2_mouse_brain_spatial_perturbation_48_1.png

[ ]: