#### API ### # sc.pp.normalize_per_cell # scv.pp.normalize_per_cell .obs.n_counts .var.gene_count_corr # sc.pp.log1p .uns.log1p # scv.pp.log1p # sc.pp.highly_variable_genes .var.highly_variable .var.means .var.dispersions .var.dispersions_norm .uns.hvg # sc.pp.scale .var.mean .var.std # scvi 明示的に .obs.solo .obsm.X_scVI .layers.scVI_normalized # sc.pp.neighbors .uns.neighbors .obsp.distances .obsp.connectivities # sc.tl.umap .uns.umap .obsm.X_umap # sc.tl.leiden .uns.leiden .obs.leiden_scVI # 明示的に
import os import numpy as np import pandas as pd import scanpy as sc
#adata = sc.read("adata.h5ad") adata = sc.read_10x_mtx(10X_dir, var_names="gene_symbols") adata.var_names_make_unique() adata = adata[:, adata.X.sum(axis=0).A1 != 0] #AnnData Class # .X data matrix # .var annotation of variables # .obs annotation of observations # .uns unstruct annotation
Mt = adata.var_names.str.startswith('mt-') # NEED_TO_CHAGE adata.obs['pMt' ] = np.sum(adata[:, Mt].X, axis = 1).A1 / np.sum(adata.X, axis = 1).A1 * 100 adata.obs['nUMIs' ] = adata.X.sum(axis = 1).A1 adata.obs['nGenes'] = (adata.X > 0.0).astype(int).sum(axis = 1).A1 sc.pl.violin(adata, ['nGenes', 'nUMIs', 'pMt'], multi_panel = True ) adata = adata[adata.obs['pMt'] < 20, :] sc.pl.violin(adata, ['nGenes', 'nUMIs', 'pMt'], multi_panel = True )
sc.pp.normalize_total(adata, target_sum = 1e4) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, n_top_genes = 2000) adata.raw = adata sc.pp.scale(adata)
sc.tl.pca(adata, svd_solver = "arpack") sc.pl.pca_variance_ratio(adata, log = True)
sc.pp.neighbors(adata, n_neighbors=25, n_pcs=20, use_rep="X_scVI") # n_neighbors: int (default: 15) # : The size of local neighborhood used for manifold approximation. # : Larger values result in more global view of the manifold, while # : smaller values result in more local data being preserved. sc.tl.umap(adata, min_dist=0.4, spread=0.6) # min_dist : float (default: 0.5) # : Smaller value lead to more clustered/clumped embedding # spread : float (default: 1.0) # : Larger values result in more clustered/clumped embedding sc.tl.leiden(adata, resolution=0.4, key_added="leiden_scVI") # resolution : float (default: 1) # : Larger values result in more clusters.
sc.pl.umap( adata, color = 'clusters', frameon = False, size = 20, use_raw = False) # raw属性が格納されていると、そのデータが優先される sc.pl.violin( adata, ['gene1', 'gene2'], groupby='clusters' , size = 1.5, xlabel = None, ylabel = None )
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon') df = pd.DataFrame(adata.uns['rank_genes_groups']['names'])
D = { '0': 'cell_1', '1': 'cell_2', '2': 'cell_2', '3': 'cell_1' } adata.obs['NEW'] = ( adata.obs['leiden_scVI'] .map(D) .astype('category') )
adata.write("adata.h5ad") df.to_csv("df.csv") sc.pl.umap(adata, save="_sufix.png") # saved in "figures" dir