CmhaDSO umap_python

使用例

#@ LOAD
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import igraph as ig
import leidenalg
from scipy.spatial.distance import pdist, squareform
from umap.umap_ import fuzzy_simplicial_set

sns.set(style='whitegrid')
matplotlib.rcParams['figure.dpi'] = 200

print('UMAP:', umap.__version__)
print('python-igraph:', ig.__version__)
#@ INPUT
homd = 'scrna'

genes = pd.read_csv(os.path.join(homd, 'genes.tsv'),
sep='\t', index_col=0)

data_HVGs = pd.read_pickle(os.path.join(homd, 'data_HVGs.gz'))
data_scaled = pd.read_pickle(os.path.join(homd, 'data_scaled.gz'))

data_scaled.iloc[:5,:5]
#@
sample_labels=['Days0-3','Days6-9', 'Days12-15', 'Days18-21', 'Days24-27']
label_to_colors= {
'Days0-3': 'red',
'Days6-9': 'orange',
'Days12-15': 'yellow',
'Days18-21': 'green',
'Days24-27': 'blue'}

def myplot(ax, coords, label_x, label_y):
for label in sample_labels:
cell_mask = data_scaled.columns.get_level_values(0) == label
ax.scatter(coords[cell_mask, 0],
coords[cell_mask, 1],
color=label_to_colors[label],
linewidth=0,
label=label,
s=6,
alpha=.5)
ax.set_xlabel(label_x)
ax.set_ylabel(label_y)
ax.set_xticks([])
ax.set_yticks([])
#@ UMAP
umap_model_nn5 = umap.UMAP(n_components=2,
n_neighbors=5,
min_dist=.3,
random_state=42,
verbose=False)
umap_coords_nn5 = umap_model_nn5.fit_transform(data_scaled.values.T)

umap_model_nn30 = umap.UMAP(n_components=2,
n_neighbors=30,
min_dist=.3,
random_state=42,
verbose=False)
umap_coords_nn30 = umap_model_nn30.fit_transform(data_scaled.values.T)

umap_model_nn500 = umap.UMAP(n_components=2,
n_neighbors=500,
min_dist=.3,
random_state=42,
verbose=False)
umap_coords_nn500 = umap_model_nn500.fit_transform(data_scaled.values.T)
#@ VISUALIZE
fig = plt.figure(figsize=(6,1.5))

ax1 = fig.add_subplot(1, 3, 1)
myplot(ax1, umap_coords_nn5, label_x='UMAP1', label_y='UMAP2')
ax1.set_title('n_neighbors=5')

ax2 = fig.add_subplot(1, 3, 2)
myplot(ax2, umap_coords_nn30, label_x='UMAP1', label_y='UMAP2')
ax2.set_title('n_neighbors=30')

ax3 = fig.add_subplot(1, 3, 3)
myplot(ax3, umap_coords_nn500, label_x='UMAP1', label_y='UMAP2')
ax3.set_title('n_neighbors=500')
ax3.legend(loc='center left', bbox_to_anchor=(1, .5))

sns.despine()
plt.show()
#@ UMAP
umap_model_md001 = umap.UMAP(n_components=2,
n_neighbors=30,
min_dist=.01,
random_state=42,
verbose=False)
umap_coords_md001 = umap_model_md001.fit_transform(data_scaled.values.T)

umap_model_md030 = umap.UMAP(n_components=2,
n_neighbors=30,
min_dist=.3,
random_state=42,
verbose=False)
umap_coords_md030 = umap_model_md030.fit_transform(data_scaled.values.T)

umap_model_md080 = umap.UMAP(n_components=2,
n_neighbors=30,
min_dist=.8,
random_state=42,
verbose=False)
umap_coords_md080 = umap_model_md080.fit_transform(data_scaled.values.T)
# VISUALIZE
fig = plt.figure(figsize=(6,1.5))

ax1 = fig.add_subplot(1, 3, 1)
myplot(ax1, umap_coords_md001, label_x='UMAP1', label_y='UMAP2')
ax1.set_title('min_dist=0.01')

ax2 = fig.add_subplot(1, 3, 2)
myplot(ax2, umap_coords_md030, label_x='UMAP1', label_y='UMAP2')
ax2.set_title('min_dist=0.30')

ax3 = fig.add_subplot(1, 3, 3)
myplot(ax3, umap_coords_md080, label_x='UMAP1', label_y='UMAP2')
ax3.set_title('min_dist_=0.80')
ax3.legend(loc='center left', bbox_to_anchor=(1, .5))

sns.despine()
plt.show()
#@
top100_gene_index = data_HVGs.var(axis=1).sort_values(ascending=False).index[:100]
data_for_hclust = data_HVGs.loc[top100_gene_index, :]
random_cell_index = np.random.choice(len(data_for_hclust.columns),
100,
replace=False)
data_for_hclust = data_for_hclust.iloc[:, random_cell_index]
#@
cell_labels = data_for_hclust.columns.get_level_values(0)
cell_colors = [label_to_colors[label] for label in cell_labels]

sns.clustermap(data_for_hclust,
method='ward',
metric='euclidean',
xticklabels=False,
yticklabels=False,
col_colors=cell_colors,
figsize=(1, .5))
#@
#data_for_hclust['Gene symbol'] = genes.loc[data_for_hclust.index, 'Symbol']
#data_for_hclust = data_for_hclust.set_index('Gene symbol')
#data_for_hclust

sns.set(font_scale=.3)
sns.clustermap(data_for_hclust,
method='ward',
metric='euclidean',
dendrogram_ratio=(.1, .05),
colors_ratio=.01,
cbar_pos = None,
xticklabels=False,
col_colors=cell_colors,
figsize=(.5, 1))
#@
D = squareform(pdist(data_scaled.values.T, metric='euclidean'))

n_neighbors = 30

knn_indices = np.argpartition(D, n_neighbors, axis=1)[:, :n_neighbors]
knn_distances = D[np.arange(D.shape[0])[:, None], knn_indices]

connectivities = fuzzy_simplicial_set(data_scaled.values.T,
n_neighbors,
random_state=42,
metric='euclidean',
knn_indices=knn_indices,
knn_dists=knn_distances)

connectivities = connectivities[0]
print(connectivities[:5,:10])
#@
sources, targets = connectivities.nonzero()
weights = connectivities[sources, targets]

g = ig.Graph()
g.add_vertices(connectivities.shape[0])
g.add_edges(list(zip(sources, targets)))
g.es['weight'] = weights

partition = leidenalg.find_partition(g, leidenalg.ModularityVertexPartition,
weights=np.array(weights)[0],
seed=42)

clusters=np.array(partition.membership)

print(clusters)
print(np.unique(clusters))
#@
colors = [matplotlib.colors.to_hex(x) for x in matplotlib.cm.tab10.colors]

def plot_clusters(coords, clusters, ax):
for cluster_id in np.unique(clusters):
ax.scatter(coords[clusters==cluster_id, 0],
coords[clusters==cluster_id, 1],
s=6, alpha=.5,
c=colors[cluster_id])
for cluster_id in np.unique(clusters):
centroid = np.mean(coords[clusters==cluster_id, :], axis=0)
ax.annotate(str(cluster_id),
xy=(centroid),
fontsize=13,
color='white',
bbox={'facecolor': colors[cluster_id], 'edgecolor':'k', 'alpha':.8})
plt.axis('off')


fig, ax = plt.subplots(figsize=(3,3))
plot_clusters(umap_coords, clusters, ax)
plt.show()
#@
sns.set(style='whitegrid')

cell_labels = data_scaled.columns.get_level_values(0)
cell_clusters = ['Cluster-'+str(cl) for cl in clusters]

data_c = pd.DataFrame(zip(cell_labels, cell_clusters),
columns=['Label', 'Cluster'])

data_c = pd.crosstab(data_c['Label'], data_c['Cluster'])
data_c = data_c.loc[['Days0-3', 'Days6-9','Days12-15','Days18-21','Days24-27'], :]

proportions = 100. * data_c.values / data_c.values.sum(axis=1)[:, None]
data_c = pd.DataFrame(proportions, index=data_c.index, columns=data_c.columns)

data_c.plot(kind='bar', stacked=True, figsize=(4,2)).legend(bbox_to_anchor=(1.25, 1.05))
#@
expressions = data_HVGs.loc['ENSG00000204531', :].values

data_c = pd.DataFrame(zip(cell_clusters, expressions),
columns=['Cluster', 'POU5F1'])

fig, ax = plt.subplots(figsize=(12,6))
sns.violinplot(x='Cluster', y='POU5F1',
data=data_c,
scale='width',
order=np.sort(np.unique(cell_clusters)),
ax=ax)

plt.show()