CmhaDSO python_RPKM

使用例

#@ %matplotlib inline import os import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy import stats #@ homd = 'data' count_file='counts.txt' gene_file='genes.csv' df = pd.read_csv(os.path.join(homd, count_file), index_col=0) df_gene = pd.read_csv(os.path.join(homd,gene_file), index_col=0) #@ print(df.shape) print(df.iloc[:5,:5]) #@ print(df_gene.shape) print(df_gene.iloc[:5,:]) #@ matched_index = pd.Index.intersection(df.index, df_gene.index) counts = np.asarray(df.loc[matched_index, :], dtype=int) gene_names = np.array(matched_index) gene_length = np.asarray(df_gene.loc[matched_index,:]['GeneLength'], dtype=int) #@ total_count = np.sum(counts, axis=0) density = stats.kde.gaussian_kde(total_count) x = np.arange(min(total_count), max(total_count), 10000) fig, ax = plt.subplots() ax.plot(x, density(x)) ax.set_xlabel("Total_count per sample") ax.set_ylabel("Density") plt.show() #@ np.random.seed(seed=7) sample_index = np.random.choice(range(counts.shape[1]), size=70, replace=False) count_subset = counts[:, sample_index] def reduce_xaxis_labels(ax, factor): plt.setp(ax.xaxis.get_ticklabels(), visible=False) for label in ax.xaxis.get_ticklabels()[factor-1::factor]: label.set_visible(True) fig, ax = plt.subplots(figsize=(6,4)) ax.boxplot(np.log(count_subset + 1)) ax.set_xlabel("samples") ax.set_ylabel("Gene expression counts") reduce_xaxis_labels(ax, 5) #@ count_lib_norm = counts / total_count * 1000000 count_subset_lib_norm=count_lib_norm[:, sample_index] fig, ax = plt.subplots(figsize=(6,4)) ax.boxplot(np.log(count_subset_lib_norm + 1)) ax.set_xlabel("samples") ax.set_ylabel("Gene expression counts") reduce_xaxis_labels(ax, 5)