#@ import os import pandas as pd import numpy as np homd='./seq' count_file='counts.txt' gene_file='gene_id_product.tsv' #@ count = pd.read_csv( os.path.join(homd, count_file), skiprows=1, index_col=0, sep='\t') gene = pd.read_csv( os.path.join(homd, gene_file), index_col=0, names=['gene_id', 'product'], sep='\t') print(count.shape) print(gene.shape) #@ df = gene.join(count) print(df.shape) df.head() #@ df['batch_1'] = df['SRR453566.sorted.bam'] del df ['SRR453566.sorted.bam'] df['batch_2'] = df['SRR453567.sorted.bam'] del df ['SRR453567.sorted.bam'] df['batch_3'] = df['SRR453568.sorted.bam'] del df ['SRR453568.sorted.bam'] df['chemostat_1'] = df['SRR453569.sorted.bam'] del df ['SRR453569.sorted.bam'] df['chemostat_2'] = df['SRR453570.sorted.bam'] del df ['SRR453570.sorted.bam'] df['chemostat_3'] = df['SRR453571.sorted.bam'] del df ['SRR453571.sorted.bam'] df.head() #@ # REMOVE mito_genes (NC_001224.1) mito_genes = "NC_001224.1" df = df[df.Chr != mito_genes] df.shape #@ # Expression data and gene_length df_exp = df.loc[:, 'batch_1':'chemostat_3'] gene_length = df['Length'] # FPMK/TPM def norm_per_million_reads(df): counts = df.sum() return 10**6 * df / counts def norm_per_kilobase(df, gene_length): df_tmp = df.copy() df_tmp = (df.T * 10**3 / gene_length).T return df_tmp def norm_fpkm(df, gene_length): df_tmp = df.copy() df_tmp = norm_per_million_reads(df_tmp) df_tmp = norm_per_kilobase(df_tmp, gene_length) return df_tmp def norm_tpm(df, gene_length): df_tmp = df.copy() df_tmp = norm_per_kilobase(df_tmp, gene_length) df_tmp = norm_per_million_reads(df_tmp) return df_tmp df_fpkm = norm_fpkm(df_exp, gene_length) df_tpm = norm_tpm(df_exp, gene_length) df_tpm.sum() #@ df_tpm