DoubletFinder

CmhaDSO DoubletFinder

使用例

INSTALL

$ conda install -c conda-forge r-remotes

> remotes::install_github('chris-mcginnis-ucsf/DoubletFinder')

LOAD

library(Seurat)
require(DoubletFinder)

INPUT

以下のデータはprefiltering, scaling, PCA, UMAP等を解析済みもの. Aggregateしたデータは使用しない (ex. Ctrlデータ + 介入データ)

so <- readRDS("tmp_a.rds")

# PK identification (Ground Truthを使用しない場合)
so_tmp_1 <- paramSweep_v3(so, PCs=1:16)
so_tmp_2 <- summarizeSweep(so_tmp_1, GT=FALSE)
bcmvn <- find.pK(so_tmp_2)
barplot(bcmvn$BCmetric, names.arg=bcmvn$pK,las=2)
  
	# Bcmvn ~ Mean-variance normalized bimodality coefficient

#@ Set Doublets numbers
  # 上図のMax値である0.02を次式のpKに設定
nExp <- round(ncol(so)*0.10)

  # doubletsを10%と仮定
  # 10x社の細胞インプット数と回収数の対応表を参照して決定

#@ Find Doublets
so <- doubletFinder_v3(so,
  pN=0.25,
  pK=0.02,
  nExp=nExp,
  PCs=1:16)

# PCs ~ The number of statistically-significant principal components
#
# pN ~ This defines the number of generated artificial doublets,
# expressed as a proportion of the merged real-artificial data.
# Default is set to 25%, based on observation that
# DoubletFinder performance is largely pN-invariant
#
# pK ~ This defines the PC neighborhood size used to compute pANN,
# expressed as a proportion of the merged real-artificial data.
# No default is set, as pK should be adjusted for each scRNA-seq dataset.
#
# nExp ~ This defines the pANN threshold used to make final doublet/singlet predictions.

#@ Rename DF.name and Visualize
DF.name = colnames(so@meta.data)[grepl("DF.classification", colnames(so@meta.data))]
DimPlot(so, group.by=DF.name, pt.size=1.2)
VlnPlot(so,features="nFeature_RNA", group.by=DF.name, pt.size=1.2)

SUBSET

so_filtered = so[,so@meta.data[,DF.name]=="Singlet"]