Engage to Life Energy
if(!requireNamespace("BiocManager",quietly=TRUE))
install.packages("BiocManager")BiocManager::install(c("Gviz","GenomicRanges","rtracklayer"))
install.packages("devtools")devtools::install_github("cole-trapnell-lab/cicero-release",ref="monocle3")
需要载入的是10x官方软件cellranger-atac的输出目录filtered_peak_bc_matrix下这三个文件:
matrix.mtx,为cell-by-peakcount矩阵;barcodes.tsv,为cell元数据;peaks.bed,为peak元数据。
library(cicero)
#read in matrix data using the Matrix package
indata<- Matrix::readMM("filtered_peak_bc_matrix/matrix.mtx")
#binarize the matrix
indata@x[indata@x> 0] <- 1
#format cell info
cellinfo<- read.table("filtered_peak_bc_matrix/barcodes.tsv")
row.names(cellinfo)<- cellinfo$V1
names(cellinfo)<- "cells"
#format peak info
peakinfo<- read.table("filtered_peak_bc_matrix/peaks.bed")
names(peakinfo)<- c("chr", "bp1", "bp2")
peakinfo$site_name<- paste(peakinfo$chr, peakinfo$bp1, peakinfo$bp2, sep="_")
row.names(peakinfo)<- peakinfo$site_name
row.names(indata)<- row.names(peakinfo)
colnames(indata)<- row.names(cellinfo)
#make CDS( cell_data_set)
input_cds<- suppressWarnings(new_cell_data_set(indata,cell_metadata =cellinfo,gene_metadata = peakinfo))
input_cds<- monocle3::detect_genes(input_cds)
#Ensurethere are no peaks included with zero reads
input_cds<- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,]
#firstfind UMAP coordinates for our input_cds
set.seed(2017)
input_cds<- detect_genes(input_cds)
input_cds<- estimate_size_factors(input_cds)
input_cds<- preprocess_cds(input_cds, method = "LSI")
input_cds<- reduce_dimension(input_cds, reduction_method = 'UMAP',preprocess_method = "LSI")
umap_coords<- reducedDims(input_cds)$UMAP
cicero_cds<- make_cicero_cds(input_cds, reduced_coordinates = umap_coords)
co-accessibility分数在-1和1之间,用于评估每个peak对之间的共可及性,分数越高,越可能是共可及区。此步骤要用到基因坐标信息和基因注释信息,需要准备两个文件:gene的gtf格式注释文件(genes.gtf),UCSC,Ensembl等权威数据库都是标准的gtf格式,直接下载即可,其他格式需自己处理成标准的格式;每条染色体长度信息(chr.len),格式如下,只有两列,一列是染色体编号,另一列对应的染色体序列长度:
#根据自己的指定的genes.gtf和chr.len构建cicero需要的数据库
sample_genome<-read.table("chr.len")
gene_anno<- rtracklayer::readGFF("genes.gtf")
gene_anno$gene<- gene_anno$gene_id
gene_anno$transcript<- gene_anno$transcript_id
gene_anno$symbol<- gene_anno$gene_name
gene_anno$chromosome<-gene_anno$seqid
#peak对之间的共可及性分数
conns<- run_cicero(cicero_cds, sample_genome, sample_num = 2)
index<- order(conns[,3],decreasing=T)
conns<-conns[index,]
#保存下来,以查看具体的得分
write.table(conns,file="peak_coaccess.xls",sep="\t",row.names=F,quote=F)
CCAN_assigns<- generate_ccans(conns)
index<- order(CCAN_assigns[,2],decreasing=T)
CCAN_assigns<-CCAN_assigns[index,]
#保存下来,便于自己从中挑选区间来做图
write.table(CCAN_assigns,file="peak_CCAN.xls",sep="\t",row.names=F,quote=F)
ccan<-"./CCAN/"
dir.create(ccan)
接下来,我们可以进行图形可视化,以直观展示物理邻近的区域之间的关联性,这很可能就是潜在的增强子-启动子对。
#以top20为例可视化做图代码示例
library(stringr)
for(i in 1:20)
{
peak<- CCAN_assigns[i,]$Peak
p1<-as.numeric(str_split_fixed(as.character(conns[conns$Peak1==peak,]$Peak2),"_",3)[,2])
p2<-as.numeric(str_split_fixed(as.character(conns[conns$Peak1==peak,]$Peak2),"_",3)[,3])
start<-min(c(p1,p2))
end<-max(c(p1,p2))
chrom<-str_split_fixed(as.character(CCAN_assigns[i,]$Peak),"_",3)[1]
name<-paste(chrom,start,end,sep="_")
pdf(paste(ccan,name,".pdf",sep=""))
plot_connections(conns,chrom, start ,end ,gene_model = gene_anno,collapseTranscripts ="longest" )
dev.off()
}
地址:上海市松江区中心路1158号5幢5楼
电话:400-9200-612 传真:+86 21 6090 1207/1208-8154
dafabet手机黄金版技术(上海)有限公司 Copyright 2012 Genergy Inc. 沪ICP备10017363号
微信:genenergy