To create stack bar plot to show the proportion of the peaks grouped by
their distance to the closest TSS (transcription start site). I have
created 2 options here:
1. Dealing with the principle, it is quit long process, however, this
will make us understand what we did.
2. Using the ChIPseeker
package in R, this is faster way.
We will do in the both ways.
R.version.string # show R version
## [1] "R version 4.4.1 (2024-06-14)"
Loading the library.
# BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene")
library(GenomicRanges)
library(rtracklayer) # For importing GTF/BED files
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(GenomicFeatures)
library(here)
library(dplyr)
library(ggplot2)
We need TSS information. There are also other ways to define TSS
region.
### Step 1: Get TSS point #### TSS defining 1: Obtain TSS information
directly from annotation files
The annotation files like GTF (Gene Transfer Format) or BED (Browser
Extensible Data) files. These files contain genomic coordinates and
annotations for various genomic features, including TSSs.
gtf_file <- "path/to/your/hg38.gtf" # Replace with the actual path
gtf <- import(gtf_file)
tss_gr <- gtf[gtf$type == "transcript"] # Select entries annotated as transcripts
tss_gr <- resize(tss_gr, width=1, fix="start") # Resize to 1 bp at the start (TSS)
# Get transcript IDs and gene IDs
tss_gr <- transcripts(txdb, columns=c("tx_id", "gene_id"))
# Resize to 1 bp at the start (TSS)
tss_gr <- resize(tss_gr, width=1, fix="start")
We assume that a base pair downstream of promoter region is
TSS.
# 1 Extracting transcript information
hg38_transcripts <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
# 2 Defining promoter regions associated with transcripts.
tss_gr <- promoters(hg38_transcripts, upstream=0, downstream=1)
In this case, I will use option 3 for the next part.
### Step 2: Find distance of each peak to the nearest TSS Importing the
peak files
Note: We can not use _summit.bed
if we want to find
overlapping regions of TAZ/YAP/TEAD4. If we want to just show stack bar
of each TF,
# Using `_submit.bed` file
TAZ_peaks <- import(here("data/HPC/bowtie2/macs3/TAZ_REP1_peaks.narrowPeak"))
YAP_peaks <- import(here("data/HPC/bowtie2/macs3/YAP_REP1_peaks.narrowPeak"))
TEAD4_peaks <- import(here("data/HPC/bowtie2/macs3/TEAD4_REP1_peaks.narrowPeak"))
Finding the overlapping peaks between YAP/TAZ/TEAD4
YAP_overlap_TAZ_peaks <- subsetByOverlaps(YAP_peaks, TAZ_peaks)
YAP_overlap_TAZ_peaks_overlap_TEAD4 <- subsetByOverlaps(YAP_overlap_TAZ_peaks, TEAD4_peaks)
YAP_overlap_TAZ_peaks_overlap_TEAD4 # 5656 peaks
## GRanges object with 5656 ranges and 6 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr1 1024621-1025009 * | YAP_REP1_peak_1 482
## [2] chr1 1264872-1265612 * | YAP_REP1_peak_3 237
## [3] chr1 1360601-1360960 * | YAP_REP1_peak_4 330
## [4] chr1 2061303-2061685 * | YAP_REP1_peak_6 462
## [5] chr1 2140061-2140338 * | YAP_REP1_peak_7 81
## ... ... ... ... . ... ...
## [5652] chrX 152896063-152896692 * | YAP_REP1_peak_10393 143
## [5653] chrX 154596539-154596884 * | YAP_REP1_peak_10398 291
## [5654] chrX 154600480-154600861 * | YAP_REP1_peak_10399 133
## [5655] chrX 154732664-154733351 * | YAP_REP1_peak_10401 81
## [5656] chrX 155888194-155888500 * | YAP_REP1_peak_10403 126
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <integer>
## [1] 15.16740 52.6700 48.25070 193
## [2] 11.27270 27.7295 23.72220 189
## [3] 14.42280 37.2991 33.09730 175
## [4] 17.26140 50.6053 46.21160 187
## [5] 6.45231 11.5914 8.13184 165
## ... ... ... ... ...
## [5652] 8.69052 18.1155 14.38090 481
## [5653] 13.28420 33.2336 29.10470 154
## [5654] 8.35005 17.0331 13.33410 184
## [5655] 6.45231 11.5914 8.13184 507
## [5656] 6.91847 16.3011 12.62580 158
## -------
## seqinfo: 28 sequences from an unspecified genome; no seqlengths
# Calculate the distance to the nearest TSS
distance_to_tss <- distanceToNearest(YAP_peaks, tss_gr)
# Print the distance
distance_to_tss
## Hits object with 10402 hits and 1 metadata column:
## queryHits subjectHits | distance
## <integer> <integer> | <integer>
## [1] 1 143 | 4497
## [2] 2 11752 | 4295
## [3] 3 11811 | 1302
## [4] 4 11893 | 816
## [5] 5 292 | 0
## ... ... ... . ...
## [10398] 10399 247714 | 4996
## [10399] 10400 247714 | 1444
## [10400] 10401 251718 | 17705
## [10401] 10402 251727 | 121
## [10402] 10403 247793 | 6815
## -------
## queryLength: 10403 / subjectLength: 276905
The distance
from each peak to the nearest TSS can be
accessed from metadata column of Hits
object.
mcols(distance_to_tss)
## DataFrame with 10402 rows and 1 column
## distance
## <integer>
## 1 4497
## 2 4295
## 3 1302
## 4 816
## 5 0
## ... ...
## 10398 4996
## 10399 1444
## 10400 17705
## 10401 121
## 10402 6815
get only distance’s value
head(mcols(distance_to_tss)$distance)
## [1] 4497 4295 1302 816 0 5623
Do it for all TFs.
YAP_dist <- mcols(distanceToNearest(YAP_peaks, tss_gr))$distance
TAZ_dist <- mcols(distanceToNearest(TAZ_peaks, tss_gr))$distance
TEAD4_dist <- mcols(distanceToNearest(TEAD4_peaks, tss_gr))$distance
Overlap_dist <- mcols(distanceToNearest(YAP_overlap_TAZ_peaks_overlap_TEAD4 , tss_gr))$distance
Combining it into one dataframe
tss_distance_df <- bind_rows(data.frame(factor = "YAP", distance = YAP_dist),
data.frame(factor = "TAZ", distance = TAZ_dist),
data.frame(factor = "TEAD4", distance = TEAD4_dist),
data.frame(factor = "YAP/TAZ/TEAD4", distance = Overlap_dist)
)
head(tss_distance_df)
## factor distance
## 1 YAP 4497
## 2 YAP 4295
## 3 YAP 1302
## 4 YAP 816
## 5 YAP 0
## 6 YAP 5623
Defining distance categories.
tss_distance_df %>%
mutate(category = case_when(
distance < 1000 ~ "<1kb",
distance >=1000 & distance < 10000 ~ "1-10kb",
distance >= 10000 & distance <=100000 ~ "10-100kb",
distance > 100000 ~ "100kb"
)) %>%
head()
## factor distance category
## 1 YAP 4497 1-10kb
## 2 YAP 4295 1-10kb
## 3 YAP 1302 1-10kb
## 4 YAP 816 <1kb
## 5 YAP 0 <1kb
## 6 YAP 5623 1-10kb
Count number of peaks in each category.
counts_per_category <- tss_distance_df %>%
mutate(category = case_when(
distance < 1000 ~ "<1kb",
distance >=1000 & distance < 10000 ~ "1-10kb",
distance >= 10000 & distance <=100000 ~ "10-100kb",
distance > 100000 ~ ">100kb"
)) %>%
group_by(factor, category) %>%
count()
counts_per_category
## # A tibble: 16 × 3
## # Groups: factor, category [16]
## factor category n
## <chr> <chr> <int>
## 1 TAZ 1-10kb 3436
## 2 TAZ 10-100kb 3846
## 3 TAZ <1kb 1947
## 4 TAZ >100kb 279
## 5 TEAD4 1-10kb 4011
## 6 TEAD4 10-100kb 4603
## 7 TEAD4 <1kb 1679
## 8 TEAD4 >100kb 374
## 9 YAP 1-10kb 3852
## 10 YAP 10-100kb 4369
## 11 YAP <1kb 1850
## 12 YAP >100kb 331
## 13 YAP/TAZ/TEAD4 1-10kb 2120
## 14 YAP/TAZ/TEAD4 10-100kb 2538
## 15 YAP/TAZ/TEAD4 <1kb 811
## 16 YAP/TAZ/TEAD4 >100kb 186
Calculating total peak counts of each TFs
total_counts<- tss_distance_df %>%
mutate(category = case_when(
distance < 1000 ~ "<1kb",
distance >=1000 & distance < 10000 ~ "1-10kb",
distance >= 10000 & distance <=100000 ~ "10-100kb",
distance > 100000 ~ ">100kb"
)) %>%
count(factor, name = "total")
total_counts
## factor total
## 1 TAZ 9508
## 2 TEAD4 10667
## 3 YAP 10402
## 4 YAP/TAZ/TEAD4 5655
Merging the peak count per category and total peak count of each TF.
merged_df <- left_join(counts_per_category, total_counts)
merged_df
## # A tibble: 16 × 4
## # Groups: factor, category [16]
## factor category n total
## <chr> <chr> <int> <int>
## 1 TAZ 1-10kb 3436 9508
## 2 TAZ 10-100kb 3846 9508
## 3 TAZ <1kb 1947 9508
## 4 TAZ >100kb 279 9508
## 5 TEAD4 1-10kb 4011 10667
## 6 TEAD4 10-100kb 4603 10667
## 7 TEAD4 <1kb 1679 10667
## 8 TEAD4 >100kb 374 10667
## 9 YAP 1-10kb 3852 10402
## 10 YAP 10-100kb 4369 10402
## 11 YAP <1kb 1850 10402
## 12 YAP >100kb 331 10402
## 13 YAP/TAZ/TEAD4 1-10kb 2120 5655
## 14 YAP/TAZ/TEAD4 10-100kb 2538 5655
## 15 YAP/TAZ/TEAD4 <1kb 811 5655
## 16 YAP/TAZ/TEAD4 >100kb 186 5655
merged_df$category<- factor(merged_df$category,
levels = c("<1kb", "1-10kb", "10-100kb", ">100kb"))
merged_df %>%
mutate(Percentage = n/total * 100) %>%
ggplot(aes(x= factor, y = Percentage, fill = category)) +
geom_bar(stat = "identity", position = "stack") +
labs(
title = "Distance to TSS",
x = "Group",
y = "Percentage"
) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
scale_fill_manual(values = c("#EF3E2B", "#F16161", "#F59595", "#FAD1C8")) +
theme_classic(base_size = 14)
ChIPseeker
Since ChIPseeker
uses lapply
, we will use
it accordingly. ### Step 1: Annotate the peak using
annotatePeak
. We have YAP/TAZ/TEAD4 overlapping peaks that
we previously analyzed, thus we will write it to .bed
file
and import it again.
# Write YAP/TAZ/TEAD4 overlapping peaks to `.bed` file
rtracklayer::export(YAP_overlap_TAZ_peaks_overlap_TEAD4, "Overlap_YAP-TAZ-TEAD4.bed")
library(ChIPseeker)
TEAD4_peak <- here("data/HPC/bowtie2/macs3/TEAD4_REP1_peaks.narrowPeak")
TAZ_peak <- here("data/HPC/bowtie2/macs3/TAZ_REP1_peaks.narrowPeak")
YAP_peak <- here("data/HPC/bowtie2/macs3/YAP_REP1_peaks.narrowPeak")
Overlap_peak <- here("result/Overlap_YAP-TAZ-TEAD4.bed")
# Combing peak file and name it
peaks <- c(TEAD4_peak, TAZ_peak, YAP_peak, Overlap_peak)
names(peaks) <- c("TEAD4", "TAZ", "YAP", "YAP/TAZ/TEAD4")
# Defining reference annotation
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
# Annoting the peaks
peakAnnoList <- lapply(peaks, annotatePeak,
TxDb=txdb,
tssRegion=c(-3000, 3000),
verbose=FALSE)
Distance to TSS profiles
plotDistToTSS(peakAnnoList,
title = "Distribution of Peaks Relative to TSS") +
scale_fill_brewer(palette = "Reds")