These are the figures we will reproduce in this section.
Figure 1f: Stack bar showed distance from YAP, TAZ, TEAD4, and YAP/TAZ/TEAD4 peaks to the nearest TSS
Figure 1f: Stack bar showed distance from YAP, TAZ, TEAD4, and YAP/TAZ/TEAD4 peaks to the nearest TSS


To create stack bar plot to show the proportion of the peaks grouped by their distance to the closest TSS (transcription start site). I have created 2 options here:
1. Dealing with the principle, it is quit long process, however, this will make us understand what we did.
2. Using the ChIPseeker package in R, this is faster way.

We will do in the both ways.

Option 1: From principle.

R.version.string # show R version 
## [1] "R version 4.4.1 (2024-06-14)"

Loading the library.

# BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene")

library(GenomicRanges)
library(rtracklayer) # For importing GTF/BED files
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(GenomicFeatures)
library(here)
library(dplyr)
library(ggplot2)
We need TSS information. There are also other ways to define TSS region.
Before we retieve it, let’s see biology of TSS first.
Promoter, TSS, and transcript (Ref: 10.13140/RG.2.1.4199.1283 )
Promoter, TSS, and transcript (Ref: 10.13140/RG.2.1.4199.1283 )


### Step 1: Get TSS point #### TSS defining 1: Obtain TSS information directly from annotation files
The annotation files like GTF (Gene Transfer Format) or BED (Browser Extensible Data) files. These files contain genomic coordinates and annotations for various genomic features, including TSSs.

gtf_file <- "path/to/your/hg38.gtf"  # Replace with the actual path
gtf <- import(gtf_file)
tss_gr <- gtf[gtf$type == "transcript"]  # Select entries annotated as transcripts
tss_gr <- resize(tss_gr, width=1, fix="start")  # Resize to 1 bp at the start (TSS)

TSS defining 2: At the start of transcript

# Get transcript IDs and gene IDs 
tss_gr <- transcripts(txdb, columns=c("tx_id", "gene_id")) 

# Resize to 1 bp at the start (TSS)
tss_gr <- resize(tss_gr, width=1, fix="start") 

TSS defining 3: A base pair after promoter.

We assume that a base pair downstream of promoter region is TSS.

# 1 Extracting transcript information 
hg38_transcripts <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)

# 2 Defining promoter regions associated with transcripts.
tss_gr <- promoters(hg38_transcripts, upstream=0, downstream=1)

In this case, I will use option 3 for the next part.
### Step 2: Find distance of each peak to the nearest TSS Importing the peak files

Note: We can not use _summit.bed if we want to find overlapping regions of TAZ/YAP/TEAD4. If we want to just show stack bar of each TF,

# Using `_submit.bed` file
TAZ_peaks <- import(here("data/HPC/bowtie2/macs3/TAZ_REP1_peaks.narrowPeak"))
YAP_peaks <- import(here("data/HPC/bowtie2/macs3/YAP_REP1_peaks.narrowPeak"))
TEAD4_peaks <- import(here("data/HPC/bowtie2/macs3/TEAD4_REP1_peaks.narrowPeak"))

Finding the overlapping peaks between YAP/TAZ/TEAD4

YAP_overlap_TAZ_peaks <- subsetByOverlaps(YAP_peaks, TAZ_peaks)
YAP_overlap_TAZ_peaks_overlap_TEAD4 <- subsetByOverlaps(YAP_overlap_TAZ_peaks, TEAD4_peaks)

YAP_overlap_TAZ_peaks_overlap_TEAD4 # 5656 peaks
## GRanges object with 5656 ranges and 6 metadata columns:
##          seqnames              ranges strand |                name     score
##             <Rle>           <IRanges>  <Rle> |         <character> <numeric>
##      [1]     chr1     1024621-1025009      * |     YAP_REP1_peak_1       482
##      [2]     chr1     1264872-1265612      * |     YAP_REP1_peak_3       237
##      [3]     chr1     1360601-1360960      * |     YAP_REP1_peak_4       330
##      [4]     chr1     2061303-2061685      * |     YAP_REP1_peak_6       462
##      [5]     chr1     2140061-2140338      * |     YAP_REP1_peak_7        81
##      ...      ...                 ...    ... .                 ...       ...
##   [5652]     chrX 152896063-152896692      * | YAP_REP1_peak_10393       143
##   [5653]     chrX 154596539-154596884      * | YAP_REP1_peak_10398       291
##   [5654]     chrX 154600480-154600861      * | YAP_REP1_peak_10399       133
##   [5655]     chrX 154732664-154733351      * | YAP_REP1_peak_10401        81
##   [5656]     chrX 155888194-155888500      * | YAP_REP1_peak_10403       126
##          signalValue    pValue    qValue      peak
##            <numeric> <numeric> <numeric> <integer>
##      [1]    15.16740   52.6700  48.25070       193
##      [2]    11.27270   27.7295  23.72220       189
##      [3]    14.42280   37.2991  33.09730       175
##      [4]    17.26140   50.6053  46.21160       187
##      [5]     6.45231   11.5914   8.13184       165
##      ...         ...       ...       ...       ...
##   [5652]     8.69052   18.1155  14.38090       481
##   [5653]    13.28420   33.2336  29.10470       154
##   [5654]     8.35005   17.0331  13.33410       184
##   [5655]     6.45231   11.5914   8.13184       507
##   [5656]     6.91847   16.3011  12.62580       158
##   -------
##   seqinfo: 28 sequences from an unspecified genome; no seqlengths
# Calculate the distance to the nearest TSS
distance_to_tss <- distanceToNearest(YAP_peaks, tss_gr)

# Print the distance
distance_to_tss
## Hits object with 10402 hits and 1 metadata column:
##           queryHits subjectHits |  distance
##           <integer>   <integer> | <integer>
##       [1]         1         143 |      4497
##       [2]         2       11752 |      4295
##       [3]         3       11811 |      1302
##       [4]         4       11893 |       816
##       [5]         5         292 |         0
##       ...       ...         ... .       ...
##   [10398]     10399      247714 |      4996
##   [10399]     10400      247714 |      1444
##   [10400]     10401      251718 |     17705
##   [10401]     10402      251727 |       121
##   [10402]     10403      247793 |      6815
##   -------
##   queryLength: 10403 / subjectLength: 276905

The distance from each peak to the nearest TSS can be accessed from metadata column of Hits object.

mcols(distance_to_tss)
## DataFrame with 10402 rows and 1 column
##        distance
##       <integer>
## 1          4497
## 2          4295
## 3          1302
## 4           816
## 5             0
## ...         ...
## 10398      4996
## 10399      1444
## 10400     17705
## 10401       121
## 10402      6815

get only distance’s value

head(mcols(distance_to_tss)$distance)
## [1] 4497 4295 1302  816    0 5623

Do it for all TFs.

YAP_dist <- mcols(distanceToNearest(YAP_peaks, tss_gr))$distance
TAZ_dist <- mcols(distanceToNearest(TAZ_peaks, tss_gr))$distance
TEAD4_dist <- mcols(distanceToNearest(TEAD4_peaks, tss_gr))$distance
Overlap_dist <- mcols(distanceToNearest(YAP_overlap_TAZ_peaks_overlap_TEAD4 , tss_gr))$distance

Combining it into one dataframe

tss_distance_df <- bind_rows(data.frame(factor = "YAP", distance = YAP_dist),
                            data.frame(factor = "TAZ", distance = TAZ_dist),
                            data.frame(factor = "TEAD4", distance = TEAD4_dist),
                            data.frame(factor = "YAP/TAZ/TEAD4", distance = Overlap_dist)
                            )
                            
head(tss_distance_df)
##   factor distance
## 1    YAP     4497
## 2    YAP     4295
## 3    YAP     1302
## 4    YAP      816
## 5    YAP        0
## 6    YAP     5623

Step 3: Defining distance categories

Defining distance categories.

tss_distance_df %>%
  mutate(category = case_when(
    distance < 1000 ~ "<1kb",
    distance >=1000 & distance < 10000 ~ "1-10kb",
    distance >= 10000 & distance <=100000 ~ "10-100kb",
    distance > 100000 ~ "100kb"
  )) %>%
  head()
##   factor distance category
## 1    YAP     4497   1-10kb
## 2    YAP     4295   1-10kb
## 3    YAP     1302   1-10kb
## 4    YAP      816     <1kb
## 5    YAP        0     <1kb
## 6    YAP     5623   1-10kb

Count number of peaks in each category.

counts_per_category <- tss_distance_df %>%
                          mutate(category = case_when(
                            distance < 1000 ~ "<1kb",
                            distance >=1000 & distance < 10000 ~ "1-10kb",
                            distance >= 10000 & distance <=100000 ~ "10-100kb",
                            distance > 100000 ~ ">100kb"
                          )) %>%
                          group_by(factor, category) %>%
                          count()

counts_per_category
## # A tibble: 16 × 3
## # Groups:   factor, category [16]
##    factor        category     n
##    <chr>         <chr>    <int>
##  1 TAZ           1-10kb    3436
##  2 TAZ           10-100kb  3846
##  3 TAZ           <1kb      1947
##  4 TAZ           >100kb     279
##  5 TEAD4         1-10kb    4011
##  6 TEAD4         10-100kb  4603
##  7 TEAD4         <1kb      1679
##  8 TEAD4         >100kb     374
##  9 YAP           1-10kb    3852
## 10 YAP           10-100kb  4369
## 11 YAP           <1kb      1850
## 12 YAP           >100kb     331
## 13 YAP/TAZ/TEAD4 1-10kb    2120
## 14 YAP/TAZ/TEAD4 10-100kb  2538
## 15 YAP/TAZ/TEAD4 <1kb       811
## 16 YAP/TAZ/TEAD4 >100kb     186

Calculating total peak counts of each TFs

total_counts<- tss_distance_df %>%
                  mutate(category = case_when(
                    distance < 1000 ~ "<1kb",
                    distance >=1000 & distance < 10000 ~ "1-10kb",
                    distance >= 10000 & distance <=100000 ~ "10-100kb",
                    distance > 100000 ~ ">100kb"
                  )) %>%
                  count(factor, name = "total")

total_counts
##          factor total
## 1           TAZ  9508
## 2         TEAD4 10667
## 3           YAP 10402
## 4 YAP/TAZ/TEAD4  5655

Merging the peak count per category and total peak count of each TF.

merged_df <- left_join(counts_per_category, total_counts)
merged_df 
## # A tibble: 16 × 4
## # Groups:   factor, category [16]
##    factor        category     n total
##    <chr>         <chr>    <int> <int>
##  1 TAZ           1-10kb    3436  9508
##  2 TAZ           10-100kb  3846  9508
##  3 TAZ           <1kb      1947  9508
##  4 TAZ           >100kb     279  9508
##  5 TEAD4         1-10kb    4011 10667
##  6 TEAD4         10-100kb  4603 10667
##  7 TEAD4         <1kb      1679 10667
##  8 TEAD4         >100kb     374 10667
##  9 YAP           1-10kb    3852 10402
## 10 YAP           10-100kb  4369 10402
## 11 YAP           <1kb      1850 10402
## 12 YAP           >100kb     331 10402
## 13 YAP/TAZ/TEAD4 1-10kb    2120  5655
## 14 YAP/TAZ/TEAD4 10-100kb  2538  5655
## 15 YAP/TAZ/TEAD4 <1kb       811  5655
## 16 YAP/TAZ/TEAD4 >100kb     186  5655

Step 4: Creating the stackbar plot

merged_df$category<- factor(merged_df$category, 
                            levels = c("<1kb", "1-10kb", "10-100kb", ">100kb"))

merged_df %>%
  mutate(Percentage = n/total * 100) %>%
  ggplot(aes(x= factor, y = Percentage, fill = category)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(
    title = "Distance to TSS",
    x = "Group",
    y = "Percentage"
  ) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  scale_fill_manual(values = c("#EF3E2B", "#F16161", "#F59595", "#FAD1C8")) +
  theme_classic(base_size = 14)

Option 2: Using ChIPseeker

Since ChIPseeker uses lapply, we will use it accordingly. ### Step 1: Annotate the peak using annotatePeak. We have YAP/TAZ/TEAD4 overlapping peaks that we previously analyzed, thus we will write it to .bed file and import it again.

# Write YAP/TAZ/TEAD4 overlapping peaks to `.bed` file
rtracklayer::export(YAP_overlap_TAZ_peaks_overlap_TEAD4, "Overlap_YAP-TAZ-TEAD4.bed")
library(ChIPseeker)

TEAD4_peak <- here("data/HPC/bowtie2/macs3/TEAD4_REP1_peaks.narrowPeak")
TAZ_peak <- here("data/HPC/bowtie2/macs3/TAZ_REP1_peaks.narrowPeak")
YAP_peak <- here("data/HPC/bowtie2/macs3/YAP_REP1_peaks.narrowPeak")
Overlap_peak <- here("result/Overlap_YAP-TAZ-TEAD4.bed")

# Combing peak file and name it
peaks <-  c(TEAD4_peak, TAZ_peak, YAP_peak, Overlap_peak)
names(peaks) <- c("TEAD4", "TAZ", "YAP", "YAP/TAZ/TEAD4")

# Defining reference annotation
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

# Annoting the peaks
peakAnnoList <- lapply(peaks, annotatePeak, 
                       TxDb=txdb,
                       tssRegion=c(-3000, 3000), 
                       verbose=FALSE)

Distance to TSS profiles

plotDistToTSS(peakAnnoList, 
              title = "Distribution of Peaks Relative to TSS") + 
              scale_fill_brewer(palette = "Reds")