R.version.string # show R version
## [1] "R version 4.4.1 (2024-06-14)"
Loading the library.
library(GenomicRanges)
library(rtracklayer) # for reading in bed file
library(here) # for file path
Importing the peak files, which are the output from
macs3
where narrowPeak
mode was used for
ChIP-seq of transcription factors.
TAZ_peaks<- import(here("data/HPC/bowtie2/macs3/TAZ_REP1_peaks.narrowPeak"))
YAP_peaks<- import(here("data/HPC/bowtie2/macs3/YAP_REP1_peaks.narrowPeak"))
Figure described “position of TEAD4 peak summits relative to the
summits of the overlapping YAP/TAZ peaks, in a 500 bp window surrounding
the summit of YAP/TAZ peaks.”
TAZ peaks coordinates represent common peaks between YAP and TAZ
peaks (YAP/TAZ peaks) (These were used when comparing YAP/TAZ peaks with
other ChIP-seq data).
So, what we need as input data for each axis are
1. x-axis: TAZ peak file
2. y-axis: TEAD4 peak file which peak density on TAZ will further
calculate.
We will do it in a way that makes us understand the principle behind
this plot; after understanding the principle, we can use the R package
like ChIPseeker
to do it as well.
In this case, based on the distance plot centered at 0, we will use
output from macs3
(peak calling tool) the
_summit.bed
file, which identifies the single base with the
highest signal within each peak. This is better than the
narrowPeak.bed
file, which provides all positions within a
peak where the transcription factor binds.
Step 1: Importing _summit.bed
file.
TAZ_summit <- import(here("data/HPC/bowtie2/macs3/TAZ_REP1_summits.bed"))
TAZ_summit # All peaks from TAZ = 9509
## GRanges object with 9509 ranges and 2 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr1 10059 * | TAZ_REP1_peak_1 9.04597
## [2] chr1 10194 * | TAZ_REP1_peak_2 10.16240
## [3] chr1 180791 * | TAZ_REP1_peak_3 4.07683
## [4] chr1 1024814 * | TAZ_REP1_peak_4 50.71850
## [5] chr1 1265028 * | TAZ_REP1_peak_5 16.16140
## ... ... ... ... . ... ...
## [9505] chrX 154733329 * | TAZ_REP1_peak_9505 2.32820
## [9506] chrX 154735831 * | TAZ_REP1_peak_9506 3.91207
## [9507] chrX 154797199 * | TAZ_REP1_peak_9507 3.91207
## [9508] chrX 155888407 * | TAZ_REP1_peak_9508 10.80710
## [9509] chrX 156030545 * | TAZ_REP1_peak_9509 31.87160
## -------
## seqinfo: 26 sequences from an unspecified genome; no seqlengths
We will include only peaks that overlapping between TAZ and YAP.
# Find overlapping TAZ and YAP
TAZ_overlap_YAP_peaks <- subsetByOverlaps(TAZ_peaks, YAP_peaks)
TAZ_summit1 <- TAZ_summit[TAZ_summit$name %in% TAZ_overlap_YAP_peaks$name]
TAZ_summit1
## GRanges object with 6909 ranges and 2 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr1 1024814 * | TAZ_REP1_peak_4 50.7185
## [2] chr1 1265028 * | TAZ_REP1_peak_5 16.1614
## [3] chr1 1265429 * | TAZ_REP1_peak_6 12.7515
## [4] chr1 1360766 * | TAZ_REP1_peak_7 27.3256
## [5] chr1 2061535 * | TAZ_REP1_peak_11 31.4823
## ... ... ... ... . ... ...
## [6905] chrX 154369087 * | TAZ_REP1_peak_9502 8.09378
## [6906] chrX 154596742 * | TAZ_REP1_peak_9503 10.59780
## [6907] chrX 154600772 * | TAZ_REP1_peak_9504 11.66930
## [6908] chrX 154733329 * | TAZ_REP1_peak_9505 2.32820
## [6909] chrX 155888407 * | TAZ_REP1_peak_9508 10.80710
## -------
## seqinfo: 26 sequences from an unspecified genome; no seqlengths
CAUSION!
Take a look at TAZ_summit$name
and
TAZ_overlap_YAP_peaks$name
; the names have to be the same
to perform
TAZ_summit$name %in% TAZ_overlap_YAP_peaks$name
.
head(TAZ_summit$name)
## [1] "TAZ_REP1_peak_1" "TAZ_REP1_peak_2" "TAZ_REP1_peak_3" "TAZ_REP1_peak_4"
## [5] "TAZ_REP1_peak_5" "TAZ_REP1_peak_6"
head(TAZ_overlap_YAP_peaks$name)
## [1] "TAZ_REP1_peak_4" "TAZ_REP1_peak_5" "TAZ_REP1_peak_6" "TAZ_REP1_peak_7"
## [5] "TAZ_REP1_peak_11" "TAZ_REP1_peak_12"
So this code will resulted in a error
TAZ_summit$name %in% YAP_overlap_TAZ_peaks$name
.
Next, importing TEAD4_summit.bed
file.
TEAD4_summit <- import(here("data/HPC/bowtie2/macs3/TEAD4_REP1_summits.bed"))
TEAD4_summit # All peak from TEAD4 = 10668
## GRanges object with 10668 ranges and 2 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr1 1024820 * | TEAD4_REP1_peak_1 35.34760
## [2] chr1 1265049 * | TEAD4_REP1_peak_2 43.23410
## [3] chr1 1360768 * | TEAD4_REP1_peak_3 59.26370
## [4] chr1 1659305 * | TEAD4_REP1_peak_4 6.84313
## [5] chr1 2061491 * | TEAD4_REP1_peak_5 30.11540
## ... ... ... ... . ... ...
## [10664] chrX 154733086 * | TEAD4_REP1_peak_10664 8.44084
## [10665] chrX 154735953 * | TEAD4_REP1_peak_10665 11.36370
## [10666] chrX 154967763 * | TEAD4_REP1_peak_10666 7.61277
## [10667] chrX 155057492 * | TEAD4_REP1_peak_10667 4.40812
## [10668] chrX 155888349 * | TEAD4_REP1_peak_10668 9.29092
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
Since the x-axis is a range of DNA distance around the TAZ peak, we
have to expand TAZ first.
Expand the TAZ summit to a 500 bp window by fixing the center
(remember?, we have information from _summit.bed
as single
base) –> see the details of expand
by fixing center.
TAZ_500bp_window <- resize(TAZ_summit, width = 500, fix="center")
# find overlapping between TEAD4 and TAZ _summit.bed file
hits <- findOverlaps(TEAD4_summit, TAZ_500bp_window)
# NOTE: TEAD4_summit = query, TAZ = subject --> this just to see the results in hits
# a hits object with the indices of the overlapping query and subject
hits
## Hits object with 6391 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 4
## [2] 2 5
## [3] 3 7
## [4] 5 11
## [5] 6 12
## ... ... ...
## [6387] 10661 9503
## [6388] 10662 9504
## [6389] 10664 9505
## [6390] 10665 9506
## [6391] 10668 9508
## -------
## queryLength: 10668 / subjectLength: 9509
The hits object indicates that TEAD4 peak(query) #1 overlaps with TAZ
peak (subject) #4, and TEAD4 peak #2 overlaps with TAZ peak #5, so on
and so forth.
Next, we will find the distance between each match.
summit_distance <- distance(TEAD4_summit[queryHits(hits)], TAZ_summit[subjectHits(hits)])
# Here is the distance of each match.
table(summit_distance)
## summit_distance
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## 234 135 118 135 122 120 128 121 114 134 124 106 114 114 111 109 104 109 96 111
## 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
## 90 92 88 96 92 86 75 84 72 60 84 68 78 71 62 74 61 66 63 61
## 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 56 50 47 73 51 66 57 48 47 42 44 48 39 49 46 48 40 35 47 39
## 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
## 40 27 29 37 33 33 36 29 32 29 30 33 21 34 25 22 26 18 26 24
## 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
## 11 15 13 20 18 35 26 18 20 18 11 13 8 13 20 18 8 20 14 18
## 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## 14 7 12 16 11 16 8 7 9 10 11 10 12 8 10 3 12 5 5 8
## 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
## 11 6 9 6 8 10 6 7 4 4 7 9 8 2 8 5 5 7 2 5
## 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
## 2 2 4 5 7 6 6 1 3 5 3 6 3 4 2 3 2 2 1 1
## 160 161 163 164 165 166 167 168 169 170 171 173 175 176 177 179 180 182 183 184
## 2 2 1 1 5 3 2 4 5 2 4 1 3 1 2 2 4 2 2 1
## 185 186 187 191 192 193 194 195 197 199 200 201 202 203 204 205 209 211 212 213
## 1 4 4 1 4 1 1 2 4 3 2 4 1 2 1 1 2 2 1 3
## 214 216 217 218 219 220 222 223 224 225 227 228 230 232 233 234 235 236 240 241
## 1 1 2 1 1 1 3 1 1 1 2 1 1 1 2 2 1 1 2 2
## 242 245 246
## 2 1 1
length((summit_distance)) # All match points between TEAD4 and TAZ peaks
## [1] 6391
This is the TEAD4 summits that precisely co-localize (distance =0) with TAZ summits.
TEAD4_summit[queryHits(hits)][summit_distance ==0]
## GRanges object with 234 ranges and 2 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr1 8212094 * | TEAD4_REP1_peak_27 12.4059
## [2] chr1 24713384 * | TEAD4_REP1_peak_117 38.7188
## [3] chr1 33099450 * | TEAD4_REP1_peak_172 45.1086
## [4] chr1 43207014 * | TEAD4_REP1_peak_214 104.8170
## [5] chr1 43822190 * | TEAD4_REP1_peak_219 61.9982
## ... ... ... ... . ... ...
## [230] chr9 120366813 * | TEAD4_REP1_peak_10436 47.08490
## [231] chr9 134544508 * | TEAD4_REP1_peak_10511 17.88210
## [232] chr9 136572251 * | TEAD4_REP1_peak_10515 114.31800
## [233] chrX 17602870 * | TEAD4_REP1_peak_10549 18.17520
## [234] chrX 101208245 * | TEAD4_REP1_peak_10603 9.77202
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
This is the TAZ summits that precisely co-localize (distance =0) with TEAD4 summits.
TAZ_summit[subjectHits(hits)][summit_distance ==0]
## GRanges object with 234 ranges and 2 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr1 8212095 * | TAZ_REP1_peak_26 9.50935
## [2] chr1 24713384 * | TAZ_REP1_peak_104 14.95960
## [3] chr1 33099451 * | TAZ_REP1_peak_160 50.37360
## [4] chr1 43207015 * | TAZ_REP1_peak_208 103.03700
## [5] chr1 43822191 * | TAZ_REP1_peak_211 24.06110
## ... ... ... ... . ... ...
## [230] chr9 120366813 * | TAZ_REP1_peak_9284 19.31370
## [231] chr9 134544508 * | TAZ_REP1_peak_9372 18.39740
## [232] chr9 136572251 * | TAZ_REP1_peak_9379 100.08700
## [233] chrX 17602870 * | TAZ_REP1_peak_9406 8.07704
## [234] chrX 101208245 * | TAZ_REP1_peak_9445 5.29685
## -------
## seqinfo: 26 sequences from an unspecified genome; no seqlengths
From figure 1c, there are positive and negative values on the
x-axis.
We now have distance values, so we will turn them into positive or
negative by assuming that
1. Negative values: when TEAD4 summit precede the TAZ summit (TEAD4…then
TAZ).
2. Positive values: when TEAD4 summit follows TAZ summit. (TAZ …then
TEAD4)
# Compute signed distances
signed_distance <- function(A, B) {
# Compute unsigned distance
dist <- distance(A, B)
# Determine signs based on whether A precedes or follows B
sign <- ifelse(start(A) < start(B), -1, 1)
# Apply sign to distance
dist * sign
}
Let’s make the sign to the distance.
library(dplyr)
library(ggplot2)
summit_distance <- signed_distance(TEAD4_summit[queryHits(hits)],
TAZ_summit[subjectHits(hits)])
distance_df<- table(summit_distance) %>%
tibble::as_tibble()
distance_df
## # A tibble: 393 × 2
## summit_distance n
## <chr> <int>
## 1 -245 1
## 2 -242 2
## 3 -241 1
## 4 -240 1
## 5 -236 1
## 6 -234 1
## 7 -232 1
## 8 -230 1
## 9 -228 1
## 10 -227 1
## # ℹ 383 more rows
Making a plot!
distance_df %>%
mutate(summit_distance = as.numeric(summit_distance)) %>%
arrange(summit_distance) %>% #To arrange according to the values
ggplot(aes(x=summit_distance, y = n)) +
geom_point()
We now see how each value makes a peak in the middle; then we change to
a line plot instead to reproduce the figure.
# Smooth the data by average the number of peaks per 5 bp bin.
df_binned <- distance_df %>%
mutate(summit_distance = as.numeric(summit_distance)) %>%
arrange(summit_distance) %>%
mutate(bin = floor(summit_distance / 5) * 5) %>% # Create bins by grouping every 5 bp
group_by(bin) %>%
summarise(n = mean(n, na.rm = TRUE)) # Calculate average 'n' for each bin
head(df_binned)
## # A tibble: 6 × 2
## bin n
## <dbl> <dbl>
## 1 -245 1.33
## 2 -240 1
## 3 -235 1
## 4 -230 1
## 5 -225 1.33
## 6 -220 1
Making a plot after smooth the data.
df_binned %>%
ggplot(aes(x=bin, y = n)) +
geom_line() +
scale_x_continuous(breaks = c(-250, 0, 250)) +
xlab("distance to the summit \nof TAZ peaks (bp)") +
ylab("peak density") +
theme_classic(base_size = 14)