You could try using the GenomicRanges
package.
library(dplyr)
library(GenomicRanges)
Here we load in the the example input data. (This is an inelegant way to do this — I know… but I was lazy and the sublime multiline edit made it easy.) Note: I don’t know where the “1” column means, but I kept it in the data.
ranges <-
rbind(
c("2","12","chr1","836780","856723","-5.7648","599"),
c("3","116","chr1","1693001","1739032","-4.8403","473"),
c("4","117","chr1","1750780","1880930","-5.3036","536"),
c("5","121","chr1","2020123","2108890","-4.4165","415")
) %>%
as.data.frame()
colnames(ranges) <-
c("1","bin","chrom","chromStart","chromEnd","name","score")
viable <-
rbind(
c("chr1","840000","890000","1566"),
c("chr1","1690000","1740000","1566"),
c("chr1","1700000","1750000","1566"),
c("chr1","1710000","1760000","1566"),
c("chr1","1720000","1770000","1566"),
c("chr1","1730000","1780000","1566"),
c("chr1","1740000","1790000","1566"),
c("chr1","1750000","1800000","1566"),
c("chr1","1760000","1810000","1566")
) %>%
as.data.frame()
colnames(viable) <-
c("chrom","chromStart","chromEnd","N")
## Need columns to be integers
ranges <-
ranges %>%
tbl_df() %>%
mutate(
chromStart = chromStart %>% as.character %>% as.integer,
chromEnd = chromEnd %>% as.character %>% as.integer
)
viable <-
viable %>%
tbl_df() %>%
mutate(
chromStart = chromStart %>% as.character %>% as.integer,
chromEnd = chromEnd %>% as.character %>% as.integer
)
Here is where my answer begins:
- Reformat dataframe to GenomicRanges class
- Find the regions by doing an intersection
- Add in the bin, name, and score columns using the
findOverlaps
. (Note, this information is removed during the intersection because there is not necessarily a 1:1 mapping) - Reformat output back into a dataframe
Done
gr.ranges <-
makeGRangesFromDataFrame(ranges,
keep.extra.columns = T,
seqnames.field = "chrom",
start.field = "chromStart",
end.field = "chromEnd")
gr.viable <-
makeGRangesFromDataFrame(viable,
keep.extra.columns = T,
seqnames.field = "chrom",
start.field = "chromStart",
end.field = "chromEnd")
# To find the intersects
gr.intersect <-
GenomicRanges::intersect(gr.ranges, gr.viable)
# For linking up the non- chrom,start,end columns
gr.hits <-
GenomicRanges::findOverlaps(gr.intersect, gr.ranges)
output <-
gr.intersect[queryHits(gr.hits)]
mcols(output) <-
mcols(gr.ranges[subjectHits(gr.hits)])
output
# Reformat to dataframe
output %>%
as.data.frame() %>%
select(`1` = X1, bin, chrom = seqnames, chromStart = start, chromEnd = end, name, score)
solved How can I find the overlapping values of these ranges in R? [duplicate]