[Solved] How can I find the overlapping values of these ranges in R? [duplicate]

Question

You could try using the GenomicRanges package.

library(dplyr)
library(GenomicRanges)

Here we load in the the example input data. (This is an inelegant way to do this — I know… but I was lazy and the sublime multiline edit made it easy.) Note: I don’t know where the “1” column means, but I kept it in the data.

ranges <-
  rbind(
    c("2","12","chr1","836780","856723","-5.7648","599"),
    c("3","116","chr1","1693001","1739032","-4.8403","473"),
    c("4","117","chr1","1750780","1880930","-5.3036","536"),
    c("5","121","chr1","2020123","2108890","-4.4165","415")
  ) %>% 
  as.data.frame()
colnames(ranges) <-
  c("1","bin","chrom","chromStart","chromEnd","name","score")

viable <-
  rbind(
    c("chr1","840000","890000","1566"),
    c("chr1","1690000","1740000","1566"),
    c("chr1","1700000","1750000","1566"),
    c("chr1","1710000","1760000","1566"),
    c("chr1","1720000","1770000","1566"),
    c("chr1","1730000","1780000","1566"),
    c("chr1","1740000","1790000","1566"),
    c("chr1","1750000","1800000","1566"),
    c("chr1","1760000","1810000","1566")
  ) %>%
  as.data.frame()
colnames(viable) <-
  c("chrom","chromStart","chromEnd","N")

## Need columns to be integers
ranges <-
  ranges %>%
  tbl_df() %>%
  mutate(
    chromStart = chromStart %>% as.character %>% as.integer,
    chromEnd = chromEnd %>% as.character %>% as.integer
    )
viable <-
  viable %>%
  tbl_df() %>%
  mutate(
    chromStart = chromStart %>% as.character %>% as.integer,
    chromEnd = chromEnd %>% as.character %>% as.integer
    )

Here is where my answer begins:

Reformat dataframe to GenomicRanges class
Find the regions by doing an intersection
Add in the bin, name, and score columns using the findOverlaps. (Note, this information is removed during the intersection because there is not necessarily a 1:1 mapping)
Reformat output back into a dataframe

Done

gr.ranges <-
  makeGRangesFromDataFrame(ranges,
                           keep.extra.columns = T,
                           seqnames.field = "chrom",
                           start.field = "chromStart",
                           end.field = "chromEnd")
gr.viable <-
  makeGRangesFromDataFrame(viable,
                           keep.extra.columns = T,
                           seqnames.field = "chrom",
                           start.field = "chromStart",
                           end.field = "chromEnd")

# To find the intersects
gr.intersect <-
  GenomicRanges::intersect(gr.ranges, gr.viable)

# For linking up the non- chrom,start,end columns
gr.hits <-
  GenomicRanges::findOverlaps(gr.intersect, gr.ranges)

output <-
  gr.intersect[queryHits(gr.hits)]
mcols(output) <-
  mcols(gr.ranges[subjectHits(gr.hits)])
output

# Reformat to dataframe
output %>%
  as.data.frame() %>%
  select(`1` = X1, bin, chrom = seqnames, chromStart = start, chromEnd = end, name, score)

Accepted Answer

You could try using the GenomicRanges package.

library(dplyr)
library(GenomicRanges)

Here we load in the the example input data. (This is an inelegant way to do this — I know… but I was lazy and the sublime multiline edit made it easy.) Note: I don’t know where the “1” column means, but I kept it in the data.

ranges <-
  rbind(
    c("2","12","chr1","836780","856723","-5.7648","599"),
    c("3","116","chr1","1693001","1739032","-4.8403","473"),
    c("4","117","chr1","1750780","1880930","-5.3036","536"),
    c("5","121","chr1","2020123","2108890","-4.4165","415")
  ) %>% 
  as.data.frame()
colnames(ranges) <-
  c("1","bin","chrom","chromStart","chromEnd","name","score")

viable <-
  rbind(
    c("chr1","840000","890000","1566"),
    c("chr1","1690000","1740000","1566"),
    c("chr1","1700000","1750000","1566"),
    c("chr1","1710000","1760000","1566"),
    c("chr1","1720000","1770000","1566"),
    c("chr1","1730000","1780000","1566"),
    c("chr1","1740000","1790000","1566"),
    c("chr1","1750000","1800000","1566"),
    c("chr1","1760000","1810000","1566")
  ) %>%
  as.data.frame()
colnames(viable) <-
  c("chrom","chromStart","chromEnd","N")

## Need columns to be integers
ranges <-
  ranges %>%
  tbl_df() %>%
  mutate(
    chromStart = chromStart %>% as.character %>% as.integer,
    chromEnd = chromEnd %>% as.character %>% as.integer
    )
viable <-
  viable %>%
  tbl_df() %>%
  mutate(
    chromStart = chromStart %>% as.character %>% as.integer,
    chromEnd = chromEnd %>% as.character %>% as.integer
    )

Here is where my answer begins:

Reformat dataframe to GenomicRanges class
Find the regions by doing an intersection
Add in the bin, name, and score columns using the findOverlaps. (Note, this information is removed during the intersection because there is not necessarily a 1:1 mapping)
Reformat output back into a dataframe

Done

gr.ranges <-
  makeGRangesFromDataFrame(ranges,
                           keep.extra.columns = T,
                           seqnames.field = "chrom",
                           start.field = "chromStart",
                           end.field = "chromEnd")
gr.viable <-
  makeGRangesFromDataFrame(viable,
                           keep.extra.columns = T,
                           seqnames.field = "chrom",
                           start.field = "chromStart",
                           end.field = "chromEnd")

# To find the intersects
gr.intersect <-
  GenomicRanges::intersect(gr.ranges, gr.viable)

# For linking up the non- chrom,start,end columns
gr.hits <-
  GenomicRanges::findOverlaps(gr.intersect, gr.ranges)

output <-
  gr.intersect[queryHits(gr.hits)]
mcols(output) <-
  mcols(gr.ranges[subjectHits(gr.hits)])
output

# Reformat to dataframe
output %>%
  as.data.frame() %>%
  select(`1` = X1, bin, chrom = seqnames, chromStart = start, chromEnd = end, name, score)