2b: Determining Interval Overlap with TypedTables

Author

Douglas Bates and Claudia Solis-Lemus

Published

2022-07-30

This is a re-formulation of the functions in “2b: Determining Interval Overlap” using TypedTables.jl in place of DataFrames.jl and Tables.jl

Load packages to be used

Code
using Arrow          # Arrow storage and file format
using BenchmarkTools # tools for benchmarking code
using RangeTrees     # a bespoke implementation of interval trees
using TypedTables         # row- or column-oriented tabular data

using Base: intersect! # not exported from Base

datadir = joinpath(@__DIR__, "biofast-data-v1");
asrange(start, stop) = (start+one(start)):stop
asrange (generic function with 1 method)
function chromodict(tbl::Table)
  r1 = first(tbl)
  itype = promote_type(typeof(r1.start), typeof(r1.stop))
  vtype = Vector{UnitRange{itype}}
  dict = Dict{Symbol,vtype}()
  for (; chromo, start, stop) in tbl
    push!(get!(dict, Symbol(chromo), vtype()), asrange(start, stop))
  end
  return dict
end
function chromodict(fnm::AbstractString)
  return chromodict(Table(Arrow.Table(joinpath(datadir, fnm))))
end
tarrngvecs = chromodict("ex-rna.arrow")
refrngvecs = chromodict("ex-anno.arrow")
Dict{Symbol, Vector{UnitRange{Int32}}} with 24 entries:
  :chr21 => [5011799:5011874, 5012548:5012687, 5014386:5014471, 5016935:5017145…
  :chr15 => [19878555:19878668, 19878831:19879004, 19881201:19881307, 19882277:…
  :chr10 => [14497:14604, 14061:14299, 16502:16544, 14138:14299, 44712:44901, 4…
  :chr17 => [76723:76866, 75814:75878, 71366:71556, 65830:65887, 64099:65736, 1…
  :chr07 => [12704:12822, 26965:27199, 24314:24365, 26965:27234, 31060:31194, 2…
  :chr14 => [16057472:16057622, 18333726:18333900, 18337973:18338078, 18338243:…
  :chr08 => [64269:64320, 64091:64175, 72601:72673, 78905:79775, 72617:72701, 7…
  :chr12 => [12310:12358, 12740:12824, 13102:13201, 13370:13501, 31878:32015, 2…
  :chr18 => [11103:11595, 15617:15822, 11191:11595, 13152:13354, 15617:15928, 1…
  :chrX  => [253743:253846, 254937:255091, 276322:276394, 281482:281684, 284167…
  :chr13 => [18177555:18178465, 18176018:18176170, 18174442:18174512, 18174010:…
  :chr11 => [75780:76143, 86649:87586, 125578:125927, 121258:121426, 113116:113…
  :chr22 => [10736171:10736283, 10961283:10961338, 10959067:10959136, 10950049:…
  :chr03 => [23757:23812, 23968:24501, 54293:54346, 53348:53692, 196607:196859,…
  :chr19 => [70928:70976, 66346:66499, 60951:61894, 62113:66524, 70928:70951, 6…
  :chr05 => [58198:58915, 92151:92276, 113251:113448, 139483:140716, 143047:143…
  :chr06 => [95124:95454, 105919:106856, 144536:144885, 140211:140379, 131910:1…
  :chr20 => [87250:87359, 96005:97094, 87710:87767, 96005:96533, 142369:142686,…
  :chrY  => [2784749:2784853, 2786855:2787699, 2789827:2790328, 2827982:2828218…
  :chr04 => [49096:49956, 49554:50124, 53285:53491, 59430:59556, 60058:60153, 8…
  :chr02 => [45440:46385, 38814:41627, 42809:42952, 41220:41627, 46807:46870, 4…
  :chr01 => [11869:12227, 12613:12721, 13221:14409, 12010:12057, 12179:12227, 1…
  :chr09 => [12134:12190, 12291:12340, 12726:12834, 13088:13157, 13338:13487, 1…
  :chr16 => [11555:11908, 12294:12402, 12902:14090, 11861:11908, 12294:12378, 1…
@code_warntype chromodict(Table(Arrow.Table("biofast-data-v1/ex-anno.arrow")))
MethodInstance for chromodict(::
Table{NamedTuple{(:chromo, :start, :stop), Tuple{String, Int32, Int32}}, 1, NamedTuple{(:chromo, :start, :stop), Tuple{Arrow.DictEncoded{String, Int8, Arrow.List{String, Int32, Vector{UInt8}}}, Arrow.Primitive{Int32, Vector{Int32}}, Arrow.Primitive{Int32, Vector{Int32}}}}})
  from chromodict(tbl::Table) in Main at In[4]:1
Arguments
  #self#::Core.Const(chromodict)
  tbl::Table{NamedTuple{(:chromo, :start, :stop), Tuple{String, Int32, Int32}}, 1, NamedTuple{(:chromo, :start, :stop), Tuple{Arrow.DictEncoded{String, Int8, Arrow.List{String, Int32, Vector{UInt8}}}, Arrow.Primitive{Int32, Vector{Int32}}, Arrow.Primitive{Int32, Vector{Int32}}}}}
Locals
  @_3::Union{Nothing, Tuple{NamedTuple{(:chromo, :start, :stop), Tuple{String, Int32, Int32}}, Tuple{Base.OneTo{Int64}, Int64}}}
  dict::Dict{Symbol, Vector{UnitRange{Int32}}}
  vtype::Type{Vector{UnitRange{Int32}}}
  itype::Type{Int32}
  r1::NamedTuple{(:chromo, :start, :stop), Tuple{String, Int32, Int32}}
  stop::Int32
  start::Int32
  chromo::String
Body::Dict{Symbol, Vector{UnitRange{Int32}}}
1 ─       
(r1 = Main.first(tbl))

│   %2  = Base.getproperty(r1, :start)::Int32
│   %3  = Main.typeof(%2)::Core.Const(Int32)
│   %4  = Base.getproperty(r1, :stop)::Int32
│   %5  = Main.typeof(%4)::Core.Const(Int32)
│         (itype = Main.promote_type(%3, %5))
│   %7  = Core.apply_type(Main.UnitRange, itype
::Core.Const(Int32))::Core.Const(UnitRange{Int32})
│         (vtype = Core.apply_type(Main.Vector, %7))
│   %9  = Core.apply_type(Main.Dict, Main.Symbol, vtype::Core.Const(Vector{UnitRange{Int32}}))::Core.Const(Dict{Symbol, Vector{UnitRange{Int32}}})
│         (dict = (%9)())
│   %11 = tbl::Table{NamedTuple{(:chromo, :start, :stop), Tuple{String, Int32, Int32}}, 1, NamedTuple{(:chromo, :start, :stop), Tuple{Arrow.DictEncoded{String, Int8, Arrow.List{String, Int32, Vector{UInt8}}}, Arrow.Primitive{Int32, Vector{Int32}}, Arrow.Primitive{Int32, Vector{Int32}}}}}
│         (@_3 = Base.iterate(%11))
│   %13 = (@_3 === nothing)::Bool
│   %14 = Base.not_int(%13)::Bool
└──       goto #4 if not %14
2 ┄ %16 = @_3::Tuple{NamedTuple{(:chromo, :start, :stop), Tuple{String, Int32, Int32}}, Tuple{Base.OneTo{Int64}, Int64}}
│   %17 = Core.getfield(%16, 1)::NamedTuple{(:chromo, :start, :stop), Tuple{String, Int32, Int32}}
│         (chromo = Base.getproperty(%17, :chromo))
│         (start = Base.getproperty(%17, :start))
│         (stop = Base.getproperty(%17, :stop))
│   %21 = Core.getfield(%16, 2)::Tuple{Base.OneTo{Int64}, Int64}
│   %22 = dict::Dict{Symbol, Vector{UnitRange{Int32}}}
│   %23 = Main.Symbol(chromo)::Symbol
│   %24 = (vtype::Core.Const(Vector{UnitRange{Int32}}))()::Vector{UnitRange{Int32}}
│   %25 = Main.get!(%22, %23, %24)::Vector{UnitRange{Int32}}
│   %26 = Main.asrange(start, stop)::UnitRange{Int32}
│         Main.push!(%25, %26)
│         (@_3 = Base.iterate(%11, %21))
│   %29 = (@_3 === nothing)::Bool
│   %30 = Base.not_int(%29)::Bool
└──       goto #4 if not %30
3 ─       goto #2
4 ┄       return dict
let
  function lt(x::UnitRange{T}, y::UnitRange{T}) where {T}
    fx, fy = first(x), first(y)
    return fx == fy ? last(x) < last(y) : fx < fy
  end
  for v in values(refrngvecs)
    sort!(v; lt)
  end
end
refrngvecs  # note changes in refrngvecs[:chr01]
Dict{Symbol, Vector{UnitRange{Int32}}} with 24 entries:
  :chr21 => [5011799:5011874, 5012548:5012687, 5014386:5014471, 5016935:5017145…
  :chr15 => [19878555:19878668, 19878831:19879004, 19881201:19881307, 19882277:…
  :chr10 => [14061:14299, 14138:14299, 14497:14604, 16502:16544, 44712:44901, 4…
  :chr17 => [64099:65736, 65830:65887, 71366:71556, 75814:75878, 76723:76866, 8…
  :chr07 => [12704:12822, 19018:19172, 19619:19895, 20834:21029, 24314:24365, 2…
  :chr14 => [16057472:16057622, 18333726:18333900, 18333826:18333896, 18337973:…
  :chr08 => [64091:64175, 64269:64320, 72601:72673, 72617:72701, 78905:79244, 7…
  :chr12 => [12310:12358, 12740:12824, 13102:13201, 13370:13501, 14522:14944, 1…
  :chr18 => [11103:11595, 11191:11595, 13152:13354, 14195:14653, 14490:14653, 1…
  :chrX  => [253743:253846, 254937:255091, 276322:276394, 276324:276394, 276353…
  :chr13 => [18174010:18174103, 18174442:18174512, 18176018:18176170, 18177555:…
  :chr11 => [75780:76143, 86649:87586, 112967:113111, 113116:113174, 121258:121…
  :chr22 => [10736171:10736283, 10939388:10939423, 10940597:10940707, 10941691:…
  :chr03 => [23757:23812, 23968:24501, 53348:53692, 54293:54346, 195758:195914,…
  :chr19 => [60951:61894, 62113:66524, 63821:64213, 65051:65226, 65822:66047, 6…
  :chr05 => [58198:58915, 92151:92276, 113251:113448, 139483:140716, 140258:140…
  :chr06 => [95124:95454, 105919:106856, 131910:132117, 140211:140379, 142272:1…
  :chr20 => [87250:87359, 87710:87767, 96005:96533, 96005:97094, 142369:142686,…
  :chrY  => [2784749:2784853, 2786855:2787699, 2789827:2790328, 2827982:2828218…
  :chr04 => [49096:49956, 49554:50124, 53285:53491, 53286:53491, 53295:53491, 5…
  :chr02 => [38814:41627, 41220:41627, 41221:41627, 42809:42952, 45440:46385, 4…
  :chr01 => [11869:12227, 12010:12057, 12179:12227, 12613:12697, 12613:12721, 1…
  :chr09 => [12134:12190, 12291:12340, 12726:12834, 13088:13157, 13338:13487, 1…
  :chr16 => [11555:11908, 11861:11908, 12294:12378, 12294:12402, 12663:12733, 1…
refrngvec01 = refrngvecs[:chr01]
tarrngvec01 = tarrngvecs[:chr01]
target = last(tarrngvec01)
182839365:182887745

RangeTrees

refrngtrees = Dict(k => RangeNode(v) for (k, v) in refrngvecs)
rangetree01 = refrngtrees[:chr01]
treesize(rangetree01), treeheight(rangetree01), treebreadth(rangetree01)
(52789, 15, 20022)
result = similar(tarrngvec01, 0)
savedresult = intersect!(result, target, rangetree01)
@benchmark intersect!(res, tar, ref) setup =
  (res = result; tar = target; ref = rangetree01)
BenchmarkTools.Trial: 10000 samples with 200 evaluations.
 Range (minmax):  388.020 ns985.650 ns   GC (min … max): 0.00% … 0.00%
 Time  (median):     418.507 ns                GC (median):    0.00%
 Time  (mean ± σ):   434.928 ns ±  59.613 ns   GC (mean ± σ):  0.00% ± 0.00%
     █▄                                                         
  ▃▄▃██▅▄▅▇▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  388 ns           Histogram: frequency by time          751 ns <
 Memory estimate: 0 bytes, allocs estimate: 0.
function coveragecount(
  target::AbstractUnitRange,
  isects::Vector{UnitRange{T}},
) where {T}
  leftpost, rightpost = T(first(target)), T(last(target))
  coverage = 0
  for isect in isects
    coverage += length(intersect(isect, leftpost:rightpost))
    leftpost = max(leftpost, last(isect) + one(T))
  end
  return coverage
end
coveragecount(target, savedresult)
5639

Iterating over a collection of targets

function overlaps(targets::Vector{UnitRange{T}}, refs::RangeNode{T}) where {T}
  nover = similar(targets, Int)
  covcount = similar(targets, T)
  result = empty!(similar(targets, ceil(Int, sqrt(treesize(refs)))))
  @inbounds for (i, tar) in enumerate(targets)
    nover[i] = length(intersect!(result, tar, refs))
    covcount[i] = coveragecount(tar, result)
  end
  Table(targets = targets, nover = nover, covcount = covcount)
end
overlaps(tarrngvec01, rangetree01)
Table with 3 columns and 453114 rows:
      targets              nover  covcount
    ┌─────────────────────────────────────
 1  │ 165655371:165655620  3      250
 2  │ 84498368:84506172    6      676
 3  │ 44777749:44778638    11     837
 4  │ 23692611:23696418    9      1398
 5  │ 23692608:23696418    9      1401
 6  │ 23962516:23963450    1      935
 7  │ 23692686:23696419    9      1324
 8  │ 114717978:114720613  15     1127
 9  │ 116392931:116404775  19     2762
 10 │ 25567642:25568695    2      1054
 11 │ 23961575:23962496    2      922
 12 │ 8861023:8870500      14     1777
 13 │ 96678807:96679446    1      573
 14 │ 37536543:37554276    12     4139
 15 │ 155308882:155320663  56     5081
 16 │ 65230872:65232127    2      1256
 17 │ 23692609:23696399    9      1381
 18 │ 181089237:181090826  1      1590
 19 │ 185118995:185120707  4      1602
 20 │ 203861621:203870081  15     987
 21 │ 153990779:153992127  10     903
 22 │ 42658437:42676757    19     1431
 23 │ 2309508:2310113      1      606
 ⋮  │          ⋮             ⋮       ⋮
@benchmark overlaps(targets, refs) setup =
  (targets = tarrngvec01; refs = rangetree01)
BenchmarkTools.Trial: 40 samples with 1 evaluation.
 Range (minmax):  124.392 ms136.181 ms   GC (min … max): 0.00% … 8.62%
 Time  (median):     125.896 ms                GC (median):    0.00%
 Time  (mean ± σ):   126.966 ms ±   3.109 ms   GC (mean ± σ):  0.83% ± 2.15%
  ▅▅▂█     ▅                                                
  █████▁▅█▁▁▁███▁▁▅▁▅▁▁▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▁▅▁▁▁▁▅▁▁▁▁▁▁▅▁▁▅ ▁
  124 ms           Histogram: frequency by time          136 ms <
 Memory estimate: 17.58 MiB, allocs estimate: 158501.

The whole shootin’ match

  • The computations on different chromosome are independent of each other and can be assigned to different threads when Julia is started with multiple threads.
function overlaps(
  tardict::Dict{Symbol,Vector{UnitRange{T}}},
  refdict::Dict{Symbol,RangeNode{T,R}},
) where {T,R}
  result = Dict{Symbol,Table}()
  @sync for k in intersect(keys(tardict), keys(refdict))
    Threads.@spawn result[k] = overlaps(tardict[k], refdict[k])
  end
  return result
end
bigresult = overlaps(tarrngvecs, refrngtrees);
bigresult[:chrX]
Table with 3 columns and 150330 rows:
      targets              nover  covcount
    ┌─────────────────────────────────────
 1  │ 72272603:72277210    15     1955
 2  │ 119786512:119791605  7      2123
 3  │ 77826515:77829079    3      2565
 4  │ 79241981:79363480    12     2619
 5  │ 12975122:12977227    8      1691
 6  │ 119786513:119791613  7      2130
 7  │ 12975119:12977227    8      1694
 8  │ 24465328:24534418    16     2115
 9  │ 12976909:12977217    3      309
 10 │ 154400549:154400875  12     327
 11 │ 12976881:12977227    3      347
 12 │ 65738963:65741915    2      2412
 13 │ 154398500:154400899  27     1950
 14 │ 119786512:119791640  7      2158
 15 │ 24077771:24077957    1      187
 16 │ 12975119:12977224    8      1691
 17 │ 12976265:12977227    5      548
 18 │ 12975128:12977212    8      1670
 19 │ 12976915:12977223    3      309
 20 │ 12975126:12977227    8      1687
 21 │ 78049074:78049922    2      849
 22 │ 77899484:77905383    7      988
 23 │ 1386273:1386685      1      413
 ⋮  │          ⋮             ⋮       ⋮
@benchmark overlaps(targets, refs) setup =
  (targets = tarrngvecs; refs = refrngtrees)
BenchmarkTools.Trial: 4 samples with 1 evaluation.
 Range (minmax):  1.302 s 1.318 s   GC (min … max): 0.82% … 0.89%
 Time  (median):     1.307 s              GC (median):    0.77%
 Time  (mean ± σ):   1.308 s ± 6.957 ms   GC (mean ± σ):  0.78% ± 0.10%
  ▁                                                     ▁  
  █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
  1.3 s         Histogram: frequency by time        1.32 s <
 Memory estimate: 188.77 MiB, allocs estimate: 1723485.

Version information

versioninfo()
Julia Version 1.8.0-rc3
Commit 33f19bcbd25 (2022-07-13 19:10 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 × 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, tigerlake)
  Threads: 1 on 8 virtual cores