implement faster innerjoin (#2612)

JuliaData · Feb 13, 2021 · 726b4e4 · 726b4e4
1 parent ecfc733
commit 726b4e4
Show file tree

Hide file tree

Showing 8 changed files with 887 additions and 17 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -33,6 +33,11 @@
 
 ## Other relevant changes
 
+* `innerjoin` is now much faster and checks if passed data frames are sorted
+  by the `on` columns and takes into account if shorter data frame that is joined
+  has unique values in `on` columns. These aspects of input data frames might affect
+  the order of rows produced in the output
+  ([#2612](https://github.com/JuliaData/DataFrames.jl/pull/2612))
 
 # DataFrames v0.22 Release Notes
 

diff --git a/Project.toml b/Project.toml
@@ -30,7 +30,7 @@ DataAPI = "1.4"
 InvertedIndices = "1"
 IteratorInterfaceExtensions = "0.1.1, 1"
 Missings = "0.4.2"
-PooledArrays = "0.5, 1.0"
+PooledArrays = "1.1"
 PrettyTables = "0.11"
 Reexport = "0.1, 0.2, 1.0"
 SortingAlgorithms = "0.1, 0.2, 0.3"

diff --git a/benchmarks/innerjoin_performance.jl b/benchmarks/innerjoin_performance.jl
@@ -0,0 +1,96 @@
+using CategoricalArrays
+using DataFrames
+using PooledArrays
+using Random
+
+fullgc() = (GC.gc(true); GC.gc(true); GC.gc(true); GC.gc(true))
+
+@assert length(ARGS) == 6
+@assert ARGS[3] in ["int", "pool", "cat", "str"]
+@assert ARGS[4] in ["uniq", "dup", "manydup"]
+@assert ARGS[5] in ["sort", "rand"]
+@assert ARGS[6] in ["1", "2"]
+
+@info ARGS
+
+llen = parse(Int, ARGS[1])
+rlen = parse(Int, ARGS[2])
+@assert llen > 1000
+@assert rlen > 2000
+
+pad = maximum(length.(string.((llen, rlen))))
+
+if ARGS[3] == "int"
+    if ARGS[4] == "uniq"
+        col1 = [1:llen;]
+        col2 = [1:rlen;]
+    elseif ARGS[4] == "dup"
+        col1 = repeat(1:llen ÷ 2, inner=2)
+        col2 = repeat(1:rlen ÷ 2, inner=2)
+    else
+        @assert ARGS[4] == "manydup"
+        col1 = repeat(1:llen ÷ 20, inner=20)
+        col2 = repeat(1:rlen ÷ 20, inner=20)
+    end
+elseif ARGS[3] == "pool"
+    if ARGS[4] == "dup"
+        col1 = PooledArray(repeat(string.(1:llen ÷ 2, pad=pad), inner=2))
+        col2 = PooledArray(repeat(string.(1:rlen ÷ 2, pad=pad), inner=2))
+    else
+        @assert ARGS[4] == "manydup"
+        col1 = PooledArray(repeat(string.(1:llen ÷ 20, pad=pad), inner=20))
+        col2 = PooledArray(repeat(string.(1:rlen ÷ 20, pad=pad), inner=20))
+    end
+elseif ARGS[3] == "cat"
+    if ARGS[4] == "dup"
+        col1 = categorical(repeat(string.(1:llen ÷ 2, pad=pad), inner=2))
+        col2 = categorical(repeat(string.(1:rlen ÷ 2, pad=pad), inner=2))
+    else
+        @assert ARGS[4] == "manydup"
+        col1 = categorical(repeat(string.(1:llen ÷ 20, pad=pad), inner=20))
+        col2 = categorical(repeat(string.(1:rlen ÷ 20, pad=pad), inner=20))
+    end
+else
+    @assert ARGS[3] == "str"
+    if ARGS[4] == "uniq"
+        col1 = string.(1:llen, pad=pad)
+        col2 = string.(1:rlen, pad=pad)
+    elseif ARGS[4] == "dup"
+        col1 = repeat(string.(1:llen ÷ 2, pad=pad), inner=2)
+        col2 = repeat(string.(1:rlen ÷ 2, pad=pad), inner=2)
+    else
+        @assert ARGS[4] == "manydup"
+        col1 = repeat(string.(1:llen ÷ 20, pad=pad), inner=20)
+        col2 = repeat(string.(1:rlen ÷ 20, pad=pad), inner=20)
+    end
+end
+
+Random.seed!(1234)
+
+if ARGS[5] == "rand"
+    shuffle!(col1)
+    shuffle!(col2)
+else
+    @assert ARGS[5] == "sort"
+end
+
+if ARGS[6] == "1"
+    df1 = DataFrame(id1 = col1)
+    df2 = DataFrame(id1 = col2)
+    innerjoin(df1[1:1000, :], df2[1:2000, :], on=:id1)
+    innerjoin(df2[1:2000, :], df1[1:1000, :], on=:id1)
+    fullgc()
+    @time innerjoin(df1, df2, on=:id1)
+    fullgc()
+    @time innerjoin(df2, df1, on=:id1)
+else
+    @assert ARGS[6] == "2"
+    df1 = DataFrame(id1 = col1, id2 = col1)
+    df2 = DataFrame(id1 = col1, id2 = col1)
+    innerjoin(df1[1:1000, :], df2[1:2000, :], on=[:id1, :id2])
+    innerjoin(df2[1:2000, :], df1[1:1000, :], on=[:id1, :id2])
+    fullgc()
+    @time innerjoin(df1, df2, on=[:id1, :id2])
+    fullgc()
+    @time innerjoin(df2, df1, on=[:id1, :id2])
+end
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
@@ -0,0 +1,2 @@
+julia runtests.jl 100000 50000000
+julia runtests.jl 5000000 10000000
diff --git a/benchmarks/runtests.jl b/benchmarks/runtests.jl
@@ -0,0 +1,12 @@
+@assert length(ARGS) == 2
+file_loc = joinpath(dirname(@__FILE__), "innerjoin_performance.jl")
+llen = ARGS[1]
+rlen = ARGS[2]
+
+for a3 in ["str", "int", "pool", "cat"],
+    a4 in ["uniq", "dup", "manydup"],
+    a5 in ["sort", "rand"],
+    a6 in ["1", "2"]
+    a4 == "uniq" && a3 in ["pool", "cat"] && continue
+    run(`julia $file_loc $llen $rlen $a3 $a4 $a5 $a6`)
+end
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		julia runtests.jl 100000 50000000
		julia runtests.jl 5000000 10000000