JuliaData · bkamins · Nov 2, 2020 · Oct 29, 2020 · Oct 29, 2020 · Oct 29, 2020
diff --git a/NEWS.md b/NEWS.md
@@ -41,8 +41,8 @@
 * in `describe` the specification of custom aggregation is now `function => name`;
   old `name => function` order is now deprecated
   ([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401))
-* in joins passing `NaN` in `on` column now throws an error and
-  passing `missing` thows an error unless `matchmissing=:equal` keyword argument
+* in joins passing `NaN` or real or imaginary `-0.0` in `on` column now throws an
+  error; passing `missing` thows an error unless `matchmissing=:equal` keyword argument
   is passed ([#2504](https://github.com/JuliaData/DataFrames.jl/pull/2504))
 * `unstack` now produces row and column keys in the order of their first appearance
    and has two new keyword arguments `allowmissing` and `allowduplicates`

diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl
@@ -57,8 +57,13 @@ struct DataFrameJoiner
         end
 
         for df in (dfl_on, dfr_on), col in eachcol(df)
-            if any(x -> (x isa Union{Complex, Real}) & isnan(x), col)
-                throw(ArgumentError("NaN values in key columns are not allowed"))
+            if any(x -> (x isa Union{Complex, Real}) &&
+                        (isnan(x) || real(x) === -0.0 || imag(x) === -0.0), col)
+                throw(ArgumentError("currently for numeric values NaN and `-0.0` " *
+                                    "in their real or imaginary components are not" *
+                                    " allowed. Use CategoricalArrays.jl to wrap" *
+                                    "these values into CategoricalVector to perform" *
+                                    "the requested join."))
             end
         end
 
@@ -508,6 +513,11 @@ The order of rows in the result is undefined and may change in the future releas
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
   matched (`isequal` is used for comparisons of rows for equality)
 
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
+
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
 of the right data frame.
@@ -642,6 +652,11 @@ The order of rows in the result is undefined and may change in the future releas
 
 All columns of the returned data table will support missing values.
 
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
+
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
 of the right data frame.
@@ -766,6 +781,11 @@ The order of rows in the result is undefined and may change in the future releas
 
 All columns of the returned data table will support missing values.
 
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
+
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
 of the right data frame.
@@ -895,6 +915,11 @@ The order of rows in the result is undefined and may change in the future releas
 
 All columns of the returned data table will support missing values.
 
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
+
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
 of the right data frame.
@@ -1023,6 +1048,11 @@ The order of rows in the result is undefined and may change in the future releas
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
   matched (`isequal` is used for comparisons of rows for equality)
 
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
+
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
 of the right data frame.
@@ -1123,6 +1153,11 @@ The order of rows in the result is undefined and may change in the future releas
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
   matched (`isequal` is used for comparisons of rows for equality)
 
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
+
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
 of the right data frame.

diff --git a/test/join.jl b/test/join.jl
@@ -573,25 +573,57 @@ end
     @test innerjoin(name, job, on=:ID, validate=(false, false)) == inner
 
     # Make sure ok with various special values
-    for special in [missing, NaN, 0.0, -0.0]
+    for special in [missing, NaN, -0.0]
         name_w_special = DataFrame(ID = [1, 2, 3, special],
                                    Name = ["John Doe", "Jane Doe", "Joe Blogs", "Maria Tester"])
-        @test innerjoin(name_w_special, job, on=:ID, validate=(true, false)) == inner
+        @test_throws ArgumentError innerjoin(name_w_special, job, on=:ID)
+        @test_throws ArgumentError leftjoin(name_w_special, job, on=:ID)
+        @test_throws ArgumentError rightjoin(name_w_special, job, on=:ID)
+        @test_throws ArgumentError outerjoin(name_w_special, job, on=:ID)
+        @test_throws ArgumentError semijoin(name_w_special, job, on=:ID)
+        @test_throws ArgumentError antijoin(name_w_special, job, on=:ID)
+    end
+
+    for special in [missing, 0.0]
+        name_w_special = DataFrame(ID = [1, 2, 3, special],
+                                   Name = ["John Doe", "Jane Doe", "Joe Blogs", "Maria Tester"])
+        @test innerjoin(name_w_special, job, on=:ID, validate=(true, false), matchmissing=:equal) ≅ inner
+        @test leftjoin(name_w_special, job, on=:ID, validate=(true, false), matchmissing=:equal) ≅
+              vcat(left, DataFrame(ID=special, Name="Maria Tester", Job=missing))
+        @test rightjoin(name_w_special, job, on=:ID, validate=(true, false), matchmissing=:equal) ≅ right
+        @test outerjoin(name_w_special, job, on=:ID, validate=(true, false), matchmissing=:equal)[[1:4;6;5], :] ≅
+              vcat(outer, DataFrame(ID=special, Name="Maria Tester", Job=missing))
+        @test semijoin(name_w_special, job, on=:ID, validate=(true, false), matchmissing=:equal) ≅ semi
+        @test antijoin(name_w_special, job, on=:ID, validate=(true, false), matchmissing=:equal) ≅
+              vcat(anti, DataFrame(ID=special, Name="Maria Tester"))
 
         # Make sure duplicated special values still an exception
         name_w_special_dups = DataFrame(ID = [1, 2, 3, special, special],
                                         Name = ["John Doe", "Jane Doe", "Joe Blogs",
                                                 "Maria Tester", "Jill Jillerson"])
         @test_throws ArgumentError innerjoin(name_w_special_dups, name, on=:ID,
+                                        validate=(true, false), matchmissing=:equal)
+    end
+
+    for special in [NaN, -0.0]
+        name_w_special = DataFrame(ID = categorical([1, 2, 3, special]),
+                                   Name = ["John Doe", "Jane Doe", "Joe Blogs", "Maria Tester"])
+        @test innerjoin(name_w_special, categorical(job, :ID), on=:ID, validate=(true, false)) == inner
+
+        # Make sure duplicated special values still an exception
+        name_w_special_dups = DataFrame(ID = categorical([1, 2, 3, special, special]),
+                                        Name = ["John Doe", "Jane Doe", "Joe Blogs",
+                                                "Maria Tester", "Jill Jillerson"])
+        @test_throws ArgumentError innerjoin(name_w_special_dups, categorical(name, :ID), on=:ID,
                                         validate=(true, false))
     end
 
     # Check 0.0 and -0.0 seen as different
-    name_w_zeros = DataFrame(ID = [1, 2, 3, 0.0, -0.0],
+    name_w_zeros = DataFrame(ID = categorical([1, 2, 3, 0.0, -0.0]),
                              Name = ["John Doe", "Jane Doe",
                                      "Joe Blogs", "Maria Tester",
                                      "Jill Jillerson"])
-    name_w_zeros2 = DataFrame(ID = [1, 2, 3, 0.0, -0.0],
+    name_w_zeros2 = DataFrame(ID = categorical([1, 2, 3, 0.0, -0.0]),
                               Name = ["John Doe", "Jane Doe",
                                       "Joe Blogs", "Maria Tester",
                                       "Jill Jillerson"],