diff --git a/docs/src/lib/driver.md b/docs/src/lib/driver.md
index b4247c1cd1..af97a0f0ba 100644
--- a/docs/src/lib/driver.md
+++ b/docs/src/lib/driver.md
@@ -143,7 +143,7 @@ CUDA.total_memory
 
 ```@docs
 CuStream
-CUDA.query(::CuStream)
+CUDA.isdone(::CuStream)
 priority_range
 priority
 synchronize(::CuStream)
@@ -153,9 +153,9 @@ CUDA.@sync
 For specific use cases, special streams are available:
 
 ```@docs
-CuDefaultStream
-CuStreamLegacy
-CuStreamPerThread
+default_stream
+legacy_stream
+per_thread_stream
 ```
 
 ## Event Management
@@ -164,7 +164,7 @@ CuStreamPerThread
 CuEvent
 record
 synchronize(::CuEvent)
-CUDA.query(::CuEvent)
+CUDA.isdone(::CuEvent)
 CUDA.wait(::CuEvent)
 elapsed
 CUDA.@elapsed
diff --git a/lib/cudadrv/devices.jl b/lib/cudadrv/devices.jl
index bc813a7806..350a7e578c 100644
--- a/lib/cudadrv/devices.jl
+++ b/lib/cudadrv/devices.jl
@@ -63,9 +63,6 @@ const DEVICE_INVALID = _CuDevice(CUdevice(-2))
 
 Base.convert(::Type{CUdevice}, dev::CuDevice) = dev.handle
 
-Base.:(==)(a::CuDevice, b::CuDevice) = a.handle == b.handle
-Base.hash(dev::CuDevice, h::UInt) = hash(dev.handle, h)
-
 function Base.show(io::IO, ::MIME"text/plain", dev::CuDevice)
   print(io, "CuDevice($(dev.handle)): ")
   if dev == DEVICE_CPU
diff --git a/lib/cudadrv/events.jl b/lib/cudadrv/events.jl
index d2d1377328..4e896a1161 100644
--- a/lib/cudadrv/events.jl
+++ b/lib/cudadrv/events.jl
@@ -50,12 +50,12 @@ Waits for an event to complete.
 synchronize(e::CuEvent) = cuEventSynchronize(e)
 
 """
-    query(e::CuEvent)
+    isdone(e::CuEvent)
 
 Return `false` if there is outstanding work preceding the most recent
 call to `record(e)` and `true` if all captured work has been completed.
 """
-function query(e::CuEvent)
+function isdone(e::CuEvent)
     res = unsafe_cuEventQuery(e)
     if res == ERROR_NOT_READY
         return false
diff --git a/lib/cudadrv/stream.jl b/lib/cudadrv/stream.jl
index dab625d0d5..bd83748a84 100644
--- a/lib/cudadrv/stream.jl
+++ b/lib/cudadrv/stream.jl
@@ -1,7 +1,7 @@
 # Stream management
 
 export
-    CuStream, CuDefaultStream, CuStreamLegacy, CuStreamPerThread,
+    CuStream, default_stream, legacy_stream, per_thread_stream,
     priority, priority_range, synchronize, device_synchronize
 
 """
@@ -29,15 +29,15 @@ mutable struct CuStream
         return obj
     end
 
-    global CuDefaultStream() = new(convert(CUstream, C_NULL), nothing)
+    global default_stream() = new(convert(CUstream, C_NULL), nothing)
 
-    global CuStreamLegacy() = new(convert(CUstream, 1), nothing)
+    global legacy_stream() = new(convert(CUstream, 1), nothing)
 
-    global CuStreamPerThread() = new(convert(CUstream, 2), nothing)
+    global per_thread_stream() = new(convert(CUstream, 2), nothing)
 end
 
 """
-    CuDefaultStream()
+    default_stream()
 
 Return the default stream.
 
@@ -46,10 +46,10 @@ Return the default stream.
     It is generally better to use `stream()` to get a stream object that's local to the
     current task. That way, operations scheduled in other tasks can overlap.
 """
-CuDefaultStream()
+default_stream()
 
 """
-    CuStreamLegacy()
+    legacy_stream()
 
 Return a special object to use use an implicit stream with legacy synchronization behavior.
 
@@ -57,10 +57,10 @@ You can use this stream to perform operations that should block on all streams (
 exception of streams created with `STREAM_NON_BLOCKING`). This matches the old pre-CUDA 7
 global stream behavior.
 """
-CuStreamLegacy()
+legacy_stream()
 
 """
-    CuStreamPerThread()
+    per_thread_stream()
 
 Return a special object to use an implicit stream with per-thread synchronization behavior.
 This stream object is normally meant to be used with APIs that do not have per-thread
@@ -72,7 +72,7 @@ versions of their APIs (i.e. without a `ptsz` or `ptds` suffix).
     gets its own non-blocking stream, and multithreading in Julia is typically
     accomplished using tasks.
 """
-CuStreamPerThread()
+per_thread_stream()
 
 Base.unsafe_convert(::Type{CUstream}, s::CuStream) = s.handle
 
@@ -92,12 +92,12 @@ function Base.show(io::IO, stream::CuStream)
 end
 
 """
-    query(s::CuStream)
+    isdone(s::CuStream)
 
 Return `false` if a stream is busy (has task running or queued)
 and `true` if that stream is free.
 """
-function query(s::CuStream)
+function isdone(s::CuStream)
     res = unsafe_cuStreamQuery(s)
     if res == ERROR_NOT_READY
         return false
@@ -119,7 +119,7 @@ See also: [`device_synchronize`](@ref)
 """
 function synchronize(stream::CuStream=stream(); blocking::Bool=true)
     # fast path
-    query(stream) && @goto(exit)
+    isdone(stream) && @goto(exit)
 
     # minimize latency of short operations by busy-waiting,
     # initially without even yielding to other tasks
@@ -132,7 +132,7 @@ function synchronize(stream::CuStream=stream(); blocking::Bool=true)
         else
             yield()
         end
-        query(stream) && @goto(exit)
+        isdone(stream) && @goto(exit)
         spins += 1
     end
 
@@ -155,7 +155,7 @@ Block for the current device's tasks to complete. This is a heavyweight operatio
 you only need to call [`synchronize`](@ref) which only synchronizes the stream associated
 with the current task.
 """
-device_synchronize() = synchronize(CuStreamLegacy())
+device_synchronize() = synchronize(legacy_stream())
 
 """
     priority_range()
diff --git a/lib/cufft/fft.jl b/lib/cufft/fft.jl
index 9b87fd5750..c355548e1a 100644
--- a/lib/cufft/fft.jl
+++ b/lib/cufft/fft.jl
@@ -27,7 +27,7 @@ function CUDA.unsafe_free!(plan::CuFFTPlan, stream::CuStream=stream())
     unsafe_free!(plan.workarea, stream)
 end
 
-unsafe_finalize!(plan::CuFFTPlan) = unsafe_free!(plan, CuDefaultStream())
+unsafe_finalize!(plan::CuFFTPlan) = unsafe_free!(plan, default_stream())
 
 mutable struct cCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace}
     handle::cufftHandle
diff --git a/lib/nvml/device.jl b/lib/nvml/device.jl
index 05d655fc9c..88a673dddd 100644
--- a/lib/nvml/device.jl
+++ b/lib/nvml/device.jl
@@ -23,6 +23,11 @@ end
 
 Base.unsafe_convert(::Type{nvmlDevice_t}, dev::Device) = dev.handle
 
+function Base.show(io::IO, ::MIME"text/plain", dev::Device)
+    print(io, "NVML.Device($(index(dev))): ")
+    print(io, "$(name(dev))")
+end
+
 
 
 # iteration
@@ -75,6 +80,12 @@ function serial(dev::Device)
     return unsafe_string(pointer(buf))
 end
 
+function index(dev::Device)
+    index = Ref{Cuint}()
+    nvmlDeviceGetIndex(dev, index)
+    return Int(index[])
+end
+
 # watt
 function power_usage(dev::Device)
     ref = Ref{Cuint}()
diff --git a/src/array.jl b/src/array.jl
index 95b2c1d708..c8ed4a3db5 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -63,7 +63,7 @@ earlier to reduce pressure on the memory allocator.
 
 By default, the operation is performed on the task-local stream. During task or process
 finalization however, that stream may be destroyed already, so be sure to specify a safe
-stream (i.e. `CuDefaultStream()`, which will ensure the operation will block on other
+stream (i.e. `default_stream()`, which will ensure the operation will block on other
 streams) when calling this function from a finalizer. For simplicity, the `unsafe_finalize!`
 function does exactly that.
 """
@@ -98,7 +98,7 @@ function unsafe_finalize!(xs::CuArray)
   # stream, it synchronizes "too much". we could do better, e.g., by keeping track of all
   # streams involved, or by refcounting uses and decrementing that refcount after the
   # operation using `cuLaunchHostFunc`. See CUDA.jl#778 and CUDA.jl#780 for details.
-  unsafe_free!(xs, CuDefaultStream())
+  unsafe_free!(xs, default_stream())
 end
 
 
diff --git a/src/deprecated.jl b/src/deprecated.jl
index 26ccb0c2fc..f59993eae4 100644
--- a/src/deprecated.jl
+++ b/src/deprecated.jl
@@ -5,3 +5,9 @@
 @deprecate CuCurrentContext() current_context()
 @deprecate CuContext(ptr::Union{Ptr,CuPtr}) context(ptr)
 @deprecate CuDevice(ptr::Union{Ptr,CuPtr}) device(ptr)
+
+@deprecate CuDefaultStream() default_stream()
+@deprecate CuStreamLegacy() legacy_stream()
+@deprecate CuStreamPerThread() per_thread_stream()
+@deprecate query(s::CuStream) isdone(s)
+@deprecate query(e::CuEvent) isdone(e)
diff --git a/test/cudadrv.jl b/test/cudadrv.jl
index 64a2225e1b..d3da991124 100644
--- a/test/cudadrv.jl
+++ b/test/cudadrv.jl
@@ -213,8 +213,8 @@ CuEvent(CUDA.EVENT_BLOCKING_SYNC | CUDA.EVENT_DISABLE_TIMING)
 end
 
 @testset "event query" begin
-    event  = CuEvent()
-    @test CUDA.query(event) == true
+    event = CuEvent()
+    @test CUDA.isdone(event)
 end
 
 end
@@ -835,7 +835,7 @@ end
 
 s = CuStream()
 synchronize(s)
-@test CUDA.query(s) == true
+@test CUDA.isdone(s)
 
 let s2 = CuStream()
     @test s != s2
diff --git a/test/nvml.jl b/test/nvml.jl
index 1a4329add2..bf342a8696 100644
--- a/test/nvml.jl
+++ b/test/nvml.jl
@@ -19,6 +19,10 @@ end
 @testset "devices" begin
     let dev = NVML.Device(0)
         @test dev == first(NVML.devices())
+        @test NVML.index(dev) == 0
+
+        str = sprint(io->show(io, "text/plain", dev))
+        @test occursin("NVML.Device(0)", str)
     end
 
     cuda_dev = CuDevice(0)