From 2b84047aa6393bf871c6ab9ffb87bd6160cc5294 Mon Sep 17 00:00:00 2001
From: Jan Braunwarth <jan.braunwarth@gmail.com>
Date: Thu, 7 Dec 2023 18:11:02 +0100
Subject: [PATCH] Implement a nvme driver

The implemented driver allows OSv to boot from a emulated nvme device and
handle additional nvme devices, emulated or passed through from the host.
Booting directly from a nvme via pci-passthrough needs to be tested.

The nvme driver creates nvme_queue_pairs to interact with the device
controller. A nvme_queue_pair manages a submission queue and the
corresponding completion queue.
The nvme driver registers every namespace as device and forwards requests
to the queues. Namespace 1 on the first nvme drive is named "vblk0".
Further devices are named nvmeXnY where X is the driver instance id
starting with 0 and Y is the namespace id starting with 1.

Read/Write requests on the device file go through the block cache layer.
This can reduce performance quite a bit, since the block cache splits
every request into 512B-sized sequentiell requests. Setting
NVME_DIRECT_RW_ENABLED in /drivers/nvme.hh disables the block cache.

All queues use MSI-X. 1 interrupt vector gets registered for every queue.
There is very noticeable overhead while using pci-passthrough. This gets
reduced by using interrupt coalescing but needs to be further investigated.

Add options to ./scripts/run.py:
--nvme to start OSv on a nvme emulated by QEMU
--second-nvme-image to attach an additional image as nvme
--pass-pci to passthrough a pci device from the host
    the device needs to be bound to vfio-pci

drivers/blk_ioctl.hh implements the BLKGETSIZE64 and BLKFLSBUF
ioctl which are used by fio

drivers/io-test.cc is an simple iops test that can be activated by
building with conf_drivers_io_test=1 and runs during initialization
of the nvme device

Signed-off-by: Jan Braunwarth <jan.braunwarth@gmail.com>
---
 Makefile                  |   9 +-
 arch/x64/arch-setup.cc    |   6 +
 conf/profiles/x64/all.mk  |   1 +
 conf/profiles/x64/base.mk |   6 +
 conf/profiles/x64/nvme.mk |   3 +
 core/debug.cc             |   3 +-
 drivers/blk_ioctl.hh      |  59 ++++
 drivers/io-test.cc        | 128 +++++++
 drivers/io-test.hh        |  12 +
 drivers/nvme-queue.cc     | 464 +++++++++++++++++++++++++
 drivers/nvme-queue.hh     | 126 +++++++
 drivers/nvme-structs.h    | 647 +++++++++++++++++++++++++++++++++++
 drivers/nvme.cc           | 693 ++++++++++++++++++++++++++++++++++++++
 drivers/nvme.hh           | 118 +++++++
 drivers/virtio-blk.cc     |   4 +-
 fs/vfs/kern_physio.cc     |  11 +-
 include/osv/bio.h         |   2 +
 include/osv/buf.h         |   2 +-
 scripts/run.py            |  23 +-
 scripts/test.py           |   5 +
 20 files changed, 2309 insertions(+), 13 deletions(-)
 create mode 100644 conf/profiles/x64/nvme.mk
 create mode 100644 drivers/blk_ioctl.hh
 create mode 100644 drivers/io-test.cc
 create mode 100644 drivers/io-test.hh
 create mode 100644 drivers/nvme-queue.cc
 create mode 100644 drivers/nvme-queue.hh
 create mode 100644 drivers/nvme-structs.h
 create mode 100644 drivers/nvme.cc
 create mode 100644 drivers/nvme.hh

diff --git a/Makefile b/Makefile
index e24cb35191..bfb2291c75 100644
--- a/Makefile
+++ b/Makefile
@@ -286,7 +286,7 @@ post-includes-bsd += -isystem bsd/$(arch)
 $(out)/musl/%.o: pre-include-api = -isystem include/api/internal_musl_headers -isystem musl/src/include
 
 ifneq ($(werror),0)
-	CFLAGS_WERROR = -Werror
+	CFLAGS_WERROR = -Wall
 endif
 # $(call compiler-flag, -ffoo, option, file)
 #     returns option if file builds with -ffoo, empty otherwise
@@ -889,6 +889,13 @@ drivers += drivers/virtio-vring.o
 ifeq ($(conf_drivers_mmio),1)
 drivers += drivers/virtio-mmio.o
 endif
+ifeq ($(conf_drivers_nvme),1)
+drivers += drivers/nvme.o
+drivers += drivers/nvme-queue.o
+endif
+ifeq ($(conf_drivers_io_test),1)
+drivers += drivers/io_test.o
+endif
 drivers += drivers/virtio-net.o
 drivers += drivers/virtio-blk.o
 drivers += drivers/virtio-scsi.o
diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc
index e1fb53808c..0d3ab2f59a 100644
--- a/arch/x64/arch-setup.cc
+++ b/arch/x64/arch-setup.cc
@@ -310,6 +310,9 @@ void arch_init_premain()
 #if CONF_drivers_ide
 #include "drivers/ide.hh"
 #endif
+#if CONF_drivers_nvme
+#include "drivers/nvme.hh"
+#endif
 
 extern bool opt_pci_disabled;
 void arch_init_drivers()
@@ -364,6 +367,9 @@ void arch_init_drivers()
 #endif
 #if CONF_drivers_ide
     drvman->register_driver(ide::ide_drive::probe);
+#endif
+#if CONF_drivers_nvme
+    drvman->register_driver(nvme::probe);
 #endif
     boot_time.event("drivers probe");
     drvman->load_all();
diff --git a/conf/profiles/x64/all.mk b/conf/profiles/x64/all.mk
index c13790e2be..3d69e8a74d 100644
--- a/conf/profiles/x64/all.mk
+++ b/conf/profiles/x64/all.mk
@@ -4,5 +4,6 @@ include conf/profiles/$(arch)/virtio-mmio.mk
 include conf/profiles/$(arch)/virtio-pci.mk
 include conf/profiles/$(arch)/vmware.mk
 include conf/profiles/$(arch)/xen.mk
+include conf/profiles/$(arch)/nvme.mk
 
 conf_drivers_vga?=1
diff --git a/conf/profiles/x64/base.mk b/conf/profiles/x64/base.mk
index 26dd054ed8..0ce84912fb 100644
--- a/conf/profiles/x64/base.mk
+++ b/conf/profiles/x64/base.mk
@@ -38,6 +38,11 @@ export conf_drivers_pci?=1
 export conf_drivers_scsi?=1
 endif
 
+export conf_drivers_nvme?=0
+ifeq ($(conf_drivers_nvme),1)
+export conf_drivers_pci?=1
+endif
+
 export conf_drivers_vmxnet3?=0
 ifeq ($(conf_drivers_vmxnet3),1)
 export conf_drivers_pci?=1
@@ -72,3 +77,4 @@ export conf_drivers_virtio?=0
 export conf_drivers_pci?=0
 export conf_drivers_mmio?=0
 export conf_drivers_scsi?=0
+export conf_drivers_io_test?=0
diff --git a/conf/profiles/x64/nvme.mk b/conf/profiles/x64/nvme.mk
new file mode 100644
index 0000000000..fe21d26bde
--- /dev/null
+++ b/conf/profiles/x64/nvme.mk
@@ -0,0 +1,3 @@
+conf_drivers_pci?=1
+
+conf_drivers_nvme?=1
diff --git a/core/debug.cc b/core/debug.cc
index cf0a06f4ee..1f7f3685c7 100644
--- a/core/debug.cc
+++ b/core/debug.cc
@@ -43,10 +43,11 @@ bool logger::parse_configuration(void)
     add_tag("virtio-blk", logger_warn);
     add_tag("virtio-net", logger_warn);
     add_tag("vmxnet3", logger_warn);
-    add_tag("pci", logger_info);
+    add_tag("pci", logger_debug);
     add_tag("poll", logger_info);
     add_tag("dhcp", logger_info);
     add_tag("acpi", logger_error);
+    add_tag("nvme", logger_debug);
 
     return (true);
 }
diff --git a/drivers/blk_ioctl.hh b/drivers/blk_ioctl.hh
new file mode 100644
index 0000000000..55a2e32d70
--- /dev/null
+++ b/drivers/blk_ioctl.hh
@@ -0,0 +1,59 @@
+#ifndef BLK_IOCTL_H
+#define BLK_IOCTL_H
+
+#define _IOC_NRBITS     8
+#define _IOC_TYPEBITS   8
+#define _IOC_SIZEBITS   13
+#define _IOC_DIRBITS    3
+
+#define _IOC_NRMASK     ((1 << _IOC_NRBITS)-1)
+#define _IOC_TYPEMASK   ((1 << _IOC_TYPEBITS)-1)
+#define _IOC_SIZEMASK   ((1 << _IOC_SIZEBITS)-1)
+#define _IOC_DIRMASK    ((1 << _IOC_DIRBITS)-1)
+
+#define _IOC_NRSHIFT    0
+#define _IOC_TYPESHIFT  (_IOC_NRSHIFT+_IOC_NRBITS)
+#define _IOC_SIZESHIFT  (_IOC_TYPESHIFT+_IOC_TYPEBITS)
+#define _IOC_DIRSHIFT   (_IOC_SIZESHIFT+_IOC_SIZEBITS)
+
+#define _IOC_DIR(nr)    (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
+#define _IOC_TYP(nr)   (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
+#define _IOC_NR(nr)     (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
+#define _IOC_SIZE(nr)   (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
+
+#define BLKGETSIZE64    114
+#define BLKFLSBUF       97
+#define BLKDISCARD      119
+
+TRACEPOINT(trace_blk_ioctl, "dev=%s type=%#x nr=%d size=%d, dir=%d", char*, int, int, int, int);
+
+void no_bio_done(bio * b) {delete b;};
+
+int
+blk_ioctl(struct device* dev, u_long io_cmd, void* buf)
+{
+    assert(dev);
+    trace_blk_ioctl(dev->name, _IOC_TYP(io_cmd), _IOC_NR(io_cmd), _IOC_SIZE(io_cmd), _IOC_DIR(io_cmd));
+
+    switch (_IOC_NR(io_cmd)) {
+        case BLKGETSIZE64:
+            //device capacity in bytes
+            *(off_t*) buf = dev->size;
+            break;
+        case BLKFLSBUF: {
+            auto* bio = alloc_bio();
+            bio->bio_dev = dev;
+            bio->bio_done = no_bio_done;
+            bio->bio_cmd = BIO_FLUSH;
+
+            dev->driver->devops->strategy(bio);
+            }
+            break;
+        default:
+            printf("ioctl not defined; type:%#x nr:%d size:%d, dir:%d\n",_IOC_TYP(io_cmd),_IOC_NR(io_cmd),_IOC_SIZE(io_cmd),_IOC_DIR(io_cmd));
+            return EINVAL;
+    }
+    return 0;
+}
+
+#endif
\ No newline at end of file
diff --git a/drivers/io-test.cc b/drivers/io-test.cc
new file mode 100644
index 0000000000..e46f914d05
--- /dev/null
+++ b/drivers/io-test.cc
@@ -0,0 +1,128 @@
+#include "drivers/io-test.hh"
+#include <osv/contiguous_alloc.hh>
+#include <stdio.h>
+#include <stdlib.h>
+#include <random>
+#include <machine/atomic.h>
+#include <osv/clock.hh>
+
+volatile bool running;
+volatile u64 completed_io;
+volatile u64 requested_io;
+std::atomic<unsigned int> open_req;
+u32 max_open;
+u64 max_ios;
+
+void test_block_device(struct device *dev, int test_duration, int blcks_per_io, int blocksize, int blockshift)
+{
+    int report_step = 1e6;
+    int io_size = blocksize * blcks_per_io;
+    completed_io = 0;
+    requested_io = 0;
+    open_req.store(0);
+    max_open = 64;
+    max_ios = 1 << 30;
+    
+    printf("Start IO test dev : %s, IO size : %d\n",dev->name,io_size);
+    sched::thread *t;
+    t = sched::thread::make([dev,io_size,blockshift] { requesting(dev,io_size,blockshift);},
+        sched::thread::attr().name("IO_Test Request"));
+
+    sched::thread *timer;
+    timer = sched::thread::make([test_duration] { usleep(test_duration);},
+        sched::thread::attr().name("IO_Test_Timer"));
+
+    sched::thread *repo;
+    repo = sched::thread::make([test_duration,report_step,io_size] { reporting(test_duration,report_step,io_size);},
+        sched::thread::attr().name("IO_Test_Timer"));
+    auto c = clock::get();
+
+    running = true;
+    u64 start = c->time();
+    timer->start();
+    t->start();
+    repo->start();
+
+    timer->join();
+    running = false;
+    u64 com = completed_io; 
+    u64 end = c->time();
+    int iops = (com * 1e9)/ (end - start);
+
+    t->join();
+    repo->join();
+    printf("Test results runtime: %llu, completed IO : %llu, IOPS : %d\n",end-start,com,iops);
+}
+
+void reporting(int test_duration, int report_step, int io_size) {
+    u32 prev_compl = completed_io;
+    u32 compl_diff;
+    u32 compl_tem;
+    auto c = clock::get();
+    int time_diff;
+    int time_tem;
+    int prev_time = c->time();
+    while(running) {
+        usleep(report_step);
+        compl_tem = completed_io;
+        time_tem = c->time();
+
+        compl_diff = compl_tem - prev_compl;
+        prev_compl = compl_tem;
+        time_diff = time_tem - prev_time;
+        prev_time = time_tem;
+        double iops = (compl_diff * 1e9 ) / (double) time_diff;
+
+        printf("Timestep: %d, completed : %d, IOPS : %lf, open : %d\n",time_diff,compl_diff,iops,open_req.load());
+    }
+}
+
+
+void requesting(struct device *dev, u32 io_size, int blockshift) {
+    void* buff;
+    bio* bio;
+    off_t max_blocks = dev->size >> blockshift;
+    off_t max_offset = (max_blocks - 1) - (io_size >> blockshift);
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> distrib(0, max_offset);
+
+    while(running) {
+        if(requested_io >= max_ios )
+            break;
+
+        buff = memory::alloc_phys_contiguous_aligned(io_size,2);
+        assert(buff);
+        memset(buff, 1, io_size);
+
+        bio = alloc_bio();
+        bio->bio_dev = dev;
+        bio->bio_data = buff;
+        bio->bio_done = io_done;
+        bio->bio_length = io_size;
+        bio->bio_bcount = io_size;
+        bio->bio_cmd = BIO_READ;
+        
+        bio->bio_offset = ((off_t) distrib(gen)) << blockshift;
+
+        while(max_open<=open_req) {
+            usleep(10);
+        }
+        open_req.fetch_add(1);
+        atomic_add_64(&requested_io,1);
+        dev->driver->devops->strategy(bio);
+    }
+}
+
+void io_done(struct bio* bio) {
+    
+    if(bio->bio_flags != BIO_DONE) {
+        printf("BIO_Error during IO Test: %x\n",bio->bio_flags);
+    }
+    u64 old = atomic_fetchadd_long(&completed_io, 1);
+    
+    open_req.fetch_add(-1);
+
+    free(bio->bio_data);
+    delete bio;
+}
\ No newline at end of file
diff --git a/drivers/io-test.hh b/drivers/io-test.hh
new file mode 100644
index 0000000000..4ff5154ed9
--- /dev/null
+++ b/drivers/io-test.hh
@@ -0,0 +1,12 @@
+#ifndef IO_TEST_H
+#define IO_TEST_H
+
+#include <osv/device.h>
+#include <osv/bio.h>
+
+void requesting(struct device *dev, u32 io_size, int blocksize);
+void reporting(int test_duration, int report_step, int io_size);
+void io_done(struct bio* bio);
+void test_block_device(struct device *dev, int test_duration,int blcks_per_io, int blocksize=512, int blockshift=9);
+
+#endif 
\ No newline at end of file
diff --git a/drivers/nvme-queue.cc b/drivers/nvme-queue.cc
new file mode 100644
index 0000000000..b2605271a0
--- /dev/null
+++ b/drivers/nvme-queue.cc
@@ -0,0 +1,464 @@
+#include <sys/cdefs.h>
+
+#include "nvme-queue.hh"
+#include <osv/bio.h>
+#include <vector>
+#include <memory>
+#include <osv/contiguous_alloc.hh>
+
+extern std::unique_ptr<nvme_sq_entry_t> alloc_cmd();
+
+TRACEPOINT(trace_nvme_io_queue_wake, "nvme%d qid=%d", int, int);
+TRACEPOINT(trace_nvme_wait_for_completion_queue_entries, "nvme%d qid=%d,have_elements=%d", int, int, bool);
+TRACEPOINT(trace_nvme_completion_queue_not_empty, "nvme%d qid=%d,not_empty=%d", int, int, bool);
+TRACEPOINT(trace_nvme_enable_interrupts, "nvme%d qid=%d", int, int);
+TRACEPOINT(trace_nvme_disable_interrupts, "nvme%d qid=%d", int, int);
+
+TRACEPOINT(trace_nvme_read, "nvme%d qid=%d cid=%d, bio data=%#x, slba=%d, nlb=%d", int, int , u16, void*, u64, u32);
+TRACEPOINT(trace_nvme_write, "nvme%d qid=%d cid=%d, bio data=%#x, slba=%d, nlb=%d", int, int , u16, void*, u64, u32);
+
+TRACEPOINT(trace_nvme_req_done_error, "nvme%d qid=%d, cid=%d, status type=%#x, status code=%#x, bio=%#x", int, int, u16, u8, u8, bio*);
+TRACEPOINT(trace_nvme_req_done_success, "nvme%d qid=%d, cid=%d, bio=%#x",int,int, u16, bio*);
+
+TRACEPOINT(trace_nvme_admin_queue_wake, "nvme%d qid=%d",int,int);
+
+TRACEPOINT(trace_nvme_admin_queue_submit, "nvme%d qid=%d, cid=%d",int, int, int);
+TRACEPOINT(trace_nvme_admin_req_done_error, "nvme%d qid=%d, cid=%d, status type=%#x, status code=%#x", int, int, u16, u8, u8);
+TRACEPOINT(trace_nvme_admin_req_done_success, "nvme%d qid=%d, cid=%d",int,int, u16);
+
+TRACEPOINT(trace_advance_sq_tail_full, "nvme%d qid=%d, sq_tail=%d, sq_head=%d", int, int, int, int);
+TRACEPOINT(trace_nvme_wait_for_entry, "nvme%d qid=%d, sq_tail=%d, sq_head=%d", int, int, int, int);
+
+nvme_queue_pair::nvme_queue_pair(
+    int did,
+    u32 id,
+    int qsize,
+    pci::device &dev,
+    nvme_sq_entry_t* sq_addr,
+    u32* sq_doorbell,
+    nvme_cq_entry_t* cq_addr,
+    u32* cq_doorbell,
+    std::map<u32, nvme_ns_t*>& ns)
+          : _id(id)
+          ,_driverid(did)
+          ,_qsize(qsize)
+          ,_dev(&dev)
+          ,_sq_addr(sq_addr)
+          ,_sq_head(0)
+          ,_sq_tail(0)
+          ,_sq_doorbell(sq_doorbell)
+          ,_sq_full(false)
+          ,_cq_addr(cq_addr)
+          ,_cq_head(0)
+          ,_cq_tail(0)
+          ,_cq_doorbell(cq_doorbell)
+          ,_cq_phase_tag(1)
+          ,_ns(ns)
+
+{
+    auto prplists = (u64**) malloc(sizeof(u64*)*qsize);
+    memset(prplists,0,sizeof(u64*)*qsize);
+    _prplists_in_use.push_back(prplists);
+
+    assert(!completion_queue_not_empty());
+}
+
+nvme_queue_pair::~nvme_queue_pair()
+{
+    memory::free_phys_contiguous_aligned(_sq_addr);
+    memory::free_phys_contiguous_aligned(_cq_addr);
+    for(auto vec: _prplists_in_use)
+        memory::free_phys_contiguous_aligned(vec);
+}
+
+inline void nvme_queue_pair::advance_sq_tail()
+{
+    _sq_tail = (_sq_tail + 1) % _qsize;
+    if(_sq_tail == _sq_head) {
+        _sq_full = true; 
+        trace_advance_sq_tail_full(_driverid,_id,_sq_tail,_sq_head);
+    }
+}
+
+u16 nvme_queue_pair::submit_cmd(std::unique_ptr<nvme_sq_entry_t> cmd)
+{   u16 ret;
+    WITH_LOCK(_lock) 
+    {
+        ret = submit_cmd_without_lock(std::move(cmd));
+    }
+    return ret;
+}
+
+u16 nvme_queue_pair::submit_cmd_without_lock(std::unique_ptr<nvme_sq_entry_t> cmd)
+{
+    _sq_addr[_sq_tail] = *cmd;
+    advance_sq_tail();
+    mmio_setl(_sq_doorbell,_sq_tail);
+    return _sq_tail;
+}
+
+void nvme_queue_pair::wait_for_completion_queue_entries()
+{
+    sched::thread::wait_until([this] {
+        bool have_elements = this->completion_queue_not_empty();
+        if (!have_elements) {
+            this->enable_interrupts();
+            //check if we got a new cqe between completion_queue_not_empty()
+            //and enable_interrupts()
+            have_elements = this->completion_queue_not_empty();
+            if (have_elements) {
+                this->disable_interrupts();
+            }
+        }
+
+        trace_nvme_wait_for_completion_queue_entries(_driverid,_id,have_elements);
+        return have_elements;
+    });
+}
+
+int nvme_queue_pair::map_prps(u16 cid, void* data, u64 datasize, u64* prp1, u64* prp2)
+{
+    u64 addr = (u64) data;
+    *prp1 = addr;
+    *prp2 = 0;
+    int numpages = 0;
+    u64 offset = addr - ( (addr >> NVME_PAGESHIFT) << NVME_PAGESHIFT );
+    if(offset) numpages = 1;
+
+    numpages += ( datasize - offset + NVME_PAGESIZE - 1) >> NVME_PAGESHIFT;
+
+    if (numpages == 2) {
+        *prp2 = ((addr >> NVME_PAGESHIFT) +1 ) << NVME_PAGESHIFT;
+    } else if (numpages > 2) {
+        assert(numpages / 512 == 0);
+        u64* prplist = (u64*) memory::alloc_phys_contiguous_aligned(numpages * 8, 4096);
+        assert(prplist != nullptr);
+        *prp2 = mmu::virt_to_phys(prplist);
+        _prplists_in_use.at(cid / _qsize)[cid % _qsize] = prplist;
+        
+        addr = ((addr >> NVME_PAGESHIFT) +1 ) << NVME_PAGESHIFT;
+        prplist[0] = addr;
+
+        for (int i = 1; i < numpages - 1; i++) {
+            addr += NVME_PAGESIZE;
+            prplist[i] = addr;
+        }
+    }
+    return 0;
+}
+
+std::unique_ptr<nvme_cq_entry_t> nvme_queue_pair::get_completion_queue_entry()
+{
+    if(!completion_queue_not_empty()) {
+        return nullptr;
+    }
+
+    auto* tcqe = new nvme_cq_entry_t; 
+    *tcqe = _cq_addr[_cq_head];
+    std::unique_ptr<nvme_cq_entry_t> cqe(tcqe);
+    assert(cqe->p == _cq_phase_tag);
+
+    if(++_cq_head == _qsize) {
+        _cq_head -= _qsize;
+        _cq_phase_tag = !_cq_phase_tag;
+    }
+    return cqe; 
+}
+
+
+bool nvme_queue_pair::completion_queue_not_empty() const
+{
+    bool a = reinterpret_cast<volatile nvme_cq_entry_t*>(&_cq_addr[_cq_head])->p == _cq_phase_tag;
+    trace_nvme_completion_queue_not_empty(_driverid,_id,a);
+    return a;//_cq_addr[_cq_head].p == _cq_phase_tag;
+}
+
+void nvme_queue_pair::enable_interrupts()
+{
+    _dev->msix_unmask_entry(_id);
+    trace_nvme_enable_interrupts(_driverid,_id);
+}
+
+void nvme_queue_pair::disable_interrupts()
+{
+    _dev->msix_mask_entry(_id);
+    trace_nvme_disable_interrupts(_driverid,_id);
+}
+
+//only use with interrupts disabled
+std::unique_ptr<nvme_cq_entry_t> nvme_queue_pair::check_for_completion(u16 cid)
+{
+    int msec = 1000;
+    int timeout = 50;
+    int i;
+
+    std::unique_ptr<nvme_cq_entry_t> cqe;
+    for(i = 0; i < timeout; i++) {
+        if(completion_queue_not_empty()) {
+            cqe = get_completion_queue_entry();
+            assert(cqe->cid == cid);
+            if(cqe->sct != 0 || cqe->sc != 0) {
+                NVME_ERROR("polling nvme%d qid=%d, cid=%d, sct=%#x, sc=%#x\n", _driverid, _id, cid, cqe->sct, cqe->sc);
+                _sq_head = cqe->sqhd; //update sq_head
+                mmio_setl(_cq_doorbell, _cq_head);
+                return cqe;
+            }
+
+            _sq_head = cqe->sqhd; //update sq_head
+            mmio_setl(_cq_doorbell, _cq_head);
+            return cqe;
+        }
+        usleep(msec);
+    }
+    NVME_ERROR("polling timeout nvme%d qid=%d cid=%d\n", _driverid, _id, cid);
+    return cqe;
+}
+
+int nvme_io_queue_pair::make_request(bio* bio, u32 nsid=1)
+{
+    u64 slba = bio->bio_offset;
+    u32 nlb = bio->bio_bcount; //do the blockshift in nvme_driver
+    u16 cid;
+    
+    _lock.lock();
+    cid = _sq_tail;
+    if(_sq_full) {
+        //Wait for free entries
+        _waiter.reset(*sched::thread::current());
+        trace_nvme_wait_for_entry(_driverid,_id,_sq_tail,_sq_head);
+        sched::thread::wait_until([this] {return !(this->_sq_full);});
+        _waiter.clear();
+    }
+    /* 
+    We need to check if there is an outstanding command that uses 
+    _sq_tail as command id.
+    This happens if 
+        1.The SQ is full. Then we just have to wait for an open slot (see above)
+        2.the Controller already read a SQE but didnt post a CQE yet.
+            This means we could post the command but need a different cid. To still
+            use the cid as index to find the corresponding bios we use a matrix 
+            adding columns if we need them
+    */
+    while(_pending_bios.at(cid / _qsize)[cid % _qsize]) {
+        cid += _qsize;
+        if(_pending_bios.size() <= (cid / _qsize)){
+            auto bios_array = (struct bio**) malloc(sizeof(struct bio*) * _qsize);
+            memset(bios_array,0,sizeof(struct bio*) * _qsize);
+            _pending_bios.push_back(bios_array);
+            auto prplists = (u64**) malloc(sizeof(u64*)* _qsize);
+            memset(prplists,0,sizeof(u64*)* _qsize);
+            _prplists_in_use.push_back(prplists);
+        }
+    }
+    _pending_bios.at(cid / _qsize)[cid % _qsize] = bio;
+
+
+
+    switch (bio->bio_cmd) {
+    case BIO_READ:
+        trace_nvme_read(_driverid, _id, cid, bio->bio_data, slba, nlb);
+        submit_rw(cid,(void*)mmu::virt_to_phys(bio->bio_data),slba,nlb, nsid, NVME_CMD_READ);
+        break;
+    
+    case BIO_WRITE:
+        trace_nvme_write(_driverid, _id, cid, bio->bio_data, slba, nlb);
+        submit_rw(cid,(void*)mmu::virt_to_phys(bio->bio_data),slba,nlb, nsid, NVME_CMD_WRITE);
+        break;
+    
+    case BIO_FLUSH: {
+        auto cmd = alloc_cmd(); 
+        cmd->vs.common.opc = NVME_CMD_FLUSH;
+        cmd->vs.common.nsid = nsid;
+        cmd->vs.common.cid = cid;
+        submit_cmd_without_lock(std::move(cmd));
+        } break;
+        
+    default:
+        NVME_ERROR("Operation not implemented\n");
+        _lock.unlock();
+        return ENOTBLK;
+    }
+    _lock.unlock();
+    return 0;
+}
+
+void nvme_io_queue_pair::req_done()
+{
+    std::unique_ptr<nvme_cq_entry_t> cqe;
+    u16 cid;
+    while(true) 
+    {
+        wait_for_completion_queue_entries();
+        trace_nvme_io_queue_wake(_driverid,_id);
+        while((cqe = get_completion_queue_entry())) {
+            cid = cqe->cid;
+            if(cqe->sct != 0 || cqe->sc != 0) {
+                trace_nvme_req_done_error(_driverid,_id, cid, cqe->sct, cqe->sc, _pending_bios.at(cid / _qsize)[cid % _qsize]);
+                if(_pending_bios.at(cid / _qsize)[cid % _qsize])
+                    biodone(_pending_bios.at(cid / _qsize)[cid % _qsize],false);
+                NVME_ERROR("I/O queue: cid=%d, sct=%#x, sc=%#x, bio=%#x, slba=%llu, nlb=%llu\n",cqe->cid, cqe->sct, 
+                    cqe->sc,_pending_bios.at(cid / _qsize)[cid % _qsize],
+                    cqe->sc,_pending_bios.at(cid / _qsize)[cid % _qsize]->bio_offset,
+                    cqe->sc,_pending_bios.at(cid / _qsize)[cid % _qsize]->bio_bcount);
+            }else {
+                trace_nvme_req_done_success(_driverid,_id, cid, _pending_bios.at(cid / _qsize)[cid % _qsize]);
+                if(_pending_bios.at(cid / _qsize)[cid % _qsize])
+                    biodone(_pending_bios.at(cid / _qsize)[cid % _qsize],true);
+            }
+
+            _pending_bios.at(cid / _qsize)[cid % _qsize] = nullptr;
+            if(_prplists_in_use.at(cid / _qsize)[cid % _qsize]) {
+                memory::free_phys_contiguous_aligned(_prplists_in_use.at(cid / _qsize)[cid % _qsize]);
+                _prplists_in_use.at(cid / _qsize)[cid % _qsize] = nullptr;
+            }
+            _sq_head = cqe->sqhd; //update sq_head
+        }
+        mmio_setl(_cq_doorbell, _cq_head);
+        if(_sq_full) { //wake up the requesting thread in case the submission queue was full before
+            _sq_full = false;
+            if(_waiter)
+                _waiter.wake_from_kernel_or_with_irq_disabled();
+        }
+    }
+}
+
+int nvme_io_queue_pair::submit_rw(u16 cid, void* data, u64 slba, u32 nlb, u32 nsid, int opc)
+{
+    auto cmd = alloc_cmd();
+    u64 prp1 = 0, prp2 = 0;
+    u32 datasize = nlb << _ns[nsid]->blockshift;
+    
+    map_prps(cid, data, datasize, &prp1, &prp2);
+    cmd->rw.common.cid = cid;
+    cmd->rw.common.opc = opc;
+    cmd->rw.common.nsid = nsid;
+    cmd->rw.common.prp1 = prp1;
+    cmd->rw.common.prp2 = prp2;
+    cmd->rw.slba = slba;
+    cmd->rw.nlb = nlb - 1;
+        
+    return submit_cmd_without_lock(std::move(cmd));
+}
+
+void nvme_admin_queue_pair::req_done()
+{   
+    std::unique_ptr<nvme_cq_entry_t> cqe;
+    u16 cid;
+    while(true) 
+    {
+        wait_for_completion_queue_entries();
+        trace_nvme_admin_queue_wake(_driverid,_id);
+        while((cqe = get_completion_queue_entry())) {
+            cid = cqe->cid;
+            if(cqe->sct != 0 || cqe->sc != 0) {
+                trace_nvme_admin_req_done_error(_driverid,_id, cid, cqe->sct, cqe->sc);
+                NVME_ERROR("Admin queue cid=%d, sct=%#x, sc=%#x\n",cid,cqe->sct,cqe->sc);
+            }else {
+                trace_nvme_admin_req_done_success(_driverid,_id, cid);
+            }
+            
+            if(_prplists_in_use.at(cid / _qsize)[cid % _qsize]) {
+                memory::free_phys_contiguous_aligned(_prplists_in_use.at(cid / _qsize)[cid % _qsize]);
+                _prplists_in_use.at(cid / _qsize)[cid % _qsize] = nullptr;
+            }
+            _sq_head = cqe->sqhd; //update sq_head
+            _req_res = std::move(cqe); //save the cqe so that the requesting thread can return it
+        }
+        mmio_setl(_cq_doorbell, _cq_head);
+        
+        /*Wake up the thread that requested the admin command*/
+        new_cq = true;
+        _req_waiter.wake_from_kernel_or_with_irq_disabled();
+    }
+}
+
+std::unique_ptr<nvme_cq_entry_t> nvme_admin_queue_pair::submit_and_return_on_completion(std::unique_ptr<nvme_sq_entry_t> cmd, void* data, unsigned int datasize)
+{
+    _lock.lock();
+    
+    _req_waiter.reset(*sched::thread::current());
+    
+    //for now admin cid = sq_tail
+    u16 cid = _sq_tail;
+    cmd->rw.common.cid = cid;
+
+    if(data != nullptr && datasize > 0) {
+        map_prps(_sq_tail,data, datasize, &cmd->rw.common.prp1, &cmd->rw.common.prp2);
+    }
+    
+    trace_nvme_admin_queue_submit(_driverid,_id,cid);
+    submit_cmd_without_lock(std::move(cmd));
+    
+    sched::thread::wait_until([this] {return this->new_cq;});
+    _req_waiter.clear();
+    
+    new_cq = false;
+    if(_prplists_in_use.at(0)[cid]) {
+        free(_prplists_in_use.at(0)[cid]);
+    }
+    
+    _lock.unlock();
+    return std::move(_req_res);
+}
+
+nvme_io_queue_pair::nvme_io_queue_pair(
+        int did,
+        int id,
+        int qsize,
+        pci::device& dev,
+
+        nvme_sq_entry_t* sq_addr,
+        u32* sq_doorbell,
+
+        nvme_cq_entry_t* cq_addr,
+        u32* cq_doorbell,
+        std::map<u32, nvme_ns_t*>& ns
+        ) : nvme_queue_pair(
+            did,
+            id,
+            qsize,
+            dev,
+
+            sq_addr,
+            sq_doorbell,
+
+            cq_addr,
+            cq_doorbell,
+            ns
+        ){
+    auto bios_array = (bio**) malloc(sizeof(bio*) * qsize);
+    memset(bios_array, 0, sizeof(bio*) * qsize);
+    _pending_bios.push_back(bios_array);
+}
+
+nvme_io_queue_pair::~nvme_io_queue_pair()
+{
+    for(auto vec : _pending_bios)
+        free(vec);
+}
+
+nvme_admin_queue_pair::nvme_admin_queue_pair(
+        int did,
+        int id,
+        int qsize,
+        pci::device& dev,
+
+        nvme_sq_entry_t* sq_addr,
+        u32* sq_doorbell,
+
+        nvme_cq_entry_t* cq_addr,
+        u32* cq_doorbell,
+        std::map<u32, nvme_ns_t*>& ns
+        ) : nvme_queue_pair(
+            did,
+            id,
+            qsize,
+            dev,
+
+            sq_addr,
+            sq_doorbell,
+
+            cq_addr,
+            cq_doorbell,
+            ns
+        ){};
diff --git a/drivers/nvme-queue.hh b/drivers/nvme-queue.hh
new file mode 100644
index 0000000000..f891c96d37
--- /dev/null
+++ b/drivers/nvme-queue.hh
@@ -0,0 +1,126 @@
+#ifndef NVME_QUEUE_H
+#define NVME_QUEUE_H
+
+#include "drivers/nvme.hh"
+
+class nvme_queue_pair;
+
+class nvme_queue_pair 
+{
+public:
+    nvme_queue_pair(
+        int did,
+        u32 id,
+        int qsize,
+        pci::device& dev,
+
+        nvme_sq_entry_t* sq_addr,
+        u32* sq_doorbell,
+
+        nvme_cq_entry_t* cq_addr,
+        u32* cq_doorbell,
+        std::map<u32, nvme_ns_t*>& ns
+        );
+
+    ~nvme_queue_pair();
+    
+    u16 submit_cmd(std::unique_ptr<nvme_sq_entry_t> cmd);
+    
+    virtual void req_done() {};
+    void wait_for_completion_queue_entries();
+    bool completion_queue_not_empty() const;
+
+    void enable_interrupts();
+    void disable_interrupts();
+
+    u32 _id;
+protected:
+    int _driverid;
+
+    u32 _qsize;
+    pci::device* _dev;
+
+    nvme_sq_entry_t* _sq_addr;
+    u32 _sq_head;
+    u32 _sq_tail;
+    volatile u32* _sq_doorbell;
+    bool _sq_full;
+
+    nvme_cq_entry_t* _cq_addr;
+    u32 _cq_head;
+    u32 _cq_tail;
+    volatile u32* _cq_doorbell;
+    int _cq_phase_tag;
+
+    std::map<u32, nvme_ns_t*> _ns;
+
+    std::vector<u64**> _prplists_in_use;
+    
+    mutex _lock;
+    sched::thread_handle _waiter;
+
+    void advance_sq_tail();
+    int map_prps(u16 cid, void* data, u64 datasize, u64* prp1, u64* prp2);
+
+    u16 submit_cmd_without_lock(std::unique_ptr<nvme_sq_entry_t> cmd);
+
+    u16 submit_cmd_batch_without_lock(std::vector<std::unique_ptr<nvme_sq_entry_t>> cmds);
+
+    std::unique_ptr<nvme_cq_entry_t> get_completion_queue_entry();
+
+    std::unique_ptr<nvme_cq_entry_t> check_for_completion(u16 cid);
+};
+
+class nvme_io_queue_pair : public nvme_queue_pair {
+public:
+    nvme_io_queue_pair(
+        int did,
+        int id,
+        int qsize,
+        pci::device& dev,
+
+        nvme_sq_entry_t* sq_addr,
+        u32* sq_doorbell,
+
+        nvme_cq_entry_t* cq_addr,
+        u32* cq_doorbell,
+        std::map<u32, nvme_ns_t*>& ns
+    );
+    ~nvme_io_queue_pair();
+
+    int self_test();
+    int make_request(struct bio* bio, u32 nsid);
+    void req_done();
+
+    int submit_io_batch(std::vector<bio*> bios, u32 nsid=1);
+private:
+    std::vector<struct bio**> _pending_bios;
+    int submit_rw(u16 cid, void* data, u64 slba, u32 nlb, u32 nsid, int opc);
+    int submit_flush();
+};
+
+class nvme_admin_queue_pair : public nvme_queue_pair {
+public:
+    nvme_admin_queue_pair(
+        int did,
+        int id,
+        int qsize,
+        pci::device& dev,
+
+        nvme_sq_entry_t* sq_addr,
+        u32* sq_doorbell,
+
+        nvme_cq_entry_t* cq_addr,
+        u32* cq_doorbell,
+        std::map<u32, nvme_ns_t*>& ns
+    );
+
+    std::unique_ptr<nvme_cq_entry_t> _req_res;
+    volatile bool new_cq;
+    void req_done();
+    std::unique_ptr<nvme_cq_entry_t> submit_and_return_on_completion(std::unique_ptr<nvme_sq_entry_t> cmd, void* data=nullptr, unsigned int datasize=0);
+private:
+    sched::thread_handle _req_waiter;
+};
+
+#endif
\ No newline at end of file
diff --git a/drivers/nvme-structs.h b/drivers/nvme-structs.h
new file mode 100644
index 0000000000..af77563510
--- /dev/null
+++ b/drivers/nvme-structs.h
@@ -0,0 +1,647 @@
+/**
+ * Copyright (c) 2015-2016, Micron Technology, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * @brief NVMe header file
+ */
+
+#ifndef NVME_STRUCTS_H
+#define NVME_STRUCTS_H
+
+#include <stdint.h>
+
+__BEGIN_DECLS
+
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+    #pragma error "only support little endian CPU architecture"
+#endif
+
+#ifndef _U_TYPE
+#define _U_TYPE                         ///< bit size data types
+typedef int8_t              s8;         ///< 8-bit signed
+typedef int16_t             s16;        ///< 16-bit signed
+typedef int32_t             s32;        ///< 32-bit signed
+typedef int64_t             s64;        ///< 64-bit signed
+typedef uint8_t             u8;         ///< 8-bit unsigned
+typedef uint16_t            u16;        ///< 16-bit unsigned
+typedef uint32_t            u32;        ///< 32-bit unsigned
+typedef uint64_t            u64;        ///< 64-bit unsigned
+#endif // _U_TYPE
+
+/// NVMe command op code
+enum {
+    NVME_CMD_FLUSH          = 0x0,      ///< flush
+    NVME_CMD_WRITE          = 0x1,      ///< write
+    NVME_CMD_READ           = 0x2,      ///< read
+    NVME_CMD_WRITE_UNCOR    = 0x4,      ///< write uncorrectable
+    NVME_CMD_COMPARE        = 0x5,      ///< compare
+    NVME_CMD_DS_MGMT        = 0x9,      ///< dataset management
+};
+
+/// NVMe admin command op code
+enum {
+    NVME_ACMD_DELETE_SQ     = 0x0,      ///< delete io submission queue
+    NVME_ACMD_CREATE_SQ     = 0x1,      ///< create io submission queue
+    NVME_ACMD_GET_LOG_PAGE  = 0x2,      ///< get log page
+    NVME_ACMD_DELETE_CQ     = 0x4,      ///< delete io completion queue
+    NVME_ACMD_CREATE_CQ     = 0x5,      ///< create io completion queue
+    NVME_ACMD_IDENTIFY      = 0x6,      ///< identify
+    NVME_ACMD_ABORT         = 0x8,      ///< abort
+    NVME_ACMD_SET_FEATURES  = 0x9,      ///< set features
+    NVME_ACMD_GET_FEATURES  = 0xA,      ///< get features
+    NVME_ACMD_ASYNC_EVENT   = 0xC,      ///< asynchronous event
+    NVME_ACMD_FW_ACTIVATE   = 0x10,     ///< firmware activate
+    NVME_ACMD_FW_DOWNLOAD   = 0x11,     ///< firmware image download
+};
+
+/// NVMe feature identifiers
+enum {
+    NVME_FEATURE_ARBITRATION = 0x1,     ///< arbitration
+    NVME_FEATURE_POWER_MGMT = 0x2,      ///< power management
+    NVME_FEATURE_LBA_RANGE = 0x3,       ///< LBA range type
+    NVME_FEATURE_TEMP_THRESHOLD = 0x4,  ///< temperature threshold
+    NVME_FEATURE_ERROR_RECOVERY = 0x5,  ///< error recovery
+    NVME_FEATURE_WRITE_CACHE = 0x6,     ///< volatile write cache
+    NVME_FEATURE_NUM_QUEUES = 0x7,      ///< number of queues
+    NVME_FEATURE_INT_COALESCING = 0x8,  ///< interrupt coalescing
+    NVME_FEATURE_INT_VECTOR = 0x9,      ///< interrupt vector config
+    NVME_FEATURE_WRITE_ATOMICITY = 0xA, ///< write atomicity
+    NVME_FEATURE_ASYNC_EVENT = 0xB,     ///< async event config
+};
+
+/// Version
+typedef union _nvme_version {
+    u32                 val;            ///< whole value
+    struct {
+        u8              rsvd;           ///< reserved
+        u8              mnr;            ///< minor version number
+        u16             mjr;            ///< major version number
+    };
+} nvme_version_t;
+
+/// Admin queue attributes
+typedef union _nvme_adminq_attr {
+    u32                 val;            ///< whole value
+    struct {
+        u16             asqs;           ///< admin submission queue size
+        u16             acqs;           ///< admin completion queue size
+    };
+} nvme_adminq_attr_t;
+
+/// Controller capabilities
+typedef union _nvme_controller_cap {
+    u64                 val;            ///< whole value
+    struct {
+        u16             mqes;           ///< max queue entries supported
+        u8              cqr     : 1;    ///< contiguous queues required
+        u8              ams     : 2;    ///< arbitration mechanism supported
+        u8              rsvd    : 5;    ///< reserved
+        u8              to;             ///< timeout
+
+        u32             dstrd   : 4;    ///< doorbell stride
+        u32             nssrs   : 1;    ///< NVM subsystem reset supported
+        u32             css     : 8;    ///< command set supported
+        u32             rsvd2   : 3;    ///< reserved
+        u32             mpsmin  : 4;    ///< memory page size minimum
+        u32             mpsmax  : 4;    ///< memory page size maximum
+        u32             rsvd3   : 8;    ///< reserved
+    };
+} nvme_controller_cap_t;
+
+/// Controller configuration register
+typedef union _nvme_controller_config {
+    u32                 val;            ///< whole value
+    struct {
+        u32             en      : 1;    ///< enable
+        u32             rsvd    : 3;    ///< reserved
+        u32             css     : 3;    ///< I/O command set selected
+        u32             mps     : 4;    ///< memory page size
+        u32             ams     : 3;    ///< arbitration mechanism selected
+        u32             shn     : 2;    ///< shutdown notification
+        u32             iosqes  : 4;    ///< I/O submission queue entry size
+        u32             iocqes  : 4;    ///< I/O completion queue entry size
+        u32             rsvd2   : 8;    ///< reserved
+    };
+} nvme_controller_config_t;
+
+/// Controller status register
+typedef union _nvme_controller_status {
+    u32                 val;            ///< whole value
+    struct {
+        u32             rdy     : 1;    ///< ready
+        u32             cfs     : 1;    ///< controller fatal status
+        u32             shst    : 2;    ///< shutdown status
+        u32             rsvd    : 28;   ///< reserved
+    };
+} nvme_controller_status_t;
+
+/// Controller memory buffer location register
+typedef union _nvme_cmbloc {
+    u32                 val;            ///< whole value
+    struct {
+        u32             bir     : 3;    ///< base indicator register
+        u32             rsvd    : 9;    ///< reserved
+        u32             ofst    : 20;   ///< offset (in cmbsz units)
+    };
+} nvme_cmbloc_t;
+
+/// Controller memory buffer size register
+typedef union _nvme_cmbsz {
+    u32                 val;            ///< whole value
+    struct {
+        u32             sqs     : 1;    ///< submission queue support
+        u32             cqs     : 1;    ///< completion queue support
+        u32             lists   : 1;    ///< PRP SGL list support
+        u32             rds     : 1;    ///< read data support
+        u32             wds     : 1;    ///< write data support
+        u32             rsvd    : 3;    ///< reserved
+        u32             szu     : 4;    ///< size units (0=4K,1=64K,2=1M,3=16M,4=256M,5=4G,6=64G)
+        u32             sz      : 20;   ///< size (in cmbsz units)
+    };
+} nvme_cmbsz_t;
+
+
+
+enum nvme_sgl_descriptor_type {
+    NVME_SGL_DATA_BLOCK_TYPE = 0x0,
+    NVME_SGL_BIT_BUCKET_TYPE = 0x1,
+    NVME_SGL_SEGMENT_TYPE = 0x2,
+    NVME_SGL_LAST_SEGMENT_TYPE = 0x3,
+    NVME_SGL_KEYED_DATA_BLOCK_TYPE = 0x4,
+    NVME_SGL_TRANSPORT_DATA_BLOCK_TYPE = 0x5,
+    /*
+    *0x6 - 0xE reserved
+    */
+
+    NVME_SGL_VENDOR_SPECIFIC_TYPE = 0xF,
+};
+
+enum nvme_sgl_descriptor_subtype {
+    NVME_SGL_ADDRESS_SUBTYPE = 0x0,
+    NVME_SGL_OFFSET_SUBTYPE = 0x1, 
+    //0xA - 0xF Nvme transport specific
+};
+
+struct __attribute__((packed)) nvme_sgl_descriptor_unkeyed {
+    u64 addr;
+    u32 length;
+    u8 reserved[3];
+    u8 subtype:4;
+    u8 type:4;
+};
+
+struct __attribute__((packed)) nvme_sgl_descriptor_keyed {
+    u64 addr;
+    u64 length:24;
+    u64 key:32;
+    u64 subtype:4;
+    u64 type:4;
+};
+union nvme_sgl_descriptor {
+    nvme_sgl_descriptor_keyed keyed;
+    nvme_sgl_descriptor_unkeyed unkeyed;
+};
+
+static_assert(sizeof(nvme_sgl_descriptor)==16);
+
+
+
+
+/// Controller register (bar 0)
+typedef struct _nvme_controller_reg {
+    nvme_controller_cap_t   cap;        ///< controller capabilities
+    nvme_version_t          vs;         ///< version
+    u32                     intms;      ///< interrupt mask set
+    u32                     intmc;      ///< interrupt mask clear
+    nvme_controller_config_t cc;        ///< controller configuration
+    u32                     rsvd;       ///< reserved
+    nvme_controller_status_t csts;      ///< controller status
+    u32                     nssr;       ///< NVM subsystem reset
+    nvme_adminq_attr_t      aqa;        ///< admin queue attributes
+    u64                     asq;        ///< admin submission queue base address
+    u64                     acq;        ///< admin completion queue base address
+    nvme_cmbloc_t           cmbloc;     ///< controller memory buffer location
+    nvme_cmbsz_t            cmbsz;      ///< controller memory buffer size
+    u32                     rcss[1008]; ///< reserved and command set specific
+    u32                     sq0tdbl[1024]; ///< sq0 tail doorbell at 0x1000
+} nvme_controller_reg_t;
+
+/// Common command header (cdw 0-9)
+typedef struct _nvme_command_common {
+    u8                      opc;        ///< opcode
+    u8                      fuse : 2;   ///< fuse
+    u8                      rsvd : 4;   ///< reserved
+    u8                      psdt : 2;   ///< PRP or SGL for data transfer
+    u16                     cid;        ///< command id
+    u32                     nsid;       ///< namespace id
+    u64                     cdw2_3;     ///< reserved (cdw 2-3)
+    u64                     mptr;       ///< metadata pointer
+    union {
+        struct {
+            u64                     prp1;       ///< PRP entry 1
+            u64                     prp2;       ///< PRP entry 2
+        };
+        nvme_sgl_descriptor sgl1;   ///<SGL1 entry
+    
+    };
+} nvme_command_common_t;
+
+/// NVMe command:  Read & Write
+typedef struct _nvme_command_rw {
+    nvme_command_common_t   common;     ///< common cdw 0
+    u64                     slba;       ///< starting LBA (cdw 10)
+    u16                     nlb;        ///< number of logical blocks
+    u16                     rsvd12 : 10; ///< reserved (in cdw 12)
+    u16                     prinfo : 4; ///< protection information field
+    u16                     fua : 1;    ///< force unit access
+    u16                     lr  : 1;    ///< limited retry
+    u8                      dsm;        ///< dataset management
+    u8                      rsvd13[3];  ///< reserved (in cdw 13)
+    u32                     eilbrt;     ///< exp initial block reference tag
+    u16                     elbat;      ///< exp logical block app tag
+    u16                     elbatm;     ///< exp logical block app tag mask
+} nvme_command_rw_t;
+
+static_assert(sizeof(nvme_command_rw_t)==64);
+
+/// Admin and NVM Vendor Specific Command
+typedef struct _nvme_command_vs {
+    nvme_command_common_t   common;     ///< common cdw 0
+    union {
+        struct {
+            u32             ndt;        ///< number of dwords data transfer
+            u32             ndm;        ///< number of dwords metadata transfer
+            u32             cdw12_15[4]; ///< vendor specific (cdw 12-15)
+        };
+        u32                 cdw10_15[6]; ///< vendor specific (cdw 10-15)
+    };
+} nvme_command_vs_t;
+
+/// Admin command:  Delete I/O Submission & Completion Queue
+typedef struct _nvme_acmd_delete_ioq {
+    nvme_command_common_t   common;     ///< common cdw 0
+    u16                     qid;        ///< queue id (cdw 10)
+    u16                     rsvd10;     ///< reserved (in cdw 10)
+    u32                     cdw11_15[5]; ///< reserved (cdw 11-15)
+} nvme_acmd_delete_ioq_t;
+
+/// Admin command:  Create I/O Submission Queue
+typedef struct _nvme_acmd_create_sq {
+    nvme_command_common_t   common;     ///< common cdw 0
+    u16                     qid;        ///< queue id (cdw 10)
+    u16                     qsize;      ///< queue size
+    u16                     pc : 1;     ///< physically contiguous
+    u16                     qprio : 2;  ///< interrupt enabled
+    u16                     rsvd11 : 13; ///< reserved (in cdw 11)
+    u16                     cqid;       ///< associated completion queue id
+    u32                     cdw12_15[4]; ///< reserved (cdw 12-15)
+} nvme_acmd_create_sq_t;
+
+/// Admin command:  Get Log Page
+typedef struct _nvme_acmd_get_log_page {
+    nvme_command_common_t   common;     ///< common cdw 0
+    u8                      lid;        ///< log page id (cdw 10)
+    u8                      rsvd10a;    ///< reserved (in cdw 10)
+    u16                     numd : 12;  ///< number of dwords
+    u16                     rsvd10b : 4; ///< reserved (in cdw 10)
+    u32                     rsvd11[5];  ///< reserved (cdw 11-15)
+} nvme_acmd_get_log_page_t;
+
+/// Admin command:  Create I/O Completion Queue
+typedef struct _nvme_acmd_create_cq {
+    nvme_command_common_t   common;     ///< common cdw 0
+    u16                     qid;        ///< queue id (cdw 10)
+    u16                     qsize;      ///< queue size
+    u16                     pc : 1;     ///< physically contiguous
+    u16                     ien : 1;    ///< interrupt enabled
+    u16                     rsvd11 : 14; ///< reserved (in cdw 11)
+    u16                     iv;         ///< interrupt vector
+    u32                     cdw12_15[4]; ///< reserved (cdw 12-15)
+} nvme_acmd_create_cq_t;
+
+/// Admin command:  Identify
+typedef struct _nvme_acmd_identify {
+    nvme_command_common_t   common;     ///< common cdw 0
+    u32                     cns;        ///< controller or namespace (cdw 10)
+    u32                     cdw11_15[5]; ///< reserved (cdw 11-15)
+} nvme_acmd_identify_t;
+
+/// Admin command:  Abort
+typedef struct _nvme_acmd_abort {
+    nvme_command_common_t   common;     ///< common cdw 0
+    u16                     sqid;       ///< submission queue id (cdw 10)
+    u16                     cid;        ///< command id
+    u32                     cdw11_15[5]; ///< reserved (cdw 11-15)
+} nvme_acmd_abort_t;
+
+struct nvme_sgls {
+    u32                     reserved:2;
+    u32                     tdbd_supp:1;    ///< Transport Data Block descriptor supported
+    u32                     offset_supp:1;  ///< Offset Subtype supported
+    u32                     sgl_mtpt_supp:1;///<SGL descriptor in Metadata pointer supported
+    u32                     reserved2:25;
+    u32                     sgl_supp:2;      ///<SGL Support
+
+};
+
+static_assert(sizeof(nvme_sgls)==4);
+
+/// Admin data:  Identify Controller Data
+typedef struct _nvme_identify_ctlr {
+    u16                     vid;        ///< PCI vendor id
+    u16                     ssvid;      ///< PCI subsystem vendor id
+    char                    sn[20];     ///< serial number
+    char                    mn[40];     ///< model number
+    char                    fr[8];      ///< firmware revision
+    u8                      rab;        ///< recommended arbitration burst
+    u8                      ieee[3];    ///< IEEE OUI identifier
+    u8                      mic;        ///< multi-interface capabilities
+    u8                      mdts;       ///< max data transfer size
+    u8                      rsvd78[178]; ///< reserved (78-255)
+    u16                     oacs;       ///< optional admin command support
+    u8                      acl;        ///< abort command limit
+    u8                      aerl;       ///< async event request limit
+    u8                      frmw;       ///< firmware updates
+    u8                      lpa;        ///< log page attributes
+    u8                      elpe;       ///< error log page entries
+    u8                      npss;       ///< number of power states support
+    u8                      avscc;      ///< admin vendor specific config
+    u8                      rsvd265[247]; ///< reserved (265-511)
+    u8                      sqes;       ///< submission queue entry size
+    u8                      cqes;       ///< completion queue entry size
+    u8                      rsvd514[2]; ///< reserved (514-515)
+    u32                     nn;         ///< number of namespaces
+    u16                     oncs;       ///< optional NVM command support
+    u16                     fuses;      ///< fused operation support
+    u8                      fna;        ///< format NVM attributes
+    u8                      vwc;        ///< volatile write cache
+    u16                     awun;       ///< atomic write unit normal
+    u16                     awupf;      ///< atomic write unit power fail
+    u8                      nvscc;      ///< NVM vendor specific config
+    u8                      rsvd531[5]; ///< reserved (531-535)
+    nvme_sgls               sgls;       ///< SGL support
+    u8                      rsvd540[164]; ///< reserved (540-703) 
+    u8                      rsvd704[1344]; ///< reserved (704-2047)
+    u8                      psd[1024];  ///< power state 0-31 descriptors
+    u8                      vs[1024];   ///< vendor specific
+} nvme_identify_ctlr_t;
+
+/// Admin data:  Identify Namespace - LBA Format Data
+typedef struct _nvme_lba_format {
+    u16                     ms;         ///< metadata size
+    u8                      lbads;      ///< LBA data size
+    u8                      rp : 2;     ///< relative performance
+    u8                      rsvd : 6;   ///< reserved
+} nvme_lba_format_t;
+
+/// Admin data:  Identify Namespace Data
+typedef struct _nvme_identify_ns {
+    u64                     nsze;       ///< namespace size
+    u64                     ncap;       ///< namespace capacity
+    u64                     nuse;       ///< namespace utilization
+    u8                      nsfeat;     ///< namespace features
+    u8                      nlbaf;      ///< number of LBA formats
+    u8                      flbas;      ///< formatted LBA size
+    u8                      mc;         ///< metadata capabilities
+    u8                      dpc;        ///< data protection capabilities
+    u8                      dps;        ///< data protection settings
+    u8                      rsvd30[98]; ///< reserved (30-127)
+    nvme_lba_format_t       lbaf[16];   ///< lba format support
+    u8                      rsvd192[192]; ///< reserved (383-192)
+    u8                      vs[3712];   ///< vendor specific
+} nvme_identify_ns_t;
+
+/// Admin data:  Get Log Page - Error Information
+typedef struct _nvme_log_page_error {
+    u64                     count;      ///< error count
+    u16                     sqid;       ///< submission queue id
+    u16                     cid;        ///< command id
+    u16                     sf;         ///< status field
+    u8                      byte;       ///< parameter byte error location
+    u8                      bit: 3;     ///< parameter bit error location
+    u8                      rsvd : 5;   ///< reserved
+    u64                     lba;        ///< logical block address
+    u32                     ns;         ///< name space
+    u8                      vspec;      ///< vendor specific infomation
+    u8                      rsvd29[35]; ///< reserved (29-63)
+} nvme_log_page_error_t;
+
+/// Admin data:  Get Log Page - SMART / Health Information
+typedef struct _nvme_log_page_health {
+    u8                      warn;       ///< critical warning
+    u16                     temp;       ///< temperature
+    u8                      avspare;     ///< available spare
+    u8                      avsparethresh; ///< available spare threshold
+    u8                      used;       ///< percentage used
+    u8                      rsvd6[26];  ///< reserved (6-31)
+    u64                     dur[2];     ///< data units read
+    u64                     duw[2];     ///< data units written
+    u64                     hrc[2];     ///< number of host read commands
+    u64                     hwc[2];     ///< number of host write commands
+    u64                     cbt[2];     ///< controller busy time
+    u64                     pcycles[2]; ///< number of power cycles
+    u64                     phours[2]; ///< power on hours
+    u64                     unsafeshut[2]; ///< unsafe shutdowns
+    u64                     merrors[2]; ///< media errors
+    u64                     errlogs[2]; ///< number of error log entries
+    u64                     rsvd192[320]; ///< reserved (192-511)
+} nvme_log_page_health_t;
+
+/// Admin data:  Get Log Page - Firmware Slot Information
+typedef struct _nvme_log_page_fw {
+    u8                      afi;        ///< active firmware info
+    u8                      rsvd1[7];   ///< reserved (1-7)
+    u64                     fr[7];      ///< firmware revision for slot 1-7
+    u8                      rsvd64[448]; ///< reserved (64-511)
+} nvme_log_page_fw_t;
+
+/// Admin feature:  Arbitration
+typedef struct _nvme_feature_arbitration {
+    u8                      ab: 3;      ///< arbitration burst
+    u8                      rsvd: 5;    ///< reserved
+    u8                      lpw;        ///< low priority weight
+    u8                      mpw;        ///< medium priority weight
+    u8                      hpw;        ///< high priority weight
+} nvme_feature_arbitration_t;
+
+/// Admin feature:  Power Management
+typedef struct _nvme_feature_power_mgmt {
+    u32                     ps: 5;      ///< power state
+    u32                     rsvd: 27;   ///< reserved
+} nvme_feature_power_mgmt_t;
+
+/// Admin feature:  LBA Range Type Data
+typedef struct _nvme_feature_lba_data {
+    struct {
+        u8                  type;       ///< type
+        u8                  attributes; ///< attributes
+        u8                  rsvd[14];   ///< reserved
+        u64                 slba;       ///< starting LBA
+        u64                 nlb;        ///< number of logical blocks
+        u8                  guid[16];   ///< unique id
+        u8                  rsvd48[16]; ///< reserved
+    } entry[64];                        ///< LBA data entry
+} nvme_feature_lba_data_t;
+
+/// Admin feature:  LBA Range Type
+typedef struct _nvme_feature_lba_range {
+    u32                     num: 6;     ///< number of LBA ranges
+    u32                     rsvd: 26;   ///< reserved
+} nvme_feature_lba_range_t;
+
+/// Admin feature:  Temperature Threshold
+typedef struct _nvme_feature_temp_threshold {
+    u16                     tmpth;      ///< temperature threshold
+    u16                     rsvd;       ///< reserved
+} nvme_feature_temp_threshold_t;
+
+/// Admin feature:  Error Recovery
+typedef struct _nvme_feature_error_recovery {
+    u16                     tler;       ///< time limited error recovery
+    u16                     rsvd;       ///< reserved
+} nvme_feature_error_recovery_t;
+
+/// Admin feature:  Volatile Write Cache
+typedef struct _nvme_feature_write_cache {
+    u32                     wce: 1;     ///< volatile write cache
+    u32                     rsvd: 31;   ///< reserved
+} nvme_feature_write_cache_t;
+
+/// Admin feature:  Number of Queues
+typedef struct _nvme_feature_num_queues {
+    u16                     nsq;        ///< numer of submission queues
+    u16                     ncq;        ///< numer of completion queues
+} nvme_feature_num_queues_t;
+
+/// Admin feature:  Interrupt Coalescing
+typedef struct _nvme_feature_int_coalescing {
+    u8                      thr;        ///< aggregation threshold
+    u8                      time;       ///< aggregation time
+    u16                     rsvd;       ///< reserved
+} nvme_feature_int_coalescing_t;
+
+/// Admin feature:  Interrupt Vector Configuration
+typedef struct _nvme_feature_int_vector {
+    u16                     iv;         ///< interrupt vector
+    u16                     cd: 1;      ///< coalescing disable
+    u16                     rsvd: 15;   ///< reserved
+} nvme_feature_int_vector_t;
+
+/// Admin feature:  Write Atomicity
+typedef struct _nvme_feature_write_atomicity {
+    u32                     dn: 1;      ///< disable normal
+    u32                     rsvd: 31;   ///< reserved
+} nvme_feature_write_atomicity_t;
+
+/// Admin feature:  Async Event Configuration
+typedef struct _nvme_feature_async_event {
+    u8                      smart;      ///< SMART / health critical warnings
+    u8                      rsvd[3];    ///< reserved
+} nvme_feature_async_event_t;
+
+/// Admin command:  Get Feature
+typedef struct _nvme_acmd_get_features {
+    nvme_command_common_t   common;     ///< common cdw 0
+    u8                      fid;        ///< feature id (cdw 10:0-7)
+    u8                      rsvd10[3];  ///< reserved (cdw 10:8-31)
+} nvme_acmd_get_features_t;
+
+/// Admin command:  Set Feature
+typedef struct _nvme_acmd_set_features {
+    nvme_command_common_t   common;     ///< common cdw 0
+    u8                      fid;        ///< feature id (cdw 10:0-7)
+    u8                      rsvd10[3];  ///< reserved (cdw 10:8-31)
+    u32                     val;        ///< cdw 11
+} nvme_acmd_set_features_t;
+
+/// Submission queue entry
+typedef union _nvme_sq_entry {
+    nvme_command_rw_t       rw;         ///< read/write command
+    nvme_command_vs_t       vs;         ///< admin and vendor specific command
+
+    nvme_acmd_abort_t       abort;      ///< admin abort command
+    nvme_acmd_create_cq_t   create_cq;  ///< admin create IO completion queue
+    nvme_acmd_create_sq_t   create_sq;  ///< admin create IO submission queue
+    nvme_acmd_delete_ioq_t  delete_ioq; ///< admin delete IO queue
+    nvme_acmd_identify_t    identify;   ///< admin identify command
+    nvme_acmd_get_log_page_t get_log_page; ///< get log page command
+    nvme_acmd_get_features_t get_features; ///< get feature
+    nvme_acmd_set_features_t set_features; ///< set feature
+} nvme_sq_entry_t;
+
+/// Completion queue entry
+typedef struct _nvme_cq_entry {
+    u32                     cs;         ///< command specific
+    u32                     rsvd;       ///< reserved
+    u16                     sqhd;       ///< submission queue head
+    u16                     sqid;       ///< submission queue id
+    u16                     cid;        ///< command id
+    union {
+        u16                 psf;        ///< phase bit and status field
+        struct {
+            u16             p : 1;      ///< phase tag id
+            u16             sc : 8;     ///< status code
+            u16             sct : 3;    ///< status code type
+            u16             rsvd3 : 2;  ///< reserved
+            u16             m : 1;      ///< more
+            u16             dnr : 1;    ///< do not retry
+        };
+    };
+} nvme_cq_entry_t;
+
+typedef union _nvme_psf {
+    u16                 psf;        ///< phase bit and status field
+    struct {
+        u16             p : 1;      ///< phase tag id
+        u16             sc : 8;     ///< status code
+        u16             sct : 3;    ///< status code type
+        u16             rsvd3 : 2;  ///< reserved
+        u16             m : 1;      ///< more
+        u16             dnr : 1;    ///< do not retry
+    };
+} nvme_psf_t;
+
+struct _nvme_device;
+
+/// Namespace attributes structure
+typedef struct _nvme_ns {
+    u32                 id;         ///< namespace id
+    u64                 blockcount; ///< total number of available blocks
+    u16                 blocksize;  ///< logical block size
+    u16                 blockshift; ///< block size shift value
+    u16                 bpshift;    ///< block to page shift
+} nvme_ns_t;
+
+__END_DECLS
+
+#endif  // NVME_STRUCTS_H
+
diff --git a/drivers/nvme.cc b/drivers/nvme.cc
new file mode 100644
index 0000000000..b68e7721a7
--- /dev/null
+++ b/drivers/nvme.cc
@@ -0,0 +1,693 @@
+#include <sys/cdefs.h>
+
+#include "drivers/nvme.hh"
+#include "drivers/pci-device.hh"
+#include <osv/interrupt.hh>
+
+#include <cassert>
+#include <sstream>
+#include <string>
+#include <string.h>
+#include <map>
+#include <errno.h>
+#include <osv/debug.h>
+
+#include <osv/sched.hh>
+#include <osv/trace.hh>
+#include <osv/aligned_new.hh>
+
+#include <osv/device.h>
+#include <osv/bio.h>
+#include <osv/ioctl.h>
+#include <osv/contiguous_alloc.hh>
+
+using namespace memory;
+
+#include <sys/mman.h>
+#include <sys/refcount.h>
+
+#include <osv/drivers_config.h>
+#include "drivers/io-test.hh"
+
+TRACEPOINT(trace_nvme_read_config, "capacity=%lu blk_size=%u max_io_size=%u", u64, u32, u64);
+TRACEPOINT(trace_nvme_strategy, "bio=%p", struct bio*);
+TRACEPOINT(trace_nvme_vwc_enabled, "sc=%#x sct=%#x", u16, u16);
+TRACEPOINT(trace_nvme_number_of_queues, "cq num=%d, sq num=%d, iv_num=%d", u16, u16, u32);
+TRACEPOINT(trace_nvme_identify_namespace, "nsid=%d, blockcount=%d, blocksize=%d", u32, u64, u16);
+TRACEPOINT(trace_nvme_register_interrupt, "_io_queues[%d], iv=%d", int, int);
+
+
+#define QEMU_VID 0x1b36
+
+std::unique_ptr<nvme_sq_entry_t> alloc_cmd() {
+    auto cmd = std::unique_ptr<nvme_sq_entry_t>(new nvme_sq_entry_t);
+    assert(cmd);
+    memset(cmd.get(), 0, sizeof(nvme_ns_t));
+    return cmd;
+}
+
+struct nvme_priv {
+    devop_strategy_t strategy;
+    nvme* drv;
+    u32 nsid;
+};
+
+static void nvme_strategy(struct bio* bio) {
+    auto* prv = reinterpret_cast<struct nvme_priv*>(bio->bio_dev->private_data);
+    trace_nvme_strategy(bio);
+    prv->drv->make_request(bio);
+}
+
+static int
+nvme_read(struct device *dev, struct uio *uio, int ioflags)
+{
+  return bdev_read(dev, uio, ioflags);
+}
+
+static int
+nvme_write(struct device *dev, struct uio *uio, int ioflags)
+{
+    return bdev_write(dev, uio, ioflags);
+}
+
+static int
+nvme_direct_rw(struct device *dev, struct uio *uio, int ioflags)
+{
+    auto* prv = reinterpret_cast<struct nvme_priv*>(dev->private_data);
+
+	assert((uio->uio_offset % prv->drv->_ns_data[prv->nsid]->blocksize) == 0);
+	assert((uio->uio_resid % prv->drv->_ns_data[prv->nsid]->blocksize) == 0);
+
+    bio* complete_io = alloc_bio();
+
+    u8 opcode;
+    switch (uio->uio_rw) {
+    case UIO_READ :
+        opcode = BIO_READ;
+        break;
+    case UIO_WRITE :
+        opcode = BIO_WRITE;
+        break;
+    default :
+        return EINVAL;
+    }
+
+    refcount_init(&complete_io->bio_refcnt, uio->uio_iovcnt);
+
+    while(uio->uio_iovcnt > 0) 
+    {
+        bio* bio = alloc_bio();
+        bio->bio_cmd = opcode;
+        bio->bio_dev = dev;
+
+        assert((uio->uio_iov->iov_len % prv->drv->_ns_data[prv->nsid]->blocksize) == 0);
+
+        bio->bio_bcount = uio->uio_iov->iov_len;
+        bio->bio_data = uio->uio_iov->iov_base;
+        bio->bio_offset = uio->uio_offset;
+
+        bio->bio_caller1 = complete_io;
+        bio->bio_private = complete_io->bio_private;
+        bio->bio_done = multiplex_bio_done;
+        
+        dev->driver->devops->strategy(bio);
+
+        uio->uio_offset += uio->uio_iov->iov_len;
+        uio->uio_resid -= uio->uio_iov->iov_len;
+        uio->uio_iov++;
+        uio->uio_iovcnt--;
+    }
+    assert(uio->uio_resid == 0);
+    int ret = bio_wait(complete_io);
+    destroy_bio(complete_io);
+
+    return ret;
+}
+
+static int
+nvme_open(struct device *dev, int ioflags)
+{
+    return 0;
+}
+
+#include "drivers/blk_ioctl.hh"
+
+static struct devops nvme_devops {
+    nvme_open,
+    no_close,
+    NVME_DIRECT_RW_ENABLED ? nvme_direct_rw : nvme_read,
+    NVME_DIRECT_RW_ENABLED ? nvme_direct_rw : nvme_write,
+    blk_ioctl,
+    no_devctl,
+    multiplex_strategy,
+};
+
+struct driver nvme_driver = {
+    "nvme",
+    &nvme_devops,
+    sizeof(struct nvme_priv),
+};
+
+int nvme::_instance = 0;
+
+extern std::vector<sched::cpu*> sched::cpus;
+
+nvme::nvme(pci::device &dev)
+     : _dev(dev)
+     , _msi(&dev)
+{
+    parse_pci_config();
+    u16 command = dev.get_command();
+    command |= 0x4 | 0x2 | 0x400;
+    dev.set_command(command);
+
+    _id = _instance++;
+    
+    _doorbellstride = 1 << (2 + _control_reg->cap.dstrd);
+    
+    wait_for_controller_ready_change(1);
+    disable_controller();
+
+    init_controller_config();
+
+    create_admin_queue();
+    
+    enable_controller();
+
+    identify_controller();
+
+    if(NVME_CHECK_FOR_ADDITIONAL_NAMESPACES) {
+        identify_active_namespaces(1);
+    } else {
+        identify_namespace(1);
+    }
+
+    if(_identify_controller->vwc & 0x1 && NVME_VWC_ENABLED) {
+        auto cmd = alloc_cmd();
+        cmd->set_features.common.opc = NVME_ACMD_SET_FEATURES;
+        cmd->set_features.fid = NVME_FEATURE_WRITE_CACHE;
+        cmd->set_features.val = 1;
+        auto res = _admin_queue->submit_and_return_on_completion(std::move(cmd));
+        trace_nvme_vwc_enabled(res->sc,res->sct);
+    }
+
+    if(NVME_QUEUE_PER_CPU_ENABLED) {
+        u16 num = sched::cpus.size();
+        u16 ret;
+        set_number_of_queues(num, &ret);   
+        create_io_queues_foreach_cpu();
+    }else {
+        u16 ret;
+        set_number_of_queues(1, &ret);
+        assert(ret>=1);
+        create_io_queue();
+    }
+
+    if(_identify_controller->vid != QEMU_VID) {
+        set_interrupt_coalescing(20,2);
+    }
+
+    struct nvme_priv* prv;
+    struct device *osv_dev;
+    
+    debugf("nvme: %s\n", _identify_controller->sn);
+    
+    for(const auto& ns : _ns_data) {
+        std::string dev_name;
+        if(ns.first == 1 && _id == 0) {
+            dev_name = "vblk";
+            dev_name += std::to_string(_id);
+        } else {
+            dev_name = "nvme";
+            dev_name += std::to_string(_id) + "n";
+            dev_name += std::to_string(ns.first);
+        }
+        off_t size = ((off_t) ns.second->blockcount) << ns.second->blockshift;
+        
+        debugf("nvme: Add namespace %d of nvme device %d as %s, devsize=%lld\n", ns.first, _id, dev_name.c_str(), size);
+
+        osv_dev = device_create(&nvme_driver,dev_name.c_str(), D_BLK);
+        prv = reinterpret_cast<struct nvme_priv*>(osv_dev->private_data);
+        prv->strategy = nvme_strategy;
+        prv->drv = this;
+        prv->nsid = ns.first;
+        osv_dev->size = size;
+        /*
+        * IO size greater than 4096 << 9 would mean we need 
+        * more than 1 page for the prplist which is not implemented
+        */
+        osv_dev->max_io_size = 4096 << ((9 < _identify_controller->mdts)? 9 : _identify_controller->mdts );
+
+        #if CONF_drivers_io_test
+            test_block_device(osv_dev, 20*1e6, 8);
+            test_block_device(osv_dev, 20*1e6, 512);
+        #endif
+        
+        read_partition_table(osv_dev);
+    }
+}
+
+int nvme::set_number_of_queues(u16 num, u16* ret) 
+{
+    auto cmd = alloc_cmd();
+    cmd->set_features.common.opc = NVME_ACMD_SET_FEATURES;
+    cmd->set_features.fid = NVME_FEATURE_NUM_QUEUES;
+    cmd->set_features.val = (num << 16) | num;
+    std::unique_ptr<nvme_cq_entry_t> res = _admin_queue->submit_and_return_on_completion(std::move(cmd));
+    u16 cq_num, sq_num;
+    cq_num = res->cs >> 16;
+    sq_num = res->cs & 0xffff;
+    
+    trace_nvme_number_of_queues(res->cs >> 16, res->cs & 0xffff,_dev.msix_get_num_entries());
+    
+    if(res->sct != 0 || res->sc != 0)
+        return EIO;
+
+    if(num > cq_num || num > sq_num) {
+        *ret = (cq_num > sq_num) ? cq_num : sq_num;  
+    } else {
+        *ret = num;
+    }
+    return 0;
+}
+/*time in 100ms increments*/
+int nvme::set_interrupt_coalescing(u8 threshold, u8 time) 
+{
+    auto cmd = alloc_cmd();
+    cmd->set_features.common.opc = NVME_ACMD_SET_FEATURES;
+    cmd->set_features.fid = NVME_FEATURE_INT_COALESCING;
+    cmd->set_features.val = threshold | (time << 8);
+    std::unique_ptr<nvme_cq_entry_t> res = _admin_queue->submit_and_return_on_completion(std::move(cmd));
+
+    if(res->sct != 0 || res->sc != 0)
+        return EIO;
+    return 0;
+}
+
+void nvme::enable_controller() 
+{
+    nvme_controller_config_t cc;
+    cc.val = mmio_getl(&_control_reg->cc);
+    
+    assert(cc.en == 0);
+    cc.en = 1;
+
+    mmio_setl(&_control_reg->cc,cc.val);
+    int s = wait_for_controller_ready_change(1);
+    assert(s==0);
+}
+
+void nvme::disable_controller() 
+{   
+    nvme_controller_config_t cc;
+    cc.val = mmio_getl(&_control_reg->cc);
+    
+    assert(cc.en == 1);
+    cc.en = 0;
+
+    mmio_setl(&_control_reg->cc,cc.val);
+    int s = wait_for_controller_ready_change(0);
+    assert(s==0);
+}
+
+int nvme::wait_for_controller_ready_change(int ready)
+{
+    int timeout = mmio_getb(&_control_reg->cap.to) * 10000; // timeout in 0.05ms steps
+    nvme_controller_status_t csts;
+    for (int i = 0; i < timeout; i++) {
+        csts.val = mmio_getl(&_control_reg->csts);
+        if (csts.rdy == ready) return 0;
+        usleep(50);
+    }
+    NVME_ERROR("timeout=%d waiting for ready %d", timeout, ready);
+    return ETIME;
+}
+
+void nvme::init_controller_config()
+{
+    nvme_controller_config_t cc;
+    cc.val = mmio_getl(&_control_reg->cc.val);
+    cc.iocqes    = 4;  // completion queue entry size 16B
+    cc.iosqes    = 6;  // submission queue entry size 64B
+    cc.mps        = 0;  // memory page size 4096B
+
+    mmio_setl(&_control_reg->cc, cc.val);
+}
+
+void nvme::create_admin_queue()
+{
+    int qsize = NVME_ADMIN_QUEUE_SIZE;
+    nvme_sq_entry_t* sqbuf = (nvme_sq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_sq_entry_t),4096);
+    nvme_cq_entry_t* cqbuf = (nvme_cq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_cq_entry_t),4096);
+    
+    nvme_adminq_attr_t aqa;
+    aqa.val = 0;
+    aqa.asqs = aqa.acqs = qsize - 1;
+    
+    u32* sq_doorbell = _control_reg->sq0tdbl;
+    u32* cq_doorbell = (u32*) ((u64)sq_doorbell + _doorbellstride);
+
+    _admin_queue = std::unique_ptr<nvme_admin_queue_pair>(new nvme_admin_queue_pair(_id,0, qsize, _dev, sqbuf, sq_doorbell, cqbuf, cq_doorbell, _ns_data));
+    
+    register_admin_interrupts();
+    
+    mmio_setl(&_control_reg->aqa, aqa.val);
+    mmio_setq(&_control_reg->asq, (u64) mmu::virt_to_phys((void*) sqbuf));
+    mmio_setq(&_control_reg->acq, (u64) mmu::virt_to_phys((void*) cqbuf));
+}
+
+int nvme::create_io_queue(int qsize, int qprio) 
+{
+    u32* sq_doorbell;
+    u32* cq_doorbell;
+    int id = _io_queues.size() + 1;
+    int iv = id;
+    qsize = (qsize < _control_reg->cap.mqes) ? qsize : _control_reg->cap.mqes + 1;
+
+    nvme_sq_entry_t* sqbuf = (nvme_sq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_sq_entry_t),4096);
+    nvme_cq_entry_t* cqbuf = (nvme_cq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_cq_entry_t),4096);
+    assert(sqbuf);
+    assert(cqbuf);
+    memset(sqbuf,0,sizeof(nvme_sq_entry_t)*qsize);
+    memset(cqbuf,0,sizeof(nvme_cq_entry_t)*qsize);
+
+    // create completion queue
+    nvme_acmd_create_cq_t* cmd = (nvme_acmd_create_cq_t*) malloc(sizeof(nvme_acmd_create_cq_t));
+    assert(cmd);
+    memset(cmd, 0, sizeof (*cmd));
+    
+    cmd->qid = id;
+    cmd->qsize = qsize - 1;
+    cmd->iv = iv;
+    cmd->pc = 1;
+    cmd->ien = 1;
+    cmd->common.opc = NVME_ACMD_CREATE_CQ;
+    cmd->common.prp1 = (u64) mmu::virt_to_phys(cqbuf);
+
+    // create submission queue
+    nvme_acmd_create_sq_t* cmd_sq = (nvme_acmd_create_sq_t*) malloc(sizeof(nvme_acmd_create_sq_t));
+    assert(cmd_sq);
+    memset(cmd_sq, 0, sizeof(nvme_acmd_create_sq_t));
+    
+    cmd_sq->pc = 1;
+    cmd_sq->qprio = qprio; // 0=urgent 1=high 2=medium 3=low
+    cmd_sq->qid = id;
+    cmd_sq->cqid = id;
+    cmd_sq->qsize = qsize - 1;
+    cmd_sq->common.opc = NVME_ACMD_CREATE_SQ;
+    cmd_sq->common.prp1 = (u64) mmu::virt_to_phys(sqbuf);
+
+    sq_doorbell = (u32*) ((u64) _control_reg->sq0tdbl + 2 * _doorbellstride * id);
+    cq_doorbell = (u32*) ((u64) sq_doorbell + _doorbellstride);
+
+    _io_queues.push_back(std::unique_ptr<nvme_io_queue_pair>(new nvme_io_queue_pair(_id, iv, qsize, _dev, sqbuf, sq_doorbell, cqbuf, cq_doorbell, _ns_data)));
+    
+    register_interrupt(iv,id-1);
+
+    _admin_queue->submit_and_return_on_completion(std::unique_ptr<nvme_sq_entry_t>((nvme_sq_entry_t*)cmd));
+    _admin_queue->submit_and_return_on_completion(std::unique_ptr<nvme_sq_entry_t>((nvme_sq_entry_t*)cmd_sq));
+
+    return id -1;
+}
+
+void nvme::create_io_queues_foreach_cpu()
+{
+    int iv,id;
+    int qsize = NVME_IO_QUEUE_SIZE;
+    
+    assert(_io_queues.size()==0);
+
+    u32* sq_doorbell;
+    u32* cq_doorbell;
+
+    for(sched::cpu* cpu : sched::cpus) {
+        id = cpu->id;
+        iv = id + 1;
+        nvme_sq_entry_t* sqbuf = (nvme_sq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_sq_entry_t),4096);
+        nvme_cq_entry_t* cqbuf = (nvme_cq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_cq_entry_t),4096);
+        assert(sqbuf);
+        assert(cqbuf);
+        memset(sqbuf,0,sizeof(nvme_sq_entry_t)*qsize);
+        memset(cqbuf,0,sizeof(nvme_cq_entry_t)*qsize);
+
+        nvme_acmd_create_cq_t* cmd = (nvme_acmd_create_cq_t*) malloc(sizeof(nvme_acmd_create_cq_t));
+        assert(cmd);
+        memset(cmd, 0, sizeof (*cmd));
+
+        cmd->qid = iv;
+        cmd->qsize = qsize - 1;
+        cmd->iv = iv;
+        cmd->pc = 1;
+        cmd->ien = 1;
+        cmd->common.opc = NVME_ACMD_CREATE_CQ;
+        cmd->common.prp1 = (u64) mmu::virt_to_phys(cqbuf);
+
+        // create submission queue
+        nvme_acmd_create_sq_t* cmd_sq = (nvme_acmd_create_sq_t*) malloc(sizeof(nvme_acmd_create_sq_t));
+        assert(cmd_sq);
+        memset(cmd_sq, 0, sizeof(nvme_acmd_create_sq_t));
+
+        cmd_sq->pc = 1;
+        cmd_sq->qprio = 2; // 0=urgent 1=high 2=medium 3=low
+        cmd_sq->qid = iv;
+        cmd_sq->cqid = iv;
+        cmd_sq->qsize = qsize - 1;
+        cmd_sq->common.opc = NVME_ACMD_CREATE_SQ;
+        cmd_sq->common.prp1 = (u64) mmu::virt_to_phys(sqbuf);
+
+        sq_doorbell = (u32*) ((u64) _control_reg->sq0tdbl + 2 * _doorbellstride * iv);
+        cq_doorbell = (u32*) ((u64) sq_doorbell + _doorbellstride);
+
+        _io_queues.push_back(std::unique_ptr<nvme_io_queue_pair>(new nvme_io_queue_pair(_id, iv, qsize, _dev, sqbuf, sq_doorbell, cqbuf, cq_doorbell, _ns_data)));
+
+        register_interrupt(iv,id,true,cpu);
+
+        _admin_queue->submit_and_return_on_completion(std::unique_ptr<nvme_sq_entry_t>((nvme_sq_entry_t*)cmd));
+        _admin_queue->submit_and_return_on_completion(std::unique_ptr<nvme_sq_entry_t>((nvme_sq_entry_t*)cmd_sq));
+    }
+}
+
+int nvme::identify_controller()
+{
+    assert(_admin_queue);
+    auto cmd = alloc_cmd();
+    cmd->identify.cns = 1;
+    cmd->identify.common.opc = NVME_ACMD_IDENTIFY;
+    auto data = new nvme_identify_ctlr_t;
+    auto res = _admin_queue->submit_and_return_on_completion(std::move(cmd), (void*) mmu::virt_to_phys(data),4096);
+    
+    if(res->sc != 0 || res->sct != 0) {
+        NVME_ERROR("Identify controller failed nvme%d, sct=%d, sc=%d", _id, res->sct, res->sc);
+        return EIO;
+    }
+
+    _identify_controller.reset(data);
+    return 0;
+}
+
+int nvme::identify_namespace(u32 ns)
+{
+    assert(_admin_queue);
+    auto cmd = alloc_cmd();
+    cmd->identify.cns = 0;
+    cmd->identify.common.nsid = ns;
+    cmd->identify.common.opc = NVME_ACMD_IDENTIFY;
+    auto data = std::unique_ptr<nvme_identify_ns_t>(new nvme_identify_ns_t);
+    
+    auto res = _admin_queue->submit_and_return_on_completion(std::move(cmd), (void*) mmu::virt_to_phys(data.get()),4096);
+    if(res->sc != 0 || res->sct != 0) {
+        NVME_ERROR("Identify namespace failed nvme%d nsid=%d, sct=%d, sc=%d", _id, ns, res->sct, res->sc);
+        return EIO;
+    }
+
+    _ns_data.insert(std::make_pair(ns, new nvme_ns_t));
+    _ns_data[ns]->blockcount = data->ncap;
+    _ns_data[ns]->blockshift = data->lbaf[data->flbas & 0xF].lbads;
+    _ns_data[ns]->blocksize = 1 << _ns_data[ns]->blockshift;
+    _ns_data[ns]->bpshift = NVME_PAGESHIFT - _ns_data[ns]->blockshift;
+    _ns_data[ns]->id = ns;
+    
+    trace_nvme_identify_namespace(ns, _ns_data[ns]->blockcount, _ns_data[ns]->blocksize);
+    return 0;
+}
+
+//identify all active namespaces with nsid >= start
+int nvme::identify_active_namespaces(u32 start)
+{
+    assert(start >= 1);
+    assert(_identify_controller);
+    //max number of namespaces supported by the controller
+    u32 nn = _identify_controller->nn;
+    assert(nn > start);
+
+    auto cmd = alloc_cmd();
+    cmd->identify.cns = 2;
+    cmd->identify.common.nsid = start - 1;
+    cmd->identify.common.opc = NVME_ACMD_IDENTIFY;
+    auto active_namespaces = (u64*) alloc_phys_contiguous_aligned(4096, 4);
+    memset(active_namespaces, 0, 4096);
+
+    _admin_queue->submit_and_return_on_completion(std::move(cmd), (void*) mmu::virt_to_phys(active_namespaces), 4096);
+    int err;
+    for(int i=0; i < 1024; i++) {
+        if(active_namespaces[i]) {
+            err = identify_namespace(active_namespaces[i]);
+            if (err) {
+                free_phys_contiguous_aligned(active_namespaces);
+                return err;
+            }
+        } else { break;}
+    }
+    free_phys_contiguous_aligned(active_namespaces);
+    return 0;
+}
+
+int nvme::make_request(bio* bio, u32 nsid)
+{  
+    if(bio->bio_bcount % _ns_data[nsid]->blocksize || bio->bio_offset % _ns_data[nsid]->blocksize) {
+        NVME_ERROR("bio request not block-aligned length=%d, offset=%d blocksize=%d\n",bio->bio_bcount, bio->bio_offset, _ns_data[nsid]->blocksize);
+        return EINVAL;
+    }
+    bio->bio_offset = bio->bio_offset >> _ns_data[nsid]->blockshift;
+    bio->bio_bcount = bio->bio_bcount >> _ns_data[nsid]->blockshift;
+
+    assert((bio->bio_offset + bio->bio_bcount) <= _ns_data[nsid]->blockcount);
+    
+    if(bio->bio_cmd == BIO_FLUSH && (_identify_controller->vwc == 0 || !NVME_VWC_ENABLED )) {
+        biodone(bio,true);
+        return 0;
+    }
+
+    if(sched::current_cpu->id >= _io_queues.size())
+        return _io_queues[0]->make_request(bio, nsid);
+
+    return _io_queues[sched::current_cpu->id]->make_request(bio, nsid);
+}
+
+void nvme::register_admin_interrupts() 
+{
+    sched::thread* aq_thread = sched::thread::make([this] { this->_admin_queue->req_done(); },
+        sched::thread::attr().name("nvme"+ std::to_string(_id)+"_aq_req_done"));
+    aq_thread->start();
+        
+    bool ok = msix_register(0, [this] { this->_admin_queue->disable_interrupts(); }, aq_thread);
+    _dev.msix_unmask_entry(0);
+    if(not ok)
+        printf("admin interrupt registration failed\n");
+}
+
+bool nvme::msix_register(unsigned iv,
+    // high priority ISR
+    std::function<void ()> isr,
+    // bottom half
+    sched::thread *t,
+    bool assign_affinity)
+{
+    // Enable the device msix capability,
+    // masks all interrupts...
+    if (_dev.is_msix()) {
+        _dev.msix_enable();
+    } else {
+        return false;
+    }
+    _dev.msix_mask_all();
+
+    if(_msix_vectors.empty())
+        _msix_vectors = std::vector<std::unique_ptr<msix_vector>>(_dev.msix_get_num_entries());
+    
+    auto vec = std::unique_ptr<msix_vector>(new msix_vector(&_dev));
+    bool assign_ok;
+    _dev.msix_mask_entry(iv);
+    if (t) {
+        assign_ok =
+            _msi.assign_isr(vec.get(),
+                [=]() mutable {
+                                isr();
+                                t->wake_with_irq_disabled();
+                              });
+    } else {
+        return false;
+    }
+    if (!assign_ok) {
+        return false;
+    }
+    bool setup_ok = _msi.setup_entry(iv, vec.get());
+    if (!setup_ok) {
+        return false;
+    }
+    if (assign_affinity) {
+        vec->set_affinity(t->get_cpu()->arch.apic_id);
+    }
+
+    if(iv < _msix_vectors.size()) {
+        _msix_vectors.at(iv) = std::move(vec);
+    } else {
+        NVME_ERROR("binding_entry %d registration failed\n",iv);
+        return false;
+    }
+    _msix_vectors.at(iv)->msix_unmask_entries();
+
+    _dev.msix_unmask_all();
+    return true;
+}
+//qid should be the index that corresponds to the queue in _io_queues.
+//In general qid = iv - 1
+bool nvme::register_interrupt(unsigned int iv, unsigned int qid, bool pin_t, sched::cpu* cpu)
+{
+    sched::thread* t;
+    bool ok;
+
+    if(_io_queues.size() <= qid) {
+        NVME_ERROR("queue %d not initialized\n",qid);
+        return false;
+    }
+
+    if(_io_queues[qid]->_id != iv)
+        printf("Warning: Queue %d ->_id = %d != iv %d\n",qid,_io_queues[qid]->_id,iv);
+
+    trace_nvme_register_interrupt(qid, iv);
+    t = sched::thread::make([this,qid] { this->_io_queues[qid]->req_done(); },
+        sched::thread::attr().name("nvme" + std::to_string(_id) + "_ioq" + std::to_string(qid) + "_iv" +std::to_string(iv)));
+    t->start();
+    if(pin_t && cpu) {
+        sched::thread::pin(t,cpu);
+    }
+
+    ok = msix_register(iv, [this,qid] { this->_io_queues[qid]->disable_interrupts(); }, t,pin_t);
+    _dev.msix_unmask_entry(iv);
+    if(not ok)
+        NVME_ERROR("Interrupt registration failed: queue=%d interruptvector=%d\n",qid,iv);
+    return ok;
+}
+
+void nvme::dump_config(void)
+{
+    u8 B, D, F;
+    _dev.get_bdf(B, D, F);
+
+    _dev.dump_config();
+    nvme_d("%s [%x:%x.%x] vid:id= %x:%x", get_name().c_str(),
+             (u16)B, (u16)D, (u16)F,
+             _dev.get_vendor_id(),
+             _dev.get_device_id());
+}
+
+void nvme::parse_pci_config()
+{
+    _bar0 = _dev.get_bar(1);
+    _bar0->map();
+    if (_bar0 == nullptr) {
+        throw std::runtime_error("BAR1 is absent");
+    }
+    assert(_bar0->is_mapped());
+    _control_reg = (nvme_controller_reg_t*) _bar0->get_mmio();
+}
+
+hw_driver* nvme::probe(hw_device* dev)
+{
+    if (auto pci_dev = dynamic_cast<pci::device*>(dev)) {
+        if ((pci_dev->get_base_class_code()==1) && (pci_dev->get_sub_class_code()==8) && (pci_dev->get_programming_interface()==2)) // detect NVMe device
+            return aligned_new<nvme>(*pci_dev);
+    }
+    return nullptr;
+}
diff --git a/drivers/nvme.hh b/drivers/nvme.hh
new file mode 100644
index 0000000000..03604fc420
--- /dev/null
+++ b/drivers/nvme.hh
@@ -0,0 +1,118 @@
+#ifndef NVME_DRIVER_H
+#define NVME_DRIVER_H
+
+#include "drivers/nvme-structs.h"
+#include "drivers/driver.hh"
+#include "drivers/pci-device.hh"
+#include <osv/mempool.hh>
+#include <osv/interrupt.hh>
+#include <osv/msi.hh>
+#include "drivers/nvme-queue.hh"
+#include <vector>
+#include <memory>
+#include <map>
+
+#define nvme_tag "nvme"
+#define nvme_d(...)    tprintf_d(nvme_tag, __VA_ARGS__)
+#define nvme_i(...)    tprintf_i(nvme_tag, __VA_ARGS__)
+#define nvme_w(...)    tprintf_w(nvme_tag, __VA_ARGS__)
+#define nvme_e(...)    tprintf_e(nvme_tag, __VA_ARGS__)
+
+#define NVME_ERROR(...) nvme_e(__VA_ARGS__)
+
+#define NVME_PAGESIZE 4096
+#define NVME_PAGESHIFT 12
+
+/*bdev block cache will not be used if enabled*/
+#define NVME_DIRECT_RW_ENABLED 0
+
+#define NVME_QUEUE_PER_CPU_ENABLED 0
+
+//Volatile Write Cache
+#define NVME_VWC_ENABLED 1
+
+//checks for all active namespaces instead of just ns 1
+#define NVME_CHECK_FOR_ADDITIONAL_NAMESPACES 1
+
+#define NVME_ADMIN_QUEUE_SIZE 8
+
+/*Will be lower if the device doesnt support the
+specified queue size */ 
+#define NVME_IO_QUEUE_SIZE 256
+
+class nvme_io_queue_pair;
+class nvme_admin_queue_pair;
+
+class nvme : public hw_driver {
+public:
+    explicit nvme(pci::device& dev);
+    virtual ~nvme() {};
+
+    virtual std::string get_name() const { return "nvme"; }
+
+    virtual void dump_config();
+
+    int make_request(struct bio* bio, u32 nsid=1);
+    static hw_driver* probe(hw_device* dev);
+
+    int set_feature();
+    int get_feature();
+
+    int set_number_of_queues(u16 num, u16* ret);
+    int set_interrupt_coalescing(u8 threshold, u8 time);
+
+    int get_interrupt_coalescing();
+
+    int create_io_queue(int qsize=NVME_IO_QUEUE_SIZE, int qprio=2);
+    
+    bool register_interrupt(unsigned int iv,unsigned int qid,bool pin_t=false, sched::cpu* cpu = NULL);
+
+    int shutdown();
+
+    std::map<u32, nvme_ns_t*> _ns_data;
+    
+private:
+    int identify_controller();
+    int identify_namespace(u32 ns);
+    int identify_active_namespaces(u32 start);
+
+    void create_admin_queue();
+    void register_admin_interrupts();
+
+    void init_controller_config();
+    void create_io_queues_foreach_cpu();
+
+    void enable_controller();
+    void disable_controller();
+    int wait_for_controller_ready_change(int ready);
+
+    void parse_pci_config();
+
+    nvme_controller_reg_t* _control_reg;
+    
+    //maintains the nvme instance number for multiple adapters
+    static int _instance;
+    int _id;
+
+    std::vector<std::unique_ptr<msix_vector>> _msix_vectors;
+    bool msix_register(unsigned iv,
+    // high priority ISR
+    std::function<void ()> isr,
+    // bottom half
+    sched::thread *t,
+    // set affinity of the vector to the cpu running t
+    bool assign_affinity=false);
+
+    std::unique_ptr<nvme_admin_queue_pair> _admin_queue;
+
+    std::vector<std::unique_ptr<nvme_io_queue_pair>> _io_queues;
+    u32 _doorbellstride;
+
+    std::unique_ptr<nvme_identify_ctlr_t> _identify_controller;
+
+    pci::device& _dev;
+    interrupt_manager _msi;
+
+    pci::bar *_bar0 = nullptr;
+};
+#endif
diff --git a/drivers/virtio-blk.cc b/drivers/virtio-blk.cc
index b643c991b7..e85909ae92 100644
--- a/drivers/virtio-blk.cc
+++ b/drivers/virtio-blk.cc
@@ -83,12 +83,14 @@ blk_write(struct device *dev, struct uio *uio, int ioflags)
     return bdev_write(dev, uio, ioflags);
 }
 
+#include "drivers/blk_ioctl.hh"
+
 static struct devops blk_devops {
     no_open,
     no_close,
     blk_read,
     blk_write,
-    no_ioctl,
+    blk_ioctl,
     no_devctl,
     multiplex_strategy,
 };
diff --git a/fs/vfs/kern_physio.cc b/fs/vfs/kern_physio.cc
index c7c99c724d..6f4207af85 100644
--- a/fs/vfs/kern_physio.cc
+++ b/fs/vfs/kern_physio.cc
@@ -72,7 +72,7 @@ biofinish(struct bio *bp, struct devstat *stat, int error)
 	biodone(bp, error);
 }
 
-static void multiplex_bio_done(struct bio *b)
+void multiplex_bio_done(struct bio *b)
 {
 	struct bio *bio = static_cast<struct bio*>(b->bio_caller1);
 	bool error = b->bio_flags & BIO_ERROR;
@@ -80,13 +80,8 @@ static void multiplex_bio_done(struct bio *b)
 
 
 	// If there is an error, we store it in the original bio flags.
-	// This path gets slower because then we need to end up taking the
-	// bio_mutex twice. But that should be fine.
-	if (error) {
-		WITH_LOCK(bio->bio_mutex) {
-			bio->bio_flags |= BIO_ERROR;
-		}
-	}
+	if (error)
+		atomic_set_char(reinterpret_cast<volatile u_char*>(&bio->bio_flags), BIO_ERROR);
 
 	// Last one releases it. We set the biodone to always be "ok", because
 	// if an error exists, we have already set that in the previous operation
diff --git a/include/osv/bio.h b/include/osv/bio.h
index 06a433476f..116d936eff 100644
--- a/include/osv/bio.h
+++ b/include/osv/bio.h
@@ -126,6 +126,8 @@ void		biodone(struct bio *bio, bool ok);
 struct devstat;
 void    biofinish(struct bio *bp, struct devstat *stat, int error);
 
+void multiplex_bio_done(struct bio *b);
+
 __END_DECLS
 
 #endif /* !_SYS_BIO_H_ */
diff --git a/include/osv/buf.h b/include/osv/buf.h
index 8799bd54b8..d1a3b6626f 100755
--- a/include/osv/buf.h
+++ b/include/osv/buf.h
@@ -43,7 +43,7 @@
 struct buf: boost::intrusive::list_base_hook<> {
 	int		b_flags;	/* see defines below */
 	struct device	*b_dev;		/* device */
-	int		b_blkno;	/* block # on device */
+	off_t		b_blkno;	/* block # on device */
 	mutex_t		b_lock;		/* lock for access */
 	void		*b_data;	/* pointer to data buffer */
 };
diff --git a/scripts/run.py b/scripts/run.py
index 01fc201f63..695bb64a77 100755
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -172,6 +172,10 @@ def start_osv_qemu(options):
         "-device", "virtio-scsi-pci,id=scsi0%s" % options.virtio_device_suffix,
         "-drive", "file=%s,if=none,id=hd0,media=disk,%s" % (options.image_file, aio),
         "-device", "scsi-hd,bus=scsi0.0,drive=hd0,scsi-id=1,lun=0%s" % boot_index]
+    elif options.nvme:
+        args += [
+        "-device", "nvme,serial=deadbeef,drive=nvm%s" % (boot_index),
+        "-drive", "file=%s,if=none,id=nvm,%s" % (options.image_file, aio)]
     elif options.ide:
         args += [
         "-hda", options.image_file]
@@ -197,7 +201,18 @@ def start_osv_qemu(options):
         "-device", "vhost-user-fs-pci,queue-size=1024,chardev=char0,tag=%s%s" % (options.virtio_fs_tag, dax),
         "-object", "memory-backend-file,id=mem,size=%s,mem-path=/dev/shm,share=on" % options.memsize,
         "-numa", "node,memdev=mem"]
-
+    
+    if options.second_nvme_image:
+        print("nvme disk qemu")
+        args += [
+        "-drive", "file=%s,if=none,id=nvm1" % (options.second_nvme_image),
+        "-device", "nvme,serial=deadbeef,drive=nvm1,"]
+    
+    if options.pass_pci:
+        print("passthrough")
+        args += [
+        "-device", "vfio-pci,host=%s" % (options.pass_pci)]
+    
     if options.no_shutdown:
         args += ["-no-reboot", "-no-shutdown"]
 
@@ -532,6 +547,8 @@ def main(options):
                         help="don't start OSv till otherwise specified, e.g. through the QEMU monitor or a remote gdb")
     parser.add_argument("-i", "--image", action="store", default=None, metavar="IMAGE",
                         help="path to disk image file. defaults to build/$mode/usr.img")
+    parser.add_argument("-N", "--nvme",action="store_true", default=False,
+                        help="use NVMe instead of virtio-blk")
     parser.add_argument("-S", "--scsi", action="store_true", default=False,
                         help="use virtio-scsi instead of virtio-blk")
     parser.add_argument("-A", "--sata", action="store_true", default=False,
@@ -626,6 +643,10 @@ def main(options):
                         help="static ip addresses (forwarded to respective kernel command line option)")
     parser.add_argument("--bootchart", action="store_true",
                         help="bootchart mode (forwarded to respective kernel command line option")
+    parser.add_argument("--second-nvme-image", action="store",
+                        help="Path to an optional disk image that should be attached to the instance as NVMe device")
+    parser.add_argument("--pass-pci", action="store",
+                        help="passthrough a pci device in given slot if bound to vfio driver")
     cmdargs = parser.parse_args()
 
     cmdargs.opt_path = "debug" if cmdargs.debug else "release" if cmdargs.release else "last"
diff --git a/scripts/test.py b/scripts/test.py
index 5b4c7ae3aa..cec533fcba 100755
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -158,6 +158,7 @@ def main():
     parser.add_argument("--run_options", action="store", help="pass extra options to run.py")
     parser.add_argument("-m", "--manifest", action="store", default="modules/tests/usr.manifest", help="test manifest")
     parser.add_argument("-d", "--disabled_list", action="append", help="test to be disabled", default=[])
+    parser.add_argument("--nvme", action="store_true", default=False, help="run tests with nvme")
     parser.add_argument("--arch", action="store", choices=["x86_64","aarch64"], default=host_arch,
                         help="specify QEMU architecture: x86_64, aarch64")
     cmdargs = parser.parse_args()
@@ -175,6 +176,10 @@ def main():
         disabled_list.extend(firecracker_disabled_list)
     else:
         disabled_list.extend(qemu_disabled_list)
+    
+    if cmdargs.nvme : 
+        print("TEST NVME\n")
+        run_py_args = run_py_args + ['--nvme']
 
     if cmdargs.arch == 'aarch64':
         if host_arch != cmdargs.arch: