From 2b84047aa6393bf871c6ab9ffb87bd6160cc5294 Mon Sep 17 00:00:00 2001 From: Jan Braunwarth Date: Thu, 7 Dec 2023 18:11:02 +0100 Subject: [PATCH] Implement a nvme driver The implemented driver allows OSv to boot from a emulated nvme device and handle additional nvme devices, emulated or passed through from the host. Booting directly from a nvme via pci-passthrough needs to be tested. The nvme driver creates nvme_queue_pairs to interact with the device controller. A nvme_queue_pair manages a submission queue and the corresponding completion queue. The nvme driver registers every namespace as device and forwards requests to the queues. Namespace 1 on the first nvme drive is named "vblk0". Further devices are named nvmeXnY where X is the driver instance id starting with 0 and Y is the namespace id starting with 1. Read/Write requests on the device file go through the block cache layer. This can reduce performance quite a bit, since the block cache splits every request into 512B-sized sequentiell requests. Setting NVME_DIRECT_RW_ENABLED in /drivers/nvme.hh disables the block cache. All queues use MSI-X. 1 interrupt vector gets registered for every queue. There is very noticeable overhead while using pci-passthrough. This gets reduced by using interrupt coalescing but needs to be further investigated. Add options to ./scripts/run.py: --nvme to start OSv on a nvme emulated by QEMU --second-nvme-image to attach an additional image as nvme --pass-pci to passthrough a pci device from the host the device needs to be bound to vfio-pci drivers/blk_ioctl.hh implements the BLKGETSIZE64 and BLKFLSBUF ioctl which are used by fio drivers/io-test.cc is an simple iops test that can be activated by building with conf_drivers_io_test=1 and runs during initialization of the nvme device Signed-off-by: Jan Braunwarth --- Makefile | 9 +- arch/x64/arch-setup.cc | 6 + conf/profiles/x64/all.mk | 1 + conf/profiles/x64/base.mk | 6 + conf/profiles/x64/nvme.mk | 3 + core/debug.cc | 3 +- drivers/blk_ioctl.hh | 59 ++++ drivers/io-test.cc | 128 +++++++ drivers/io-test.hh | 12 + drivers/nvme-queue.cc | 464 +++++++++++++++++++++++++ drivers/nvme-queue.hh | 126 +++++++ drivers/nvme-structs.h | 647 +++++++++++++++++++++++++++++++++++ drivers/nvme.cc | 693 ++++++++++++++++++++++++++++++++++++++ drivers/nvme.hh | 118 +++++++ drivers/virtio-blk.cc | 4 +- fs/vfs/kern_physio.cc | 11 +- include/osv/bio.h | 2 + include/osv/buf.h | 2 +- scripts/run.py | 23 +- scripts/test.py | 5 + 20 files changed, 2309 insertions(+), 13 deletions(-) create mode 100644 conf/profiles/x64/nvme.mk create mode 100644 drivers/blk_ioctl.hh create mode 100644 drivers/io-test.cc create mode 100644 drivers/io-test.hh create mode 100644 drivers/nvme-queue.cc create mode 100644 drivers/nvme-queue.hh create mode 100644 drivers/nvme-structs.h create mode 100644 drivers/nvme.cc create mode 100644 drivers/nvme.hh diff --git a/Makefile b/Makefile index e24cb35191..bfb2291c75 100644 --- a/Makefile +++ b/Makefile @@ -286,7 +286,7 @@ post-includes-bsd += -isystem bsd/$(arch) $(out)/musl/%.o: pre-include-api = -isystem include/api/internal_musl_headers -isystem musl/src/include ifneq ($(werror),0) - CFLAGS_WERROR = -Werror + CFLAGS_WERROR = -Wall endif # $(call compiler-flag, -ffoo, option, file) # returns option if file builds with -ffoo, empty otherwise @@ -889,6 +889,13 @@ drivers += drivers/virtio-vring.o ifeq ($(conf_drivers_mmio),1) drivers += drivers/virtio-mmio.o endif +ifeq ($(conf_drivers_nvme),1) +drivers += drivers/nvme.o +drivers += drivers/nvme-queue.o +endif +ifeq ($(conf_drivers_io_test),1) +drivers += drivers/io_test.o +endif drivers += drivers/virtio-net.o drivers += drivers/virtio-blk.o drivers += drivers/virtio-scsi.o diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc index e1fb53808c..0d3ab2f59a 100644 --- a/arch/x64/arch-setup.cc +++ b/arch/x64/arch-setup.cc @@ -310,6 +310,9 @@ void arch_init_premain() #if CONF_drivers_ide #include "drivers/ide.hh" #endif +#if CONF_drivers_nvme +#include "drivers/nvme.hh" +#endif extern bool opt_pci_disabled; void arch_init_drivers() @@ -364,6 +367,9 @@ void arch_init_drivers() #endif #if CONF_drivers_ide drvman->register_driver(ide::ide_drive::probe); +#endif +#if CONF_drivers_nvme + drvman->register_driver(nvme::probe); #endif boot_time.event("drivers probe"); drvman->load_all(); diff --git a/conf/profiles/x64/all.mk b/conf/profiles/x64/all.mk index c13790e2be..3d69e8a74d 100644 --- a/conf/profiles/x64/all.mk +++ b/conf/profiles/x64/all.mk @@ -4,5 +4,6 @@ include conf/profiles/$(arch)/virtio-mmio.mk include conf/profiles/$(arch)/virtio-pci.mk include conf/profiles/$(arch)/vmware.mk include conf/profiles/$(arch)/xen.mk +include conf/profiles/$(arch)/nvme.mk conf_drivers_vga?=1 diff --git a/conf/profiles/x64/base.mk b/conf/profiles/x64/base.mk index 26dd054ed8..0ce84912fb 100644 --- a/conf/profiles/x64/base.mk +++ b/conf/profiles/x64/base.mk @@ -38,6 +38,11 @@ export conf_drivers_pci?=1 export conf_drivers_scsi?=1 endif +export conf_drivers_nvme?=0 +ifeq ($(conf_drivers_nvme),1) +export conf_drivers_pci?=1 +endif + export conf_drivers_vmxnet3?=0 ifeq ($(conf_drivers_vmxnet3),1) export conf_drivers_pci?=1 @@ -72,3 +77,4 @@ export conf_drivers_virtio?=0 export conf_drivers_pci?=0 export conf_drivers_mmio?=0 export conf_drivers_scsi?=0 +export conf_drivers_io_test?=0 diff --git a/conf/profiles/x64/nvme.mk b/conf/profiles/x64/nvme.mk new file mode 100644 index 0000000000..fe21d26bde --- /dev/null +++ b/conf/profiles/x64/nvme.mk @@ -0,0 +1,3 @@ +conf_drivers_pci?=1 + +conf_drivers_nvme?=1 diff --git a/core/debug.cc b/core/debug.cc index cf0a06f4ee..1f7f3685c7 100644 --- a/core/debug.cc +++ b/core/debug.cc @@ -43,10 +43,11 @@ bool logger::parse_configuration(void) add_tag("virtio-blk", logger_warn); add_tag("virtio-net", logger_warn); add_tag("vmxnet3", logger_warn); - add_tag("pci", logger_info); + add_tag("pci", logger_debug); add_tag("poll", logger_info); add_tag("dhcp", logger_info); add_tag("acpi", logger_error); + add_tag("nvme", logger_debug); return (true); } diff --git a/drivers/blk_ioctl.hh b/drivers/blk_ioctl.hh new file mode 100644 index 0000000000..55a2e32d70 --- /dev/null +++ b/drivers/blk_ioctl.hh @@ -0,0 +1,59 @@ +#ifndef BLK_IOCTL_H +#define BLK_IOCTL_H + +#define _IOC_NRBITS 8 +#define _IOC_TYPEBITS 8 +#define _IOC_SIZEBITS 13 +#define _IOC_DIRBITS 3 + +#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1) +#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1) +#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1) +#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1) + +#define _IOC_NRSHIFT 0 +#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS) +#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS) +#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS) + +#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) +#define _IOC_TYP(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) +#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) +#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK) + +#define BLKGETSIZE64 114 +#define BLKFLSBUF 97 +#define BLKDISCARD 119 + +TRACEPOINT(trace_blk_ioctl, "dev=%s type=%#x nr=%d size=%d, dir=%d", char*, int, int, int, int); + +void no_bio_done(bio * b) {delete b;}; + +int +blk_ioctl(struct device* dev, u_long io_cmd, void* buf) +{ + assert(dev); + trace_blk_ioctl(dev->name, _IOC_TYP(io_cmd), _IOC_NR(io_cmd), _IOC_SIZE(io_cmd), _IOC_DIR(io_cmd)); + + switch (_IOC_NR(io_cmd)) { + case BLKGETSIZE64: + //device capacity in bytes + *(off_t*) buf = dev->size; + break; + case BLKFLSBUF: { + auto* bio = alloc_bio(); + bio->bio_dev = dev; + bio->bio_done = no_bio_done; + bio->bio_cmd = BIO_FLUSH; + + dev->driver->devops->strategy(bio); + } + break; + default: + printf("ioctl not defined; type:%#x nr:%d size:%d, dir:%d\n",_IOC_TYP(io_cmd),_IOC_NR(io_cmd),_IOC_SIZE(io_cmd),_IOC_DIR(io_cmd)); + return EINVAL; + } + return 0; +} + +#endif \ No newline at end of file diff --git a/drivers/io-test.cc b/drivers/io-test.cc new file mode 100644 index 0000000000..e46f914d05 --- /dev/null +++ b/drivers/io-test.cc @@ -0,0 +1,128 @@ +#include "drivers/io-test.hh" +#include +#include +#include +#include +#include +#include + +volatile bool running; +volatile u64 completed_io; +volatile u64 requested_io; +std::atomic open_req; +u32 max_open; +u64 max_ios; + +void test_block_device(struct device *dev, int test_duration, int blcks_per_io, int blocksize, int blockshift) +{ + int report_step = 1e6; + int io_size = blocksize * blcks_per_io; + completed_io = 0; + requested_io = 0; + open_req.store(0); + max_open = 64; + max_ios = 1 << 30; + + printf("Start IO test dev : %s, IO size : %d\n",dev->name,io_size); + sched::thread *t; + t = sched::thread::make([dev,io_size,blockshift] { requesting(dev,io_size,blockshift);}, + sched::thread::attr().name("IO_Test Request")); + + sched::thread *timer; + timer = sched::thread::make([test_duration] { usleep(test_duration);}, + sched::thread::attr().name("IO_Test_Timer")); + + sched::thread *repo; + repo = sched::thread::make([test_duration,report_step,io_size] { reporting(test_duration,report_step,io_size);}, + sched::thread::attr().name("IO_Test_Timer")); + auto c = clock::get(); + + running = true; + u64 start = c->time(); + timer->start(); + t->start(); + repo->start(); + + timer->join(); + running = false; + u64 com = completed_io; + u64 end = c->time(); + int iops = (com * 1e9)/ (end - start); + + t->join(); + repo->join(); + printf("Test results runtime: %llu, completed IO : %llu, IOPS : %d\n",end-start,com,iops); +} + +void reporting(int test_duration, int report_step, int io_size) { + u32 prev_compl = completed_io; + u32 compl_diff; + u32 compl_tem; + auto c = clock::get(); + int time_diff; + int time_tem; + int prev_time = c->time(); + while(running) { + usleep(report_step); + compl_tem = completed_io; + time_tem = c->time(); + + compl_diff = compl_tem - prev_compl; + prev_compl = compl_tem; + time_diff = time_tem - prev_time; + prev_time = time_tem; + double iops = (compl_diff * 1e9 ) / (double) time_diff; + + printf("Timestep: %d, completed : %d, IOPS : %lf, open : %d\n",time_diff,compl_diff,iops,open_req.load()); + } +} + + +void requesting(struct device *dev, u32 io_size, int blockshift) { + void* buff; + bio* bio; + off_t max_blocks = dev->size >> blockshift; + off_t max_offset = (max_blocks - 1) - (io_size >> blockshift); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> distrib(0, max_offset); + + while(running) { + if(requested_io >= max_ios ) + break; + + buff = memory::alloc_phys_contiguous_aligned(io_size,2); + assert(buff); + memset(buff, 1, io_size); + + bio = alloc_bio(); + bio->bio_dev = dev; + bio->bio_data = buff; + bio->bio_done = io_done; + bio->bio_length = io_size; + bio->bio_bcount = io_size; + bio->bio_cmd = BIO_READ; + + bio->bio_offset = ((off_t) distrib(gen)) << blockshift; + + while(max_open<=open_req) { + usleep(10); + } + open_req.fetch_add(1); + atomic_add_64(&requested_io,1); + dev->driver->devops->strategy(bio); + } +} + +void io_done(struct bio* bio) { + + if(bio->bio_flags != BIO_DONE) { + printf("BIO_Error during IO Test: %x\n",bio->bio_flags); + } + u64 old = atomic_fetchadd_long(&completed_io, 1); + + open_req.fetch_add(-1); + + free(bio->bio_data); + delete bio; +} \ No newline at end of file diff --git a/drivers/io-test.hh b/drivers/io-test.hh new file mode 100644 index 0000000000..4ff5154ed9 --- /dev/null +++ b/drivers/io-test.hh @@ -0,0 +1,12 @@ +#ifndef IO_TEST_H +#define IO_TEST_H + +#include +#include + +void requesting(struct device *dev, u32 io_size, int blocksize); +void reporting(int test_duration, int report_step, int io_size); +void io_done(struct bio* bio); +void test_block_device(struct device *dev, int test_duration,int blcks_per_io, int blocksize=512, int blockshift=9); + +#endif \ No newline at end of file diff --git a/drivers/nvme-queue.cc b/drivers/nvme-queue.cc new file mode 100644 index 0000000000..b2605271a0 --- /dev/null +++ b/drivers/nvme-queue.cc @@ -0,0 +1,464 @@ +#include + +#include "nvme-queue.hh" +#include +#include +#include +#include + +extern std::unique_ptr alloc_cmd(); + +TRACEPOINT(trace_nvme_io_queue_wake, "nvme%d qid=%d", int, int); +TRACEPOINT(trace_nvme_wait_for_completion_queue_entries, "nvme%d qid=%d,have_elements=%d", int, int, bool); +TRACEPOINT(trace_nvme_completion_queue_not_empty, "nvme%d qid=%d,not_empty=%d", int, int, bool); +TRACEPOINT(trace_nvme_enable_interrupts, "nvme%d qid=%d", int, int); +TRACEPOINT(trace_nvme_disable_interrupts, "nvme%d qid=%d", int, int); + +TRACEPOINT(trace_nvme_read, "nvme%d qid=%d cid=%d, bio data=%#x, slba=%d, nlb=%d", int, int , u16, void*, u64, u32); +TRACEPOINT(trace_nvme_write, "nvme%d qid=%d cid=%d, bio data=%#x, slba=%d, nlb=%d", int, int , u16, void*, u64, u32); + +TRACEPOINT(trace_nvme_req_done_error, "nvme%d qid=%d, cid=%d, status type=%#x, status code=%#x, bio=%#x", int, int, u16, u8, u8, bio*); +TRACEPOINT(trace_nvme_req_done_success, "nvme%d qid=%d, cid=%d, bio=%#x",int,int, u16, bio*); + +TRACEPOINT(trace_nvme_admin_queue_wake, "nvme%d qid=%d",int,int); + +TRACEPOINT(trace_nvme_admin_queue_submit, "nvme%d qid=%d, cid=%d",int, int, int); +TRACEPOINT(trace_nvme_admin_req_done_error, "nvme%d qid=%d, cid=%d, status type=%#x, status code=%#x", int, int, u16, u8, u8); +TRACEPOINT(trace_nvme_admin_req_done_success, "nvme%d qid=%d, cid=%d",int,int, u16); + +TRACEPOINT(trace_advance_sq_tail_full, "nvme%d qid=%d, sq_tail=%d, sq_head=%d", int, int, int, int); +TRACEPOINT(trace_nvme_wait_for_entry, "nvme%d qid=%d, sq_tail=%d, sq_head=%d", int, int, int, int); + +nvme_queue_pair::nvme_queue_pair( + int did, + u32 id, + int qsize, + pci::device &dev, + nvme_sq_entry_t* sq_addr, + u32* sq_doorbell, + nvme_cq_entry_t* cq_addr, + u32* cq_doorbell, + std::map& ns) + : _id(id) + ,_driverid(did) + ,_qsize(qsize) + ,_dev(&dev) + ,_sq_addr(sq_addr) + ,_sq_head(0) + ,_sq_tail(0) + ,_sq_doorbell(sq_doorbell) + ,_sq_full(false) + ,_cq_addr(cq_addr) + ,_cq_head(0) + ,_cq_tail(0) + ,_cq_doorbell(cq_doorbell) + ,_cq_phase_tag(1) + ,_ns(ns) + +{ + auto prplists = (u64**) malloc(sizeof(u64*)*qsize); + memset(prplists,0,sizeof(u64*)*qsize); + _prplists_in_use.push_back(prplists); + + assert(!completion_queue_not_empty()); +} + +nvme_queue_pair::~nvme_queue_pair() +{ + memory::free_phys_contiguous_aligned(_sq_addr); + memory::free_phys_contiguous_aligned(_cq_addr); + for(auto vec: _prplists_in_use) + memory::free_phys_contiguous_aligned(vec); +} + +inline void nvme_queue_pair::advance_sq_tail() +{ + _sq_tail = (_sq_tail + 1) % _qsize; + if(_sq_tail == _sq_head) { + _sq_full = true; + trace_advance_sq_tail_full(_driverid,_id,_sq_tail,_sq_head); + } +} + +u16 nvme_queue_pair::submit_cmd(std::unique_ptr cmd) +{ u16 ret; + WITH_LOCK(_lock) + { + ret = submit_cmd_without_lock(std::move(cmd)); + } + return ret; +} + +u16 nvme_queue_pair::submit_cmd_without_lock(std::unique_ptr cmd) +{ + _sq_addr[_sq_tail] = *cmd; + advance_sq_tail(); + mmio_setl(_sq_doorbell,_sq_tail); + return _sq_tail; +} + +void nvme_queue_pair::wait_for_completion_queue_entries() +{ + sched::thread::wait_until([this] { + bool have_elements = this->completion_queue_not_empty(); + if (!have_elements) { + this->enable_interrupts(); + //check if we got a new cqe between completion_queue_not_empty() + //and enable_interrupts() + have_elements = this->completion_queue_not_empty(); + if (have_elements) { + this->disable_interrupts(); + } + } + + trace_nvme_wait_for_completion_queue_entries(_driverid,_id,have_elements); + return have_elements; + }); +} + +int nvme_queue_pair::map_prps(u16 cid, void* data, u64 datasize, u64* prp1, u64* prp2) +{ + u64 addr = (u64) data; + *prp1 = addr; + *prp2 = 0; + int numpages = 0; + u64 offset = addr - ( (addr >> NVME_PAGESHIFT) << NVME_PAGESHIFT ); + if(offset) numpages = 1; + + numpages += ( datasize - offset + NVME_PAGESIZE - 1) >> NVME_PAGESHIFT; + + if (numpages == 2) { + *prp2 = ((addr >> NVME_PAGESHIFT) +1 ) << NVME_PAGESHIFT; + } else if (numpages > 2) { + assert(numpages / 512 == 0); + u64* prplist = (u64*) memory::alloc_phys_contiguous_aligned(numpages * 8, 4096); + assert(prplist != nullptr); + *prp2 = mmu::virt_to_phys(prplist); + _prplists_in_use.at(cid / _qsize)[cid % _qsize] = prplist; + + addr = ((addr >> NVME_PAGESHIFT) +1 ) << NVME_PAGESHIFT; + prplist[0] = addr; + + for (int i = 1; i < numpages - 1; i++) { + addr += NVME_PAGESIZE; + prplist[i] = addr; + } + } + return 0; +} + +std::unique_ptr nvme_queue_pair::get_completion_queue_entry() +{ + if(!completion_queue_not_empty()) { + return nullptr; + } + + auto* tcqe = new nvme_cq_entry_t; + *tcqe = _cq_addr[_cq_head]; + std::unique_ptr cqe(tcqe); + assert(cqe->p == _cq_phase_tag); + + if(++_cq_head == _qsize) { + _cq_head -= _qsize; + _cq_phase_tag = !_cq_phase_tag; + } + return cqe; +} + + +bool nvme_queue_pair::completion_queue_not_empty() const +{ + bool a = reinterpret_cast(&_cq_addr[_cq_head])->p == _cq_phase_tag; + trace_nvme_completion_queue_not_empty(_driverid,_id,a); + return a;//_cq_addr[_cq_head].p == _cq_phase_tag; +} + +void nvme_queue_pair::enable_interrupts() +{ + _dev->msix_unmask_entry(_id); + trace_nvme_enable_interrupts(_driverid,_id); +} + +void nvme_queue_pair::disable_interrupts() +{ + _dev->msix_mask_entry(_id); + trace_nvme_disable_interrupts(_driverid,_id); +} + +//only use with interrupts disabled +std::unique_ptr nvme_queue_pair::check_for_completion(u16 cid) +{ + int msec = 1000; + int timeout = 50; + int i; + + std::unique_ptr cqe; + for(i = 0; i < timeout; i++) { + if(completion_queue_not_empty()) { + cqe = get_completion_queue_entry(); + assert(cqe->cid == cid); + if(cqe->sct != 0 || cqe->sc != 0) { + NVME_ERROR("polling nvme%d qid=%d, cid=%d, sct=%#x, sc=%#x\n", _driverid, _id, cid, cqe->sct, cqe->sc); + _sq_head = cqe->sqhd; //update sq_head + mmio_setl(_cq_doorbell, _cq_head); + return cqe; + } + + _sq_head = cqe->sqhd; //update sq_head + mmio_setl(_cq_doorbell, _cq_head); + return cqe; + } + usleep(msec); + } + NVME_ERROR("polling timeout nvme%d qid=%d cid=%d\n", _driverid, _id, cid); + return cqe; +} + +int nvme_io_queue_pair::make_request(bio* bio, u32 nsid=1) +{ + u64 slba = bio->bio_offset; + u32 nlb = bio->bio_bcount; //do the blockshift in nvme_driver + u16 cid; + + _lock.lock(); + cid = _sq_tail; + if(_sq_full) { + //Wait for free entries + _waiter.reset(*sched::thread::current()); + trace_nvme_wait_for_entry(_driverid,_id,_sq_tail,_sq_head); + sched::thread::wait_until([this] {return !(this->_sq_full);}); + _waiter.clear(); + } + /* + We need to check if there is an outstanding command that uses + _sq_tail as command id. + This happens if + 1.The SQ is full. Then we just have to wait for an open slot (see above) + 2.the Controller already read a SQE but didnt post a CQE yet. + This means we could post the command but need a different cid. To still + use the cid as index to find the corresponding bios we use a matrix + adding columns if we need them + */ + while(_pending_bios.at(cid / _qsize)[cid % _qsize]) { + cid += _qsize; + if(_pending_bios.size() <= (cid / _qsize)){ + auto bios_array = (struct bio**) malloc(sizeof(struct bio*) * _qsize); + memset(bios_array,0,sizeof(struct bio*) * _qsize); + _pending_bios.push_back(bios_array); + auto prplists = (u64**) malloc(sizeof(u64*)* _qsize); + memset(prplists,0,sizeof(u64*)* _qsize); + _prplists_in_use.push_back(prplists); + } + } + _pending_bios.at(cid / _qsize)[cid % _qsize] = bio; + + + + switch (bio->bio_cmd) { + case BIO_READ: + trace_nvme_read(_driverid, _id, cid, bio->bio_data, slba, nlb); + submit_rw(cid,(void*)mmu::virt_to_phys(bio->bio_data),slba,nlb, nsid, NVME_CMD_READ); + break; + + case BIO_WRITE: + trace_nvme_write(_driverid, _id, cid, bio->bio_data, slba, nlb); + submit_rw(cid,(void*)mmu::virt_to_phys(bio->bio_data),slba,nlb, nsid, NVME_CMD_WRITE); + break; + + case BIO_FLUSH: { + auto cmd = alloc_cmd(); + cmd->vs.common.opc = NVME_CMD_FLUSH; + cmd->vs.common.nsid = nsid; + cmd->vs.common.cid = cid; + submit_cmd_without_lock(std::move(cmd)); + } break; + + default: + NVME_ERROR("Operation not implemented\n"); + _lock.unlock(); + return ENOTBLK; + } + _lock.unlock(); + return 0; +} + +void nvme_io_queue_pair::req_done() +{ + std::unique_ptr cqe; + u16 cid; + while(true) + { + wait_for_completion_queue_entries(); + trace_nvme_io_queue_wake(_driverid,_id); + while((cqe = get_completion_queue_entry())) { + cid = cqe->cid; + if(cqe->sct != 0 || cqe->sc != 0) { + trace_nvme_req_done_error(_driverid,_id, cid, cqe->sct, cqe->sc, _pending_bios.at(cid / _qsize)[cid % _qsize]); + if(_pending_bios.at(cid / _qsize)[cid % _qsize]) + biodone(_pending_bios.at(cid / _qsize)[cid % _qsize],false); + NVME_ERROR("I/O queue: cid=%d, sct=%#x, sc=%#x, bio=%#x, slba=%llu, nlb=%llu\n",cqe->cid, cqe->sct, + cqe->sc,_pending_bios.at(cid / _qsize)[cid % _qsize], + cqe->sc,_pending_bios.at(cid / _qsize)[cid % _qsize]->bio_offset, + cqe->sc,_pending_bios.at(cid / _qsize)[cid % _qsize]->bio_bcount); + }else { + trace_nvme_req_done_success(_driverid,_id, cid, _pending_bios.at(cid / _qsize)[cid % _qsize]); + if(_pending_bios.at(cid / _qsize)[cid % _qsize]) + biodone(_pending_bios.at(cid / _qsize)[cid % _qsize],true); + } + + _pending_bios.at(cid / _qsize)[cid % _qsize] = nullptr; + if(_prplists_in_use.at(cid / _qsize)[cid % _qsize]) { + memory::free_phys_contiguous_aligned(_prplists_in_use.at(cid / _qsize)[cid % _qsize]); + _prplists_in_use.at(cid / _qsize)[cid % _qsize] = nullptr; + } + _sq_head = cqe->sqhd; //update sq_head + } + mmio_setl(_cq_doorbell, _cq_head); + if(_sq_full) { //wake up the requesting thread in case the submission queue was full before + _sq_full = false; + if(_waiter) + _waiter.wake_from_kernel_or_with_irq_disabled(); + } + } +} + +int nvme_io_queue_pair::submit_rw(u16 cid, void* data, u64 slba, u32 nlb, u32 nsid, int opc) +{ + auto cmd = alloc_cmd(); + u64 prp1 = 0, prp2 = 0; + u32 datasize = nlb << _ns[nsid]->blockshift; + + map_prps(cid, data, datasize, &prp1, &prp2); + cmd->rw.common.cid = cid; + cmd->rw.common.opc = opc; + cmd->rw.common.nsid = nsid; + cmd->rw.common.prp1 = prp1; + cmd->rw.common.prp2 = prp2; + cmd->rw.slba = slba; + cmd->rw.nlb = nlb - 1; + + return submit_cmd_without_lock(std::move(cmd)); +} + +void nvme_admin_queue_pair::req_done() +{ + std::unique_ptr cqe; + u16 cid; + while(true) + { + wait_for_completion_queue_entries(); + trace_nvme_admin_queue_wake(_driverid,_id); + while((cqe = get_completion_queue_entry())) { + cid = cqe->cid; + if(cqe->sct != 0 || cqe->sc != 0) { + trace_nvme_admin_req_done_error(_driverid,_id, cid, cqe->sct, cqe->sc); + NVME_ERROR("Admin queue cid=%d, sct=%#x, sc=%#x\n",cid,cqe->sct,cqe->sc); + }else { + trace_nvme_admin_req_done_success(_driverid,_id, cid); + } + + if(_prplists_in_use.at(cid / _qsize)[cid % _qsize]) { + memory::free_phys_contiguous_aligned(_prplists_in_use.at(cid / _qsize)[cid % _qsize]); + _prplists_in_use.at(cid / _qsize)[cid % _qsize] = nullptr; + } + _sq_head = cqe->sqhd; //update sq_head + _req_res = std::move(cqe); //save the cqe so that the requesting thread can return it + } + mmio_setl(_cq_doorbell, _cq_head); + + /*Wake up the thread that requested the admin command*/ + new_cq = true; + _req_waiter.wake_from_kernel_or_with_irq_disabled(); + } +} + +std::unique_ptr nvme_admin_queue_pair::submit_and_return_on_completion(std::unique_ptr cmd, void* data, unsigned int datasize) +{ + _lock.lock(); + + _req_waiter.reset(*sched::thread::current()); + + //for now admin cid = sq_tail + u16 cid = _sq_tail; + cmd->rw.common.cid = cid; + + if(data != nullptr && datasize > 0) { + map_prps(_sq_tail,data, datasize, &cmd->rw.common.prp1, &cmd->rw.common.prp2); + } + + trace_nvme_admin_queue_submit(_driverid,_id,cid); + submit_cmd_without_lock(std::move(cmd)); + + sched::thread::wait_until([this] {return this->new_cq;}); + _req_waiter.clear(); + + new_cq = false; + if(_prplists_in_use.at(0)[cid]) { + free(_prplists_in_use.at(0)[cid]); + } + + _lock.unlock(); + return std::move(_req_res); +} + +nvme_io_queue_pair::nvme_io_queue_pair( + int did, + int id, + int qsize, + pci::device& dev, + + nvme_sq_entry_t* sq_addr, + u32* sq_doorbell, + + nvme_cq_entry_t* cq_addr, + u32* cq_doorbell, + std::map& ns + ) : nvme_queue_pair( + did, + id, + qsize, + dev, + + sq_addr, + sq_doorbell, + + cq_addr, + cq_doorbell, + ns + ){ + auto bios_array = (bio**) malloc(sizeof(bio*) * qsize); + memset(bios_array, 0, sizeof(bio*) * qsize); + _pending_bios.push_back(bios_array); +} + +nvme_io_queue_pair::~nvme_io_queue_pair() +{ + for(auto vec : _pending_bios) + free(vec); +} + +nvme_admin_queue_pair::nvme_admin_queue_pair( + int did, + int id, + int qsize, + pci::device& dev, + + nvme_sq_entry_t* sq_addr, + u32* sq_doorbell, + + nvme_cq_entry_t* cq_addr, + u32* cq_doorbell, + std::map& ns + ) : nvme_queue_pair( + did, + id, + qsize, + dev, + + sq_addr, + sq_doorbell, + + cq_addr, + cq_doorbell, + ns + ){}; diff --git a/drivers/nvme-queue.hh b/drivers/nvme-queue.hh new file mode 100644 index 0000000000..f891c96d37 --- /dev/null +++ b/drivers/nvme-queue.hh @@ -0,0 +1,126 @@ +#ifndef NVME_QUEUE_H +#define NVME_QUEUE_H + +#include "drivers/nvme.hh" + +class nvme_queue_pair; + +class nvme_queue_pair +{ +public: + nvme_queue_pair( + int did, + u32 id, + int qsize, + pci::device& dev, + + nvme_sq_entry_t* sq_addr, + u32* sq_doorbell, + + nvme_cq_entry_t* cq_addr, + u32* cq_doorbell, + std::map& ns + ); + + ~nvme_queue_pair(); + + u16 submit_cmd(std::unique_ptr cmd); + + virtual void req_done() {}; + void wait_for_completion_queue_entries(); + bool completion_queue_not_empty() const; + + void enable_interrupts(); + void disable_interrupts(); + + u32 _id; +protected: + int _driverid; + + u32 _qsize; + pci::device* _dev; + + nvme_sq_entry_t* _sq_addr; + u32 _sq_head; + u32 _sq_tail; + volatile u32* _sq_doorbell; + bool _sq_full; + + nvme_cq_entry_t* _cq_addr; + u32 _cq_head; + u32 _cq_tail; + volatile u32* _cq_doorbell; + int _cq_phase_tag; + + std::map _ns; + + std::vector _prplists_in_use; + + mutex _lock; + sched::thread_handle _waiter; + + void advance_sq_tail(); + int map_prps(u16 cid, void* data, u64 datasize, u64* prp1, u64* prp2); + + u16 submit_cmd_without_lock(std::unique_ptr cmd); + + u16 submit_cmd_batch_without_lock(std::vector> cmds); + + std::unique_ptr get_completion_queue_entry(); + + std::unique_ptr check_for_completion(u16 cid); +}; + +class nvme_io_queue_pair : public nvme_queue_pair { +public: + nvme_io_queue_pair( + int did, + int id, + int qsize, + pci::device& dev, + + nvme_sq_entry_t* sq_addr, + u32* sq_doorbell, + + nvme_cq_entry_t* cq_addr, + u32* cq_doorbell, + std::map& ns + ); + ~nvme_io_queue_pair(); + + int self_test(); + int make_request(struct bio* bio, u32 nsid); + void req_done(); + + int submit_io_batch(std::vector bios, u32 nsid=1); +private: + std::vector _pending_bios; + int submit_rw(u16 cid, void* data, u64 slba, u32 nlb, u32 nsid, int opc); + int submit_flush(); +}; + +class nvme_admin_queue_pair : public nvme_queue_pair { +public: + nvme_admin_queue_pair( + int did, + int id, + int qsize, + pci::device& dev, + + nvme_sq_entry_t* sq_addr, + u32* sq_doorbell, + + nvme_cq_entry_t* cq_addr, + u32* cq_doorbell, + std::map& ns + ); + + std::unique_ptr _req_res; + volatile bool new_cq; + void req_done(); + std::unique_ptr submit_and_return_on_completion(std::unique_ptr cmd, void* data=nullptr, unsigned int datasize=0); +private: + sched::thread_handle _req_waiter; +}; + +#endif \ No newline at end of file diff --git a/drivers/nvme-structs.h b/drivers/nvme-structs.h new file mode 100644 index 0000000000..af77563510 --- /dev/null +++ b/drivers/nvme-structs.h @@ -0,0 +1,647 @@ +/** + * Copyright (c) 2015-2016, Micron Technology, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file + * @brief NVMe header file + */ + +#ifndef NVME_STRUCTS_H +#define NVME_STRUCTS_H + +#include + +__BEGIN_DECLS + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + #pragma error "only support little endian CPU architecture" +#endif + +#ifndef _U_TYPE +#define _U_TYPE ///< bit size data types +typedef int8_t s8; ///< 8-bit signed +typedef int16_t s16; ///< 16-bit signed +typedef int32_t s32; ///< 32-bit signed +typedef int64_t s64; ///< 64-bit signed +typedef uint8_t u8; ///< 8-bit unsigned +typedef uint16_t u16; ///< 16-bit unsigned +typedef uint32_t u32; ///< 32-bit unsigned +typedef uint64_t u64; ///< 64-bit unsigned +#endif // _U_TYPE + +/// NVMe command op code +enum { + NVME_CMD_FLUSH = 0x0, ///< flush + NVME_CMD_WRITE = 0x1, ///< write + NVME_CMD_READ = 0x2, ///< read + NVME_CMD_WRITE_UNCOR = 0x4, ///< write uncorrectable + NVME_CMD_COMPARE = 0x5, ///< compare + NVME_CMD_DS_MGMT = 0x9, ///< dataset management +}; + +/// NVMe admin command op code +enum { + NVME_ACMD_DELETE_SQ = 0x0, ///< delete io submission queue + NVME_ACMD_CREATE_SQ = 0x1, ///< create io submission queue + NVME_ACMD_GET_LOG_PAGE = 0x2, ///< get log page + NVME_ACMD_DELETE_CQ = 0x4, ///< delete io completion queue + NVME_ACMD_CREATE_CQ = 0x5, ///< create io completion queue + NVME_ACMD_IDENTIFY = 0x6, ///< identify + NVME_ACMD_ABORT = 0x8, ///< abort + NVME_ACMD_SET_FEATURES = 0x9, ///< set features + NVME_ACMD_GET_FEATURES = 0xA, ///< get features + NVME_ACMD_ASYNC_EVENT = 0xC, ///< asynchronous event + NVME_ACMD_FW_ACTIVATE = 0x10, ///< firmware activate + NVME_ACMD_FW_DOWNLOAD = 0x11, ///< firmware image download +}; + +/// NVMe feature identifiers +enum { + NVME_FEATURE_ARBITRATION = 0x1, ///< arbitration + NVME_FEATURE_POWER_MGMT = 0x2, ///< power management + NVME_FEATURE_LBA_RANGE = 0x3, ///< LBA range type + NVME_FEATURE_TEMP_THRESHOLD = 0x4, ///< temperature threshold + NVME_FEATURE_ERROR_RECOVERY = 0x5, ///< error recovery + NVME_FEATURE_WRITE_CACHE = 0x6, ///< volatile write cache + NVME_FEATURE_NUM_QUEUES = 0x7, ///< number of queues + NVME_FEATURE_INT_COALESCING = 0x8, ///< interrupt coalescing + NVME_FEATURE_INT_VECTOR = 0x9, ///< interrupt vector config + NVME_FEATURE_WRITE_ATOMICITY = 0xA, ///< write atomicity + NVME_FEATURE_ASYNC_EVENT = 0xB, ///< async event config +}; + +/// Version +typedef union _nvme_version { + u32 val; ///< whole value + struct { + u8 rsvd; ///< reserved + u8 mnr; ///< minor version number + u16 mjr; ///< major version number + }; +} nvme_version_t; + +/// Admin queue attributes +typedef union _nvme_adminq_attr { + u32 val; ///< whole value + struct { + u16 asqs; ///< admin submission queue size + u16 acqs; ///< admin completion queue size + }; +} nvme_adminq_attr_t; + +/// Controller capabilities +typedef union _nvme_controller_cap { + u64 val; ///< whole value + struct { + u16 mqes; ///< max queue entries supported + u8 cqr : 1; ///< contiguous queues required + u8 ams : 2; ///< arbitration mechanism supported + u8 rsvd : 5; ///< reserved + u8 to; ///< timeout + + u32 dstrd : 4; ///< doorbell stride + u32 nssrs : 1; ///< NVM subsystem reset supported + u32 css : 8; ///< command set supported + u32 rsvd2 : 3; ///< reserved + u32 mpsmin : 4; ///< memory page size minimum + u32 mpsmax : 4; ///< memory page size maximum + u32 rsvd3 : 8; ///< reserved + }; +} nvme_controller_cap_t; + +/// Controller configuration register +typedef union _nvme_controller_config { + u32 val; ///< whole value + struct { + u32 en : 1; ///< enable + u32 rsvd : 3; ///< reserved + u32 css : 3; ///< I/O command set selected + u32 mps : 4; ///< memory page size + u32 ams : 3; ///< arbitration mechanism selected + u32 shn : 2; ///< shutdown notification + u32 iosqes : 4; ///< I/O submission queue entry size + u32 iocqes : 4; ///< I/O completion queue entry size + u32 rsvd2 : 8; ///< reserved + }; +} nvme_controller_config_t; + +/// Controller status register +typedef union _nvme_controller_status { + u32 val; ///< whole value + struct { + u32 rdy : 1; ///< ready + u32 cfs : 1; ///< controller fatal status + u32 shst : 2; ///< shutdown status + u32 rsvd : 28; ///< reserved + }; +} nvme_controller_status_t; + +/// Controller memory buffer location register +typedef union _nvme_cmbloc { + u32 val; ///< whole value + struct { + u32 bir : 3; ///< base indicator register + u32 rsvd : 9; ///< reserved + u32 ofst : 20; ///< offset (in cmbsz units) + }; +} nvme_cmbloc_t; + +/// Controller memory buffer size register +typedef union _nvme_cmbsz { + u32 val; ///< whole value + struct { + u32 sqs : 1; ///< submission queue support + u32 cqs : 1; ///< completion queue support + u32 lists : 1; ///< PRP SGL list support + u32 rds : 1; ///< read data support + u32 wds : 1; ///< write data support + u32 rsvd : 3; ///< reserved + u32 szu : 4; ///< size units (0=4K,1=64K,2=1M,3=16M,4=256M,5=4G,6=64G) + u32 sz : 20; ///< size (in cmbsz units) + }; +} nvme_cmbsz_t; + + + +enum nvme_sgl_descriptor_type { + NVME_SGL_DATA_BLOCK_TYPE = 0x0, + NVME_SGL_BIT_BUCKET_TYPE = 0x1, + NVME_SGL_SEGMENT_TYPE = 0x2, + NVME_SGL_LAST_SEGMENT_TYPE = 0x3, + NVME_SGL_KEYED_DATA_BLOCK_TYPE = 0x4, + NVME_SGL_TRANSPORT_DATA_BLOCK_TYPE = 0x5, + /* + *0x6 - 0xE reserved + */ + + NVME_SGL_VENDOR_SPECIFIC_TYPE = 0xF, +}; + +enum nvme_sgl_descriptor_subtype { + NVME_SGL_ADDRESS_SUBTYPE = 0x0, + NVME_SGL_OFFSET_SUBTYPE = 0x1, + //0xA - 0xF Nvme transport specific +}; + +struct __attribute__((packed)) nvme_sgl_descriptor_unkeyed { + u64 addr; + u32 length; + u8 reserved[3]; + u8 subtype:4; + u8 type:4; +}; + +struct __attribute__((packed)) nvme_sgl_descriptor_keyed { + u64 addr; + u64 length:24; + u64 key:32; + u64 subtype:4; + u64 type:4; +}; +union nvme_sgl_descriptor { + nvme_sgl_descriptor_keyed keyed; + nvme_sgl_descriptor_unkeyed unkeyed; +}; + +static_assert(sizeof(nvme_sgl_descriptor)==16); + + + + +/// Controller register (bar 0) +typedef struct _nvme_controller_reg { + nvme_controller_cap_t cap; ///< controller capabilities + nvme_version_t vs; ///< version + u32 intms; ///< interrupt mask set + u32 intmc; ///< interrupt mask clear + nvme_controller_config_t cc; ///< controller configuration + u32 rsvd; ///< reserved + nvme_controller_status_t csts; ///< controller status + u32 nssr; ///< NVM subsystem reset + nvme_adminq_attr_t aqa; ///< admin queue attributes + u64 asq; ///< admin submission queue base address + u64 acq; ///< admin completion queue base address + nvme_cmbloc_t cmbloc; ///< controller memory buffer location + nvme_cmbsz_t cmbsz; ///< controller memory buffer size + u32 rcss[1008]; ///< reserved and command set specific + u32 sq0tdbl[1024]; ///< sq0 tail doorbell at 0x1000 +} nvme_controller_reg_t; + +/// Common command header (cdw 0-9) +typedef struct _nvme_command_common { + u8 opc; ///< opcode + u8 fuse : 2; ///< fuse + u8 rsvd : 4; ///< reserved + u8 psdt : 2; ///< PRP or SGL for data transfer + u16 cid; ///< command id + u32 nsid; ///< namespace id + u64 cdw2_3; ///< reserved (cdw 2-3) + u64 mptr; ///< metadata pointer + union { + struct { + u64 prp1; ///< PRP entry 1 + u64 prp2; ///< PRP entry 2 + }; + nvme_sgl_descriptor sgl1; /// + +#include "drivers/nvme.hh" +#include "drivers/pci-device.hh" +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +using namespace memory; + +#include +#include + +#include +#include "drivers/io-test.hh" + +TRACEPOINT(trace_nvme_read_config, "capacity=%lu blk_size=%u max_io_size=%u", u64, u32, u64); +TRACEPOINT(trace_nvme_strategy, "bio=%p", struct bio*); +TRACEPOINT(trace_nvme_vwc_enabled, "sc=%#x sct=%#x", u16, u16); +TRACEPOINT(trace_nvme_number_of_queues, "cq num=%d, sq num=%d, iv_num=%d", u16, u16, u32); +TRACEPOINT(trace_nvme_identify_namespace, "nsid=%d, blockcount=%d, blocksize=%d", u32, u64, u16); +TRACEPOINT(trace_nvme_register_interrupt, "_io_queues[%d], iv=%d", int, int); + + +#define QEMU_VID 0x1b36 + +std::unique_ptr alloc_cmd() { + auto cmd = std::unique_ptr(new nvme_sq_entry_t); + assert(cmd); + memset(cmd.get(), 0, sizeof(nvme_ns_t)); + return cmd; +} + +struct nvme_priv { + devop_strategy_t strategy; + nvme* drv; + u32 nsid; +}; + +static void nvme_strategy(struct bio* bio) { + auto* prv = reinterpret_cast(bio->bio_dev->private_data); + trace_nvme_strategy(bio); + prv->drv->make_request(bio); +} + +static int +nvme_read(struct device *dev, struct uio *uio, int ioflags) +{ + return bdev_read(dev, uio, ioflags); +} + +static int +nvme_write(struct device *dev, struct uio *uio, int ioflags) +{ + return bdev_write(dev, uio, ioflags); +} + +static int +nvme_direct_rw(struct device *dev, struct uio *uio, int ioflags) +{ + auto* prv = reinterpret_cast(dev->private_data); + + assert((uio->uio_offset % prv->drv->_ns_data[prv->nsid]->blocksize) == 0); + assert((uio->uio_resid % prv->drv->_ns_data[prv->nsid]->blocksize) == 0); + + bio* complete_io = alloc_bio(); + + u8 opcode; + switch (uio->uio_rw) { + case UIO_READ : + opcode = BIO_READ; + break; + case UIO_WRITE : + opcode = BIO_WRITE; + break; + default : + return EINVAL; + } + + refcount_init(&complete_io->bio_refcnt, uio->uio_iovcnt); + + while(uio->uio_iovcnt > 0) + { + bio* bio = alloc_bio(); + bio->bio_cmd = opcode; + bio->bio_dev = dev; + + assert((uio->uio_iov->iov_len % prv->drv->_ns_data[prv->nsid]->blocksize) == 0); + + bio->bio_bcount = uio->uio_iov->iov_len; + bio->bio_data = uio->uio_iov->iov_base; + bio->bio_offset = uio->uio_offset; + + bio->bio_caller1 = complete_io; + bio->bio_private = complete_io->bio_private; + bio->bio_done = multiplex_bio_done; + + dev->driver->devops->strategy(bio); + + uio->uio_offset += uio->uio_iov->iov_len; + uio->uio_resid -= uio->uio_iov->iov_len; + uio->uio_iov++; + uio->uio_iovcnt--; + } + assert(uio->uio_resid == 0); + int ret = bio_wait(complete_io); + destroy_bio(complete_io); + + return ret; +} + +static int +nvme_open(struct device *dev, int ioflags) +{ + return 0; +} + +#include "drivers/blk_ioctl.hh" + +static struct devops nvme_devops { + nvme_open, + no_close, + NVME_DIRECT_RW_ENABLED ? nvme_direct_rw : nvme_read, + NVME_DIRECT_RW_ENABLED ? nvme_direct_rw : nvme_write, + blk_ioctl, + no_devctl, + multiplex_strategy, +}; + +struct driver nvme_driver = { + "nvme", + &nvme_devops, + sizeof(struct nvme_priv), +}; + +int nvme::_instance = 0; + +extern std::vector sched::cpus; + +nvme::nvme(pci::device &dev) + : _dev(dev) + , _msi(&dev) +{ + parse_pci_config(); + u16 command = dev.get_command(); + command |= 0x4 | 0x2 | 0x400; + dev.set_command(command); + + _id = _instance++; + + _doorbellstride = 1 << (2 + _control_reg->cap.dstrd); + + wait_for_controller_ready_change(1); + disable_controller(); + + init_controller_config(); + + create_admin_queue(); + + enable_controller(); + + identify_controller(); + + if(NVME_CHECK_FOR_ADDITIONAL_NAMESPACES) { + identify_active_namespaces(1); + } else { + identify_namespace(1); + } + + if(_identify_controller->vwc & 0x1 && NVME_VWC_ENABLED) { + auto cmd = alloc_cmd(); + cmd->set_features.common.opc = NVME_ACMD_SET_FEATURES; + cmd->set_features.fid = NVME_FEATURE_WRITE_CACHE; + cmd->set_features.val = 1; + auto res = _admin_queue->submit_and_return_on_completion(std::move(cmd)); + trace_nvme_vwc_enabled(res->sc,res->sct); + } + + if(NVME_QUEUE_PER_CPU_ENABLED) { + u16 num = sched::cpus.size(); + u16 ret; + set_number_of_queues(num, &ret); + create_io_queues_foreach_cpu(); + }else { + u16 ret; + set_number_of_queues(1, &ret); + assert(ret>=1); + create_io_queue(); + } + + if(_identify_controller->vid != QEMU_VID) { + set_interrupt_coalescing(20,2); + } + + struct nvme_priv* prv; + struct device *osv_dev; + + debugf("nvme: %s\n", _identify_controller->sn); + + for(const auto& ns : _ns_data) { + std::string dev_name; + if(ns.first == 1 && _id == 0) { + dev_name = "vblk"; + dev_name += std::to_string(_id); + } else { + dev_name = "nvme"; + dev_name += std::to_string(_id) + "n"; + dev_name += std::to_string(ns.first); + } + off_t size = ((off_t) ns.second->blockcount) << ns.second->blockshift; + + debugf("nvme: Add namespace %d of nvme device %d as %s, devsize=%lld\n", ns.first, _id, dev_name.c_str(), size); + + osv_dev = device_create(&nvme_driver,dev_name.c_str(), D_BLK); + prv = reinterpret_cast(osv_dev->private_data); + prv->strategy = nvme_strategy; + prv->drv = this; + prv->nsid = ns.first; + osv_dev->size = size; + /* + * IO size greater than 4096 << 9 would mean we need + * more than 1 page for the prplist which is not implemented + */ + osv_dev->max_io_size = 4096 << ((9 < _identify_controller->mdts)? 9 : _identify_controller->mdts ); + + #if CONF_drivers_io_test + test_block_device(osv_dev, 20*1e6, 8); + test_block_device(osv_dev, 20*1e6, 512); + #endif + + read_partition_table(osv_dev); + } +} + +int nvme::set_number_of_queues(u16 num, u16* ret) +{ + auto cmd = alloc_cmd(); + cmd->set_features.common.opc = NVME_ACMD_SET_FEATURES; + cmd->set_features.fid = NVME_FEATURE_NUM_QUEUES; + cmd->set_features.val = (num << 16) | num; + std::unique_ptr res = _admin_queue->submit_and_return_on_completion(std::move(cmd)); + u16 cq_num, sq_num; + cq_num = res->cs >> 16; + sq_num = res->cs & 0xffff; + + trace_nvme_number_of_queues(res->cs >> 16, res->cs & 0xffff,_dev.msix_get_num_entries()); + + if(res->sct != 0 || res->sc != 0) + return EIO; + + if(num > cq_num || num > sq_num) { + *ret = (cq_num > sq_num) ? cq_num : sq_num; + } else { + *ret = num; + } + return 0; +} +/*time in 100ms increments*/ +int nvme::set_interrupt_coalescing(u8 threshold, u8 time) +{ + auto cmd = alloc_cmd(); + cmd->set_features.common.opc = NVME_ACMD_SET_FEATURES; + cmd->set_features.fid = NVME_FEATURE_INT_COALESCING; + cmd->set_features.val = threshold | (time << 8); + std::unique_ptr res = _admin_queue->submit_and_return_on_completion(std::move(cmd)); + + if(res->sct != 0 || res->sc != 0) + return EIO; + return 0; +} + +void nvme::enable_controller() +{ + nvme_controller_config_t cc; + cc.val = mmio_getl(&_control_reg->cc); + + assert(cc.en == 0); + cc.en = 1; + + mmio_setl(&_control_reg->cc,cc.val); + int s = wait_for_controller_ready_change(1); + assert(s==0); +} + +void nvme::disable_controller() +{ + nvme_controller_config_t cc; + cc.val = mmio_getl(&_control_reg->cc); + + assert(cc.en == 1); + cc.en = 0; + + mmio_setl(&_control_reg->cc,cc.val); + int s = wait_for_controller_ready_change(0); + assert(s==0); +} + +int nvme::wait_for_controller_ready_change(int ready) +{ + int timeout = mmio_getb(&_control_reg->cap.to) * 10000; // timeout in 0.05ms steps + nvme_controller_status_t csts; + for (int i = 0; i < timeout; i++) { + csts.val = mmio_getl(&_control_reg->csts); + if (csts.rdy == ready) return 0; + usleep(50); + } + NVME_ERROR("timeout=%d waiting for ready %d", timeout, ready); + return ETIME; +} + +void nvme::init_controller_config() +{ + nvme_controller_config_t cc; + cc.val = mmio_getl(&_control_reg->cc.val); + cc.iocqes = 4; // completion queue entry size 16B + cc.iosqes = 6; // submission queue entry size 64B + cc.mps = 0; // memory page size 4096B + + mmio_setl(&_control_reg->cc, cc.val); +} + +void nvme::create_admin_queue() +{ + int qsize = NVME_ADMIN_QUEUE_SIZE; + nvme_sq_entry_t* sqbuf = (nvme_sq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_sq_entry_t),4096); + nvme_cq_entry_t* cqbuf = (nvme_cq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_cq_entry_t),4096); + + nvme_adminq_attr_t aqa; + aqa.val = 0; + aqa.asqs = aqa.acqs = qsize - 1; + + u32* sq_doorbell = _control_reg->sq0tdbl; + u32* cq_doorbell = (u32*) ((u64)sq_doorbell + _doorbellstride); + + _admin_queue = std::unique_ptr(new nvme_admin_queue_pair(_id,0, qsize, _dev, sqbuf, sq_doorbell, cqbuf, cq_doorbell, _ns_data)); + + register_admin_interrupts(); + + mmio_setl(&_control_reg->aqa, aqa.val); + mmio_setq(&_control_reg->asq, (u64) mmu::virt_to_phys((void*) sqbuf)); + mmio_setq(&_control_reg->acq, (u64) mmu::virt_to_phys((void*) cqbuf)); +} + +int nvme::create_io_queue(int qsize, int qprio) +{ + u32* sq_doorbell; + u32* cq_doorbell; + int id = _io_queues.size() + 1; + int iv = id; + qsize = (qsize < _control_reg->cap.mqes) ? qsize : _control_reg->cap.mqes + 1; + + nvme_sq_entry_t* sqbuf = (nvme_sq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_sq_entry_t),4096); + nvme_cq_entry_t* cqbuf = (nvme_cq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_cq_entry_t),4096); + assert(sqbuf); + assert(cqbuf); + memset(sqbuf,0,sizeof(nvme_sq_entry_t)*qsize); + memset(cqbuf,0,sizeof(nvme_cq_entry_t)*qsize); + + // create completion queue + nvme_acmd_create_cq_t* cmd = (nvme_acmd_create_cq_t*) malloc(sizeof(nvme_acmd_create_cq_t)); + assert(cmd); + memset(cmd, 0, sizeof (*cmd)); + + cmd->qid = id; + cmd->qsize = qsize - 1; + cmd->iv = iv; + cmd->pc = 1; + cmd->ien = 1; + cmd->common.opc = NVME_ACMD_CREATE_CQ; + cmd->common.prp1 = (u64) mmu::virt_to_phys(cqbuf); + + // create submission queue + nvme_acmd_create_sq_t* cmd_sq = (nvme_acmd_create_sq_t*) malloc(sizeof(nvme_acmd_create_sq_t)); + assert(cmd_sq); + memset(cmd_sq, 0, sizeof(nvme_acmd_create_sq_t)); + + cmd_sq->pc = 1; + cmd_sq->qprio = qprio; // 0=urgent 1=high 2=medium 3=low + cmd_sq->qid = id; + cmd_sq->cqid = id; + cmd_sq->qsize = qsize - 1; + cmd_sq->common.opc = NVME_ACMD_CREATE_SQ; + cmd_sq->common.prp1 = (u64) mmu::virt_to_phys(sqbuf); + + sq_doorbell = (u32*) ((u64) _control_reg->sq0tdbl + 2 * _doorbellstride * id); + cq_doorbell = (u32*) ((u64) sq_doorbell + _doorbellstride); + + _io_queues.push_back(std::unique_ptr(new nvme_io_queue_pair(_id, iv, qsize, _dev, sqbuf, sq_doorbell, cqbuf, cq_doorbell, _ns_data))); + + register_interrupt(iv,id-1); + + _admin_queue->submit_and_return_on_completion(std::unique_ptr((nvme_sq_entry_t*)cmd)); + _admin_queue->submit_and_return_on_completion(std::unique_ptr((nvme_sq_entry_t*)cmd_sq)); + + return id -1; +} + +void nvme::create_io_queues_foreach_cpu() +{ + int iv,id; + int qsize = NVME_IO_QUEUE_SIZE; + + assert(_io_queues.size()==0); + + u32* sq_doorbell; + u32* cq_doorbell; + + for(sched::cpu* cpu : sched::cpus) { + id = cpu->id; + iv = id + 1; + nvme_sq_entry_t* sqbuf = (nvme_sq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_sq_entry_t),4096); + nvme_cq_entry_t* cqbuf = (nvme_cq_entry_t*) alloc_phys_contiguous_aligned(qsize * sizeof(nvme_cq_entry_t),4096); + assert(sqbuf); + assert(cqbuf); + memset(sqbuf,0,sizeof(nvme_sq_entry_t)*qsize); + memset(cqbuf,0,sizeof(nvme_cq_entry_t)*qsize); + + nvme_acmd_create_cq_t* cmd = (nvme_acmd_create_cq_t*) malloc(sizeof(nvme_acmd_create_cq_t)); + assert(cmd); + memset(cmd, 0, sizeof (*cmd)); + + cmd->qid = iv; + cmd->qsize = qsize - 1; + cmd->iv = iv; + cmd->pc = 1; + cmd->ien = 1; + cmd->common.opc = NVME_ACMD_CREATE_CQ; + cmd->common.prp1 = (u64) mmu::virt_to_phys(cqbuf); + + // create submission queue + nvme_acmd_create_sq_t* cmd_sq = (nvme_acmd_create_sq_t*) malloc(sizeof(nvme_acmd_create_sq_t)); + assert(cmd_sq); + memset(cmd_sq, 0, sizeof(nvme_acmd_create_sq_t)); + + cmd_sq->pc = 1; + cmd_sq->qprio = 2; // 0=urgent 1=high 2=medium 3=low + cmd_sq->qid = iv; + cmd_sq->cqid = iv; + cmd_sq->qsize = qsize - 1; + cmd_sq->common.opc = NVME_ACMD_CREATE_SQ; + cmd_sq->common.prp1 = (u64) mmu::virt_to_phys(sqbuf); + + sq_doorbell = (u32*) ((u64) _control_reg->sq0tdbl + 2 * _doorbellstride * iv); + cq_doorbell = (u32*) ((u64) sq_doorbell + _doorbellstride); + + _io_queues.push_back(std::unique_ptr(new nvme_io_queue_pair(_id, iv, qsize, _dev, sqbuf, sq_doorbell, cqbuf, cq_doorbell, _ns_data))); + + register_interrupt(iv,id,true,cpu); + + _admin_queue->submit_and_return_on_completion(std::unique_ptr((nvme_sq_entry_t*)cmd)); + _admin_queue->submit_and_return_on_completion(std::unique_ptr((nvme_sq_entry_t*)cmd_sq)); + } +} + +int nvme::identify_controller() +{ + assert(_admin_queue); + auto cmd = alloc_cmd(); + cmd->identify.cns = 1; + cmd->identify.common.opc = NVME_ACMD_IDENTIFY; + auto data = new nvme_identify_ctlr_t; + auto res = _admin_queue->submit_and_return_on_completion(std::move(cmd), (void*) mmu::virt_to_phys(data),4096); + + if(res->sc != 0 || res->sct != 0) { + NVME_ERROR("Identify controller failed nvme%d, sct=%d, sc=%d", _id, res->sct, res->sc); + return EIO; + } + + _identify_controller.reset(data); + return 0; +} + +int nvme::identify_namespace(u32 ns) +{ + assert(_admin_queue); + auto cmd = alloc_cmd(); + cmd->identify.cns = 0; + cmd->identify.common.nsid = ns; + cmd->identify.common.opc = NVME_ACMD_IDENTIFY; + auto data = std::unique_ptr(new nvme_identify_ns_t); + + auto res = _admin_queue->submit_and_return_on_completion(std::move(cmd), (void*) mmu::virt_to_phys(data.get()),4096); + if(res->sc != 0 || res->sct != 0) { + NVME_ERROR("Identify namespace failed nvme%d nsid=%d, sct=%d, sc=%d", _id, ns, res->sct, res->sc); + return EIO; + } + + _ns_data.insert(std::make_pair(ns, new nvme_ns_t)); + _ns_data[ns]->blockcount = data->ncap; + _ns_data[ns]->blockshift = data->lbaf[data->flbas & 0xF].lbads; + _ns_data[ns]->blocksize = 1 << _ns_data[ns]->blockshift; + _ns_data[ns]->bpshift = NVME_PAGESHIFT - _ns_data[ns]->blockshift; + _ns_data[ns]->id = ns; + + trace_nvme_identify_namespace(ns, _ns_data[ns]->blockcount, _ns_data[ns]->blocksize); + return 0; +} + +//identify all active namespaces with nsid >= start +int nvme::identify_active_namespaces(u32 start) +{ + assert(start >= 1); + assert(_identify_controller); + //max number of namespaces supported by the controller + u32 nn = _identify_controller->nn; + assert(nn > start); + + auto cmd = alloc_cmd(); + cmd->identify.cns = 2; + cmd->identify.common.nsid = start - 1; + cmd->identify.common.opc = NVME_ACMD_IDENTIFY; + auto active_namespaces = (u64*) alloc_phys_contiguous_aligned(4096, 4); + memset(active_namespaces, 0, 4096); + + _admin_queue->submit_and_return_on_completion(std::move(cmd), (void*) mmu::virt_to_phys(active_namespaces), 4096); + int err; + for(int i=0; i < 1024; i++) { + if(active_namespaces[i]) { + err = identify_namespace(active_namespaces[i]); + if (err) { + free_phys_contiguous_aligned(active_namespaces); + return err; + } + } else { break;} + } + free_phys_contiguous_aligned(active_namespaces); + return 0; +} + +int nvme::make_request(bio* bio, u32 nsid) +{ + if(bio->bio_bcount % _ns_data[nsid]->blocksize || bio->bio_offset % _ns_data[nsid]->blocksize) { + NVME_ERROR("bio request not block-aligned length=%d, offset=%d blocksize=%d\n",bio->bio_bcount, bio->bio_offset, _ns_data[nsid]->blocksize); + return EINVAL; + } + bio->bio_offset = bio->bio_offset >> _ns_data[nsid]->blockshift; + bio->bio_bcount = bio->bio_bcount >> _ns_data[nsid]->blockshift; + + assert((bio->bio_offset + bio->bio_bcount) <= _ns_data[nsid]->blockcount); + + if(bio->bio_cmd == BIO_FLUSH && (_identify_controller->vwc == 0 || !NVME_VWC_ENABLED )) { + biodone(bio,true); + return 0; + } + + if(sched::current_cpu->id >= _io_queues.size()) + return _io_queues[0]->make_request(bio, nsid); + + return _io_queues[sched::current_cpu->id]->make_request(bio, nsid); +} + +void nvme::register_admin_interrupts() +{ + sched::thread* aq_thread = sched::thread::make([this] { this->_admin_queue->req_done(); }, + sched::thread::attr().name("nvme"+ std::to_string(_id)+"_aq_req_done")); + aq_thread->start(); + + bool ok = msix_register(0, [this] { this->_admin_queue->disable_interrupts(); }, aq_thread); + _dev.msix_unmask_entry(0); + if(not ok) + printf("admin interrupt registration failed\n"); +} + +bool nvme::msix_register(unsigned iv, + // high priority ISR + std::function isr, + // bottom half + sched::thread *t, + bool assign_affinity) +{ + // Enable the device msix capability, + // masks all interrupts... + if (_dev.is_msix()) { + _dev.msix_enable(); + } else { + return false; + } + _dev.msix_mask_all(); + + if(_msix_vectors.empty()) + _msix_vectors = std::vector>(_dev.msix_get_num_entries()); + + auto vec = std::unique_ptr(new msix_vector(&_dev)); + bool assign_ok; + _dev.msix_mask_entry(iv); + if (t) { + assign_ok = + _msi.assign_isr(vec.get(), + [=]() mutable { + isr(); + t->wake_with_irq_disabled(); + }); + } else { + return false; + } + if (!assign_ok) { + return false; + } + bool setup_ok = _msi.setup_entry(iv, vec.get()); + if (!setup_ok) { + return false; + } + if (assign_affinity) { + vec->set_affinity(t->get_cpu()->arch.apic_id); + } + + if(iv < _msix_vectors.size()) { + _msix_vectors.at(iv) = std::move(vec); + } else { + NVME_ERROR("binding_entry %d registration failed\n",iv); + return false; + } + _msix_vectors.at(iv)->msix_unmask_entries(); + + _dev.msix_unmask_all(); + return true; +} +//qid should be the index that corresponds to the queue in _io_queues. +//In general qid = iv - 1 +bool nvme::register_interrupt(unsigned int iv, unsigned int qid, bool pin_t, sched::cpu* cpu) +{ + sched::thread* t; + bool ok; + + if(_io_queues.size() <= qid) { + NVME_ERROR("queue %d not initialized\n",qid); + return false; + } + + if(_io_queues[qid]->_id != iv) + printf("Warning: Queue %d ->_id = %d != iv %d\n",qid,_io_queues[qid]->_id,iv); + + trace_nvme_register_interrupt(qid, iv); + t = sched::thread::make([this,qid] { this->_io_queues[qid]->req_done(); }, + sched::thread::attr().name("nvme" + std::to_string(_id) + "_ioq" + std::to_string(qid) + "_iv" +std::to_string(iv))); + t->start(); + if(pin_t && cpu) { + sched::thread::pin(t,cpu); + } + + ok = msix_register(iv, [this,qid] { this->_io_queues[qid]->disable_interrupts(); }, t,pin_t); + _dev.msix_unmask_entry(iv); + if(not ok) + NVME_ERROR("Interrupt registration failed: queue=%d interruptvector=%d\n",qid,iv); + return ok; +} + +void nvme::dump_config(void) +{ + u8 B, D, F; + _dev.get_bdf(B, D, F); + + _dev.dump_config(); + nvme_d("%s [%x:%x.%x] vid:id= %x:%x", get_name().c_str(), + (u16)B, (u16)D, (u16)F, + _dev.get_vendor_id(), + _dev.get_device_id()); +} + +void nvme::parse_pci_config() +{ + _bar0 = _dev.get_bar(1); + _bar0->map(); + if (_bar0 == nullptr) { + throw std::runtime_error("BAR1 is absent"); + } + assert(_bar0->is_mapped()); + _control_reg = (nvme_controller_reg_t*) _bar0->get_mmio(); +} + +hw_driver* nvme::probe(hw_device* dev) +{ + if (auto pci_dev = dynamic_cast(dev)) { + if ((pci_dev->get_base_class_code()==1) && (pci_dev->get_sub_class_code()==8) && (pci_dev->get_programming_interface()==2)) // detect NVMe device + return aligned_new(*pci_dev); + } + return nullptr; +} diff --git a/drivers/nvme.hh b/drivers/nvme.hh new file mode 100644 index 0000000000..03604fc420 --- /dev/null +++ b/drivers/nvme.hh @@ -0,0 +1,118 @@ +#ifndef NVME_DRIVER_H +#define NVME_DRIVER_H + +#include "drivers/nvme-structs.h" +#include "drivers/driver.hh" +#include "drivers/pci-device.hh" +#include +#include +#include +#include "drivers/nvme-queue.hh" +#include +#include +#include + +#define nvme_tag "nvme" +#define nvme_d(...) tprintf_d(nvme_tag, __VA_ARGS__) +#define nvme_i(...) tprintf_i(nvme_tag, __VA_ARGS__) +#define nvme_w(...) tprintf_w(nvme_tag, __VA_ARGS__) +#define nvme_e(...) tprintf_e(nvme_tag, __VA_ARGS__) + +#define NVME_ERROR(...) nvme_e(__VA_ARGS__) + +#define NVME_PAGESIZE 4096 +#define NVME_PAGESHIFT 12 + +/*bdev block cache will not be used if enabled*/ +#define NVME_DIRECT_RW_ENABLED 0 + +#define NVME_QUEUE_PER_CPU_ENABLED 0 + +//Volatile Write Cache +#define NVME_VWC_ENABLED 1 + +//checks for all active namespaces instead of just ns 1 +#define NVME_CHECK_FOR_ADDITIONAL_NAMESPACES 1 + +#define NVME_ADMIN_QUEUE_SIZE 8 + +/*Will be lower if the device doesnt support the +specified queue size */ +#define NVME_IO_QUEUE_SIZE 256 + +class nvme_io_queue_pair; +class nvme_admin_queue_pair; + +class nvme : public hw_driver { +public: + explicit nvme(pci::device& dev); + virtual ~nvme() {}; + + virtual std::string get_name() const { return "nvme"; } + + virtual void dump_config(); + + int make_request(struct bio* bio, u32 nsid=1); + static hw_driver* probe(hw_device* dev); + + int set_feature(); + int get_feature(); + + int set_number_of_queues(u16 num, u16* ret); + int set_interrupt_coalescing(u8 threshold, u8 time); + + int get_interrupt_coalescing(); + + int create_io_queue(int qsize=NVME_IO_QUEUE_SIZE, int qprio=2); + + bool register_interrupt(unsigned int iv,unsigned int qid,bool pin_t=false, sched::cpu* cpu = NULL); + + int shutdown(); + + std::map _ns_data; + +private: + int identify_controller(); + int identify_namespace(u32 ns); + int identify_active_namespaces(u32 start); + + void create_admin_queue(); + void register_admin_interrupts(); + + void init_controller_config(); + void create_io_queues_foreach_cpu(); + + void enable_controller(); + void disable_controller(); + int wait_for_controller_ready_change(int ready); + + void parse_pci_config(); + + nvme_controller_reg_t* _control_reg; + + //maintains the nvme instance number for multiple adapters + static int _instance; + int _id; + + std::vector> _msix_vectors; + bool msix_register(unsigned iv, + // high priority ISR + std::function isr, + // bottom half + sched::thread *t, + // set affinity of the vector to the cpu running t + bool assign_affinity=false); + + std::unique_ptr _admin_queue; + + std::vector> _io_queues; + u32 _doorbellstride; + + std::unique_ptr _identify_controller; + + pci::device& _dev; + interrupt_manager _msi; + + pci::bar *_bar0 = nullptr; +}; +#endif diff --git a/drivers/virtio-blk.cc b/drivers/virtio-blk.cc index b643c991b7..e85909ae92 100644 --- a/drivers/virtio-blk.cc +++ b/drivers/virtio-blk.cc @@ -83,12 +83,14 @@ blk_write(struct device *dev, struct uio *uio, int ioflags) return bdev_write(dev, uio, ioflags); } +#include "drivers/blk_ioctl.hh" + static struct devops blk_devops { no_open, no_close, blk_read, blk_write, - no_ioctl, + blk_ioctl, no_devctl, multiplex_strategy, }; diff --git a/fs/vfs/kern_physio.cc b/fs/vfs/kern_physio.cc index c7c99c724d..6f4207af85 100644 --- a/fs/vfs/kern_physio.cc +++ b/fs/vfs/kern_physio.cc @@ -72,7 +72,7 @@ biofinish(struct bio *bp, struct devstat *stat, int error) biodone(bp, error); } -static void multiplex_bio_done(struct bio *b) +void multiplex_bio_done(struct bio *b) { struct bio *bio = static_cast(b->bio_caller1); bool error = b->bio_flags & BIO_ERROR; @@ -80,13 +80,8 @@ static void multiplex_bio_done(struct bio *b) // If there is an error, we store it in the original bio flags. - // This path gets slower because then we need to end up taking the - // bio_mutex twice. But that should be fine. - if (error) { - WITH_LOCK(bio->bio_mutex) { - bio->bio_flags |= BIO_ERROR; - } - } + if (error) + atomic_set_char(reinterpret_cast(&bio->bio_flags), BIO_ERROR); // Last one releases it. We set the biodone to always be "ok", because // if an error exists, we have already set that in the previous operation diff --git a/include/osv/bio.h b/include/osv/bio.h index 06a433476f..116d936eff 100644 --- a/include/osv/bio.h +++ b/include/osv/bio.h @@ -126,6 +126,8 @@ void biodone(struct bio *bio, bool ok); struct devstat; void biofinish(struct bio *bp, struct devstat *stat, int error); +void multiplex_bio_done(struct bio *b); + __END_DECLS #endif /* !_SYS_BIO_H_ */ diff --git a/include/osv/buf.h b/include/osv/buf.h index 8799bd54b8..d1a3b6626f 100755 --- a/include/osv/buf.h +++ b/include/osv/buf.h @@ -43,7 +43,7 @@ struct buf: boost::intrusive::list_base_hook<> { int b_flags; /* see defines below */ struct device *b_dev; /* device */ - int b_blkno; /* block # on device */ + off_t b_blkno; /* block # on device */ mutex_t b_lock; /* lock for access */ void *b_data; /* pointer to data buffer */ }; diff --git a/scripts/run.py b/scripts/run.py index 01fc201f63..695bb64a77 100755 --- a/scripts/run.py +++ b/scripts/run.py @@ -172,6 +172,10 @@ def start_osv_qemu(options): "-device", "virtio-scsi-pci,id=scsi0%s" % options.virtio_device_suffix, "-drive", "file=%s,if=none,id=hd0,media=disk,%s" % (options.image_file, aio), "-device", "scsi-hd,bus=scsi0.0,drive=hd0,scsi-id=1,lun=0%s" % boot_index] + elif options.nvme: + args += [ + "-device", "nvme,serial=deadbeef,drive=nvm%s" % (boot_index), + "-drive", "file=%s,if=none,id=nvm,%s" % (options.image_file, aio)] elif options.ide: args += [ "-hda", options.image_file] @@ -197,7 +201,18 @@ def start_osv_qemu(options): "-device", "vhost-user-fs-pci,queue-size=1024,chardev=char0,tag=%s%s" % (options.virtio_fs_tag, dax), "-object", "memory-backend-file,id=mem,size=%s,mem-path=/dev/shm,share=on" % options.memsize, "-numa", "node,memdev=mem"] - + + if options.second_nvme_image: + print("nvme disk qemu") + args += [ + "-drive", "file=%s,if=none,id=nvm1" % (options.second_nvme_image), + "-device", "nvme,serial=deadbeef,drive=nvm1,"] + + if options.pass_pci: + print("passthrough") + args += [ + "-device", "vfio-pci,host=%s" % (options.pass_pci)] + if options.no_shutdown: args += ["-no-reboot", "-no-shutdown"] @@ -532,6 +547,8 @@ def main(options): help="don't start OSv till otherwise specified, e.g. through the QEMU monitor or a remote gdb") parser.add_argument("-i", "--image", action="store", default=None, metavar="IMAGE", help="path to disk image file. defaults to build/$mode/usr.img") + parser.add_argument("-N", "--nvme",action="store_true", default=False, + help="use NVMe instead of virtio-blk") parser.add_argument("-S", "--scsi", action="store_true", default=False, help="use virtio-scsi instead of virtio-blk") parser.add_argument("-A", "--sata", action="store_true", default=False, @@ -626,6 +643,10 @@ def main(options): help="static ip addresses (forwarded to respective kernel command line option)") parser.add_argument("--bootchart", action="store_true", help="bootchart mode (forwarded to respective kernel command line option") + parser.add_argument("--second-nvme-image", action="store", + help="Path to an optional disk image that should be attached to the instance as NVMe device") + parser.add_argument("--pass-pci", action="store", + help="passthrough a pci device in given slot if bound to vfio driver") cmdargs = parser.parse_args() cmdargs.opt_path = "debug" if cmdargs.debug else "release" if cmdargs.release else "last" diff --git a/scripts/test.py b/scripts/test.py index 5b4c7ae3aa..cec533fcba 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -158,6 +158,7 @@ def main(): parser.add_argument("--run_options", action="store", help="pass extra options to run.py") parser.add_argument("-m", "--manifest", action="store", default="modules/tests/usr.manifest", help="test manifest") parser.add_argument("-d", "--disabled_list", action="append", help="test to be disabled", default=[]) + parser.add_argument("--nvme", action="store_true", default=False, help="run tests with nvme") parser.add_argument("--arch", action="store", choices=["x86_64","aarch64"], default=host_arch, help="specify QEMU architecture: x86_64, aarch64") cmdargs = parser.parse_args() @@ -175,6 +176,10 @@ def main(): disabled_list.extend(firecracker_disabled_list) else: disabled_list.extend(qemu_disabled_list) + + if cmdargs.nvme : + print("TEST NVME\n") + run_py_args = run_py_args + ['--nvme'] if cmdargs.arch == 'aarch64': if host_arch != cmdargs.arch: