diff --git a/README.md b/README.md index d0acc4658..a7228e772 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,30 @@ # Vortex GPGPU -Vortex is a full-stack open-source RISC-V GPGPU. +Vortex is a full-stack open-source RISC-V GPGPU. Vortex supports multiple **backend drivers**, including our C++ simulator (simx), an RTL simulator, and physical Xilinx and Altera FPGAs-- all controlled by a single driver script. The chosen driver determines the corresponding code invoked to run Vortex. Generally, developers will prototype their intended design in simx, before completing going forward with an RTL implementation. Alternatively, you can get up and running by selecting a driver of your choice and running a demo program. + +## Website +Vortex news can be found on its [website](https://vortex.cc.gatech.edu/) + +## Citation +``` +@inproceedings{10.1145/3466752.3480128, + author = {Tine, Blaise and Yalamarthy, Krishna Praveen and Elsabbagh, Fares and Hyesoon, Kim}, + title = {Vortex: Extending the RISC-V ISA for GPGPU and 3D-Graphics}, + year = {2021}, + isbn = {9781450385572}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + url = {https://doi.org/10.1145/3466752.3480128}, + doi = {10.1145/3466752.3480128}, + abstract = {The importance of open-source hardware and software has been increasing. However, despite GPUs being one of the more popular accelerators across various applications, there is very little open-source GPU infrastructure in the public domain. We argue that one of the reasons for the lack of open-source infrastructure for GPUs is rooted in the complexity of their ISA and software stacks. In this work, we first propose an ISA extension to RISC-V that supports GPGPUs and graphics. The main goal of the ISA extension proposal is to minimize the ISA changes so that the corresponding changes to the open-source ecosystem are also minimal, which makes for a sustainable development ecosystem. To demonstrate the feasibility of the minimally extended RISC-V ISA, we implemented the complete software and hardware stacks of Vortex on FPGA. Vortex is a PCIe-based soft GPU that supports OpenCL and OpenGL. Vortex can be used in a variety of applications, including machine learning, graph analytics, and graphics rendering. Vortex can scale up to 32 cores on an Altera Stratix 10 FPGA, delivering a peak performance of 25.6 GFlops at 200 Mhz.}, + booktitle = {MICRO-54: 54th Annual IEEE/ACM International Symposium on Microarchitecture}, + pages = {754–766}, + numpages = {13}, + keywords = {reconfigurable computing, memory systems., computer graphics}, + location = {Virtual Event, Greece}, + series = {MICRO '21} +} +``` ## Specifications @@ -30,12 +54,14 @@ Vortex is a full-stack open-source RISC-V GPGPU. - `ci`: Continuous integration scripts. - `miscs`: Miscellaneous resources. -## Build Instructions -More detailed build instructions can be found [here](docs/install_vortex.md). +## Quick Start +If you are interested in a stable release of Vortex, you can download the latest release [here](https://github.com/vortexgpgpu/vortex/releases/latest). Otherwise, you can pull the most recent, but (potentially) unstable version as shown below. The following steps demonstrate how to build and run Vortex with the default driver: SimX. If you are interested in a different backend, look [here](docs/simulation.md). + ### Supported OS Platforms - Ubuntu 18.04, 20.04, 22.04, 24.04 - Centos 7 ### Toolchain Dependencies +The following dependencies will be fetched prebuilt by `toolchain_install.sh`. - [POCL](http://portablecl.org/) - [LLVM](https://llvm.org/) - [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain) @@ -105,4 +131,4 @@ echo "source /ci/toolchain_env.sh" >> ~/.bashrc ```sh ./ci/blackbox.sh --app=demo --debug=3 ``` -- For additional information, check out the /docs. +- For additional information, check out the [documentation](docs/index.md) diff --git a/ci/install_dependencies.sh b/ci/install_dependencies.sh index a62ed253b..4dab27786 100755 --- a/ci/install_dependencies.sh +++ b/ci/install_dependencies.sh @@ -31,7 +31,7 @@ check_gcc_version() { apt-get update -y # install system dependencies -apt-get install -y build-essential valgrind libstdc++6 binutils python3 uuid-dev ccache +apt-get install -y build-essential valgrind libstdc++6 binutils python3 uuid-dev ccache cmake # Check and install GCC 11 if necessary if check_gcc_version; then diff --git a/ci/regression.sh.in b/ci/regression.sh.in index d315e67d7..53819490f 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -163,8 +163,9 @@ cache() CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx # test cache ways - CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx + CONFIGS="-DICACHE_NUM_WAYS=1 -DDCACHE_NUM_WAYS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DICACHE_NUM_WAYS=4 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DICACHE_NUM_WAYS=4 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx # test cache banking CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx @@ -174,11 +175,17 @@ cache() CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx + # replacement policy + CONFIGS="-DDCACHE_REPL_POLICY=0" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DDCACHE_REPL_POLICY=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DDCACHE_REPL_POLICY=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + # test writeback - CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress - CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress - CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress - CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress + CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_DIRTYBYTES=0 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=mstress + CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_DIRTYBYTES=1 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=mstress + CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=simx --app=mstress + CONFIGS="-DSOCKET_SIZE=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress + CONFIGS="-DSOCKET_SIZE=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress # cache clustering CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2 diff --git a/configure b/configure index d2483a796..fbcd3f130 100755 --- a/configure +++ b/configure @@ -65,7 +65,7 @@ copy_files() { filename_no_ext="${filename%.in}" dest_file="$dest_dir/$filename_no_ext" mkdir -p "$dest_dir" - sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@CURRENTDIR@|$CURRENT_DIR|g" "$file" > "$dest_file" + sed "s|@VORTEX_HOME@|$SOURCE_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@BUILDDIR@|$CURRENT_DIR|g" "$file" > "$dest_file" # apply permissions to bash scripts read -r firstline < "$dest_file" if [[ "$firstline" =~ ^#!.*bash ]]; then @@ -169,8 +169,8 @@ fi SUBDIRS=("." "!ci" "!perf" "hw*" "kernel*" "runtime*" "sim*" "tests*") # Get the directory of the script -SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +SOURCE_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -THIRD_PARTY_DIR=$SCRIPT_DIR/third_party +THIRD_PARTY_DIR=$SOURCE_DIR/third_party -copy_files "$SCRIPT_DIR" "$CURRENT_DIR" +copy_files "$SOURCE_DIR" "$CURRENT_DIR" diff --git a/docs/altera_fpga_guide.md b/docs/altera_fpga_guide.md deleted file mode 100644 index ba95d942a..000000000 --- a/docs/altera_fpga_guide.md +++ /dev/null @@ -1,92 +0,0 @@ -# FPGA Startup and Configuration Guide - -OPAE Environment Setup ----------------------- - - $ source /opt/inteldevstack/init_env_user.sh - $ export OPAE_HOME=/opt/opae/1.1.2 - $ export PATH=$OPAE_HOME/bin:$PATH - $ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH - $ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH - $ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH - -OPAE Build ------------------- - -The FPGA has to following configuration options: -- DEVICE_FAMILY=arria10 | stratix10 -- NUM_CORES=#n - -Command line: - - $ cd hw/syn/altera/opae - $ PREFIX=test1 TARGET=fpga NUM_CORES=4 make - -A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete. -Setting TARGET=ase will build the project for simulation using Intel ASE. - - -OPAE Build Configuration ------------------------- - -The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured: -- `NUM_WARPS`: Number of warps per cores -- `NUM_THREADS`: Number of threads per warps -- `PERF_ENABLE`: enable the use of all profile counters - -You can configure the synthesis build from the command line: - - $ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make - -OPAE Build Progress -------------------- - -You could check the last 10 lines in the build log for possible errors until build completion. - - $ tail -n 10 /synth/build.log - -Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs. - - $ ps -u - -If the build fails and you need to restart it, clean up the build folder using the following command: - - $ make clean - -The bitstream file `vortex_afu.gbs` should exist when the build is done: - - $ ls -lsa /synth/vortex_afu.gbs - - -Signing the bitstream and Programming the FPGA ----------------------------------------------- - - $ cd - $ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs - $ fpgasupdate vortex_afu_unsigned_ssl.gbs - -Sample FPGA Run Test --------------------- - -Ensure you have the correct opae runtime for the FPGA target - - $ TARGET=FPGA make -C runtime/opae - -Run the following from your Vortex build directory - - $ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128" - -Testing Vortex using OPAE with Intel ASE Simulation ---------------------------------------------------- - -Building ASE synthesis - - $ TARGET=asesim make -C runtime/opae - -Building ASE runtime - - $ TARGET=asesim make -C runtime/opae - -Running ASE simulation - - $ ASE_LOG=0 ASE_WORKDIR=/synth/work TARGET=asesim ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n16" \ No newline at end of file diff --git a/docs/contributing.md b/docs/contributing.md index 14e0ccd0c..0250e9f9f 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -1,18 +1,37 @@ -# Contributing to Vortex on Github +# Contributing to Vortex -## Github Details -- There are two main repos, `vortex` (public, this one) and `vortex-dev` (private) -- todo: Most current development is on `vortex` -- If you have a legacy version of `vortex`, you can use the releases branch or tags to access the repo at that point in time +## Github +Vortex uses Github to host its git repositories. +There are a lot of ways to use the features on Github for collaboration. +Therefore, this documentation details the standard procedure for contributing to Vortex. +Development of Vortex is consolidated to this repo, `vortex` and any associated forks. +Previously, there was active work done on a private repo named `vortex-dev`. +`vortex-dev` has officially been deprecated and fully merged into this public repo, `vortex`. +If you are returning to this project and have legacy versions of Vortex, you can use the releases branches to access older versions. ## Contribution Process -- You should create a new branch from develop that is clearly named with the feature that you want to add -- Avoid pushing directly to the `master` branch instead you will need to make a Pull Request (PR) -- There should be protections in place that prevent pushing directly to the main branch, but don't rely on it -- When you make a PR it will be tested against the continuous integration (ci) pipeline (see `continuous_integration.md`) -- It is not sufficient to just write some tests, they need to be incorporated into the ci pipeline to make sure they are run -- During a PR, you might receive feedback regarding your changes and you might need to make further commits to your branch +In an effort to keep `vortex` organized, permissions to directly create branches and push code has been limited to admins. +However, contributions are strongly encouraged and keep the project moving forward! Here is the procedure for contributing: +1. Create a fork of `vortex` +2. In your fork, create a branch from `master` that briefly explains the work you are adding (ie: `develop-documentation`) +3. Make your changes on the new branch in your fork. You may create as many commits as you need, which might be common if you are making multiple iterations +4. Since you are the owner of your fork, you have full permissions to push commits to your fork +4. When you are satisfied with the changes on your fork, you can open a PR from your fork using the online interface +5. If you recently made a push, you will get automatically get a prompt on Github online to create a PR, which you can press +6. Otherwise, you can go to your fork on Github online and manually create a PR (todo) +(todo): how to name and format your PR, what information you should add to the PR, does not need to be too strict if you are attending the weekly meetings* +7. Github uses the following semantics: `base repository` gets the changes from your `head repository` +8. Therefore, you should set the `base repository` to `vortexgpgpu/vortex` and the `base` branch to `master` since the master branch is protected by reviewed PRs. +9. And you should assign the `head repository` to `/vortex` (which represents your fork of vortex) and the `base` branch to the one created in step 2 +10. Now that your intended PR has been specified, you should review the status. Check for merge conflicts, if all your commits are present, and all the modified files make sense +11. You can still make a PR if there are issues in step 10, just make sure the structure is correct according to steps 7-9 +12. Once the PR is made, the CI pipeline will run automatically, testing your changes +13. Remember, a PR is flexible if you need to make changes to the code you can go back to your branch of the fork to commit and push any updates +14. As long as the `head repository`'s `base` branch is the one you edited, the PR will automatically get the most recent changes +15. When all merge conflicts are resolved, changes are made, and tests pass you can have an admin merge your PR -## Creating and Adding Tests -see `testing.md` \ No newline at end of file +## What Makes a Good Contribution? +- If you are contributing code changes, then review [testing.md](./testing.md) to ensure your tests are integrated into the [CI pipeline](continuous_integration.md) +- During a PR, you should consider the advice you are provided by your reviewers. Remember you keep adding commits to an open PR! +- If your change aims to fix an issue opened on Github, please tag that issue in the PR itself \ No newline at end of file diff --git a/docs/environment_setup.md b/docs/environment_setup.md index a55060ee5..ccd97c55e 100644 --- a/docs/environment_setup.md +++ b/docs/environment_setup.md @@ -1,16 +1,19 @@ # Environment Setup -These instructions apply to the development vortex repo using the updated toolchain. The updated toolchain is considered to be any commit of `master` pulled from July 2, 2023 onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`. +These instructions apply to the development vortex repo using the updated toolchain. The updated toolchain is considered to be any commit of `master` pulled from July 2, 2023 onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`. ## Set Up on Your Own System -The toolchain binaries provided with Vortex are built on Ubuntu-based systems. To install Vortex on your own system, [follow these instructions](install_vortex.md). +The toolchain binaries provided with Vortex are built on Ubuntu-based systems. To install Vortex on your own system, [follow these instructions](install_vortex.md). ## Servers for Georgia Tech Students and Collaborators + ### Volvo + Volvo is a 64-core server provided by HPArch. You need valid credentials to access it. If you don't already have access, you can get in contact with your mentor to ask about setting your account up. Setup on Volvo: + 1. Connect to Georgia Tech's VPN or ssh into another machine on campus 2. `ssh volvo.cc.gatech.edu` 3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git` @@ -19,9 +22,11 @@ Setup on Volvo: 6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood` ### Nio + Nio is a 20-core desktop server provided by HPArch. If you have access to Volvo, you also have access to Nio. Setup on Nio: + 1. Connect to Georgia Tech's VPN or ssh into another machine on campus 2. `ssh nio.cc.gatech.edu` 3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git` @@ -29,11 +34,12 @@ Setup on Nio: 5. `make -s` in the `vortex` root directory 6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood` - ## Docker (Experimental) + Docker allows for isolated pre-built environments to be created, shared and used. The emulation mode required for ARM-based processors will incur a decrease in performance. Currently, the dockerfile is not included with the official vortex repository and is not actively maintained or supported. ### Setup with Docker + 1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex.git` 2. Download the dockerfile from [here](https://github.gatech.edu/gist/usubramanya3/f1bf3e953faa38a6372e1292ffd0b65c) and place it in the root of the repo. 3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex -f dockerfile .` diff --git a/docs/fpga_setup.md b/docs/fpga_setup.md new file mode 100644 index 000000000..d909d8687 --- /dev/null +++ b/docs/fpga_setup.md @@ -0,0 +1,217 @@ +# FPGA Startup and Configuration Guide + +## Gaining Access to FPGA's with CRNCH +If you are associated with Georgia Tech (or related workshops) you can use CRNCH's server to gain remote access to FPGA's. Otherwise, you can skip to the Xilinx or Intel (Altera) synthesis steps below. + +## What is CRNCH? + +**C**enter for **R**esearch into **N**ovel **C**omputing **H**ierarchies + +## What does CRNCH Offer? + +**The Rogues Gallery (RG)**: new concept focused on developing our understanding of next-generation hardware with a focus on unorthodox and uncommon technologies. **RG** will acquire new and unique hardware (ie, the aforementioned “*rogues*”) from vendors, research labs, and startups and make this hardware available to students, faculty, and industry collaborators within a managed data center environment + +## Why are the Rouges Important? + +By exposing students and researchers to this set of unique hardware, we hope to foster cross-cutting discussions about hardware designs that will drive future *performance improvements in computing long after the Moore’s Law era of “cheap transistors” ends*. Specifically, the Rouges Gallery contains FPGA's which can be synthesized into Vortex hardware. + +## How is the Rouges Gallery Funded? + +Rogues Gallery testbed is primarily supported by the National Science Foundation (NSF) under NSF Award Number [#2016701](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2016701&HistoricalAwards=false) + +## Rouges Gallery Documentation + +You can read about RG in more detail on its official documentation [page](https://gt-crnch-rg.readthedocs.io/en/main/index.html#). + +You can listen to a talk about RG [here](https://mediaspace.gatech.edu/media/Jeff%20Young%20-%20Rogues%20Gallery%20-%20CRNCH%20Summit%202021/1_lqlgr0jj) + +[CRNCH Summit 2023](https://github.com/gt-crnch/crnch-summit-2023/tree/main) + +## Request Access for Rouges Gallery + +You should use [this form](https://crnch-rg.cc.gatech.edu/request-rogues-gallery-access/) to request access to RG’s reconfigurable computing (vortex fpga) resources. You should receive an email with your ticket item being created. Once it gets processed, you should get an email confirmed your access has been granted. It might take some time to get processed. + +## How to Access Rouges Gallery? +There are two methods of accessing CRNCH's Rouges Gallery +1) Web-based GUI: [rg-ood.crnch.gatech.edu](http://rg-ood.crnch.gatech.edu/) +2) SSH: `ssh @rg-login.crnch.gatech.edu` + + +## Where should I keep my files? +The CRNCH servers have a folder called `USERSCRATCH` which can be found in your home directory: `echo $HOME`. You should keep all your files in this folder since it is available across all the Rouges Gallery Nodes. + +## **What Machines are Available in the Rogues Gallery?** + +Complete list of machines can be found [here](https://gt-crnch-rg.readthedocs.io/en/main/general/rg-hardware.html). Furthermore, you can find detailed information about the FPGA hardware [here](https://gt-crnch-rg.readthedocs.io/en/main/reconfig/xilinx/xilinx-getting-started.html). + +## Allocate an FPGA Node +Once you’ve connected to the CRNCH login node, you can use the Slurm scheduler to request an interactive job using `salloc`. This [page](https://gt-crnch-rg.readthedocs.io/en/main/general/using-slurm.html) explains why we use Slurm to request resources. Documentation for `salloc` can be found [here](https://gt-crnch-rg.readthedocs.io/en/main/general/using-slurm-examples.html). And here. + + +To request 16 cores and 64GB of RAM for 6 hours on flubber9, a fpga dev node: +```bash +salloc -p rg-fpga --nodes=1 --ntasks-per-node=16 --mem=64G --nodelist flubber1 --time=06:00:00 +``` +Synthesis for Xilinx Boards +---------------------- +Once you are logged in, you will need to complete some first time configurations. If you are interested in the Intel (Altera) synthesis steps, scroll down below. + +### Source Configuration Scripts +``` +# From any directory +$ source /opt/xilinx/xrt/setup.sh +$ source /tools/reconfig/xilinx/Vitis/2023.1/settings64.sh +``` + +### Check Installed FPGA Platforms +`platforminfo -l` which tells us the correct name of the platform installed on the current fpga node. It should be used for the `PLATFORM` variable below. Otherwise, if there is an error then there was an issue with the previous two commands. + +### Install Vortex Toolchain +The Xilinx synthesis process requires verilator to generate the bitstream. Eventually, you will need the whole toolchain to run the bitstream on the FPGA. Therefore, the Vortex toolchain and can be installed as follows. If you complete these steps properly, you should only need to complete them once and you can skip to `Activate Vortex Toolchain` +``` +# Make a build directory from root and configure scripts for your environment +mkdir build && cd build && ../configure --tooldir=$HOME/tools + +# Install the whole prebuilt toolchain +./ci/toolchain_install.sh --all + +# Add environment variables to bashrc +echo "source /vortex/build/ci/toolchain_env.sh" >> ~/.bashrc +``` + +### Activate Vortex Toolchain +``` +# From any directory +source ~/.bashrc + +# Check environment setup +verilator --version +``` + +### Build the FPGA Bitstream +The root directory contains the path `hw/syn/xilinx/xrt` which has the makefile used to generate the Vortex bitstream. + +``` + $ cd hw/syn/xilinx/xrt + $ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=1 make > build_u250_hw_1c.log 2>&1 & +``` +Will run the synthesis under new build directory: BUILD_DIR := "\\_\\_\" +The generated bitstream will be located under /bin/vortex_afu.xclbin + +For long-running jobs, invocation of this makefile can be made of the following form: + +`[CONFIGS=] [PREFIX=] [NUM_CORES=<#>] TARGET=hw|hw_emu PLATFORM= nohup make > 2>&1 &` + +For example: + +```bash +CONFIGS="-DL2_ENABLE -DDCACHE_SIZE=8192" PREFIX=build_4c_u280 NUM_CORES=4 TARGET=hw PLATFORM=xilinx_u280_gen3x16_xdma_1_202310_1 nohup make > build_u250_hw_4c.log 2>&1 & +``` + +The build is complete when the bitstream file `vortex_afu.xclbin` exists in `hw|hw_emu/bin`. + +### Running a Program on Xilinx FPGA + +The [blackbox.sh](./simulation.md) script within the build directory can be used to run a test with Vortex’s xrt driver using the following command: + +`FPGA_BIN_DIR= TARGET=hw|hw_emu PLATFORM= ./ci/blackbox.sh --driver=xrt --app=` + +For example: + +```FPGA_BIN_DIR= hw/syn/xilinx/xrt/build_4c_u280_xilinx_u280_gen3x16_xdma_1_202211_1_hw/bin TARGET=hw PLATFORM=xilinx_u280_gen3x16_xdma_1_202211_1 ./ci/blackbox.sh --driver=xrt --app=demo``` + +Synthesis for Intel (Altera) Boards +---------------------- + +### OPAE Environment Setup + + + $ source /opt/inteldevstack/init_env_user.sh + $ export OPAE_HOME=/opt/opae/1.1.2 + $ export PATH=$OPAE_HOME/bin:$PATH + $ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH + $ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH + $ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH + +### OPAE Build + +The FPGA has to following configuration options: +- DEVICE_FAMILY=arria10 | stratix10 +- NUM_CORES=#n + +Command line: + + $ cd hw/syn/altera/opae + $ PREFIX=test1 TARGET=fpga NUM_CORES=4 make + +A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete. +Setting TARGET=ase will build the project for simulation using Intel ASE. + + +### OPAE Build Configuration + +The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured: +- `NUM_WARPS`: Number of warps per cores +- `NUM_THREADS`: Number of threads per warps +- `PERF_ENABLE`: enable the use of all profile counters + +You configure the syntesis build from the command line: + + $ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make + +### OPAE Build Progress + +You could check the last 10 lines in the build log for possible errors until build completion. + + $ tail -n 10 /build.log + +Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs. + + $ ps -u + +If the build fails and you need to restart it, clean up the build folder using the following command: + + $ make clean + +The file `vortex_afu.gbs` should exist when the build is done: + + $ ls -lsa /synth/vortex_afu.gbs + + +### Signing the bitstream and Programming the FPGA + + $ cd + $ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs + $ fpgasupdate vortex_afu_unsigned_ssl.gbs + +### Sample FPGA Run Test +Ensure you have the correct opae runtime for the FPGA target + +``` +$ TARGET=FPGA make -C runtime/opae +``` + +Run the [blackbox.sh](./simulation.md) from your Vortex build directory + +``` +$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128" +``` + +### FPGA sample test running OpenCL sgemm kernel + +You can use the `blackbox.sh` script to run the following from your Vortex build directory + + $ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128" + +### Testing Vortex using OPAE with Intel ASE Simulation +Building ASE synthesis + +```$ TARGET=asesim make -C runtime/opae``` + +Building ASE runtime + +```$ TARGET=asesim make -C runtime/opae``` + +Running ASE simulation + +```$ ASE_LOG=0 ASE_WORKDIR=/synth/work TARGET=asesim ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n16"``` diff --git a/docs/index.md b/docs/index.md index 14a45f335..351e41fbb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,32 +2,8 @@ ## Table of Contents -- [Codebase Layout](codebase.md) -- [Microarchitecture](microarchitecture.md) -- [Cache Subsystem](cache_subsystem.md) -- [Software](software.md) -- [Simulation](simulation.md) -- [Altera FPGA Setup Guide](altera_fpga_guide.md) -- [Xilinx FPGA Setup Guide](xilinx_fpga_guide.md) -- [Debugging](debugging.md) -- [Useful Links](references.md) - -## Installation - -- For the different environments Vortex supports, [read this document](environment_setup.md). -- To install on your own system, [follow this document](install_vortex.md). - -## Quick Start Scenarios - -Running Vortex simulators with different configurations: -- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads - - $ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic - -- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads - - $ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo - -- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads - - $ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood +- [Codebase Layout](codebase.md): Summary of repo file tree +- [Microarchitecture](microarchitecture.md): Vortex Pipeline and cache microarchitectural details and reconfigurability +- [Simulation](simulation.md): Details for building and running each simulation driver +- [Contributing](contributing.md): Process for contributing your own features including repo semantics and testing +- [Debugging](debugging.md): Debugging configurations for each Vortex driver diff --git a/docs/microarchitecture.md b/docs/microarchitecture.md index 3459abcc4..85fa52fd5 100644 --- a/docs/microarchitecture.md +++ b/docs/microarchitecture.md @@ -77,4 +77,7 @@ Vortex has a 6-stage pipeline: - Sockets - Grouping multiple cores sharing L1 cache - Clusters - - Grouping of sockets sharing L2 cache \ No newline at end of file + - Grouping of sockets sharing L2 cache + +### Vortex Cache Subsystem +More details about the cache subsystem are provided [here](./cache_subsystem.md). \ No newline at end of file diff --git a/docs/simulation.md b/docs/simulation.md index 86ce1f135..4201a64d4 100644 --- a/docs/simulation.md +++ b/docs/simulation.md @@ -6,13 +6,16 @@ ### Cycle-Approximate Simulation -SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant files are located in the `simX` folder. +SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant files are located in the `simx` folder. The [readme](README.md) has the most detailed instructions for building and running simX. + +- To install on your own system, [follow this document](install_vortex.md). +- For the different Georgia Tech environments Vortex supports, [read this document](environment_setup.md). ### FGPA Simulation -The current target FPGA for simulation is the Arria10 Intel Accelerator Card v1.0. The guide to build the fpga with specific configurations is located [here.](fpga_setup.md) +The guide to build the fpga with specific configurations is located [here.](fpga_setup.md) You can find instructions for both Xilinx and Altera based FPGAs. -### How to Test +### How to Test (using `blackbox.sh`) Running tests under specific drivers (rtlsim,simx,fpga) is done using the script named `blackbox.sh` located in the `ci` folder. Running command `./ci/blackbox.sh --help` from the Vortex root directory will display the following command line arguments for `blackbox.sh`: @@ -47,4 +50,20 @@ PERF: core1: instrs=90693, cycles=53108, IPC=1.707709 PERF: core2: instrs=90849, cycles=53107, IPC=1.710678 PERF: core3: instrs=90836, cycles=50347, IPC=1.804199 PERF: instrs=363180, cycles=53108, IPC=6.838518 -``` \ No newline at end of file +``` + +## Additional Quick Start Scenarios + +Running Vortex simulators with different configurations and drivers is supported. For example: + +- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads + + $ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic + +- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads + + $ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo + +- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads + + $ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood \ No newline at end of file diff --git a/docs/testing.md b/docs/testing.md index b2ae8fb2c..739193ce3 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -2,7 +2,7 @@ ## Running a Vortex application -The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree. +The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree. It gets copied into the `build` directory with all the environment variables resolved, so you should run it from the `build` directory as follows: You can query the commandline options of the tool using: $ ./ci/blackbox.sh --help @@ -49,4 +49,4 @@ Compile your test: `$ make -C tests/regression/` Run your test: `$ ./ci/blackbox.sh --driver=simx --app= --debug` ## Adding Your Tests to the CI Pipeline -See `continuous_integration.md` \ No newline at end of file +If you are a contributor, then you will need to add tests that integrate into the continuous integration pipeline. Remember, Pull Requests cannot be merged unless new code has tests and existing tests do not regress. Furthermore, if you are contributing a new feature, it is recommended that you add the ability to enable / disable the new feature that you are adding. See more at [contributing.md](contributing.md) and [continuous_integration.md](continuous_integration.md). \ No newline at end of file diff --git a/docs/xilinx_fpga_guide.md b/docs/xilinx_fpga_guide.md deleted file mode 100644 index 959ca6773..000000000 --- a/docs/xilinx_fpga_guide.md +++ /dev/null @@ -1,52 +0,0 @@ -# FPGA Startup and Configuration Guide - -XRT Environment Setup ----------------------- - - $ source /opt/xilinx/Vitis/2023.1/settings64.sh - $ source /opt/xilinx/xrt/setup.sh - - -Check Installed FPGA Platforms ------------------------------- - - $ platforminfo -l - - -Build FPGA image ----------------- - - $ cd hw/syn/xilinx/xrt - $ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=4 make - -Will run the synthesis under new build directory: BUILD_DIR := "\\_\\_\" - -The generated bitstream will be located under /bin/vortex_afu.xclbin - -Sample FPGA Run Test --------------------- - -Ensure you have the correct opae runtime for the FPGA target - - $ make -C runtime/xrt clean - $ TARGET=hw make -C runtime/xrt - -Run the following from your Vortex build directory - - $ TARGET=hw FPGA_BIN_DIR=/bin ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n128" - -Testing Vortex using XRT Hardware Emulation -------------------------------------------- - -Building XRT's hw_emu target - - $ cd hw/syn/xilinx/xrt - $ PREFIX=test2 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw_emu make - -Building XRT hw_meu runtime - - $ TARGET=hw_emu make -C runtime/xrt - -Running XRT hw_emu simulation - - $ TARGET=hw_emu FPGA_BIN_DIR=/bin ./ci/blackbox.sh --driver=xrt --app=sgemm \ No newline at end of file diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv index 73d9b34ab..853881c08 100644 --- a/hw/rtl/VX_cluster.sv +++ b/hw/rtl/VX_cluster.sv @@ -67,7 +67,7 @@ module VX_cluster import VX_gpu_pkg::*; #( ); VX_gbar_unit #( - .INSTANCE_ID ($sformatf("gbar%0d", CLUSTER_ID)) + .INSTANCE_ID (`SFORMATF(("gbar%0d", CLUSTER_ID))) ) gbar_unit ( .clk (clk), .reset (reset), @@ -84,7 +84,7 @@ module VX_cluster import VX_gpu_pkg::*; #( `RESET_RELAY (l2_reset, reset); VX_cache_wrap #( - .INSTANCE_ID ($sformatf("%s-l2cache", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-l2cache", INSTANCE_ID))), .CACHE_SIZE (`L2_CACHE_SIZE), .LINE_SIZE (`L2_LINE_SIZE), .NUM_BANKS (`L2_NUM_BANKS), @@ -98,8 +98,10 @@ module VX_cluster import VX_gpu_pkg::*; #( .TAG_WIDTH (L2_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`L2_WRITEBACK), - .DIRTY_BYTES (`L2_WRITEBACK), + .DIRTY_BYTES (`L2_DIRTYBYTES), + .REPL_POLICY (`L2_REPL_POLICY), .UUID_WIDTH (`UUID_WIDTH), + .FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH), .CORE_OUT_BUF (3), .MEM_OUT_BUF (3), .NC_ENABLE (1), @@ -129,7 +131,7 @@ module VX_cluster import VX_gpu_pkg::*; #( VX_socket #( .SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + socket_id), - .INSTANCE_ID ($sformatf("%s-socket%0d", INSTANCE_ID, socket_id)) + .INSTANCE_ID (`SFORMATF(("%s-socket%0d", INSTANCE_ID, socket_id))) ) socket ( `SCOPE_IO_BIND (scope_socket+socket_id) @@ -152,6 +154,6 @@ module VX_cluster import VX_gpu_pkg::*; #( ); end - `BUFFER_EX(busy, (| per_socket_busy), 1'b1, (`NUM_SOCKETS > 1)); + `BUFFER_EX(busy, (| per_socket_busy), 1'b1, 1, (`NUM_SOCKETS > 1)); endmodule diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 0ecb37309..3badaa3d3 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -174,6 +174,10 @@ `define L3_LINE_SIZE `MEM_BLOCK_SIZE `endif +`ifndef MEMORY_BANKS +`define MEMORY_BANKS 2 +`endif + `ifdef XLEN_64 `ifndef STACK_BASE_ADDR @@ -570,7 +574,12 @@ // Number of Associative Ways `ifndef ICACHE_NUM_WAYS -`define ICACHE_NUM_WAYS 1 +`define ICACHE_NUM_WAYS 4 +`endif + +// Replacement Policy +`ifndef ICACHE_REPL_POLICY +`define ICACHE_REPL_POLICY 1 `endif // Dcache Configurable Knobs ////////////////////////////////////////////////// @@ -619,12 +628,12 @@ // Memory Response Queue Size `ifndef DCACHE_MRSQ_SIZE -`define DCACHE_MRSQ_SIZE 0 +`define DCACHE_MRSQ_SIZE 4 `endif // Number of Associative Ways `ifndef DCACHE_NUM_WAYS -`define DCACHE_NUM_WAYS 1 +`define DCACHE_NUM_WAYS 4 `endif // Enable Cache Writeback @@ -632,6 +641,16 @@ `define DCACHE_WRITEBACK 0 `endif +// Enable Cache Dirty bytes +`ifndef DCACHE_DIRTYBYTES +`define DCACHE_DIRTYBYTES `DCACHE_WRITEBACK +`endif + +// Replacement Policy +`ifndef DCACHE_REPL_POLICY +`define DCACHE_REPL_POLICY 1 +`endif + // LMEM Configurable Knobs //////////////////////////////////////////////////// `ifndef LMEM_DISABLE @@ -654,12 +673,8 @@ // Cache Size `ifndef L2_CACHE_SIZE -`ifdef ALTERA_S10 -`define L2_CACHE_SIZE 2097152 -`else `define L2_CACHE_SIZE 1048576 `endif -`endif // Number of Banks `ifndef L2_NUM_BANKS @@ -683,12 +698,12 @@ // Memory Response Queue Size `ifndef L2_MRSQ_SIZE -`define L2_MRSQ_SIZE 0 +`define L2_MRSQ_SIZE 4 `endif // Number of Associative Ways `ifndef L2_NUM_WAYS -`define L2_NUM_WAYS 2 +`define L2_NUM_WAYS 8 `endif // Enable Cache Writeback @@ -696,15 +711,21 @@ `define L2_WRITEBACK 0 `endif +// Enable Cache Dirty bytes +`ifndef L2_DIRTYBYTES +`define L2_DIRTYBYTES `L2_WRITEBACK +`endif + +// Replacement Policy +`ifndef L2_REPL_POLICY +`define L2_REPL_POLICY 1 +`endif + // L3cache Configurable Knobs ///////////////////////////////////////////////// // Cache Size `ifndef L3_CACHE_SIZE -`ifdef ALTERA_S10 `define L3_CACHE_SIZE 2097152 -`else -`define L3_CACHE_SIZE 1048576 -`endif `endif // Number of Banks @@ -729,12 +750,12 @@ // Memory Response Queue Size `ifndef L3_MRSQ_SIZE -`define L3_MRSQ_SIZE 0 +`define L3_MRSQ_SIZE 4 `endif // Number of Associative Ways `ifndef L3_NUM_WAYS -`define L3_NUM_WAYS 4 +`define L3_NUM_WAYS 8 `endif // Enable Cache Writeback @@ -742,8 +763,14 @@ `define L3_WRITEBACK 0 `endif -`ifndef MEMORY_BANKS -`define MEMORY_BANKS 2 +// Enable Cache Dirty bytes +`ifndef L3_DIRTYBYTES +`define L3_DIRTYBYTES `L3_WRITEBACK +`endif + +// Replacement Policy +`ifndef L3_REPL_POLICY +`define L3_REPL_POLICY 1 `endif // Number of Memory Ports from LLC diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 4ccb00880..6519984ad 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -335,10 +335,10 @@ .data_out (dst) \ ) -`define BUFFER_EX(dst, src, ena, latency) \ +`define BUFFER_EX(dst, src, ena, RSTW, latency) \ VX_pipe_register #( \ .DATAW ($bits(dst)), \ - .RESETW ($bits(dst)), \ + .RESETW (RSTW), \ .DEPTH (latency) \ ) __``dst``__ ( \ .clk (clk), \ @@ -348,7 +348,7 @@ .data_out (dst) \ ) -`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1) +`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 0, 1) `define POP_COUNT_EX(out, in, model) \ VX_popcount #( \ diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 3e9042737..d874b9b2b 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -37,16 +37,13 @@ endgenerate `define ASSERT(cond, msg) \ assert(cond) else $error msg -`define RUNTIME_ASSERT(cond, msg) \ - always @(posedge clk) begin \ - assert(cond) else $error msg; \ +`define RUNTIME_ASSERT(cond, msg) \ + always @(posedge clk) begin \ + if (!reset) begin \ + `ASSERT(cond, msg); \ + end \ end -`define __SCOPE -`define __SCOPE_X -`define __SCOPE_ON -`define __SCOPE_OFF - `ifndef TRACING_ALL `define TRACING_ON /* verilator tracing_on */ `define TRACING_OFF /* verilator tracing_off */ @@ -128,6 +125,8 @@ endgenerate end `endif +`define SFORMATF(x) $sformatf x + `else // SYNTHESIS `define STATIC_ASSERT(cond, msg) @@ -137,6 +136,7 @@ endgenerate `define DEBUG_BLOCK(x) `define TRACE(level, args) +`define SFORMATF(x) "" `define TRACING_ON `define TRACING_OFF @@ -153,45 +153,39 @@ endgenerate `define UNUSED_PIN(x) . x () `define UNUSED_ARG(x) x -`define __SCOPE (* mark_debug="true" *) - -`define __SCOPE_X - -`define __SCOPE_ON \ - `undef __SCOPE_X \ - `define __SCOPE_X `__SCOPE - -`define __SCOPE_OFF \ - `undef __SCOPE_X \ - `define __SCOPE_X - `endif /////////////////////////////////////////////////////////////////////////////// `ifdef QUARTUS `define MAX_FANOUT 8 -`define IF_DATA_SIZE(x) $bits(x.data) +`define MAX_LUTRAM 1024 +`define USE_BLOCK_BRAM (* ramstyle = "block" *) `define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *) `define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *) `define DISABLE_BRAM (* ramstyle = "logic" *) `define PRESERVE_NET (* preserve *) +`define BLACKBOX_CELL (* black_box *) `define STRING string `elsif VIVADO `define MAX_FANOUT 8 -`define IF_DATA_SIZE(x) $bits(x.data) +`define MAX_LUTRAM 1024 +`define USE_BLOCK_BRAM (* ram_style = "block" *) `define USE_FAST_BRAM (* ram_style = "distributed" *) `define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *) `define DISABLE_BRAM (* ram_style = "registers" *) `define PRESERVE_NET (* keep = "true" *) +`define BLACKBOX_CELL (* black_box *) `define STRING `else `define MAX_FANOUT 8 -`define IF_DATA_SIZE(x) x.DATA_WIDTH +`define MAX_LUTRAM 1024 +`define USE_BLOCK_BRAM `define USE_FAST_BRAM `define NO_RW_RAM_CHECK `define DISABLE_BRAM `define PRESERVE_NET +`define BLACKBOX_CELL `define STRING string `endif @@ -217,7 +211,7 @@ endgenerate `define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x))) -`define UP(x) (((x) != 0) ? (x) : 1) +`define UP(x) (((x) > 0) ? (x) : 1) `define CDIV(n,d) ((n + d - 1) / (d)) diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index 69ff88a2c..87dcbd02e 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -85,7 +85,7 @@ module VX_socket import VX_gpu_pkg::*; #( `RESET_RELAY (icache_reset, reset); VX_cache_cluster #( - .INSTANCE_ID ($sformatf("%s-icache", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-icache", INSTANCE_ID))), .NUM_UNITS (`NUM_ICACHES), .NUM_INPUTS (`SOCKET_SIZE), .TAG_SEL_IDX (0), @@ -100,8 +100,10 @@ module VX_socket import VX_gpu_pkg::*; #( .MRSQ_SIZE (`ICACHE_MRSQ_SIZE), .MREQ_SIZE (`ICACHE_MREQ_SIZE), .TAG_WIDTH (ICACHE_TAG_WIDTH), + .FLAGS_WIDTH (0), .UUID_WIDTH (`UUID_WIDTH), .WRITE_ENABLE (0), + .REPL_POLICY (`ICACHE_REPL_POLICY), .NC_ENABLE (0), .CORE_OUT_BUF (3), .MEM_OUT_BUF (2) @@ -130,7 +132,7 @@ module VX_socket import VX_gpu_pkg::*; #( `RESET_RELAY (dcache_reset, reset); VX_cache_cluster #( - .INSTANCE_ID ($sformatf("%s-dcache", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-dcache", INSTANCE_ID))), .NUM_UNITS (`NUM_DCACHES), .NUM_INPUTS (`SOCKET_SIZE), .TAG_SEL_IDX (0), @@ -146,9 +148,11 @@ module VX_socket import VX_gpu_pkg::*; #( .MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE), .TAG_WIDTH (DCACHE_TAG_WIDTH), .UUID_WIDTH (`UUID_WIDTH), + .FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`DCACHE_WRITEBACK), - .DIRTY_BYTES (`DCACHE_WRITEBACK), + .DIRTY_BYTES (`DCACHE_DIRTYBYTES), + .REPL_POLICY (`DCACHE_REPL_POLICY), .NC_ENABLE (1), .CORE_OUT_BUF (3), .MEM_OUT_BUF (2) @@ -208,7 +212,7 @@ module VX_socket import VX_gpu_pkg::*; #( VX_core #( .CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + core_id), - .INSTANCE_ID ($sformatf("%s-core%0d", INSTANCE_ID, core_id)) + .INSTANCE_ID (`SFORMATF(("%s-core%0d", INSTANCE_ID, core_id))) ) core ( `SCOPE_IO_BIND (scope_core + core_id) @@ -233,6 +237,6 @@ module VX_socket import VX_gpu_pkg::*; #( ); end - `BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1)); + `BUFFER_EX(busy, (| per_core_busy), 1'b1, 1, (`SOCKET_SIZE > 1)); endmodule diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index e07aaae4d..bce771340 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -84,8 +84,10 @@ module Vortex import VX_gpu_pkg::*; ( .TAG_WIDTH (L2_MEM_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`L3_WRITEBACK), - .DIRTY_BYTES (`L3_WRITEBACK), + .DIRTY_BYTES (`L3_DIRTYBYTES), + .REPL_POLICY (`L3_REPL_POLICY), .UUID_WIDTH (`UUID_WIDTH), + .FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH), .CORE_OUT_BUF (3), .MEM_OUT_BUF (3), .NC_ENABLE (1), @@ -138,7 +140,7 @@ module Vortex import VX_gpu_pkg::*; ( VX_cluster #( .CLUSTER_ID (cluster_id), - .INSTANCE_ID ($sformatf("cluster%0d", cluster_id)) + .INSTANCE_ID (`SFORMATF(("cluster%0d", cluster_id))) ) cluster ( `SCOPE_IO_BIND (scope_cluster + cluster_id) @@ -157,7 +159,7 @@ module Vortex import VX_gpu_pkg::*; ( ); end - `BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1)); + `BUFFER_EX(busy, (| per_cluster_busy), 1'b1, 1, (`NUM_CLUSTERS > 1)); `ifdef PERF_ENABLE @@ -202,13 +204,13 @@ module Vortex import VX_gpu_pkg::*; ( always @(posedge clk) begin if (mem_req_fire) begin if (mem_req_rw) begin - `TRACE(1, ("%t: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h (#%0d)\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data, mem_req_uuid)) + `TRACE(2, ("%t: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h (#%0d)\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data, mem_req_uuid)) end else begin - `TRACE(1, ("%t: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h (#%0d)\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_uuid)) + `TRACE(2, ("%t: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h (#%0d)\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_uuid)) end end if (mem_rsp_fire) begin - `TRACE(1, ("%t: MEM Rd Rsp: tag=0x%0h, data=0x%h (#%0d)\n", $time, mem_rsp_tag, mem_rsp_data, mem_rsp_uuid)) + `TRACE(2, ("%t: MEM Rd Rsp: tag=0x%0h, data=0x%h (#%0d)\n", $time, mem_rsp_tag, mem_rsp_data, mem_rsp_uuid)) end end `endif diff --git a/hw/rtl/afu/opae/vortex_afu.sv b/hw/rtl/afu/opae/vortex_afu.sv index 7e0bcfaed..f21f851c0 100644 --- a/hw/rtl/afu/opae/vortex_afu.sv +++ b/hw/rtl/afu/opae/vortex_afu.sv @@ -968,7 +968,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ wire [COUT_TID_WIDTH-1:0] cout_tid; - VX_encoder #( + VX_onehot_encoder #( .N (`VX_MEM_BYTEEN_WIDTH) ) cout_tid_enc ( .data_in (vx_mem_req_byteen), diff --git a/hw/rtl/afu/xrt/VX_afu_wrap.sv b/hw/rtl/afu/xrt/VX_afu_wrap.sv index 2b1bfb7c2..7d13344a4 100644 --- a/hw/rtl/afu/xrt/VX_afu_wrap.sv +++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv @@ -373,7 +373,9 @@ module VX_afu_wrap #( `SCOPE_IO_UNUSED(0) `endif `endif + `ifdef CHIPSCOPE +`ifdef DBG_SCOPE_AFU ila_afu ila_afu_inst ( .clk (clk), .probe0 ({ @@ -394,6 +396,7 @@ module VX_afu_wrap #( }) ); `endif +`endif `ifdef SIMULATION `ifndef VERILATOR diff --git a/hw/rtl/cache/VX_bank_flush.sv b/hw/rtl/cache/VX_bank_flush.sv index a01ae0e0b..e50f8ef44 100644 --- a/hw/rtl/cache/VX_bank_flush.sv +++ b/hw/rtl/cache/VX_bank_flush.sv @@ -33,7 +33,7 @@ module VX_bank_flush #( output wire flush_init, output wire flush_valid, output wire [`CS_LINE_SEL_BITS-1:0] flush_line, - output wire [NUM_WAYS-1:0] flush_way, + output wire [`CS_WAY_SEL_WIDTH-1:0] flush_way, input wire flush_ready, input wire mshr_empty, input wire bank_empty @@ -48,20 +48,21 @@ module VX_bank_flush #( localparam STATE_WAIT2 = 4; localparam STATE_DONE = 5; - reg [2:0] state_r, state_n; + reg [2:0] state, state_n; - reg [CTR_WIDTH-1:0] counter_r; + reg [CTR_WIDTH-1:0] counter; always @(*) begin - state_n = state_r; - case (state_r) - STATE_IDLE: begin + state_n = state; + case (state) + //STATE_IDLE: + default : begin if (flush_begin) begin state_n = STATE_WAIT1; end end STATE_INIT: begin - if (counter_r == ((2 ** `CS_LINE_SEL_BITS)-1)) begin + if (counter == ((2 ** `CS_LINE_SEL_BITS)-1)) begin state_n = STATE_IDLE; end end @@ -72,7 +73,7 @@ module VX_bank_flush #( end end STATE_FLUSH: begin - if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin + if (counter == ((2 ** CTR_WIDTH)-1) && flush_ready) begin state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2; end end @@ -93,37 +94,30 @@ module VX_bank_flush #( always @(posedge clk) begin if (reset) begin - state_r <= STATE_INIT; - counter_r <= '0; + state <= STATE_INIT; + counter <= '0; end else begin - state_r <= state_n; - if (state_r != STATE_IDLE) begin - if ((state_r == STATE_INIT) - || ((state_r == STATE_FLUSH) && flush_ready)) begin - counter_r <= counter_r + CTR_WIDTH'(1); + state <= state_n; + if (state != STATE_IDLE) begin + if ((state == STATE_INIT) + || ((state == STATE_FLUSH) && flush_ready)) begin + counter <= counter + CTR_WIDTH'(1); end end else begin - counter_r <= '0; + counter <= '0; end end end - assign flush_end = (state_r == STATE_DONE); - assign flush_init = (state_r == STATE_INIT); - assign flush_valid = (state_r == STATE_FLUSH); - assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0]; + assign flush_end = (state == STATE_DONE); + assign flush_init = (state == STATE_INIT); + assign flush_valid = (state == STATE_FLUSH); + assign flush_line = counter[`CS_LINE_SEL_BITS-1:0]; - if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin : g_flush_way - VX_decoder #( - .N (`CS_WAY_SEL_BITS), - .D (NUM_WAYS) - ) ctr_decoder ( - .data_in (counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]), - .valid_in (1'b1), - .data_out (flush_way) - ); + if (WRITEBACK && (NUM_WAYS > 1)) begin : g_flush_way + assign flush_way = counter[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]; end else begin : g_flush_way_all - assign flush_way = {NUM_WAYS{1'b1}}; + assign flush_way = '0; end endmodule diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 06887944c..d8a5dbaa2 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -20,22 +20,22 @@ module VX_cache import VX_gpu_pkg::*; #( parameter NUM_REQS = 4, // Size of cache in bytes - parameter CACHE_SIZE = 4096, + parameter CACHE_SIZE = 32768, // Size of line inside a bank in bytes parameter LINE_SIZE = 64, // Number of banks - parameter NUM_BANKS = 1, + parameter NUM_BANKS = 4, // Number of associative ways - parameter NUM_WAYS = 1, + parameter NUM_WAYS = 4, // Size of a word in bytes - parameter WORD_SIZE = `XLEN/8, + parameter WORD_SIZE = 16, // Core Response Queue Size - parameter CRSQ_SIZE = 2, + parameter CRSQ_SIZE = 4, // Miss Reserv Queue Knob - parameter MSHR_SIZE = 8, + parameter MSHR_SIZE = 16, // Memory Response Queue Size - parameter MRSQ_SIZE = 0, + parameter MRSQ_SIZE = 4, // Memory Request Queue Size parameter MREQ_SIZE = 4, @@ -48,17 +48,23 @@ module VX_cache import VX_gpu_pkg::*; #( // Enable dirty bytes on writeback parameter DIRTY_BYTES = 0, + // Replacement policy + parameter REPL_POLICY = `CS_REPL_CYCLIC, + // Request debug identifier parameter UUID_WIDTH = 0, // core request tag size parameter TAG_WIDTH = UUID_WIDTH + 1, + // core request flags + parameter FLAGS_WIDTH = 0, + // Core response output register - parameter CORE_OUT_BUF = 0, + parameter CORE_OUT_BUF = 3, // Memory request output register - parameter MEM_OUT_BUF = 0 + parameter MEM_OUT_BUF = 3 ) ( // PERF `ifdef PERF_ENABLE @@ -76,10 +82,6 @@ module VX_cache import VX_gpu_pkg::*; #( `STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable")) `STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback")) - // In writeback mode, memory fill response may issue a new memory request to handle evicted blocks. - // We need to ensure that the memory request queue never fills up to avoid deadlock. - `STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE")) - localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS); localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS); localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE); @@ -90,7 +92,7 @@ module VX_cache import VX_gpu_pkg::*; #( localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS); localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS); - localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + 1; + localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + `UP(FLAGS_WIDTH); localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH; localparam BANK_MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH; @@ -206,13 +208,13 @@ module VX_cache import VX_gpu_pkg::*; #( wire [LINE_SIZE-1:0] mem_req_byteen; wire [`CS_LINE_WIDTH-1:0] mem_req_data; wire [MEM_TAG_WIDTH-1:0] mem_req_tag; - wire mem_req_flush; + wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags; wire mem_req_ready; - wire mem_req_flush_b; + wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flush_b; VX_elastic_buffer #( - .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1), + .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)), .SIZE (MEM_REQ_REG_DISABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_buf ( @@ -220,13 +222,18 @@ module VX_cache import VX_gpu_pkg::*; #( .reset (reset), .valid_in (mem_req_valid), .ready_in (mem_req_ready), - .data_in ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data, mem_req_tag, mem_req_flush}), + .data_in ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data, mem_req_tag, mem_req_flags}), .data_out ({mem_bus_tmp_if.req_data.rw, mem_bus_tmp_if.req_data.byteen, mem_bus_tmp_if.req_data.addr, mem_bus_tmp_if.req_data.data, mem_bus_tmp_if.req_data.tag, mem_req_flush_b}), .valid_out (mem_bus_tmp_if.req_valid), .ready_out (mem_bus_tmp_if.req_ready) ); - assign mem_bus_tmp_if.req_data.flags = mem_req_flush_b ? `MEM_REQ_FLAGS_WIDTH'(1 << `MEM_REQ_FLAG_FLUSH) : '0; + if (FLAGS_WIDTH != 0) begin : g_mem_req_flags + assign mem_bus_tmp_if.req_data.flags = mem_req_flush_b; + end else begin : g_no_mem_req_flags + assign mem_bus_tmp_if.req_data.flags = '0; + `UNUSED_VAR (mem_req_flush_b) + end if (WRITE_ENABLE) begin : g_mem_bus_if `ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if); @@ -244,7 +251,7 @@ module VX_cache import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data; wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag; wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx; - wire [NUM_BANKS-1:0] per_bank_core_req_flush; + wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_core_req_flags; wire [NUM_BANKS-1:0] per_bank_core_req_ready; wire [NUM_BANKS-1:0] per_bank_core_rsp_valid; @@ -259,7 +266,7 @@ module VX_cache import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen; wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data; wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_req_tag; - wire [NUM_BANKS-1:0] per_bank_mem_req_flush; + wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_mem_req_flags; wire [NUM_BANKS-1:0] per_bank_mem_req_ready; wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready; @@ -276,7 +283,7 @@ module VX_cache import VX_gpu_pkg::*; #( wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen; wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data; wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag; - wire [NUM_REQS-1:0] core_req_flush; + wire [NUM_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] core_req_flags; wire [NUM_REQS-1:0] core_req_ready; wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr; @@ -293,7 +300,7 @@ module VX_cache import VX_gpu_pkg::*; #( assign core_req_addr[i] = core_bus2_if[i].req_data.addr; assign core_req_data[i] = core_bus2_if[i].req_data.data; assign core_req_tag[i] = core_bus2_if[i].req_data.tag; - assign core_req_flush[i] = core_bus2_if[i].req_data.flags[`MEM_REQ_FLAG_FLUSH]; + assign core_req_flags[i] = `UP(FLAGS_WIDTH)'(core_bus2_if[i].req_data.flags); assign core_bus2_if[i].req_ready = core_req_ready[i]; end @@ -325,7 +332,7 @@ module VX_cache import VX_gpu_pkg::*; #( core_req_byteen[i], core_req_data[i], core_req_tag[i], - core_req_flush[i] + core_req_flags[i] }; end @@ -366,7 +373,7 @@ module VX_cache import VX_gpu_pkg::*; #( per_bank_core_req_byteen[i], per_bank_core_req_data[i], per_bank_core_req_tag[i], - per_bank_core_req_flush[i] + per_bank_core_req_flags[i] } = core_req_data_out[i]; end @@ -378,23 +385,25 @@ module VX_cache import VX_gpu_pkg::*; #( VX_cache_bank #( .BANK_ID (bank_id), - .INSTANCE_ID ($sformatf("%s-bank%0d", INSTANCE_ID, bank_id)), + .INSTANCE_ID (`SFORMATF(("%s-bank%0d", INSTANCE_ID, bank_id))), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), .NUM_REQS (NUM_REQS), + .WRITE_ENABLE (WRITE_ENABLE), + .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), + .REPL_POLICY (REPL_POLICY), .CRSQ_SIZE (CRSQ_SIZE), .MSHR_SIZE (MSHR_SIZE), .MREQ_SIZE (MREQ_SIZE), - .WRITE_ENABLE (WRITE_ENABLE), - .DIRTY_BYTES (DIRTY_BYTES), - .WRITEBACK (WRITEBACK), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (TAG_WIDTH), - .CORE_OUT_REG (CORE_RSP_REG_DISABLE ? 0 : `TO_OUT_BUF_REG(CORE_OUT_BUF)), - .MEM_OUT_REG (MEM_REQ_REG_DISABLE ? 0 : `TO_OUT_BUF_REG(MEM_OUT_BUF)) + .FLAGS_WIDTH (FLAGS_WIDTH), + .CORE_OUT_REG (CORE_RSP_REG_DISABLE ? 0 : 1), + .MEM_OUT_REG (MEM_REQ_REG_DISABLE ? 0 : 1) ) bank ( .clk (clk), .reset (reset), @@ -414,7 +423,7 @@ module VX_cache import VX_gpu_pkg::*; #( .core_req_data (per_bank_core_req_data[bank_id]), .core_req_tag (per_bank_core_req_tag[bank_id]), .core_req_idx (per_bank_core_req_idx[bank_id]), - .core_req_flush (per_bank_core_req_flush[bank_id]), + .core_req_flags (per_bank_core_req_flags[bank_id]), .core_req_ready (per_bank_core_req_ready[bank_id]), // Core response @@ -431,7 +440,7 @@ module VX_cache import VX_gpu_pkg::*; #( .mem_req_byteen (per_bank_mem_req_byteen[bank_id]), .mem_req_data (per_bank_mem_req_data[bank_id]), .mem_req_tag (per_bank_mem_req_tag[bank_id]), - .mem_req_flush (per_bank_mem_req_flush[bank_id]), + .mem_req_flags (per_bank_mem_req_flags[bank_id]), .mem_req_ready (per_bank_mem_req_ready[bank_id]), // Memory response @@ -487,7 +496,7 @@ module VX_cache import VX_gpu_pkg::*; #( // Memory request arbitration - wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + 1)-1:0] data_in; + wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH))-1:0] data_in; for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_data_in assign data_in[i] = { @@ -496,7 +505,7 @@ module VX_cache import VX_gpu_pkg::*; #( per_bank_mem_req_byteen[i], per_bank_mem_req_data[i], per_bank_mem_req_tag[i], - per_bank_mem_req_flush[i] + per_bank_mem_req_flags[i] }; end @@ -504,7 +513,7 @@ module VX_cache import VX_gpu_pkg::*; #( VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), - .DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + 1), + .DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)), .ARBITER ("R") ) mem_req_arb ( .clk (clk), @@ -512,7 +521,7 @@ module VX_cache import VX_gpu_pkg::*; #( .valid_in (per_bank_mem_req_valid), .ready_in (per_bank_mem_req_ready), .data_in (data_in), - .data_out ({mem_req_addr, mem_req_rw, mem_req_byteen, mem_req_data, bank_mem_req_tag, mem_req_flush}), + .data_out ({mem_req_addr, mem_req_rw, mem_req_byteen, mem_req_data, bank_mem_req_tag, mem_req_flags}), .valid_out (mem_req_valid), .ready_out (mem_req_ready), `UNUSED_PIN (sel_out) diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index 054b7c589..fdee28bf1 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -47,12 +47,18 @@ module VX_cache_bank #( // Enable dirty bytes on writeback parameter DIRTY_BYTES = 0, + // Replacement policy + parameter REPL_POLICY = `CS_REPL_CYCLIC, + // Request debug identifier parameter UUID_WIDTH = 0, // core request tag size parameter TAG_WIDTH = UUID_WIDTH + 1, + // core request flags + parameter FLAGS_WIDTH = 0, + // Core response output register parameter CORE_OUT_REG = 0, @@ -82,7 +88,7 @@ module VX_cache_bank #( input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id) input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array - input wire core_req_flush, // flush enable + input wire [`UP(FLAGS_WIDTH)-1:0] core_req_flags, output wire core_req_ready, // Core Response @@ -99,7 +105,7 @@ module VX_cache_bank #( output wire [LINE_SIZE-1:0] mem_req_byteen, output wire [`CS_LINE_WIDTH-1:0] mem_req_data, output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, - output wire mem_req_flush, + output wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags, input wire mem_req_ready, // Memory response @@ -138,43 +144,45 @@ module VX_cache_bank #( wire [MSHR_ADDR_WIDTH-1:0] replay_id; wire replay_ready; - wire is_init_st0, is_init_st1; + + wire valid_sel, valid_st0, valid_st1; + wire is_init_st0; + wire is_creq_st0, is_creq_st1; + wire is_fill_st0, is_fill_st1; wire is_flush_st0, is_flush_st1; - wire [NUM_WAYS-1:0] flush_way_st0; + wire [`CS_WAY_SEL_WIDTH-1:0] flush_way_st0, evict_way_st0; + wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_st0, way_idx_st1; wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1; - wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1; + wire [`CS_LINE_SEL_BITS-1:0] line_idx_st0, line_idx_st1; + wire [`CS_TAG_SEL_BITS-1:0] line_tag_st0, line_tag_st1; + wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0, evict_tag_st1; wire rw_sel, rw_st0, rw_st1; - wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1; + wire [WORD_SEL_WIDTH-1:0] word_idx_sel, word_idx_st0, word_idx_st1; wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1; wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1; wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1; - wire [`CS_WORD_WIDTH-1:0] read_data_st1; + wire [`CS_WORD_WIDTH-1:0] write_word_st0, write_word_st1; wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1; - wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1; - wire valid_sel, valid_st0, valid_st1; - wire is_creq_st0, is_creq_st1; - wire is_fill_st0, is_fill_st1; + wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1; + wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0; + wire is_dirty_st0, is_dirty_st1; wire is_replay_st0, is_replay_st1; - wire creq_flush_sel, creq_flush_st0, creq_flush_st1; - wire evict_dirty_st0, evict_dirty_st1; - wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1; - wire [NUM_WAYS-1:0] tag_matches_st0; - wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0; - wire [MSHR_ADDR_WIDTH-1:0] mshr_prev_st0, mshr_prev_st1; + wire is_hit_st0, is_hit_st1; + wire [`UP(FLAGS_WIDTH)-1:0] flags_sel, flags_st0, flags_st1; wire mshr_pending_st0, mshr_pending_st1; + wire [MSHR_ADDR_WIDTH-1:0] mshr_previd_st0, mshr_previd_st1; wire mshr_empty; wire flush_valid; wire init_valid; wire [`CS_LINE_SEL_BITS-1:0] flush_sel; - wire [NUM_WAYS-1:0] flush_way; + wire [`CS_WAY_SEL_WIDTH-1:0] flush_way; wire flush_ready; // ensure we have no pending memory request in the bank wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty; - // flush unit VX_bank_flush #( .BANK_ID (BANK_ID), .CACHE_SIZE (CACHE_SIZE), @@ -196,11 +204,7 @@ module VX_cache_bank #( .bank_empty (no_pending_req) ); - wire rdw_hazard1_sel; - wire rdw_hazard2_sel; - reg rdw_hazard3_st1; - - wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1; + wire pipe_stall = crsp_queue_stall; // inputs arbitration: // mshr replay has highest priority to maximize utilization since there is no miss. @@ -219,28 +223,26 @@ module VX_cache_bank #( wire creq_enable = creq_grant && core_req_valid; assign replay_ready = replay_grant - && ~rdw_hazard1_sel + && ~(!WRITEBACK && replay_rw && mreq_queue_alm_full) // needed for writethrough && ~pipe_stall; assign mem_rsp_ready = fill_grant - && (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions - && ~rdw_hazard2_sel + && ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback && ~pipe_stall; assign flush_ready = flush_grant - && (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions - && ~rdw_hazard2_sel + && ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback && ~pipe_stall; assign core_req_ready = creq_grant - && ~mreq_queue_alm_full - && ~mshr_alm_full + && ~mreq_queue_alm_full // needed for fill requests + && ~mshr_alm_full // needed for mshr allocation && ~pipe_stall; wire init_fire = init_valid; wire replay_fire = replay_valid && replay_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; - wire flush_fire = flush_valid && flush_ready; + wire flush_fire = flush_valid && flush_ready; wire core_req_fire = core_req_valid && core_req_ready; wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id = mem_rsp_tag[MSHR_ADDR_WIDTH-1:0]; @@ -264,14 +266,13 @@ module VX_cache_bank #( assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire; assign rw_sel = replay_valid ? replay_rw : core_req_rw; assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen; - assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel; + assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) : + (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr)); + assign word_idx_sel= replay_valid ? replay_wsel : core_req_wsel; assign req_idx_sel = replay_valid ? replay_idx : core_req_idx; assign tag_sel = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) : (replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s : core_req_tag)); - assign creq_flush_sel = core_req_valid && core_req_flush; - - assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) : - (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr)); + assign flags_sel = core_req_valid ? core_req_flags : '0; if (WRITE_ENABLE) begin : g_data_sel for (genvar i = 0; i < `CS_LINE_WIDTH; ++i) begin : g_i @@ -293,15 +294,21 @@ module VX_cache_bank #( assign req_uuid_sel = '0; end + wire is_init_sel = init_valid; + wire is_creq_sel = creq_enable || replay_enable; + wire is_fill_sel = fill_enable; + wire is_flush_sel = flush_enable; + wire is_replay_sel = replay_enable; + VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH), + .DATAW (1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH), .RESETW (1) ) pipe_reg0 ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), - .data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0}) + .data_in ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, flags_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id}), + .data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, flags_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0}) ); if (UUID_WIDTH != 0) begin : g_req_uuid_st0 @@ -310,147 +317,121 @@ module VX_cache_bank #( assign req_uuid_st0 = '0; end - wire do_init_st0 = valid_st0 && is_init_st0; - wire do_flush_st0 = valid_st0 && is_flush_st0; - wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0; - wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0; - wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0; - wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0; - wire do_fill_st0 = valid_st0 && is_fill_st0; - wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0; - wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0; - wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0; + wire is_read_st0 = is_creq_st0 && ~rw_st0; + wire is_write_st0 = is_creq_st0 && rw_st0; + + wire do_init_st0 = valid_st0 && is_init_st0; + wire do_flush_st0 = valid_st0 && is_flush_st0; + wire do_read_st0 = valid_st0 && is_read_st0; + wire do_write_st0 = valid_st0 && is_write_st0; + wire do_fill_st0 = valid_st0 && is_fill_st0; - wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0]; + wire is_read_st1 = is_creq_st1 && ~rw_st1; + wire is_write_st1 = is_creq_st1 && rw_st1; - assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0]; + wire do_read_st1 = valid_st1 && is_read_st1; + wire do_write_st1 = valid_st1 && is_write_st1; - wire [NUM_WAYS-1:0] evict_way_st0; - wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0; + assign line_idx_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0]; + assign line_tag_st0 = `CS_LINE_ADDR_TAG(addr_st0); + + assign write_word_st0 = data_st0[`CS_WORD_WIDTH-1:0]; + + wire do_lookup_st0 = do_read_st0 || do_write_st0; + wire do_lookup_st1 = do_read_st1 || do_write_st1; + + wire [`CS_WAY_SEL_WIDTH-1:0] victim_way_st0; + wire [NUM_WAYS-1:0] tag_matches_st0; + + VX_cache_repl #( + .CACHE_SIZE (CACHE_SIZE), + .LINE_SIZE (LINE_SIZE), + .NUM_BANKS (NUM_BANKS), + .NUM_WAYS (NUM_WAYS), + .REPL_POLICY (REPL_POLICY) + ) cache_repl ( + .clk (clk), + .reset (reset), + .stall (pipe_stall), + .hit_valid (do_lookup_st1 && is_hit_st1 && ~pipe_stall), + .hit_line (line_idx_st1), + .hit_way (way_idx_st1), + .repl_valid (do_fill_st0 && ~pipe_stall), + .repl_line (line_idx_st0), + .repl_way (victim_way_st0) + ); + + assign evict_way_st0 = is_fill_st0 ? victim_way_st0 : flush_way_st0; VX_cache_tags #( - .INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)), - .BANK_ID (BANK_ID), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), - .WRITEBACK (WRITEBACK), - .UUID_WIDTH (UUID_WIDTH) + .WRITEBACK (WRITEBACK) ) cache_tags ( .clk (clk), .reset (reset), - - .req_uuid (req_uuid_st0), - - .stall (pipe_stall), - - // init/flush/fill/write/lookup + // inputs .init (do_init_st0), - .flush (do_flush_st0), - .fill (do_fill_st0), - .write (do_cache_wr_st0), - .lookup (do_lookup_st0), - .line_addr (addr_st0), - .way_sel (flush_way_st0), - .tag_matches(tag_matches_st0), - - // replacement - .evict_dirty(evict_dirty_st0), + .flush (do_flush_st0 && ~pipe_stall), + .fill (do_fill_st0 && ~pipe_stall), + .read (do_read_st0 && ~pipe_stall), + .write (do_write_st0 && ~pipe_stall), + .line_idx (line_idx_st0), + .line_tag (line_tag_st0), .evict_way (evict_way_st0), + // outputs + .tag_matches(tag_matches_st0), + .evict_dirty(is_dirty_st0), .evict_tag (evict_tag_st0) ); - wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0; - - wire is_flush2_st0 = WRITEBACK && is_flush_st0; - - assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0; + wire [`CS_WAY_SEL_WIDTH-1:0] hit_idx_st0; + VX_onehot_encoder #( + .N (NUM_WAYS) + ) way_idx_enc ( + .data_in (tag_matches_st0), + .data_out (hit_idx_st0), + `UNUSED_PIN (valid_out) + ); - assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0; + assign way_idx_st0 = is_creq_st0 ? hit_idx_st0 : evict_way_st0; + assign is_hit_st0 = (| tag_matches_st0); - assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0; + wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0; + assign mshr_id_st0 = is_replay_st0 ? replay_id_st0 : mshr_alloc_id_st0; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1), + .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}), - .data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1}) + .data_in ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, flags_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, data_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0}), + .data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, flags_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, data_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1}) ); - // we have a tag hit - wire is_hit_st1 = (| way_sel_st1); - if (UUID_WIDTH != 0) begin : g_req_uuid_st1 assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH]; end else begin : g_req_uuid_st1_0 assign req_uuid_st1 = '0; end - wire is_read_st1 = is_creq_st1 && ~rw_st1; - wire is_write_st1 = is_creq_st1 && rw_st1; - - wire do_init_st1 = valid_st1 && is_init_st1; - wire do_fill_st1 = valid_st1 && is_fill_st1; - wire do_flush_st1 = valid_st1 && is_flush_st1; - - wire do_creq_rd_st1 = valid_st1 && is_read_st1; - wire do_creq_wr_st1 = valid_st1 && is_write_st1; - wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1; - wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1; - - wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1; - wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1; - - wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1; - wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1; - - wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1; - wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1; - - assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0]; - - `UNUSED_VAR (do_write_miss_st1) + assign addr_st1 = {line_tag_st1, line_idx_st1}; // ensure mshr replay always get a hit - `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("%t: missed mshr replay", $time)) + `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1 && ~is_hit_st1), ("%t: missed mshr replay", $time)) - // both tag and data stores use BRAM with no read-during-write protection. - // we ned to stall the pipeline to prevent read-after-write hazards. - assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill - assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write - always @(posedge clk) begin - // stall reads following writes to same line address - rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1) - && ~rdw_hazard3_st1; // release pipeline stall - end - - wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}}; - wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1; - wire [LINE_SIZE-1:0] write_byteen_st1; - - wire [`CS_LINE_WIDTH-1:0] dirty_data_st1; - wire [LINE_SIZE-1:0] dirty_byteen_st1; + assign write_word_st1 = data_st1[`CS_WORD_WIDTH-1:0]; + `UNUSED_VAR (data_st1) - if (`CS_WORDS_PER_LINE > 1) begin : g_write_byteen_st1_wsel - reg [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen_w; - always @(*) begin - write_byteen_w = '0; - write_byteen_w[wsel_st1] = byteen_st1; - end - assign write_byteen_st1 = write_byteen_w; - end else begin : g_write_byteen_st1 - assign write_byteen_st1 = byteen_st1; - end + wire[`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] read_data_st1; + wire [LINE_SIZE-1:0] evict_byteen_st1; VX_cache_data #( - .INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)), - .BANK_ID (BANK_ID), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), @@ -458,56 +439,58 @@ module VX_cache_bank #( .WORD_SIZE (WORD_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .WRITEBACK (WRITEBACK), - .DIRTY_BYTES (DIRTY_BYTES), - .UUID_WIDTH (UUID_WIDTH) + .DIRTY_BYTES (DIRTY_BYTES) ) cache_data ( .clk (clk), .reset (reset), - - .req_uuid (req_uuid_st1), - .stall (pipe_stall), - - .init (do_init_st1), - .read (do_cache_rd_st1), - .fill (do_fill_st1), - .flush (do_flush_st1), - .write (do_cache_wr_st1), - .way_sel (way_sel_st1), - .line_addr (addr_st1), - .wsel (wsel_st1), - .fill_data (fill_data_st1), - .write_data (write_data_st1), - .write_byteen(write_byteen_st1), + // inputs + .init (do_init_st0), + .fill (do_fill_st0 && ~pipe_stall), + .flush (do_flush_st0 && ~pipe_stall), + .read (do_read_st0 && ~pipe_stall), + .write (do_write_st0 && ~pipe_stall), + .evict_way (evict_way_st0), + .tag_matches(tag_matches_st0), + .line_idx (line_idx_st0), + .fill_data (data_st0), + .write_word (write_word_st0), + .word_idx (word_idx_st0), + .write_byteen(byteen_st0), + .way_idx_r (way_idx_st1), + // outputs .read_data (read_data_st1), - .dirty_data (dirty_data_st1), - .dirty_byteen(dirty_byteen_st1) + .evict_byteen(evict_byteen_st1) ); - wire [MSHR_SIZE-1:0] mshr_lookup_pending_st0; - wire [MSHR_SIZE-1:0] mshr_lookup_rw_st0; - wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall; - wire mshr_lookup_st0 = mshr_allocate_st0; - wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall; + // only allocate MSHR entries for non-replay core requests + wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~is_replay_st0; + wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~is_replay_st1; // release allocated mshr entry if we had a hit wire mshr_release_st1; - if (WRITEBACK) begin : g_mshr_release_st1 + if (WRITEBACK) begin : g_mshr_release assign mshr_release_st1 = is_hit_st1; - end else begin : g_mshr_release_st1_ro - // we need to keep missed write requests in MSHR if there is already a pending entry to the same address - // this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content - // this can happen when writes are sent late, when the fill was already in flight. + end else begin : g_mshr_release_ro + // we need to keep missed write requests in MSHR if there is already a pending entry to the same address. + // this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content. + // this can happen when writes are sent to memory late, when a related fill was already in flight. assign mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1); end + wire mshr_release_fire = mshr_finalize_st1 && mshr_release_st1 && ~pipe_stall; + + wire [1:0] mshr_dequeue; + `POP_COUNT(mshr_dequeue, {replay_fire, mshr_release_fire}); + VX_pending_size #( - .SIZE (MSHR_SIZE) + .SIZE (MSHR_SIZE), + .DECRW (2) ) mshr_pending_size ( .clk (clk), .reset (reset), .incr (core_req_fire), - .decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)), + .decr (mshr_dequeue), .empty (mshr_empty), `UNUSED_PIN (alm_empty), .full (mshr_alm_full), @@ -516,11 +499,12 @@ module VX_cache_bank #( ); VX_cache_mshr #( - .INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-mshr", INSTANCE_ID))), .BANK_ID (BANK_ID), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), .MSHR_SIZE (MSHR_SIZE), + .WRITEBACK (WRITEBACK), .UUID_WIDTH (UUID_WIDTH), .DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH) ) cache_mshr ( @@ -528,7 +512,7 @@ module VX_cache_bank #( .reset (reset), .deq_req_uuid (req_uuid_sel), - .lkp_req_uuid (req_uuid_st0), + .alc_req_uuid (req_uuid_st0), .fin_req_uuid (req_uuid_st1), // memory fill @@ -545,37 +529,23 @@ module VX_cache_bank #( .dequeue_ready (replay_ready), // allocate - .allocate_valid (mshr_allocate_st0), + .allocate_valid (mshr_allocate_st0 && ~pipe_stall), .allocate_addr (addr_st0), .allocate_rw (rw_st0), - .allocate_data ({wsel_st0, byteen_st0, write_data_st0, tag_st0, req_idx_st0}), + .allocate_data ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0}), .allocate_id (mshr_alloc_id_st0), - .allocate_prev (mshr_prev_st0), + .allocate_pending(mshr_pending_st0), + .allocate_previd(mshr_previd_st0), `UNUSED_PIN (allocate_ready), - // lookup - .lookup_valid (mshr_lookup_st0), - .lookup_addr (addr_st0), - .lookup_pending (mshr_lookup_pending_st0), - .lookup_rw (mshr_lookup_rw_st0), - // finalize - .finalize_valid (mshr_finalize_st1), - .finalize_release(mshr_release_st1), - .finalize_pending(mshr_pending_st1), + .finalize_valid (mshr_finalize_st1 && ~pipe_stall), + .finalize_is_release(mshr_release_st1), + .finalize_is_pending(mshr_pending_st1), .finalize_id (mshr_id_st1), - .finalize_prev (mshr_prev_st1) + .finalize_previd(mshr_previd_st1) ); - // check if there are pending requests to same line in the MSHR - wire [MSHR_SIZE-1:0] lookup_matches; - for (genvar i = 0; i < MSHR_SIZE; ++i) begin : g_lookup_matches - assign lookup_matches[i] = mshr_lookup_pending_st0[i] - && (i != mshr_alloc_id_st0) // exclude current mshr id - && (WRITEBACK || ~mshr_lookup_rw_st0[i]); // exclude write requests if writethrough - end - assign mshr_pending_st0 = (| lookup_matches); - // schedule core response wire crsp_queue_valid, crsp_queue_ready; @@ -583,9 +553,9 @@ module VX_cache_bank #( wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx; wire [TAG_WIDTH-1:0] crsp_queue_tag; - assign crsp_queue_valid = do_cache_rd_st1; + assign crsp_queue_valid = do_read_st1 && is_hit_st1; assign crsp_queue_idx = req_idx_st1; - assign crsp_queue_data = read_data_st1; + assign crsp_queue_data = read_data_st1[word_idx_st1]; assign crsp_queue_tag = tag_st1; VX_elastic_buffer #( @@ -595,7 +565,7 @@ module VX_cache_bank #( ) core_rsp_queue ( .clk (clk), .reset (reset), - .valid_in (crsp_queue_valid && ~rdw_hazard3_st1), + .valid_in (crsp_queue_valid), .ready_in (crsp_queue_ready), .data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}), .data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}), @@ -613,51 +583,68 @@ module VX_cache_bank #( wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr; wire [MEM_TAG_WIDTH-1:0] mreq_queue_tag; wire mreq_queue_rw; - wire mreq_queue_flush; + wire [`UP(FLAGS_WIDTH)-1:0] mreq_queue_flags; - wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1; + wire is_fill_or_flush_st1 = is_fill_st1 || (is_flush_st1 && WRITEBACK); wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1; - wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1; - - if (WRITEBACK) begin : g_mreq_queue_push - if (DIRTY_BYTES) begin : g_dirty_bytes - // ensure dirty bytes match the tag info - wire has_dirty_bytes = (| dirty_byteen_st1); - `RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("%t: missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", $time, evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID))) - end - assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1) - || do_writeback_st1) - && ~rdw_hazard3_st1; - end else begin : g_mreq_queue_push_ro - `UNUSED_VAR (do_writeback_st1) - assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1) - || do_creq_wr_st1) - && ~rdw_hazard3_st1; - end - - assign mreq_queue_pop = mem_req_valid && mem_req_ready; - assign mreq_queue_addr = addr_st1; - assign mreq_queue_flush = creq_flush_st1; + wire do_writeback_st1 = do_fill_or_flush_st1 && is_dirty_st1; + wire [`CS_LINE_ADDR_WIDTH-1:0] evict_addr_st1 = {evict_tag_st1, line_idx_st1}; if (WRITE_ENABLE) begin : g_mreq_queue - if (WRITEBACK) begin : g_writeback + if (WRITEBACK) begin : g_wb + if (DIRTY_BYTES) begin : g_dirty_bytes + // ensure dirty bytes match the tag info + wire has_dirty_bytes = (| evict_byteen_st1); + `RUNTIME_ASSERT (~do_fill_or_flush_st1 || (is_dirty_st1 == has_dirty_bytes), ("%t: missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", $time, is_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID))) + end + // issue a fill request on a read/write miss + // issue a writeback on a dirty line eviction + assign mreq_queue_push = ((do_lookup_st1 && ~is_hit_st1 && ~mshr_pending_st1) + || do_writeback_st1) + && ~pipe_stall; + assign mreq_queue_addr = is_fill_or_flush_st1 ? evict_addr_st1 : addr_st1; assign mreq_queue_rw = is_fill_or_flush_st1; - assign mreq_queue_data = dirty_data_st1; - assign mreq_queue_byteen = is_fill_or_flush_st1 ? dirty_byteen_st1 : '1; - end else begin : g_writethrough + assign mreq_queue_data = read_data_st1; + assign mreq_queue_byteen = is_fill_or_flush_st1 ? evict_byteen_st1 : '1; + `UNUSED_VAR (write_word_st1) + `UNUSED_VAR (byteen_st1) + end else begin : g_wt + wire [LINE_SIZE-1:0] line_byteen; + VX_demux #( + .N (`CS_WORD_SEL_BITS), + .M (WORD_SIZE) + ) byteen_demux ( + .sel_in (word_idx_st1), + .data_in (byteen_st1), + .data_out (line_byteen) + ); + // issue a fill request on a read miss + // issue a memory write on a write request + assign mreq_queue_push = ((do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1) + || do_write_st1) + && ~pipe_stall; + assign mreq_queue_addr = addr_st1; assign mreq_queue_rw = rw_st1; - assign mreq_queue_data = write_data_st1; - assign mreq_queue_byteen = rw_st1 ? write_byteen_st1 : '1; + assign mreq_queue_data = {`CS_WORDS_PER_LINE{write_word_st1}}; + assign mreq_queue_byteen = rw_st1 ? line_byteen : '1; `UNUSED_VAR (is_fill_or_flush_st1) - `UNUSED_VAR (dirty_data_st1) - `UNUSED_VAR (dirty_byteen_st1) + `UNUSED_VAR (do_writeback_st1) + `UNUSED_VAR (evict_addr_st1) + `UNUSED_VAR (evict_byteen_st1) end end else begin : g_mreq_queue_ro + // issue a fill request on a read miss + assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1) + && ~pipe_stall; + assign mreq_queue_addr = addr_st1; assign mreq_queue_rw = 0; assign mreq_queue_data = '0; assign mreq_queue_byteen = '1; - `UNUSED_VAR (dirty_data_st1) - `UNUSED_VAR (dirty_byteen_st1) + `UNUSED_VAR (do_writeback_st1) + `UNUSED_VAR (evict_addr_st1) + `UNUSED_VAR (evict_byteen_st1) + `UNUSED_VAR (write_word_st1) + `UNUSED_VAR (byteen_st1) end if (UUID_WIDTH != 0) begin : g_mreq_queue_tag_uuid @@ -666,18 +653,21 @@ module VX_cache_bank #( assign mreq_queue_tag = mshr_id_st1; end + assign mreq_queue_pop = mem_req_valid && mem_req_ready; + assign mreq_queue_flags = flags_st1; + VX_fifo_queue #( - .DATAW (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1), + .DATAW (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)), .DEPTH (MREQ_SIZE), - .ALM_FULL (MREQ_SIZE-PIPELINE_STAGES), + .ALM_FULL (MREQ_SIZE - PIPELINE_STAGES), .OUT_REG (MEM_OUT_REG) ) mem_req_queue ( .clk (clk), .reset (reset), .push (mreq_queue_push), .pop (mreq_queue_pop), - .data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, mreq_queue_flush}), - .data_out ({mem_req_rw, mem_req_addr, mem_req_byteen, mem_req_data, mem_req_tag, mem_req_flush}), + .data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, mreq_queue_flags}), + .data_out ({mem_req_rw, mem_req_addr, mem_req_byteen, mem_req_data, mem_req_tag, mem_req_flags}), .empty (mreq_queue_empty), .alm_full (mreq_queue_alm_full), `UNUSED_PIN (full), @@ -687,11 +677,13 @@ module VX_cache_bank #( assign mem_req_valid = ~mreq_queue_empty; + `UNUSED_VAR (do_lookup_st0) + /////////////////////////////////////////////////////////////////////////////// `ifdef PERF_ENABLE - assign perf_read_misses = do_read_miss_st1; - assign perf_write_misses = do_write_miss_st1; + assign perf_read_misses = do_read_st1 && ~is_hit_st1; + assign perf_write_misses = do_write_st1 && ~is_hit_st1; assign perf_mshr_stalls = mshr_alm_full; `endif @@ -701,31 +693,76 @@ module VX_cache_bank #( && ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire); always @(posedge clk) begin if (input_stall || pipe_stall) begin - `TRACE(3, ("%t: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1)) + `TRACE(4, ("%t: *** %s stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, + crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full)) end if (mem_rsp_fire) begin - `TRACE(2, ("%t: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data, req_uuid_sel)) + `TRACE(2, ("%t: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data, req_uuid_sel)) end if (replay_fire) begin - `TRACE(2, ("%t: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel)) + `TRACE(2, ("%t: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel)) end if (core_req_fire) begin if (core_req_rw) begin - `TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel)) + `TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel)) end else begin - `TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel)) + `TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel)) end end + if (do_init_st0) begin + `TRACE(3, ("%t: %s tags-init: addr=0x%0h, line=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st0, BANK_ID), line_idx_st0)) + end + if (do_fill_st0 && ~pipe_stall) begin + `TRACE(3, ("%t: %s tags-fill: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(addr_st0, BANK_ID), evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0)) + end + if (do_flush_st0 && ~pipe_stall) begin + `TRACE(3, ("%t: %s tags-flush: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(addr_st0, BANK_ID), evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0)) + end + if (do_lookup_st0 && ~pipe_stall) begin + if (is_hit_st0) begin + `TRACE(3, ("%t: %s tags-hit: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(addr_st0, BANK_ID), rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0)) + end else begin + `TRACE(3, ("%t: %s tags-miss: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(addr_st0, BANK_ID), rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0)) + end + end + if (do_fill_st0 && ~pipe_stall) begin + `TRACE(3, ("%t: %s data-fill: addr=0x%0h, way=%0d, line=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(addr_st0, BANK_ID), way_idx_st0, line_idx_st0, data_st0, req_uuid_st0)) + end + if (do_flush_st0 && ~pipe_stall) begin + `TRACE(3, ("%t: %s data-flush: addr=0x%0h, way=%0d, line=%0d (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(addr_st0, BANK_ID), way_idx_st0, line_idx_st0, req_uuid_st0)) + end + if (do_read_st1 && is_hit_st1 && ~pipe_stall) begin + `TRACE(3, ("%t: %s data-read: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), way_idx_st1, line_idx_st1, word_idx_st1, crsp_queue_data, req_uuid_st1)) + end + if (do_write_st1 && is_hit_st1 && ~pipe_stall) begin + `TRACE(3, ("%t: %s data-write: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), way_idx_st1, line_idx_st1, word_idx_st1, byteen_st1, write_word_st1, req_uuid_st1)) + end if (crsp_queue_fire) begin - `TRACE(2, ("%t: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1)) + `TRACE(2, ("%t: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1)) end if (mreq_queue_push) begin - if (do_creq_wr_st1 && !WRITEBACK) begin - `TRACE(2, ("%t: %s writethrough: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1)) - end else if (do_writeback_st1) begin - `TRACE(2, ("%t: %s writeback: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1)) + if (!WRITEBACK && do_write_st1) begin + `TRACE(2, ("%t: %s writethrough: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1)) + end else if (WRITEBACK && do_writeback_st1) begin + `TRACE(2, ("%t: %s writeback: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1)) end else begin - `TRACE(2, ("%t: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mshr_id_st1, req_uuid_st1)) + `TRACE(2, ("%t: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mshr_id_st1, req_uuid_st1)) end end end diff --git a/hw/rtl/cache/VX_cache_bypass.sv b/hw/rtl/cache/VX_cache_bypass.sv index 4b3b3a59a..8f6234364 100644 --- a/hw/rtl/cache/VX_cache_bypass.sv +++ b/hw/rtl/cache/VX_cache_bypass.sv @@ -268,7 +268,7 @@ module VX_cache_bypass #( for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_valid assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || (is_mem_rsp_nc && rsp_idx == REQ_SEL_WIDTH'(i)); end - + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_ready assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i]; end diff --git a/hw/rtl/cache/VX_cache_cluster.sv b/hw/rtl/cache/VX_cache_cluster.sv index 5a8bb9865..fc4afdb0a 100644 --- a/hw/rtl/cache/VX_cache_cluster.sv +++ b/hw/rtl/cache/VX_cache_cluster.sv @@ -24,22 +24,22 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( parameter NUM_REQS = 4, // Size of cache in bytes - parameter CACHE_SIZE = 16384, + parameter CACHE_SIZE = 32768, // Size of line inside a bank in bytes parameter LINE_SIZE = 64, // Number of banks - parameter NUM_BANKS = 1, + parameter NUM_BANKS = 4, // Number of associative ways parameter NUM_WAYS = 4, // Size of a word in bytes - parameter WORD_SIZE = 4, + parameter WORD_SIZE = 16, // Core Response Queue Size - parameter CRSQ_SIZE = 2, + parameter CRSQ_SIZE = 4, // Miss Reserv Queue Knob - parameter MSHR_SIZE = 8, + parameter MSHR_SIZE = 16, // Memory Response Queue Size - parameter MRSQ_SIZE = 0, + parameter MRSQ_SIZE = 4, // Memory Request Queue Size parameter MREQ_SIZE = 4, @@ -52,20 +52,26 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( // Enable dirty bytes on writeback parameter DIRTY_BYTES = 0, + // Replacement policy + parameter REPL_POLICY = `CS_REPL_CYCLIC, + // Request debug identifier parameter UUID_WIDTH = 0, // core request tag size parameter TAG_WIDTH = UUID_WIDTH + 1, + // core request flags + parameter FLAGS_WIDTH = 0, + // enable bypass for non-cacheable addresses parameter NC_ENABLE = 0, // Core response output buffer - parameter CORE_OUT_BUF = 0, + parameter CORE_OUT_BUF = 3, // Memory request output buffer - parameter MEM_OUT_BUF = 0 + parameter MEM_OUT_BUF = 3 ) ( input wire clk, input wire reset, @@ -140,22 +146,24 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( for (genvar i = 0; i < NUM_CACHES; ++i) begin : g_cache_wrap VX_cache_wrap #( - .INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, i)), + .INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, i))), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), .NUM_REQS (NUM_REQS), + .WRITE_ENABLE (WRITE_ENABLE), + .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), + .REPL_POLICY (REPL_POLICY), .CRSQ_SIZE (CRSQ_SIZE), .MSHR_SIZE (MSHR_SIZE), .MRSQ_SIZE (MRSQ_SIZE), .MREQ_SIZE (MREQ_SIZE), - .WRITE_ENABLE (WRITE_ENABLE), - .WRITEBACK (WRITEBACK), - .DIRTY_BYTES (DIRTY_BYTES), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (ARB_TAG_WIDTH), + .FLAGS_WIDTH (FLAGS_WIDTH), .TAG_SEL_IDX (TAG_SEL_IDX), .CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF), .MEM_OUT_BUF ((NUM_CACHES > 1) ? 2 : MEM_OUT_BUF), diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index 04b0ff746..ddc40b1bd 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -14,8 +14,6 @@ `include "VX_cache_define.vh" module VX_cache_data #( - parameter `STRING INSTANCE_ID= "", - parameter BANK_ID = 0, // Size of cache in bytes parameter CACHE_SIZE = 1024, // Size of line inside a bank in bytes @@ -31,171 +29,147 @@ module VX_cache_data #( // Enable cache writeback parameter WRITEBACK = 0, // Enable dirty bytes on writeback - parameter DIRTY_BYTES = 0, - // Request debug identifier - parameter UUID_WIDTH = 0 + parameter DIRTY_BYTES = 0 ) ( input wire clk, input wire reset, - -`IGNORE_UNUSED_BEGIN - input wire[`UP(UUID_WIDTH)-1:0] req_uuid, -`IGNORE_UNUSED_END - input wire stall, - + // inputs input wire init, - input wire read, input wire fill, input wire flush, + input wire read, input wire write, - input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr, - input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel, + input wire [`CS_LINE_SEL_BITS-1:0] line_idx, + input wire [`CS_WAY_SEL_WIDTH-1:0] evict_way, + input wire [NUM_WAYS-1:0] tag_matches, input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data, - input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] write_data, - input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen, - input wire [NUM_WAYS-1:0] way_sel, - output wire [`CS_WORD_WIDTH-1:0] read_data, - output wire [`CS_LINE_WIDTH-1:0] dirty_data, - output wire [LINE_SIZE-1:0] dirty_byteen + input wire [`CS_WORD_WIDTH-1:0] write_word, + input wire [WORD_SIZE-1:0] write_byteen, + input wire [`UP(`CS_WORD_SEL_BITS)-1:0] word_idx, + input wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_r, + // outputs + output wire [`CS_LINE_WIDTH-1:0] read_data, + output wire [LINE_SIZE-1:0] evict_byteen ); - `UNUSED_SPARAM (INSTANCE_ID) - `UNUSED_PARAM (BANK_ID) `UNUSED_PARAM (WORD_SIZE) `UNUSED_VAR (stall) - `UNUSED_VAR (line_addr) - `UNUSED_VAR (init) - `UNUSED_VAR (read) - `UNUSED_VAR (flush) - - localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1; - - wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; - - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata; - wire [`LOG2UP(NUM_WAYS)-1:0] way_idx; - - if (WRITEBACK) begin : g_dirty_data - wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] transposed_rdata; - VX_transpose #( - .DATAW (`CS_WORD_WIDTH), - .N (`CS_WORDS_PER_LINE), - .M (NUM_WAYS) - ) transpose ( - .data_in (line_rdata), - .data_out (transposed_rdata) - ); - assign dirty_data = transposed_rdata[way_idx]; - end else begin : g_dirty_data_0 - assign dirty_data = '0; - end - if (DIRTY_BYTES) begin : g_dirty_byteen - wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata; - wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata; + if (DIRTY_BYTES != 0) begin : g_dirty_bytes - for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_bs_wdata - wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]); - assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]); + wire [NUM_WAYS-1:0][LINE_SIZE-1:0] byteen_rdata; + wire [NUM_WAYS-1:0][LINE_SIZE-1:0] byteen_wdata; + wire [NUM_WAYS-1:0][LINE_SIZE-1:0] byteen_wren; + + for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_byteen_wdata + wire evict = fill || flush; + wire evict_way_en = (NUM_WAYS == 1) || (evict_way == i); + wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_mask; + for (genvar j = 0; j < `CS_WORDS_PER_LINE; ++j) begin : g_write_mask + wire word_en = (`CS_WORDS_PER_LINE == 1) || (word_idx == j); + assign write_mask[j] = write_byteen & {WORD_SIZE{word_en}}; + end + assign byteen_wdata[i] = {LINE_SIZE{write}}; // only asserted on writes + assign byteen_wren[i] = {LINE_SIZE{init}} + | {LINE_SIZE{evict && evict_way_en}} + | ({LINE_SIZE{write && tag_matches[i]}} & write_mask); end + wire byteen_read = fill || flush; + wire byteen_write = init || write || fill || flush; + VX_sp_ram #( .DATAW (LINE_SIZE * NUM_WAYS), - .SIZE (`CS_LINES_PER_BANK) + .WRENW (LINE_SIZE * NUM_WAYS), + .SIZE (`CS_LINES_PER_BANK), + .OUT_REG (1), + .RDW_MODE ("R") ) byteen_store ( .clk (clk), .reset (reset), - .read (write || fill || flush), - .write (init || write || fill || flush), - .wren (1'b1), - .addr (line_sel), - .wdata (bs_wdata), - .rdata (bs_rdata) + .read (byteen_read), + .write (byteen_write), + .wren (byteen_wren), + .addr (line_idx), + .wdata (byteen_wdata), + .rdata (byteen_rdata) ); - assign dirty_byteen = bs_rdata[way_idx]; - end else begin : g_dirty_byteen_0 - assign dirty_byteen = '1; + assign evict_byteen = byteen_rdata[way_idx_r]; + end else begin : g_no_dirty_bytes + `UNUSED_VAR (init) + `UNUSED_VAR (flush) + assign evict_byteen = '1; // update whole line end - // order the data layout to perform ways multiplexing last. - // this allows converting way index to binary in parallel with BRAM readaccess and way selection. - - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata; - wire [BYTEENW-1:0] line_wren; - - if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin : g_line_wdata - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w; - for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin : g_i - for (genvar j = 0; j < NUM_WAYS; ++j) begin : g_j - assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i]; - assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i]) - & {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}}; + wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] line_rdata; + + if (WRITE_ENABLE) begin : g_data_store + // create a single write-enable block ram to reduce area overhead + wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] line_wdata; + wire [NUM_WAYS-1:0][LINE_SIZE-1:0] line_wren; + wire line_write; + wire line_read; + + for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_wdata + wire fill_way_en = (NUM_WAYS == 1) || (evict_way == i); + wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_mask; + for (genvar j = 0; j < `CS_WORDS_PER_LINE; ++j) begin : g_write_mask + wire word_en = (`CS_WORDS_PER_LINE == 1) || (word_idx == j); + assign write_mask[j] = write_byteen & {WORD_SIZE{word_en}}; end + assign line_wdata[i] = fill ? fill_data : {`CS_WORDS_PER_LINE{write_word}}; + assign line_wren[i] = {LINE_SIZE{fill && fill_way_en}} + | ({LINE_SIZE{write && tag_matches[i]}} & write_mask); end - assign line_wren = wren_w; - end else begin : g_line_wdata_ro - `UNUSED_VAR (write) - `UNUSED_VAR (write_byteen) - `UNUSED_VAR (write_data) - assign line_wdata = fill_data; - assign line_wren = fill; - end - VX_encoder #( - .N (NUM_WAYS) - ) way_enc ( - .data_in (way_sel), - .data_out (way_idx), - `UNUSED_PIN (valid_out) - ); - - wire line_read = (read && ~stall) - || (WRITEBACK && (fill || flush)); - - wire line_write = write || fill; - - VX_sp_ram #( - .DATAW (`CS_LINE_WIDTH * NUM_WAYS), - .SIZE (`CS_LINES_PER_BANK), - .WRENW (BYTEENW), - .NO_RWCHECK (1), - .RW_ASSERT (1) - ) data_store ( - .clk (clk), - .reset (reset), - .read (line_read), - .write (line_write), - .wren (line_wren), - .addr (line_sel), - .wdata (line_wdata), - .rdata (line_rdata) - ); - - wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata; - if (`CS_WORDS_PER_LINE > 1) begin : g_per_way_rdata_wsel - assign per_way_rdata = line_rdata[wsel]; - end else begin : g_per_way_rdata - `UNUSED_VAR (wsel) - assign per_way_rdata = line_rdata; - end - assign read_data = per_way_rdata[way_idx]; + assign line_read = read || ((fill || flush) && WRITEBACK); + assign line_write = fill || (write && WRITE_ENABLE); -`ifdef DBG_TRACE_CACHE - always @(posedge clk) begin - if (fill && ~stall) begin - `TRACE(3, ("%t: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data)) - end - if (flush && ~stall) begin - `TRACE(3, ("%t: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=0x%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data)) - end - if (read && ~stall) begin - `TRACE(3, ("%t: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid)) - end - if (write && ~stall) begin - `TRACE(3, ("%t: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid)) + VX_sp_ram #( + .DATAW (NUM_WAYS * `CS_LINE_WIDTH), + .SIZE (`CS_LINES_PER_BANK), + .WRENW (NUM_WAYS * LINE_SIZE), + .OUT_REG (1), + .RDW_MODE ("R") + ) data_store ( + .clk (clk), + .reset (reset), + .read (line_read), + .write (line_write), + .wren (line_wren), + .addr (line_idx), + .wdata (line_wdata), + .rdata (line_rdata) + ); + end else begin : g_data_store + `UNUSED_VAR (write) + `UNUSED_VAR (write_byteen) + `UNUSED_VAR (write_word) + `UNUSED_VAR (word_idx) + `UNUSED_VAR (tag_matches) + + // we don't merge the ways into a single block ram due to WREN overhead + for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_ways + wire fill_way_en = (NUM_WAYS == 1) || (evict_way == i); + VX_sp_ram #( + .DATAW (`CS_LINE_WIDTH), + .SIZE (`CS_LINES_PER_BANK), + .OUT_REG (1), + .RDW_MODE ("R") + ) data_store ( + .clk (clk), + .reset (reset), + .read (read), + .write (fill && fill_way_en), + .wren (1'b1), + .addr (line_idx), + .wdata (fill_data), + .rdata (line_rdata[i]) + ); end end -`endif + + assign read_data = line_rdata[way_idx_r]; endmodule diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index 342a40a1b..65b239900 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -22,6 +22,7 @@ `define CS_LINE_WIDTH (8 * LINE_SIZE) `define CS_BANK_SIZE (CACHE_SIZE / NUM_BANKS) `define CS_WAY_SEL_BITS `CLOG2(NUM_WAYS) +`define CS_WAY_SEL_WIDTH `UP(`CS_WAY_SEL_BITS) `define CS_LINES_PER_BANK (`CS_BANK_SIZE / (LINE_SIZE * NUM_WAYS)) `define CS_WORDS_PER_LINE (LINE_SIZE / WORD_SIZE) @@ -73,4 +74,10 @@ `PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \ `PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1)) +/////////////////////////////////////////////////////////////////////////////// + +`define CS_REPL_RANDOM 0 +`define CS_REPL_CYCLIC 1 +`define CS_REPL_PLRU 2 + `endif // VX_CACHE_DEFINE_VH diff --git a/hw/rtl/cache/VX_cache_flush.sv b/hw/rtl/cache/VX_cache_flush.sv index b318dc5af..d10cb5275 100644 --- a/hw/rtl/cache/VX_cache_flush.sv +++ b/hw/rtl/cache/VX_cache_flush.sv @@ -128,7 +128,8 @@ module VX_cache_flush #( lock_released_n = lock_released; flush_uuid_n = flush_uuid_r; case (state) - STATE_IDLE: begin + //STATE_IDLE: + default: begin if (flush_req_enable) begin state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH; for (integer i = NUM_REQS-1; i >= 0; --i) begin diff --git a/hw/rtl/cache/VX_cache_mshr.sv b/hw/rtl/cache/VX_cache_mshr.sv index 482c110dc..78557e1ce 100644 --- a/hw/rtl/cache/VX_cache_mshr.sv +++ b/hw/rtl/cache/VX_cache_mshr.sv @@ -24,36 +24,23 @@ // arrival and are dequeued in the same order. // Each entry has a next pointer to the next entry pending for the same cache line. // -// During the fill operation, the MSHR will release the MSHR entry at fill_id +// During the fill request, the MSHR will dequue the MSHR entry at the fill_id location // which represents the first request in the pending list that initiated the memory fill. // -// The dequeue operation directly follows the fill operation and will release +// The dequeue response directly follows the fill request and will release // all the subsequent entries linked to fill_id (pending the same cache line). // -// During the allocation operation, the MSHR will allocate the next free slot +// During the allocation request, the MSHR will allocate the next free slot // for the incoming core request. We return the allocated slot id as well as // the slot id of the previous entry for the same cache line. This is used to -// link the new entry to the pending list during finalization. +// link the new entry to the pending list. // -// The lookup operation is used to find all pending entries for a given cache line. -// This is used to by the cache bank to determine if a cache miss is already pending -// and therefore avoid issuing a memory fill request. -// -// The finalize operation is used to release the allocated MSHR entry if we had a hit. -// If we had a miss and finalize_pending is true, we link the allocated entry to -// its corresponding pending list (via finalize_prev). +// The finalize request is used to persit or release the currently allocated MSHR entry +// if we had a cache miss or a hit, respectively. // // Warning: This MSHR implementation is strongly coupled with the bank pipeline // and as such changes to either module requires careful evaluation. // -// This architecture implements three pipeline stages: -// - Arbitration: cache bank arbitration before entering pipeline. -// fill and dequeue operations are executed at this stage. -// - stage 0: cache bank tag access stage. -// allocate and lookup operations are executed at this stage. -// - stage 1: cache bank tdatag access stage. -// finalize operation is executed at this stage. -// module VX_cache_mshr #( parameter `STRING INSTANCE_ID= "", @@ -68,6 +55,9 @@ module VX_cache_mshr #( parameter UUID_WIDTH = 0, // MSHR parameters parameter DATA_WIDTH = 1, + // Enable cache writeback + parameter WRITEBACK = 0, + parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE) ) ( input wire clk, @@ -75,7 +65,7 @@ module VX_cache_mshr #( `IGNORE_UNUSED_BEGIN input wire[`UP(UUID_WIDTH)-1:0] deq_req_uuid, - input wire[`UP(UUID_WIDTH)-1:0] lkp_req_uuid, + input wire[`UP(UUID_WIDTH)-1:0] alc_req_uuid, input wire[`UP(UUID_WIDTH)-1:0] fin_req_uuid, `IGNORE_UNUSED_END @@ -98,26 +88,21 @@ module VX_cache_mshr #( input wire allocate_rw, input wire [DATA_WIDTH-1:0] allocate_data, output wire [MSHR_ADDR_WIDTH-1:0] allocate_id, - output wire [MSHR_ADDR_WIDTH-1:0] allocate_prev, + output wire allocate_pending, + output wire [MSHR_ADDR_WIDTH-1:0] allocate_previd, output wire allocate_ready, - // lookup - input wire lookup_valid, - input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr, - output wire [MSHR_SIZE-1:0] lookup_pending, - output wire [MSHR_SIZE-1:0] lookup_rw, - // finalize input wire finalize_valid, - input wire finalize_release, - input wire finalize_pending, - input wire [MSHR_ADDR_WIDTH-1:0] finalize_id, - input wire [MSHR_ADDR_WIDTH-1:0] finalize_prev + input wire finalize_is_release, + input wire finalize_is_pending, + input wire [MSHR_ADDR_WIDTH-1:0] finalize_previd, + input wire [MSHR_ADDR_WIDTH-1:0] finalize_id ); `UNUSED_PARAM (BANK_ID) - reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0]; - reg [MSHR_ADDR_WIDTH-1:0] next_index [MSHR_SIZE-1:0]; + reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [0:MSHR_SIZE-1]; + reg [MSHR_ADDR_WIDTH-1:0] next_index [0:MSHR_SIZE-1]; reg [MSHR_SIZE-1:0] valid_table, valid_table_n; reg [MSHR_SIZE-1:0] next_table, next_table_x, next_table_n; @@ -136,7 +121,7 @@ module VX_cache_mshr #( wire [MSHR_SIZE-1:0] addr_matches; for (genvar i = 0; i < MSHR_SIZE; ++i) begin : g_addr_matches - assign addr_matches[i] = valid_table[i] && (addr_table[i] == lookup_addr); + assign addr_matches[i] = valid_table[i] && (addr_table[i] == allocate_addr); end VX_lzc #( @@ -148,11 +133,13 @@ module VX_cache_mshr #( .valid_out (allocate_rdy_n) ); - VX_encoder #( + // find matching tail-entry + VX_priority_encoder #( .N (MSHR_SIZE) ) prev_sel ( .data_in (addr_matches & ~next_table_x), - .data_out (prev_idx), + .index_out (prev_idx), + `UNUSED_PIN (onehot_out), `UNUSED_PIN (valid_out) ); @@ -171,17 +158,22 @@ module VX_cache_mshr #( valid_table_n[dequeue_id] = 0; if (next_table[dequeue_id]) begin dequeue_id_n = next_index[dequeue_id]; + end else if (finalize_valid && finalize_is_pending && (finalize_previd == dequeue_id)) begin + dequeue_id_n = finalize_id; end else begin dequeue_val_n = 0; end end if (finalize_valid) begin - if (finalize_release) begin + if (finalize_is_release) begin valid_table_n[finalize_id] = 0; end - if (finalize_pending) begin - next_table_x[finalize_prev] = 1; + // warning: This code allows 'finalize_is_pending' to be asserted regardless of hit/miss + // to reduce the its propagation delay into the MSHR. this is safe because wrong updates + // to 'next_table_n' will be cleared during 'allocate_fire' below. + if (finalize_is_pending) begin + next_table_x[finalize_previd] = 1; end end @@ -204,12 +196,12 @@ module VX_cache_mshr #( end if (allocate_fire) begin - addr_table[allocate_id] <= allocate_addr; + addr_table[allocate_id] <= allocate_addr; write_table[allocate_id] <= allocate_rw; end - if (finalize_valid && finalize_pending) begin - next_index[finalize_prev] <= finalize_id; + if (finalize_valid && finalize_is_pending) begin + next_index[finalize_previd] <= finalize_id; end dequeue_id_r <= dequeue_id_n; @@ -217,20 +209,20 @@ module VX_cache_mshr #( next_table <= next_table_n; end - `RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid)) + `RUNTIME_ASSERT(~(allocate_fire && valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, alc_req_uuid)) - `RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, + `RUNTIME_ASSERT(~(finalize_valid && ~valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid)) - `RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID, + `RUNTIME_ASSERT(~(fill_valid && ~valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id)) VX_dp_ram #( - .DATAW (DATA_WIDTH), - .SIZE (MSHR_SIZE), - .LUTRAM (1) - ) entries ( + .DATAW (DATA_WIDTH), + .SIZE (MSHR_SIZE), + .RDW_MODE ("R") + ) mshr_store ( .clk (clk), .reset (reset), .read (1'b1), @@ -245,19 +237,20 @@ module VX_cache_mshr #( assign fill_addr = addr_table[fill_id]; assign allocate_ready = allocate_rdy; - assign allocate_id = allocate_id_r; - assign allocate_prev = prev_idx; - - assign dequeue_valid = dequeue_val; - assign dequeue_addr = addr_table[dequeue_id_r]; - assign dequeue_rw = write_table[dequeue_id_r]; - assign dequeue_id = dequeue_id_r; - - // return pending entries for the given cache line - assign lookup_pending = addr_matches; - assign lookup_rw = write_table; + assign allocate_id = allocate_id_r; + assign allocate_previd = prev_idx; + + if (WRITEBACK) begin : g_pending_wb + assign allocate_pending = |addr_matches; + end else begin : g_pending_wt + // exclude write requests if writethrough + assign allocate_pending = |(addr_matches & ~write_table); + end - `UNUSED_VAR (lookup_valid) + assign dequeue_valid = dequeue_val; + assign dequeue_addr = addr_table[dequeue_id_r]; + assign dequeue_rw = write_table[dequeue_id_r]; + assign dequeue_id = dequeue_id_r; `ifdef DBG_TRACE_CACHE reg show_table; @@ -265,23 +258,21 @@ module VX_cache_mshr #( if (reset) begin show_table <= 0; end else begin - show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire; + show_table <= allocate_fire || finalize_valid || fill_valid || dequeue_fire; end if (allocate_fire) begin - `TRACE(3, ("%t: %s allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_prev, allocate_id, lkp_req_uuid)) + `TRACE(3, ("%t: %s allocate: addr=0x%0h, id=%0d, pending=%b, prev=%0d (#%0d)\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id, allocate_pending, prev_idx, alc_req_uuid)) end - if (lookup_valid) begin - `TRACE(3, ("%t: %s lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_pending, lkp_req_uuid)) + if (finalize_valid && finalize_is_release) begin + `TRACE(3, ("%t: %s release: id=%0d (#%0d)\n", $time, INSTANCE_ID, finalize_id, fin_req_uuid)) end - if (finalize_valid) begin - `TRACE(3, ("%t: %s finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, - finalize_release, finalize_pending, finalize_prev, finalize_id, fin_req_uuid)) + if (finalize_valid && finalize_is_pending) begin + `TRACE(3, ("%t: %s finalize: id=%0d (#%0d)\n", $time, INSTANCE_ID, finalize_id, fin_req_uuid)) end if (fill_valid) begin - `TRACE(3, ("%t: %s fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id)) + `TRACE(3, ("%t: %s fill: addr=0x%0h, id=%0d\n", $time, INSTANCE_ID, + `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id)) end if (dequeue_fire) begin `TRACE(3, ("%t: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID, diff --git a/hw/rtl/cache/VX_cache_repl.sv b/hw/rtl/cache/VX_cache_repl.sv new file mode 100644 index 000000000..578c87002 --- /dev/null +++ b/hw/rtl/cache/VX_cache_repl.sv @@ -0,0 +1,202 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_cache_define.vh" + +// Fast PLRU encoder and decoder utility +// Adapted from BaseJump STL: http://bjump.org/data_out.html + +module plru_decoder #( + parameter NUM_WAYS = 1, + parameter WAY_IDX_BITS = $clog2(NUM_WAYS), + parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS) +) ( + input wire [WAY_IDX_WIDTH-1:0] way_idx, + output wire [`UP(NUM_WAYS-1)-1:0] lru_data, + output wire [`UP(NUM_WAYS-1)-1:0] lru_mask +); + if (NUM_WAYS > 1) begin : g_dec + wire [`UP(NUM_WAYS-1)-1:0] data; + `IGNORE_UNOPTFLAT_BEGIN + wire [`UP(NUM_WAYS-1)-1:0] mask; + `IGNORE_UNOPTFLAT_END + for (genvar i = 0; i < NUM_WAYS-1; ++i) begin : g_i + if (i == 0) begin : g_i_0 + assign mask[i] = 1'b1; + end else if (i % 2 == 1) begin : g_i_odd + assign mask[i] = mask[(i-1)/2] & ~way_idx[WAY_IDX_BITS-$clog2(i+2)+1]; + end else begin : g_i_even + assign mask[i] = mask[(i-2)/2] & way_idx[WAY_IDX_BITS-$clog2(i+2)+1]; + end + assign data[i] = ~way_idx[WAY_IDX_BITS-$clog2(i+2)]; + end + assign lru_data = data; + assign lru_mask = mask; + end else begin : g_no_dec + `UNUSED_VAR (way_idx) + assign lru_data = '0; + assign lru_mask = '0; + end + +endmodule + +module plru_encoder #( + parameter NUM_WAYS = 1, + parameter WAY_IDX_BITS = $clog2(NUM_WAYS), + parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS) +) ( + input wire [`UP(NUM_WAYS-1)-1:0] lru_in, + output wire [WAY_IDX_WIDTH-1:0] way_idx +); + if (NUM_WAYS > 1) begin : g_enc + wire [WAY_IDX_BITS-1:0] tmp; + for (genvar i = 0; i < WAY_IDX_BITS; ++i) begin : g_i + if (i == 0) begin : g_i_0 + assign tmp[WAY_IDX_WIDTH-1] = lru_in[0]; + end else begin : g_i_n + VX_mux #( + .N (2**i) + ) mux ( + .data_in (lru_in[((2**i)-1)+:(2**i)]), + .sel_in (tmp[WAY_IDX_BITS-1-:i]), + .data_out (tmp[WAY_IDX_BITS-1-i]) + ); + end + end + assign way_idx = tmp; + end else begin : g_no_enc + `UNUSED_VAR (lru_in) + assign way_idx = '0; + end + +endmodule + +module VX_cache_repl #( + parameter CACHE_SIZE = 1024, + // Size of line inside a bank in bytes + parameter LINE_SIZE = 64, + // Number of banks + parameter NUM_BANKS = 1, + // Number of associative ways + parameter NUM_WAYS = 1, + // replacement policy + parameter REPL_POLICY = `CS_REPL_CYCLIC +) ( + input wire clk, + input wire reset, + input wire stall, + input wire hit_valid, + input wire [`CS_LINE_SEL_BITS-1:0] hit_line, + input wire [`CS_WAY_SEL_WIDTH-1:0] hit_way, + input wire repl_valid, + input wire [`CS_LINE_SEL_BITS-1:0] repl_line, + output wire [`CS_WAY_SEL_WIDTH-1:0] repl_way +); + localparam WAY_SEL_WIDTH = `CS_WAY_SEL_WIDTH; + `UNUSED_VAR (stall) + + if (NUM_WAYS > 1) begin : g_enable + if (REPL_POLICY == `CS_REPL_PLRU) begin : g_plru + // Pseudo Least Recently Used replacement policy + localparam LRU_WIDTH = `UP(NUM_WAYS-1); + + wire [LRU_WIDTH-1:0] plru_rdata; + wire [LRU_WIDTH-1:0] plru_wdata; + wire [LRU_WIDTH-1:0] plru_wmask; + + VX_dp_ram #( + .DATAW (LRU_WIDTH), + .SIZE (`CS_LINES_PER_BANK), + .WRENW (LRU_WIDTH), + .RDW_MODE ("R") + ) plru_store ( + .clk (clk), + .reset (reset), + .read (repl_valid), + .write (hit_valid), + .wren (plru_wmask), + .waddr (hit_line), + .raddr (repl_line), + .wdata (plru_wdata), + .rdata (plru_rdata) + ); + + plru_decoder #( + .NUM_WAYS (NUM_WAYS) + ) plru_dec ( + .way_idx (hit_way), + .lru_data (plru_wdata), + .lru_mask (plru_wmask) + ); + + plru_encoder #( + .NUM_WAYS (NUM_WAYS) + ) plru_enc ( + .lru_in (plru_rdata), + .way_idx (repl_way) + ); + + end else if (REPL_POLICY == `CS_REPL_CYCLIC) begin : g_cyclic + // Cyclic replacement policy + `UNUSED_VAR (hit_valid) + `UNUSED_VAR (hit_line) + `UNUSED_VAR (hit_way) + + wire [WAY_SEL_WIDTH-1:0] ctr_rdata; + wire [WAY_SEL_WIDTH-1:0] ctr_wdata = ctr_rdata + 1; + + VX_sp_ram #( + .DATAW (WAY_SEL_WIDTH), + .SIZE (`CS_LINES_PER_BANK), + .RDW_MODE ("R") + ) ctr_store ( + .clk (clk), + .reset (reset), + .read (repl_valid), + .write (repl_valid), + .wren (1'b1), + .addr (repl_line), + .wdata (ctr_wdata), + .rdata (ctr_rdata) + ); + + assign repl_way = ctr_rdata; + end else begin : g_random + // Random replacement policy + `UNUSED_VAR (hit_valid) + `UNUSED_VAR (hit_line) + `UNUSED_VAR (hit_way) + `UNUSED_VAR (repl_valid) + `UNUSED_VAR (repl_line) + reg [WAY_SEL_WIDTH-1:0] victim_idx; + always @(posedge clk) begin + if (reset) begin + victim_idx <= 0; + end else if (~stall) begin + victim_idx <= victim_idx + 1; + end + end + assign repl_way = victim_idx; + end + end else begin : g_disable + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + `UNUSED_VAR (hit_valid) + `UNUSED_VAR (hit_line) + `UNUSED_VAR (hit_way) + `UNUSED_VAR (repl_valid) + `UNUSED_VAR (repl_line) + assign repl_way = 1'b0; + end + +endmodule diff --git a/hw/rtl/cache/VX_cache_tags.sv b/hw/rtl/cache/VX_cache_tags.sv index 92497b80b..e086ea94f 100644 --- a/hw/rtl/cache/VX_cache_tags.sv +++ b/hw/rtl/cache/VX_cache_tags.sv @@ -14,8 +14,6 @@ `include "VX_cache_define.vh" module VX_cache_tags #( - parameter `STRING INSTANCE_ID = "", - parameter BANK_ID = 0, // Size of cache in bytes parameter CACHE_SIZE = 1024, // Size of line inside a bank in bytes @@ -27,96 +25,61 @@ module VX_cache_tags #( // Size of a word in bytes parameter WORD_SIZE = 1, // Enable cache writeback - parameter WRITEBACK = 0, - // Request debug identifier - parameter UUID_WIDTH = 0 + parameter WRITEBACK = 0 ) ( input wire clk, input wire reset, -`IGNORE_UNUSED_BEGIN - input wire [`UP(UUID_WIDTH)-1:0] req_uuid, -`IGNORE_UNUSED_END - - input wire stall, - - // init/fill/lookup + // inputs input wire init, input wire flush, input wire fill, + input wire read, input wire write, - input wire lookup, - input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr, - input wire [NUM_WAYS-1:0] way_sel, - output wire [NUM_WAYS-1:0] tag_matches, + input wire [`CS_LINE_SEL_BITS-1:0] line_idx, + input wire [`CS_TAG_SEL_BITS-1:0] line_tag, + input wire [`CS_WAY_SEL_WIDTH-1:0] evict_way, - // eviction + // outputs + output wire [NUM_WAYS-1:0] tag_matches, output wire evict_dirty, - output wire [NUM_WAYS-1:0] evict_way, output wire [`CS_TAG_SEL_BITS-1:0] evict_tag ); - `UNUSED_SPARAM (INSTANCE_ID) - `UNUSED_PARAM (BANK_ID) - `UNUSED_VAR (lookup) - // valid, dirty, tag localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS; - wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; - wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr); - wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag; wire [NUM_WAYS-1:0] read_valid; wire [NUM_WAYS-1:0] read_dirty; - - if (NUM_WAYS > 1) begin : g_evict_way - reg [NUM_WAYS-1:0] evict_way_r; - // cyclic assignment of replacement way - always @(posedge clk) begin - if (reset) begin - evict_way_r <= 1; - end else if (~stall) begin // holding the value on stalls prevents filling different slots twice - evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]}; - end - end - - assign evict_way = fill ? evict_way_r : way_sel; - - VX_onehot_mux #( - .DATAW (`CS_TAG_SEL_BITS), - .N (NUM_WAYS) - ) evict_tag_sel ( - .data_in (read_tag), - .sel_in (evict_way), - .data_out (evict_tag) - ); - end else begin : g_evict_way_0 - `UNUSED_VAR (stall) - assign evict_way = 1'b1; - assign evict_tag = read_tag; + `UNUSED_VAR (read) + + if (WRITEBACK) begin : g_evict_tag_wb + assign evict_dirty = read_dirty[evict_way]; + assign evict_tag = read_tag[evict_way]; + end else begin : g_evict_tag_wt + `UNUSED_VAR (read_dirty) + assign evict_dirty = 1'b0; + assign evict_tag = '0; end - // fill and flush need to also read in writeback mode - wire fill_s = fill && (!WRITEBACK || ~stall); - wire flush_s = flush && (!WRITEBACK || ~stall); - for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_store + wire way_en = (NUM_WAYS == 1) || (evict_way == i); + wire do_init = init; // init all ways + wire do_fill = fill && way_en; + wire do_flush = flush && (!WRITEBACK || way_en); // flush the whole line in writethrough mode + wire do_write = WRITEBACK && write && tag_matches[i]; // only write on tag hit - wire do_fill = fill_s && evict_way[i]; - wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode - wire do_write = WRITEBACK && write && tag_matches[i]; - - wire line_read = (WRITEBACK && (fill_s || flush_s)); - wire line_write = init || do_fill || do_flush || do_write; - wire line_valid = ~(init || flush); + wire line_read = read || write || (WRITEBACK && (fill || flush)); + wire line_write = do_init || do_fill || do_flush || do_write; + wire line_valid = fill || write; wire [TAG_WIDTH-1:0] line_wdata; wire [TAG_WIDTH-1:0] line_rdata; - if (WRITEBACK) begin : g_writeback + if (WRITEBACK) begin : g_wdata assign line_wdata = {line_valid, write, line_tag}; assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata; - end else begin : g_writethrough + end else begin : g_wdata assign line_wdata = {line_valid, line_tag}; assign {read_valid[i], read_tag[i]} = line_rdata; assign read_dirty[i] = 1'b0; @@ -125,15 +88,14 @@ module VX_cache_tags #( VX_sp_ram #( .DATAW (TAG_WIDTH), .SIZE (`CS_LINES_PER_BANK), - .NO_RWCHECK (1), - .RW_ASSERT (1) + .RDW_MODE ("W") ) tag_store ( .clk (clk), .reset (reset), .read (line_read), .write (line_write), .wren (1'b1), - .addr (line_sel), + .addr (line_idx), .wdata (line_wdata), .rdata (line_rdata) ); @@ -143,36 +105,4 @@ module VX_cache_tags #( assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]); end - assign evict_dirty = | (read_dirty & evict_way); - -`ifdef DBG_TRACE_CACHE - wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel}; - always @(posedge clk) begin - if (fill && ~stall) begin - `TRACE(3, ("%t: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID))) - end - if (init) begin - `TRACE(3, ("%t: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel)) - end - if (flush && ~stall) begin - `TRACE(3, ("%t: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty)) - end - if (lookup && ~stall) begin - if (tag_matches != 0) begin - if (write) begin - `TRACE(3, ("%t: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)) - end else begin - `TRACE(3, ("%t: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)) - end - end else begin - if (write) begin - `TRACE(3, ("%t: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)) - end else begin - `TRACE(3, ("%t: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)) - end - end - end - end -`endif - endmodule diff --git a/hw/rtl/cache/VX_cache_top.sv b/hw/rtl/cache/VX_cache_top.sv index 3fa0e5d65..45664af2b 100644 --- a/hw/rtl/cache/VX_cache_top.sv +++ b/hw/rtl/cache/VX_cache_top.sv @@ -20,7 +20,7 @@ module VX_cache_top import VX_gpu_pkg::*; #( parameter NUM_REQS = 4, // Size of cache in bytes - parameter CACHE_SIZE = 16384, + parameter CACHE_SIZE = 65536, // Size of line inside a bank in bytes parameter LINE_SIZE = 64, // Number of banks @@ -28,37 +28,37 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Number of associative ways parameter NUM_WAYS = 4, // Size of a word in bytes - parameter WORD_SIZE = 4, + parameter WORD_SIZE = 16, // Core Response Queue Size - parameter CRSQ_SIZE = 2, + parameter CRSQ_SIZE = 8, // Miss Reserv Queue Knob parameter MSHR_SIZE = 16, // Memory Response Queue Size - parameter MRSQ_SIZE = 0, + parameter MRSQ_SIZE = 8, // Memory Request Queue Size - parameter MREQ_SIZE = 4, + parameter MREQ_SIZE = 8, // Enable cache writeable parameter WRITE_ENABLE = 1, // Enable cache writeback - parameter WRITEBACK = 0, + parameter WRITEBACK = 1, // Enable dirty bytes on writeback - parameter DIRTY_BYTES = 0, + parameter DIRTY_BYTES = 1, // Request debug identifier parameter UUID_WIDTH = 0, // core request tag size - parameter TAG_WIDTH = 16, + parameter TAG_WIDTH = 32, // Core response output buffer - parameter CORE_OUT_BUF = 2, + parameter CORE_OUT_BUF = 3, // Memory request output buffer - parameter MEM_OUT_BUF = 2, + parameter MEM_OUT_BUF = 3, parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS) ) ( diff --git a/hw/rtl/cache/VX_cache_wrap.sv b/hw/rtl/cache/VX_cache_wrap.sv index 0b8a1f3c4..c181fb466 100644 --- a/hw/rtl/cache/VX_cache_wrap.sv +++ b/hw/rtl/cache/VX_cache_wrap.sv @@ -27,18 +27,18 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( // Size of line inside a bank in bytes parameter LINE_SIZE = 64, // Number of banks - parameter NUM_BANKS = 1, + parameter NUM_BANKS = 4, // Number of associative ways - parameter NUM_WAYS = 1, + parameter NUM_WAYS = 4, // Size of a word in bytes - parameter WORD_SIZE = 4, + parameter WORD_SIZE = 16, // Core Response Queue Size - parameter CRSQ_SIZE = 2, + parameter CRSQ_SIZE = 4, // Miss Reserv Queue Knob - parameter MSHR_SIZE = 8, + parameter MSHR_SIZE = 16, // Memory Response Queue Size - parameter MRSQ_SIZE = 0, + parameter MRSQ_SIZE = 4, // Memory Request Queue Size parameter MREQ_SIZE = 4, @@ -51,12 +51,18 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( // Enable dirty bytes on writeback parameter DIRTY_BYTES = 0, + // Replacement policy + parameter REPL_POLICY = `CS_REPL_CYCLIC, + // Request debug identifier parameter UUID_WIDTH = 0, // core request tag size parameter TAG_WIDTH = UUID_WIDTH + 1, + // core request flags + parameter FLAGS_WIDTH = 0, + // enable bypass for non-cacheable addresses parameter NC_ENABLE = 0, @@ -64,10 +70,10 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( parameter PASSTHRU = 0, // Core response output buffer - parameter CORE_OUT_BUF = 0, + parameter CORE_OUT_BUF = 3, // Memory request output buffer - parameter MEM_OUT_BUF = 0 + parameter MEM_OUT_BUF = 3 ) ( input wire clk, @@ -166,15 +172,17 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), .NUM_REQS (NUM_REQS), + .WRITE_ENABLE (WRITE_ENABLE), + .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), + .REPL_POLICY (REPL_POLICY), .CRSQ_SIZE (CRSQ_SIZE), .MSHR_SIZE (MSHR_SIZE), .MRSQ_SIZE (MRSQ_SIZE), .MREQ_SIZE (MREQ_SIZE), - .WRITE_ENABLE (WRITE_ENABLE), - .WRITEBACK (WRITEBACK), - .DIRTY_BYTES (DIRTY_BYTES), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (TAG_WIDTH), + .FLAGS_WIDTH (FLAGS_WIDTH), .CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF), .MEM_OUT_BUF (NC_OR_BYPASS ? 1 : MEM_OUT_BUF) ) cache ( @@ -232,13 +240,13 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( always @(posedge clk) begin if (core_req_fire) begin if (core_bus_if[i].req_data.rw) begin - `TRACE(1, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid)) + `TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid)) end else begin - `TRACE(1, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid)) + `TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid)) end end if (core_rsp_fire) begin - `TRACE(1, ("%t: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid)) + `TRACE(2, ("%t: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid)) end end end @@ -260,15 +268,15 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( always @(posedge clk) begin if (mem_req_fire) begin if (mem_bus_if.req_data.rw) begin - `TRACE(1, ("%t: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", + `TRACE(2, ("%t: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid)) end else begin - `TRACE(1, ("%t: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n", + `TRACE(2, ("%t: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid)) end end if (mem_rsp_fire) begin - `TRACE(1, ("%t: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n", + `TRACE(2, ("%t: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid)) end end diff --git a/hw/rtl/core/VX_alu_int.sv b/hw/rtl/core/VX_alu_int.sv index 53c7ae57a..8e43d8f3f 100644 --- a/hw/rtl/core/VX_alu_int.sv +++ b/hw/rtl/core/VX_alu_int.sv @@ -194,7 +194,7 @@ module VX_alu_int #( `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (br_enable) begin - `TRACE(1, ("%t: %s branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n", + `TRACE(2, ("%t: %s branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n", $time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid)) end end diff --git a/hw/rtl/core/VX_alu_unit.sv b/hw/rtl/core/VX_alu_unit.sv index 951cd811b..e87221709 100644 --- a/hw/rtl/core/VX_alu_unit.sv +++ b/hw/rtl/core/VX_alu_unit.sv @@ -89,7 +89,7 @@ module VX_alu_unit #( ); VX_alu_int #( - .INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)), + .INSTANCE_ID (`SFORMATF(("%s-int%0d", INSTANCE_ID, block_idx))), .BLOCK_IDX (block_idx), .NUM_LANES (NUM_LANES) ) alu_int ( @@ -102,7 +102,7 @@ module VX_alu_unit #( `ifdef EXT_M_ENABLE VX_alu_muldiv #( - .INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)), + .INSTANCE_ID (`SFORMATF(("%s-muldiv%0d", INSTANCE_ID, block_idx))), .NUM_LANES (NUM_LANES) ) muldiv_unit ( .clk (clk), diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 260cedca3..62ed016af 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -87,7 +87,7 @@ module VX_core import VX_gpu_pkg::*; #( `SCOPE_IO_SWITCH (3); VX_schedule #( - .INSTANCE_ID ($sformatf("%s-schedule", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-schedule", INSTANCE_ID))), .CORE_ID (CORE_ID) ) schedule ( .clk (clk), @@ -115,7 +115,7 @@ module VX_core import VX_gpu_pkg::*; #( ); VX_fetch #( - .INSTANCE_ID ($sformatf("%s-fetch", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-fetch", INSTANCE_ID))) ) fetch ( `SCOPE_IO_BIND (0) .clk (clk), @@ -126,7 +126,7 @@ module VX_core import VX_gpu_pkg::*; #( ); VX_decode #( - .INSTANCE_ID ($sformatf("%s-decode", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-decode", INSTANCE_ID))) ) decode ( .clk (clk), .reset (reset), @@ -136,7 +136,7 @@ module VX_core import VX_gpu_pkg::*; #( ); VX_issue #( - .INSTANCE_ID ($sformatf("%s-issue", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-issue", INSTANCE_ID))) ) issue ( `SCOPE_IO_BIND (1) @@ -153,7 +153,7 @@ module VX_core import VX_gpu_pkg::*; #( ); VX_execute #( - .INSTANCE_ID ($sformatf("%s-execute", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-execute", INSTANCE_ID))), .CORE_ID (CORE_ID) ) execute ( `SCOPE_IO_BIND (2) @@ -181,7 +181,7 @@ module VX_core import VX_gpu_pkg::*; #( ); VX_commit #( - .INSTANCE_ID ($sformatf("%s-commit", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-commit", INSTANCE_ID))) ) commit ( .clk (clk), .reset (reset), diff --git a/hw/rtl/core/VX_core_top.sv b/hw/rtl/core/VX_core_top.sv index 9ade1c28b..e16a80259 100644 --- a/hw/rtl/core/VX_core_top.sv +++ b/hw/rtl/core/VX_core_top.sv @@ -144,7 +144,7 @@ module VX_core_top import VX_gpu_pkg::*; #( `endif VX_core #( - .INSTANCE_ID ($sformatf("core")), + .INSTANCE_ID (`SFORMATF(("core"))), .CORE_ID (CORE_ID) ) core ( `SCOPE_IO_BIND (0) diff --git a/hw/rtl/core/VX_dcr_data.sv b/hw/rtl/core/VX_dcr_data.sv index 042c87e55..6a13e034a 100644 --- a/hw/rtl/core/VX_dcr_data.sv +++ b/hw/rtl/core/VX_dcr_data.sv @@ -50,9 +50,9 @@ module VX_dcr_data import VX_gpu_pkg::*; ( `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (dcr_bus_if.write_valid) begin - `TRACE(1, ("%t: base-dcr: state=", $time)) + `TRACE(2, ("%t: base-dcr: state=", $time)) trace_base_dcr(1, dcr_bus_if.write_addr); - `TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data)) + `TRACE(2, (", data=0x%h\n", dcr_bus_if.write_data)) end end `endif diff --git a/hw/rtl/core/VX_execute.sv b/hw/rtl/core/VX_execute.sv index 4f66757f1..b737725ea 100644 --- a/hw/rtl/core/VX_execute.sv +++ b/hw/rtl/core/VX_execute.sv @@ -52,7 +52,7 @@ module VX_execute import VX_gpu_pkg::*; #( `endif VX_alu_unit #( - .INSTANCE_ID ($sformatf("%s-alu", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-alu", INSTANCE_ID))) ) alu_unit ( .clk (clk), .reset (reset), @@ -64,7 +64,7 @@ module VX_execute import VX_gpu_pkg::*; #( `SCOPE_IO_SWITCH (1); VX_lsu_unit #( - .INSTANCE_ID ($sformatf("%s-lsu", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-lsu", INSTANCE_ID))) ) lsu_unit ( `SCOPE_IO_BIND (0) .clk (clk), @@ -76,7 +76,7 @@ module VX_execute import VX_gpu_pkg::*; #( `ifdef EXT_F_ENABLE VX_fpu_unit #( - .INSTANCE_ID ($sformatf("%s-fpu", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-fpu", INSTANCE_ID))) ) fpu_unit ( .clk (clk), .reset (reset), @@ -87,7 +87,7 @@ module VX_execute import VX_gpu_pkg::*; #( `endif VX_sfu_unit #( - .INSTANCE_ID ($sformatf("%s-sfu", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-sfu", INSTANCE_ID))), .CORE_ID (CORE_ID) ) sfu_unit ( .clk (clk), diff --git a/hw/rtl/core/VX_fetch.sv b/hw/rtl/core/VX_fetch.sv index cf862aa06..802effe07 100644 --- a/hw/rtl/core/VX_fetch.sv +++ b/hw/rtl/core/VX_fetch.sv @@ -51,9 +51,9 @@ module VX_fetch import VX_gpu_pkg::*; #( wire [`NUM_THREADS-1:0] rsp_tmask; VX_dp_ram #( - .DATAW (`PC_BITS + `NUM_THREADS), - .SIZE (`NUM_WARPS), - .LUTRAM (1) + .DATAW (`PC_BITS + `NUM_THREADS), + .SIZE (`NUM_WARPS), + .RDW_MODE ("R") ) tag_store ( .clk (clk), .reset (reset), @@ -166,7 +166,9 @@ module VX_fetch import VX_gpu_pkg::*; #( `SCOPE_IO_UNUSED(0) `endif `endif + `ifdef CHIPSCOPE +`ifdef DBG_SCOPE_FETCH ila_fetch ila_fetch_inst ( .clk (clk), .probe0 ({schedule_if.valid, schedule_if.data, schedule_if.ready}), @@ -174,6 +176,7 @@ module VX_fetch import VX_gpu_pkg::*; #( .probe2 ({icache_bus_if.rsp_valid, icache_bus_if.rsp_data, icache_bus_if.rsp_ready}) ); `endif +`endif `ifdef DBG_TRACE_MEM always @(posedge clk) begin diff --git a/hw/rtl/core/VX_ibuffer.sv b/hw/rtl/core/VX_ibuffer.sv index e1a9457de..abb261b7e 100644 --- a/hw/rtl/core/VX_ibuffer.sv +++ b/hw/rtl/core/VX_ibuffer.sv @@ -39,7 +39,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #( VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`IBUF_SIZE), - .OUT_REG (2) // 2-cycle EB for area reduction + .OUT_REG (1) ) instr_buf ( .clk (clk), .reset (reset), diff --git a/hw/rtl/core/VX_ipdom_stack.sv b/hw/rtl/core/VX_ipdom_stack.sv index ded232f30..6bec14504 100644 --- a/hw/rtl/core/VX_ipdom_stack.sv +++ b/hw/rtl/core/VX_ipdom_stack.sv @@ -16,7 +16,6 @@ module VX_ipdom_stack #( parameter WIDTH = 1, parameter DEPTH = 1, - parameter OUT_REG = 0, parameter ADDRW = `LOG2UP(DEPTH) ) ( input wire clk, @@ -31,76 +30,63 @@ module VX_ipdom_stack #( output wire empty, output wire full ); - reg slot_set [DEPTH-1:0]; - - reg [ADDRW-1:0] rd_ptr, wr_ptr; + reg [ADDRW-1:0] rd_ptr, rd_ptr_n, wr_ptr; reg empty_r, full_r; wire [WIDTH-1:0] d0, d1; - wire d_set_n = slot_set[rd_ptr]; + wire d_set_r; + + always @(*) begin + rd_ptr_n = rd_ptr; + if (push) begin + rd_ptr_n = wr_ptr; + end else if (pop) begin + rd_ptr_n = rd_ptr - ADDRW'(d_set_r); + end + end always @(posedge clk) begin if (reset) begin - rd_ptr <= '0; wr_ptr <= '0; empty_r <= 1; full_r <= 0; + rd_ptr <= '0; end else begin `ASSERT(~push || ~full, ("%t: runtime error: writing to a full stack!", $time)); `ASSERT(~pop || ~empty, ("%t: runtime error: reading an empty stack!", $time)); `ASSERT(~push || ~pop, ("%t: runtime error: push and pop in same cycle not supported!", $time)); if (push) begin - rd_ptr <= wr_ptr; wr_ptr <= wr_ptr + ADDRW'(1); empty_r <= 0; full_r <= (ADDRW'(DEPTH-1) == wr_ptr); end else if (pop) begin - wr_ptr <= wr_ptr - ADDRW'(d_set_n); - rd_ptr <= rd_ptr - ADDRW'(d_set_n); - empty_r <= (rd_ptr == 0) && (d_set_n == 1); + wr_ptr <= wr_ptr - ADDRW'(d_set_r); + empty_r <= (rd_ptr == 0) && d_set_r; full_r <= 0; end + rd_ptr <= rd_ptr_n; end end + wire [WIDTH * 2:0] qout = push ? {1'b0, q1, q0} : {1'b1, d1, d0}; + VX_dp_ram #( - .DATAW (WIDTH * 2), - .SIZE (DEPTH), - .OUT_REG (OUT_REG ? 1 : 0), - .LUTRAM (OUT_REG ? 0 : 1) - ) store ( + .DATAW (1 + WIDTH * 2), + .SIZE (DEPTH), + .OUT_REG (1), + .RDW_MODE ("R") + ) ipdom_store ( .clk (clk), .reset (reset), .read (1'b1), - .write (push), + .write (push || pop), .wren (1'b1), - .waddr (wr_ptr), - .wdata ({q1, q0}), - .raddr (rd_ptr), - .rdata ({d1, d0}) - ); - - always @(posedge clk) begin - if (push) begin - slot_set[wr_ptr] <= 0; - end else if (pop) begin - slot_set[rd_ptr] <= 1; - end - end - - wire d_set_r; - - VX_pipe_register #( - .DATAW (1), - .DEPTH (OUT_REG) - ) pipe_reg ( - .clk (clk), - .reset (reset), - .enable (1'b1), - .data_in (d_set_n), - .data_out (d_set_r) + .waddr (push ? wr_ptr : rd_ptr), + .wdata (qout), + .raddr (rd_ptr_n), + .rdata ({d_set_r, d1, d0}) ); assign d = d_set_r ? d0 : d1; diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 84bcc0072..924d1a67d 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -52,7 +52,7 @@ module VX_issue import VX_gpu_pkg::*; #( `SCOPE_IO_SWITCH (`ISSUE_WIDTH); - for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : g_issue_slices + for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : g_slices VX_decode_if #( .NUM_WARPS (PER_ISSUE_WARPS) ) per_issue_decode_if(); @@ -78,7 +78,7 @@ module VX_issue import VX_gpu_pkg::*; #( `endif VX_issue_slice #( - .INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, issue_id)), + .INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, issue_id))), .ISSUE_ID (issue_id) ) issue_slice ( `SCOPE_IO_BIND(issue_id) diff --git a/hw/rtl/core/VX_issue_slice.sv b/hw/rtl/core/VX_issue_slice.sv index f287525c7..5af5f0ef0 100644 --- a/hw/rtl/core/VX_issue_slice.sv +++ b/hw/rtl/core/VX_issue_slice.sv @@ -37,7 +37,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #( VX_operands_if operands_if(); VX_ibuffer #( - .INSTANCE_ID ($sformatf("%s-ibuffer", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-ibuffer", INSTANCE_ID))) ) ibuffer ( .clk (clk), .reset (reset), @@ -49,7 +49,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #( ); VX_scoreboard #( - .INSTANCE_ID ($sformatf("%s-scoreboard", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-scoreboard", INSTANCE_ID))) ) scoreboard ( .clk (clk), .reset (reset), @@ -64,7 +64,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #( ); VX_operands #( - .INSTANCE_ID ($sformatf("%s-operands", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-operands", INSTANCE_ID))) ) operands ( .clk (clk), .reset (reset), @@ -77,7 +77,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #( ); VX_dispatch #( - .INSTANCE_ID ($sformatf("%s-dispatch", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-dispatch", INSTANCE_ID))) ) dispatch ( .clk (clk), .reset (reset), @@ -143,7 +143,9 @@ module VX_issue_slice import VX_gpu_pkg::*; #( `SCOPE_IO_UNUSED(0) `endif `endif + `ifdef CHIPSCOPE +`ifdef DBG_SCOPE_ISSUE ila_issue ila_issue_inst ( .clk (clk), .probe0 ({decode_if.valid, decode_if.data, decode_if.ready}), @@ -152,6 +154,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #( .probe3 ({writeback_if.valid, writeback_if.data}) ); `endif +`endif `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin diff --git a/hw/rtl/core/VX_lsu_slice.sv b/hw/rtl/core/VX_lsu_slice.sv index 1f39ab5a7..333cbfa54 100644 --- a/hw/rtl/core/VX_lsu_slice.sv +++ b/hw/rtl/core/VX_lsu_slice.sv @@ -310,7 +310,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #( wire lsu_mem_rsp_ready; VX_mem_scheduler #( - .INSTANCE_ID ($sformatf("%s-memsched", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-memsched", INSTANCE_ID))), .CORE_REQS (NUM_LANES), .MEM_CHANNELS(NUM_LANES), .WORD_SIZE (LSU_WORD_SIZE), @@ -504,30 +504,30 @@ module VX_lsu_slice import VX_gpu_pkg::*; #( `ifdef DBG_TRACE_MEM always @(posedge clk) begin if (execute_if.valid && fence_lock) begin - `TRACE(1, ("%t: *** %s fence wait\n", $time, INSTANCE_ID)) + `TRACE(2, ("%t: *** %s fence wait\n", $time, INSTANCE_ID)) end if (mem_req_fire) begin if (mem_req_rw) begin - `TRACE(1, ("%t: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask)) - `TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES) - `TRACE(1, (", flags=")) - `TRACE_ARRAY1D(1, "%b", mem_req_flags, NUM_LANES) - `TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen)) - `TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES) - `TRACE(1, (", sop=%b, eop=%b, tag=0x%0h (#%0d)\n", execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid)) + `TRACE(2, ("%t: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", full_addr, NUM_LANES) + `TRACE(2, (", flags=")) + `TRACE_ARRAY1D(2, "%b", mem_req_flags, NUM_LANES) + `TRACE(2, (", byteen=0x%0h, data=", mem_req_byteen)) + `TRACE_ARRAY1D(2, "0x%0h", mem_req_data, NUM_LANES) + `TRACE(2, (", sop=%b, eop=%b, tag=0x%0h (#%0d)\n", execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid)) end else begin - `TRACE(1, ("%t: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask)) - `TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES) - `TRACE(1, (", flags=")) - `TRACE_ARRAY1D(1, "%b", mem_req_flags, NUM_LANES) - `TRACE(1, (", byteen=0x%0h, rd=%0d, sop=%b, eop=%b, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid)) + `TRACE(2, ("%t: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", full_addr, NUM_LANES) + `TRACE(2, (", flags=")) + `TRACE_ARRAY1D(2, "%b", mem_req_flags, NUM_LANES) + `TRACE(2, (", byteen=0x%0h, rd=%0d, sop=%b, eop=%b, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid)) end end if (mem_rsp_fire) begin - `TRACE(1, ("%t: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=", + `TRACE(2, ("%t: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=", $time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop)) - `TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES) - `TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid)) + `TRACE_ARRAY1D(2, "0x%0h", mem_rsp_data, NUM_LANES) + `TRACE(2, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid)) end end `endif @@ -561,7 +561,9 @@ module VX_lsu_slice import VX_gpu_pkg::*; #( `SCOPE_IO_UNUSED(0) `endif `endif + `ifdef CHIPSCOPE +`ifdef DBG_SCOPE_LSU ila_lsu ila_lsu_inst ( .clk (clk), .probe0 ({execute_if.valid, execute_if.data, execute_if.ready}), @@ -569,5 +571,6 @@ module VX_lsu_slice import VX_gpu_pkg::*; #( .probe2 ({lsu_mem_if.rsp_valid, lsu_mem_if.rsp_data, lsu_mem_if.rsp_ready}) ); `endif +`endif endmodule diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index 6e9e2081c..7a64a849b 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -52,9 +52,9 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( .NUM_LANES (NUM_LANES) ) per_block_commit_if[BLOCK_SIZE](); - for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_lsus + for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_slices VX_lsu_slice #( - .INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, block_idx)) + .INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, block_idx))) ) lsu_slice( `SCOPE_IO_BIND (block_idx) .clk (clk), diff --git a/hw/rtl/core/VX_mem_unit.sv b/hw/rtl/core/VX_mem_unit.sv index c02e99b29..931ad65cd 100644 --- a/hw/rtl/core/VX_mem_unit.sv +++ b/hw/rtl/core/VX_mem_unit.sv @@ -92,7 +92,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #( end VX_local_mem #( - .INSTANCE_ID($sformatf("%s-lmem", INSTANCE_ID)), + .INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))), .SIZE (1 << `LMEM_LOG_SIZE), .NUM_REQS (LSU_NUM_REQS), .NUM_BANKS (`LMEM_NUM_BANKS), @@ -127,11 +127,11 @@ module VX_mem_unit import VX_gpu_pkg::*; #( .TAG_WIDTH (DCACHE_TAG_WIDTH) ) dcache_coalesced_if[`NUM_LSU_BLOCKS](); - if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin : g_enabled + if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers VX_mem_coalescer #( - .INSTANCE_ID ($sformatf("%s-coalescer%0d", INSTANCE_ID, i)), + .INSTANCE_ID (`SFORMATF(("%s-coalescer%0d", INSTANCE_ID, i))), .NUM_REQS (`NUM_LSU_LANES), .DATA_IN_SIZE (LSU_WORD_SIZE), .DATA_OUT_SIZE (DCACHE_WORD_SIZE), diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 42a91e4c2..48b01b4c6 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -178,14 +178,14 @@ module VX_operands import VX_gpu_pkg::*; #( wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1; VX_pipe_buffer #( - .DATAW (NUM_BANKS + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH) + .DATAW (NUM_BANKS * (1 + REQ_SEL_WIDTH) + META_DATAW) ) pipe_reg2 ( .clk (clk), .reset (reset), .valid_in (pipe_valid2_st1), .ready_in (pipe_ready_st1), - .data_in ({gpr_rd_valid_st1, pipe_data_st1, gpr_rd_req_idx_st1}), - .data_out ({gpr_rd_valid_st2, pipe_data_st2, gpr_rd_req_idx_st2}), + .data_in ({gpr_rd_valid_st1, gpr_rd_req_idx_st1, pipe_data_st1}), + .data_out ({gpr_rd_valid_st2, gpr_rd_req_idx_st2, pipe_data_st2}), .valid_out(pipe_valid_st2), .ready_out(pipe_ready_st2) ); @@ -266,13 +266,12 @@ module VX_operands import VX_gpu_pkg::*; #( VX_dp_ram #( .DATAW (REGS_DATAW), .SIZE (PER_BANK_REGS * PER_ISSUE_WARPS), - .OUT_REG (1), - .READ_ENABLE (1), .WRENW (BYTEENW), `ifdef GPR_RESET .RESET_RAM (1), `endif - .NO_RWCHECK (1) + .OUT_REG (1), + .RDW_MODE ("U") ) gpr_ram ( .clk (clk), .reset (reset), diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index 9b49ae268..800b6b63f 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -290,7 +290,7 @@ module VX_schedule import VX_gpu_pkg::*; #( // split/join handling VX_split_join #( - .INSTANCE_ID ($sformatf("%s-splitjoin", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-splitjoin", INSTANCE_ID))) ) split_join ( .clk (clk), .reset (reset), @@ -388,7 +388,7 @@ module VX_schedule import VX_gpu_pkg::*; #( wire no_pending_instr = (& pending_warp_empty); - `BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1); + `BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1, 1); // export CSRs assign sched_csr_if.cycles = cycles; diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 1fe9a7f44..9ec9a6287 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -62,8 +62,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #( .data_out (perf_sfu_per_cycle) ); - `BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT)); - `BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT)); + `BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, 0, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT)); + `BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, 0, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT)); wire [PER_ISSUE_WARPS-1:0] stg_valid_in; for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stg_valid_in @@ -206,7 +206,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end else begin if (staging_if[w].valid && ~staging_if[w].ready) begin `ifdef DBG_TRACE_PIPELINE - `TRACE(3, ("%t: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n", + `TRACE(4, ("%t: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n", $time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr, operands_busy, staging_if[w].data.uuid)) `endif diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv index 5af6211f6..dccfcfe46 100644 --- a/hw/rtl/core/VX_sfu_unit.sv +++ b/hw/rtl/core/VX_sfu_unit.sv @@ -99,7 +99,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( ); VX_wctl_unit #( - .INSTANCE_ID ($sformatf("%s-wctl", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-wctl", INSTANCE_ID))), .NUM_LANES (NUM_LANES) ) wctl_unit ( .clk (clk), @@ -110,7 +110,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( ); VX_csr_unit #( - .INSTANCE_ID ($sformatf("%s-csr", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-csr", INSTANCE_ID))), .CORE_ID (CORE_ID), .NUM_LANES (NUM_LANES) ) csr_unit ( diff --git a/hw/rtl/core/VX_split_join.sv b/hw/rtl/core/VX_split_join.sv index 7955437a6..c3f1f73f3 100644 --- a/hw/rtl/core/VX_split_join.sv +++ b/hw/rtl/core/VX_split_join.sv @@ -48,8 +48,7 @@ module VX_split_join import VX_gpu_pkg::*; #( for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_ipdom_stacks VX_ipdom_stack #( .WIDTH (`NUM_THREADS+`PC_BITS), - .DEPTH (`DV_STACK_SIZE), - .OUT_REG (0) + .DEPTH (`DV_STACK_SIZE) ) ipdom_stack ( .clk (clk), .reset (reset), diff --git a/hw/rtl/libs/VX_async_ram_patch.sv b/hw/rtl/libs/VX_async_ram_patch.sv new file mode 100644 index 000000000..fd29e881d --- /dev/null +++ b/hw/rtl/libs/VX_async_ram_patch.sv @@ -0,0 +1,158 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_platform.vh" + +`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end + +`define RAM_INITIALIZATION \ + if (INIT_ENABLE != 0) begin : g_init \ + if (INIT_FILE != "") begin : g_file \ + initial $readmemh(INIT_FILE, ram); \ + end else begin : g_value \ + initial begin \ + for (integer i = 0; i < SIZE; ++i) begin : g_i \ + ram[i] = INIT_VALUE; \ + end \ + end \ + end \ + end + +`define RAM_BYPASS(__d) \ + reg [DATAW-1:0] bypass_data_r; \ + reg bypass_valid_r; \ + always @(posedge clk) begin \ + bypass_valid_r <= read_s && write && (raddr_s == waddr); \ + bypass_data_r <= wdata; \ + end \ + assign __d = bypass_valid_r ? bypass_data_r : rdata_r + +`TRACING_OFF +module VX_async_ram_patch #( + parameter DATAW = 1, + parameter SIZE = 1, + parameter WRENW = 1, + parameter DUAL_PORT = 0, + parameter INIT_ENABLE = 0, + parameter INIT_FILE = "", + parameter [DATAW-1:0] INIT_VALUE = 0, + parameter ADDRW = `LOG2UP(SIZE) +) ( + input wire clk, + input wire reset, + input wire read, + input wire write, + input wire [WRENW-1:0] wren, + input wire [ADDRW-1:0] waddr, + input wire [DATAW-1:0] wdata, + input wire [ADDRW-1:0] raddr, + output wire [DATAW-1:0] rdata +); + localparam WSELW = DATAW / WRENW; + + `UNUSED_VAR (reset) + + (* keep = "true" *) wire [ADDRW-1:0] raddr_w, raddr_s; + (* keep = "true" *) wire read_s, is_raddr_reg; + + assign raddr_w = raddr; + + VX_placeholder #( + .I (ADDRW), + .O (ADDRW + 1 + 1) + ) placeholder ( + .in (raddr_w), + .out ({raddr_s, read_s, is_raddr_reg}) + ); + + // synchroneous ram + + wire [DATAW-1:0] rdata_s; + + if (WRENW != 1) begin : g_wren_sync_ram + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + reg [DATAW-1:0] rdata_r; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (read_s || write) begin + if (write) begin + `RAM_WRITE_WREN + end + rdata_r <= ram[raddr_s]; + end + end + `RAM_BYPASS(rdata_s); + end else begin : g_no_wren_sync_ram + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + reg [DATAW-1:0] rdata_r; + `RAM_INITIALIZATION + `UNUSED_VAR (wren) + always @(posedge clk) begin + if (read_s || write) begin + if (write) begin + ram[waddr] <= wdata; + end + rdata_r <= ram[raddr_s]; + end + end + `RAM_BYPASS(rdata_s); + end + + // asynchronous ram (fallback) + + wire [DATAW-1:0] rdata_a; + + if (DUAL_PORT != 0) begin : g_dp_async_ram + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + if (WRENW != 1) begin : g_wren + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end + end + end else begin : g_no_wren + always @(posedge clk) begin + if (write) begin + ram[waddr] <= wdata; + end + end + end + assign rdata_a = ram[raddr]; + end else begin : g_sp_async_ram + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + if (WRENW != 1) begin : g_wren + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end + end + end else begin : g_no_wren + always @(posedge clk) begin + if (write) begin + ram[waddr] <= wdata; + end + end + end + assign rdata_a = ram[waddr]; + end + + assign rdata = is_raddr_reg ? rdata_s : rdata_a; + +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index 255789fd7..162b0581a 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -135,7 +135,7 @@ module VX_axi_adapter #( ); end - wire tbuf_full; + wire mem_req_tag_ready; wire [TAG_WIDTH_OUT-1:0] mem_req_tag_out; wire [TAG_WIDTH_OUT-1:0] mem_rsp_tag_out; @@ -143,13 +143,14 @@ module VX_axi_adapter #( if (TAG_WIDTH_IN > TAG_WIDTH_OUT) begin : g_tag_buf localparam TBUF_ADDRW = `CLOG2(TAG_BUFFER_SIZE); wire [TBUF_ADDRW-1:0] tbuf_waddr, tbuf_raddr; + wire tbuf_full; VX_index_buffer #( .DATAW (TAG_WIDTH_IN), .SIZE (TAG_BUFFER_SIZE) ) tag_buf ( .clk (clk), .reset (reset), - .acquire_en (mem_req_valid && !mem_req_rw && mem_req_ready), + .acquire_en (mem_req_valid && ~mem_req_rw && mem_req_ready), .write_addr (tbuf_waddr), .write_data (mem_req_tag), .read_data (mem_rsp_tag), @@ -158,22 +159,24 @@ module VX_axi_adapter #( .full (tbuf_full), `UNUSED_PIN (empty) ); + assign mem_req_tag_ready = mem_req_rw || ~tbuf_full; assign mem_req_tag_out = TAG_WIDTH_OUT'(tbuf_waddr); assign tbuf_raddr = mem_rsp_tag_out[TBUF_ADDRW-1:0]; `UNUSED_VAR (mem_rsp_tag_out) end else begin : g_no_tag_buf - assign tbuf_full = 0; + assign mem_req_tag_ready = 1; assign mem_req_tag_out = TAG_WIDTH_OUT'(mem_req_tag); assign mem_rsp_tag = mem_rsp_tag_out[TAG_WIDTH_IN-1:0]; `UNUSED_VAR (mem_rsp_tag_out) end // request ack - assign mem_req_ready = (mem_req_rw ? axi_write_ready[req_bank_sel] : m_axi_arready[req_bank_sel]) && ~tbuf_full; + assign mem_req_ready = mem_req_rw ? axi_write_ready[req_bank_sel] : + (m_axi_arready[req_bank_sel] && mem_req_tag_ready); // AXI write request address channel for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_axi_write_addr - assign m_axi_awvalid[i] = mem_req_valid && mem_req_rw && (req_bank_sel == i) && ~tbuf_full && ~m_axi_aw_ack[i]; + assign m_axi_awvalid[i] = mem_req_valid && mem_req_rw && (req_bank_sel == i) && ~m_axi_aw_ack[i]; assign m_axi_awaddr[i] = ADDR_WIDTH_OUT'(req_bank_off) << `CLOG2(DATA_WIDTH/8); assign m_axi_awid[i] = mem_req_tag_out; assign m_axi_awlen[i] = 8'b00000000; @@ -188,7 +191,7 @@ module VX_axi_adapter #( // AXI write request data channel for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_axi_write_data - assign m_axi_wvalid[i] = mem_req_valid && mem_req_rw && (req_bank_sel == i) && ~tbuf_full && ~m_axi_w_ack[i]; + assign m_axi_wvalid[i] = mem_req_valid && mem_req_rw && (req_bank_sel == i) && ~m_axi_w_ack[i]; assign m_axi_wdata[i] = mem_req_data; assign m_axi_wstrb[i] = mem_req_byteen; assign m_axi_wlast[i] = 1'b1; @@ -205,7 +208,7 @@ module VX_axi_adapter #( // AXI read request channel for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_axi_read_req - assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i) && ~tbuf_full; + assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i) && mem_req_tag_ready; assign m_axi_araddr[i] = ADDR_WIDTH_OUT'(req_bank_off) << `CLOG2(DATA_WIDTH/8); assign m_axi_arid[i] = mem_req_tag_out; assign m_axi_arlen[i] = 8'b00000000; @@ -228,9 +231,8 @@ module VX_axi_adapter #( assign rsp_arb_valid_in[i] = m_axi_rvalid[i]; assign rsp_arb_data_in[i] = {m_axi_rdata[i], m_axi_rid[i]}; assign m_axi_rready[i] = rsp_arb_ready_in[i]; - `RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rlast[i] == 1, ("%t: *** AXI response error", $time)) - `RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rresp[i] == 0, ("%t: *** AXI response error", $time)) - `UNUSED_VAR (m_axi_rlast[i]) + `RUNTIME_ASSERT(~(m_axi_rvalid[i] && m_axi_rlast[i] == 0), ("%t: *** AXI response error", $time)) + `RUNTIME_ASSERT(~(m_axi_rvalid[i] && m_axi_rresp[i] != 0), ("%t: *** AXI response error", $time)) end VX_stream_arb #( diff --git a/hw/rtl/libs/VX_cyclic_arbiter.sv b/hw/rtl/libs/VX_cyclic_arbiter.sv index a4dead008..9c28fcc4a 100644 --- a/hw/rtl/libs/VX_cyclic_arbiter.sv +++ b/hw/rtl/libs/VX_cyclic_arbiter.sv @@ -65,12 +65,12 @@ module VX_cyclic_arbiter #( .valid_out (grant_valid) ); - VX_decoder #( + VX_demux #( .N (LOG_NUM_REQS), .D (NUM_REQS) ) grant_decoder ( - .data_in (grant_index), - .valid_in (1'b1), + .sel_in (grant_index), + .data_in (1'b1), .data_out (grant_onehot_w) ); diff --git a/hw/rtl/libs/VX_decoder.sv b/hw/rtl/libs/VX_demux.sv similarity index 60% rename from hw/rtl/libs/VX_decoder.sv rename to hw/rtl/libs/VX_demux.sv index 7c0c760e5..b76ab42aa 100644 --- a/hw/rtl/libs/VX_decoder.sv +++ b/hw/rtl/libs/VX_demux.sv @@ -17,26 +17,31 @@ // Adapted from BaseJump STL: http://bjump.org/data_out.html `TRACING_OFF -module VX_decoder #( - parameter N = 1, +module VX_demux #( + parameter N = 0, parameter M = 1, parameter MODEL = 0, parameter D = 1 << N ) ( - input wire [N-1:0] data_in, - input wire [M-1:0] valid_in, + input wire [`UP(N)-1:0] sel_in, + input wire [M-1:0] data_in, output wire [D-1:0][M-1:0] data_out ); - logic [D-1:0][M-1:0] shift; - if (MODEL == 1) begin : g_model1 - always @(*) begin - shift = '0; - shift[data_in] = {M{1'b1}}; + if (N != 0) begin : g_decoder + logic [D-1:0][M-1:0] shift; + if (MODEL == 1) begin : g_model1 + always @(*) begin + shift = '0; + shift[sel_in] = {M{1'b1}}; + end + end else begin : g_model0 + assign shift = ((D*M)'({M{1'b1}})) << (sel_in * M); end - end else begin : g_model0 - assign shift = ((D*M)'({M{1'b1}})) << (data_in * M); + assign data_out = {D{data_in}} & shift; + end else begin : g_passthru + `UNUSED_VAR (sel_in) + assign data_out = data_in; end - assign data_out = {D{valid_in}} & shift; endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index 21ab03ad5..0cff67882 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -13,6 +13,35 @@ `include "VX_platform.vh" +`define RAM_INITIALIZATION \ + if (INIT_ENABLE != 0) begin : g_init \ + if (INIT_FILE != "") begin : g_file \ + initial $readmemh(INIT_FILE, ram); \ + end else begin : g_value \ + initial begin \ + for (integer i = 0; i < SIZE; ++i) begin : g_i \ + ram[i] = INIT_VALUE; \ + end \ + end \ + end \ + end + +`ifdef QUARTUS + `define RAM_ARRAY_WREN reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1]; + `define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[waddr][i] <= wdata[i * WSELW +: WSELW]; \ + end \ + end +`else + `define RAM_ARRAY_WREN reg [DATAW-1:0] ram [0:SIZE-1]; + `define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end +`endif + `TRACING_OFF module VX_dp_ram #( parameter DATAW = 1, @@ -20,11 +49,9 @@ module VX_dp_ram #( parameter WRENW = 1, parameter OUT_REG = 0, parameter LUTRAM = 0, - parameter NO_RWCHECK = 0, - parameter RW_ASSERT = 0, + parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first, U: undefined + parameter RDW_ASSERT = 0, parameter RESET_RAM = 0, - parameter RESET_OUT = 0, - parameter READ_ENABLE = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, @@ -41,284 +68,348 @@ module VX_dp_ram #( output wire [DATAW-1:0] rdata ); localparam WSELW = DATAW / WRENW; - `STATIC_ASSERT((WRENW * WSELW == DATAW), ("invalid parameter")) - -`define RAM_INITIALIZATION \ - if (INIT_ENABLE != 0) begin : g_init \ - if (INIT_FILE != "") begin : g_file \ - initial $readmemh(INIT_FILE, ram); \ - end else begin : g_value \ - initial begin \ - for (integer i = 0; i < SIZE; ++i) \ - ram[i] = INIT_VALUE; \ - end \ - end \ - end - - `UNUSED_PARAM (RW_ASSERT) - `UNUSED_VAR (read) + `UNUSED_PARAM (LUTRAM) - `RUNTIME_ASSERT((((WRENW == 1) ) || ~write) || (| wren), ("%t: invalid write enable mask", $time)) + `STATIC_ASSERT(!(WRENW * WSELW != DATAW), ("invalid parameter")) + `STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W" || RDW_MODE == "U"), ("invalid parameter")) + `UNUSED_PARAM (RDW_ASSERT) - if (OUT_REG && !READ_ENABLE) begin : g_out_reg - `UNUSED_PARAM (NO_RWCHECK) - reg [DATAW-1:0] rdata_r; - wire cs = read || write; - if (WRENW != 1) begin : g_writeen - `ifdef QUARTUS - if (LUTRAM != 0) begin : g_lutram - `USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (cs) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; +`ifdef SYNTHESIS + localparam FORCE_BRAM = !LUTRAM && (SIZE * DATAW >= `MAX_LUTRAM); + if (OUT_REG) begin : g_sync + if (FORCE_BRAM) begin : g_bram + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + (* rw_addr_collision = "yes" *) `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [ADDRW-1:0] raddr_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + `RAM_WRITE_WREN end + raddr_r <= raddr; end - if (RESET_OUT && reset) begin - rdata_r <= '0; - end else begin - rdata_r <= ram[raddr]; + end + assign rdata = ram[raddr_r]; + end else begin : g_no_wren + (* rw_addr_collision = "yes" *) `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [ADDRW-1:0] raddr_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + ram[waddr] <= wdata; + end + raddr_r <= raddr; end end + assign rdata = ram[raddr_r]; end - end else begin : g_no_lutram - reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (cs) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; + end else if (RDW_MODE == "R") begin : g_read_first + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + `RAM_WRITE_WREN end + rdata_r <= ram[raddr]; end - if (RESET_OUT && reset) begin - rdata_r <= '0; - end else begin + end + assign rdata = rdata_r; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + ram[waddr] <= wdata; + end rdata_r <= ram[raddr]; end end + assign rdata = rdata_r; end - end - `else - // default synthesis - if (LUTRAM != 0) begin : g_lutram - `USE_FAST_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (cs) begin + end else begin : g_undefined + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end + `RAM_WRITE_WREN end - if (RESET_OUT && reset) begin - rdata_r <= '0; - end else begin + if (read) begin rdata_r <= ram[raddr]; end end - end - end else begin : g_no_lutram - reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (cs) begin + assign rdata = rdata_r; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end + ram[waddr] <= wdata; end - if (RESET_OUT && reset) begin - rdata_r <= '0; - end else begin + if (read) begin rdata_r <= ram[raddr]; end end + assign rdata = rdata_r; end end - `endif - end else begin : g_no_writeen - if (LUTRAM != 0) begin : g_lutram - `USE_FAST_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (cs) begin - if (write) - ram[waddr] <= wdata; - if (RESET_OUT && reset) begin - rdata_r <= '0; - end else begin - rdata_r <= ram[raddr]; + end else begin : g_auto + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + (* rw_addr_collision = "yes" *) `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [ADDRW-1:0] raddr_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + `RAM_WRITE_WREN + end + raddr_r <= raddr; end end + assign rdata = ram[raddr_r]; + end else begin : g_no_wren + (* rw_addr_collision = "yes" *) reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [ADDRW-1:0] raddr_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + ram[waddr] <= wdata; + end + raddr_r <= raddr; + end + end + assign rdata = ram[raddr_r]; end - - end else begin : g_no_lutram - reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (cs) begin - if (write) - ram[waddr] <= wdata; - if (RESET_OUT && reset) begin - rdata_r <= '0; - end else begin + end else if (RDW_MODE == "R") begin : g_read_first + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + `RAM_WRITE_WREN + end rdata_r <= ram[raddr]; end end - end - end - end - assign rdata = rdata_r; - end else begin : g_no_out_reg - // OUT_REG==0 || READ_ENABLE=1 - wire [DATAW-1:0] rdata_w; - `ifdef SYNTHESIS - if (WRENW > 1) begin : g_writeen - `ifdef QUARTUS - if (LUTRAM != 0) begin : g_lutram - `USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; + assign rdata = rdata_r; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + ram[waddr] <= wdata; + end + rdata_r <= ram[raddr]; end end + assign rdata = rdata_r; end - assign rdata_w = ram[raddr]; - end else begin : g_no_lutram - if (NO_RWCHECK != 0) begin : g_no_rwcheck - `NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1]; + end else begin + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; always @(posedge clk) begin if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; - end + `RAM_WRITE_WREN + end + if (read) begin + rdata_r <= ram[raddr]; end end - assign rdata_w = ram[raddr]; - end else begin : g_rwcheck - reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1]; + assign rdata = rdata_r; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; always @(posedge clk) begin if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; - end + ram[waddr] <= wdata; + end + if (read) begin + rdata_r <= ram[raddr]; end end - assign rdata_w = ram[raddr]; + assign rdata = rdata_r; end end - `else - // default synthesis - if (LUTRAM != 0) begin : g_lutram - `USE_FAST_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; + end + end else begin : g_async + `UNUSED_VAR (read) + if (FORCE_BRAM) begin : g_bram + if (RDW_MODE == "W") begin : g_write_first + `ifdef VIVADO + VX_async_ram_patch #( + .DATAW (DATAW), + .SIZE (SIZE), + .WRENW (WRENW), + .DUAL_PORT (1), + .INIT_ENABLE(INIT_ENABLE), + .INIT_FILE (INIT_FILE), + .INIT_VALUE (INIT_VALUE) + ) async_ram_patch ( + .clk (clk), + .reset (reset), + .read (read), + .write (write), + .wren (wren), + .waddr (waddr), + .wdata (wdata), + .raddr (raddr), + .rdata (rdata) + ); + `else + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end + end + assign rdata = ram[raddr]; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + ram[waddr] <= wdata; end end + assign rdata = ram[raddr]; end - assign rdata_w = ram[raddr]; - end else begin : g_no_lutram - if (NO_RWCHECK != 0) begin : g_no_rwcheck - `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; + `endif + end else begin : g_read_first + if (WRENW != 1) begin : g_wren + `NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end + `RAM_WRITE_WREN end end - assign rdata_w = ram[raddr]; - end else begin : g_rwcheck - reg [DATAW-1:0] ram [0:SIZE-1]; + assign rdata = ram[raddr]; + end else begin : g_no_wren + `NO_RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end + ram[waddr] <= wdata; end end - assign rdata_w = ram[raddr]; + assign rdata = ram[raddr]; end end - `endif - end else begin : g_no_writeen - // (WRENW == 1) - if (LUTRAM != 0) begin : g_lutram - `USE_FAST_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; + end else begin : g_auto + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end end - end - assign rdata_w = ram[raddr]; - end else begin : g_no_lutram - if (NO_RWCHECK != 0) begin : g_no_rwcheck - `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; + assign rdata = ram[raddr]; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin ram[waddr] <= wdata; end end - assign rdata_w = ram[raddr]; - end else begin : g_rwcheck - reg [DATAW-1:0] ram [0:SIZE-1]; + assign rdata = ram[raddr]; + end + end else begin : g_read_first + if (WRENW != 1) begin : g_wren + `NO_RW_RAM_CHECK `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end + end + assign rdata = ram[raddr]; + end else begin : g_no_wren + `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin ram[waddr] <= wdata; end end - assign rdata_w = ram[raddr]; + assign rdata = ram[raddr]; end end end - `else - // simulation - reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION + end +`else + // simulation + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION - wire [DATAW-1:0] ram_n; - for (genvar i = 0; i < WRENW; ++i) begin : g_ram_n - assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW]; + always @(posedge clk) begin + if (RESET_RAM && reset) begin + for (integer i = 0; i < SIZE; ++i) begin + ram[i] <= DATAW'(INIT_VALUE); + end + end else if (write) begin + for (integer i = 0; i < WRENW; ++i) begin + if (wren[i]) begin + ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; + end + end end + end - always @(posedge clk) begin - if (RESET_RAM && reset) begin - for (integer i = 0; i < SIZE; ++i) begin - ram[i] <= DATAW'(INIT_VALUE); + if (OUT_REG) begin : g_sync + if (RDW_MODE == "W") begin : g_write_first + reg [ADDRW-1:0] raddr_r; + always @(posedge clk) begin + if (read || write) begin + raddr_r <= raddr; end - end else begin - if (write) begin - ram[waddr] <= ram_n; + end + assign rdata = ram[raddr_r]; + end else if (RDW_MODE == "R") begin : g_read_first + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + rdata_r <= ram[raddr]; end end + assign rdata = rdata_r; + end else begin : g_undefined + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read) begin + rdata_r <= ram[raddr]; + end + end + assign rdata = rdata_r; end - - if (!LUTRAM && NO_RWCHECK) begin : g_rdata_no_bypass + end else begin : g_async + `UNUSED_VAR (read) + if (RDW_MODE == "W") begin : g_write_first + assign rdata = ram[raddr]; + end else begin : g_read_first reg [DATAW-1:0] prev_data; reg [ADDRW-1:0] prev_waddr; reg prev_write; @@ -335,30 +426,13 @@ module VX_dp_ram #( end end - assign rdata_w = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; - if (RW_ASSERT) begin : g_rw_assert - `RUNTIME_ASSERT(~read || (rdata_w == ram[raddr]), ("%t: read after write hazard", $time)) + assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; + if (RDW_ASSERT) begin : g_rw_asert + `RUNTIME_ASSERT(~read || (rdata == ram[raddr]), ("%t: read after write hazard", $time)) end - end else begin : g_rdata_with_bypass - assign rdata_w = ram[raddr]; end - `endif - - if (OUT_REG != 0) begin : g_rdata_req - reg [DATAW-1:0] rdata_r; - always @(posedge clk) begin - if (READ_ENABLE && reset) begin - rdata_r <= '0; - end else if (!READ_ENABLE || read) begin - rdata_r <= rdata_w; - end - end - assign rdata = rdata_r; - end else begin : g_rdata_comb - assign rdata = rdata_w; - end - end +`endif endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_fifo_queue.sv b/hw/rtl/libs/VX_fifo_queue.sv index c5a4bf32e..720a1a2c6 100644 --- a/hw/rtl/libs/VX_fifo_queue.sv +++ b/hw/rtl/libs/VX_fifo_queue.sv @@ -15,12 +15,12 @@ `TRACING_OFF module VX_fifo_queue #( - parameter DATAW = 1, - parameter DEPTH = 2, + parameter DATAW = 32, + parameter DEPTH = 32, parameter ALM_FULL = (DEPTH - 1), parameter ALM_EMPTY = 1, parameter OUT_REG = 0, - parameter LUTRAM = 1, + parameter LUTRAM = 0, parameter SIZEW = `CLOG2(DEPTH+1) ) ( input wire clk, @@ -59,6 +59,8 @@ module VX_fifo_queue #( ); if (DEPTH == 1) begin : g_depth_1 + `UNUSED_PARAM (OUT_REG) + `UNUSED_PARAM (LUTRAM) reg [DATAW-1:0] head_r; @@ -74,91 +76,52 @@ module VX_fifo_queue #( localparam ADDRW = `CLOG2(DEPTH); - if (OUT_REG != 0) begin : g_out_reg - - wire [DATAW-1:0] dout; - reg [DATAW-1:0] dout_r; - reg [ADDRW-1:0] wr_ptr_r; - reg [ADDRW-1:0] rd_ptr_r; - reg [ADDRW-1:0] rd_ptr_n_r; + wire [DATAW-1:0] data_out_w; + reg [ADDRW-1:0] rd_ptr_r; + reg [ADDRW-1:0] wr_ptr_r; - always @(posedge clk) begin - if (reset) begin - wr_ptr_r <= '0; - rd_ptr_r <= '0; - rd_ptr_n_r <= 1; - end else begin - wr_ptr_r <= wr_ptr_r + ADDRW'(push); - if (pop) begin - rd_ptr_r <= rd_ptr_n_r; - if (DEPTH > 2) begin - rd_ptr_n_r <= rd_ptr_r + ADDRW'(2); - end else begin // (DEPTH == 2); - rd_ptr_n_r <= ~rd_ptr_n_r; - end - end - end + always @(posedge clk) begin + if (reset) begin + wr_ptr_r <= '0; + rd_ptr_r <= (OUT_REG != 0) ? 1 : 0; + end else begin + wr_ptr_r <= wr_ptr_r + ADDRW'(push); + rd_ptr_r <= rd_ptr_r + ADDRW'(pop); end + end - VX_dp_ram #( - .DATAW (DATAW), - .SIZE (DEPTH), - .LUTRAM (LUTRAM) - ) dp_ram ( - .clk (clk), - .reset (reset), - .read (1'b1), - .write (push), - .wren (1'b1), - .waddr (wr_ptr_r), - .wdata (data_in), - .raddr (rd_ptr_n_r), - .rdata (dout) - ); - - wire going_empty = (ALM_EMPTY == 1) ? alm_empty : (size[ADDRW-1:0] == ADDRW'(1)); + wire going_empty = (ALM_EMPTY == 1) ? alm_empty : (size[ADDRW-1:0] == ADDRW'(1)); + wire bypass = push && (empty || (going_empty && pop)); + + VX_dp_ram #( + .DATAW (DATAW), + .SIZE (DEPTH), + .LUTRAM (LUTRAM), + .RDW_MODE ("W") + ) dp_ram ( + .clk (clk), + .reset (reset), + .read (~bypass), + .write (push), + .wren (1'b1), + .raddr (rd_ptr_r), + .waddr (wr_ptr_r), + .wdata (data_in), + .rdata (data_out_w) + ); + if (OUT_REG != 0) begin : g_out_reg + reg [DATAW-1:0] data_out_r; always @(posedge clk) begin - if (push && (empty || (going_empty && pop))) begin - dout_r <= data_in; + if (bypass) begin + data_out_r <= data_in; end else if (pop) begin - dout_r <= dout; + data_out_r <= data_out_w; end end - - assign data_out = dout_r; - + assign data_out = data_out_r; end else begin : g_no_out_reg - - reg [ADDRW-1:0] rd_ptr_r; - reg [ADDRW-1:0] wr_ptr_r; - - always @(posedge clk) begin - if (reset) begin - rd_ptr_r <= '0; - wr_ptr_r <= '0; - end else begin - wr_ptr_r <= wr_ptr_r + ADDRW'(push); - rd_ptr_r <= rd_ptr_r + ADDRW'(pop); - end - end - - VX_dp_ram #( - .DATAW (DATAW), - .SIZE (DEPTH), - .LUTRAM (LUTRAM) - ) dp_ram ( - .clk (clk), - .reset (reset), - .read (1'b1), - .write (push), - .wren (1'b1), - .waddr (wr_ptr_r), - .wdata (data_in), - .raddr (rd_ptr_r), - .rdata (data_out) - ); - + assign data_out = data_out_w; end end diff --git a/hw/rtl/libs/VX_generic_arbiter.sv b/hw/rtl/libs/VX_generic_arbiter.sv index 5e090ebdd..2b0d086db 100644 --- a/hw/rtl/libs/VX_generic_arbiter.sv +++ b/hw/rtl/libs/VX_generic_arbiter.sv @@ -16,7 +16,7 @@ `TRACING_OFF module VX_generic_arbiter #( parameter NUM_REQS = 1, - parameter `STRING TYPE = "P", + parameter `STRING TYPE = "P", // P: priority, R: round-robin, M: matrix, C: cyclic parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS) ) ( input wire clk, @@ -27,6 +27,8 @@ module VX_generic_arbiter #( output wire grant_valid, input wire grant_ready ); + `STATIC_ASSERT((TYPE == "P" || TYPE == "R" || TYPE == "M" || TYPE == "C"), ("invalid parameter")) + if (TYPE == "P") begin : g_priority `UNUSED_VAR (clk) @@ -84,10 +86,6 @@ module VX_generic_arbiter #( .grant_ready (grant_ready) ); - end else begin : g_invalid - - `ERROR(("invalid parameter")); - end `RUNTIME_ASSERT (((~(| requests) != 1) || (grant_valid && (requests[grant_index] != 0) && (grant_onehot == (NUM_REQS'(1) << grant_index)))), ("%t: invalid arbiter grant!", $time)) diff --git a/hw/rtl/libs/VX_index_buffer.sv b/hw/rtl/libs/VX_index_buffer.sv index 4e8439818..8d0320c5d 100644 --- a/hw/rtl/libs/VX_index_buffer.sv +++ b/hw/rtl/libs/VX_index_buffer.sv @@ -15,10 +15,10 @@ `TRACING_OFF module VX_index_buffer #( - parameter DATAW = 1, - parameter SIZE = 1, - parameter LUTRAM = 1, - parameter ADDRW = `LOG2UP(SIZE) + parameter DATAW = 1, + parameter SIZE = 1, + parameter LUTRAM = 0, + parameter ADDRW = `LOG2UP(SIZE) ) ( input wire clk, input wire reset, @@ -49,9 +49,10 @@ module VX_index_buffer #( ); VX_dp_ram #( - .DATAW (DATAW), - .SIZE (SIZE), - .LUTRAM (LUTRAM) + .DATAW (DATAW), + .SIZE (SIZE), + .LUTRAM (LUTRAM), + .RDW_MODE ("W") ) data_table ( .clk (clk), .reset (reset), diff --git a/hw/rtl/libs/VX_matrix_arbiter.sv b/hw/rtl/libs/VX_matrix_arbiter.sv index 2840ef43e..b6b88e47a 100644 --- a/hw/rtl/libs/VX_matrix_arbiter.sv +++ b/hw/rtl/libs/VX_matrix_arbiter.sv @@ -72,7 +72,7 @@ module VX_matrix_arbiter #( assign grant_onehot = grant; - VX_encoder #( + VX_onehot_encoder #( .N (NUM_REQS) ) encoder ( .data_in (grant_onehot), diff --git a/hw/rtl/libs/VX_mem_adapter.sv b/hw/rtl/libs/VX_mem_adapter.sv index 4ece7cf69..d5efc7d6e 100644 --- a/hw/rtl/libs/VX_mem_adapter.sv +++ b/hw/rtl/libs/VX_mem_adapter.sv @@ -100,21 +100,21 @@ module VX_mem_adapter #( assign mem_req_addr_out_w = mem_req_addr_in_qual; end - VX_decoder #( + VX_demux #( .N (D), .M (SRC_DATA_WIDTH/8) - ) req_be_dec ( - .data_in (req_idx), - .valid_in (mem_req_byteen_in), + ) req_be_demux ( + .sel_in (req_idx), + .data_in (mem_req_byteen_in), .data_out (mem_req_byteen_out_w) ); - VX_decoder #( + VX_demux #( .N (D), .M (SRC_DATA_WIDTH) - ) req_data_dec ( - .data_in (req_idx), - .valid_in (mem_req_data_in), + ) req_data_demux ( + .sel_in (req_idx), + .data_in (mem_req_data_in), .data_out (mem_req_data_out_w) ); diff --git a/hw/rtl/libs/VX_mem_coalescer.sv b/hw/rtl/libs/VX_mem_coalescer.sv index c27f04da4..1a7030b86 100644 --- a/hw/rtl/libs/VX_mem_coalescer.sv +++ b/hw/rtl/libs/VX_mem_coalescer.sv @@ -18,7 +18,7 @@ module VX_mem_coalescer #( parameter `STRING INSTANCE_ID = "", parameter NUM_REQS = 1, parameter ADDR_WIDTH = 32, - parameter FLAGS_WIDTH = 1, + parameter FLAGS_WIDTH = 0, parameter DATA_IN_SIZE = 4, parameter DATA_OUT_SIZE = 64, parameter TAG_WIDTH = 8, @@ -43,7 +43,7 @@ module VX_mem_coalescer #( input wire [NUM_REQS-1:0] in_req_mask, input wire [NUM_REQS-1:0][DATA_IN_SIZE-1:0] in_req_byteen, input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] in_req_addr, - input wire [NUM_REQS-1:0][FLAGS_WIDTH-1:0] in_req_flags, + input wire [NUM_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] in_req_flags, input wire [NUM_REQS-1:0][DATA_IN_WIDTH-1:0] in_req_data, input wire [TAG_WIDTH-1:0] in_req_tag, output wire in_req_ready, @@ -61,7 +61,7 @@ module VX_mem_coalescer #( output wire [OUT_REQS-1:0] out_req_mask, output wire [OUT_REQS-1:0][DATA_OUT_SIZE-1:0] out_req_byteen, output wire [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr, - output wire [OUT_REQS-1:0][FLAGS_WIDTH-1:0] out_req_flags, + output wire [OUT_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] out_req_flags, output wire [OUT_REQS-1:0][DATA_OUT_WIDTH-1:0] out_req_data, output wire [OUT_TAG_WIDTH-1:0] out_req_tag, input wire out_req_ready, @@ -74,6 +74,7 @@ module VX_mem_coalescer #( output wire out_rsp_ready ); `UNUSED_SPARAM (INSTANCE_ID) + `STATIC_ASSERT ((NUM_REQS > 1), ("invalid parameter")) `STATIC_ASSERT (`IS_DIVISBLE(NUM_REQS * DATA_IN_WIDTH, DATA_OUT_WIDTH), ("invalid parameter")) `STATIC_ASSERT ((NUM_REQS * DATA_IN_WIDTH >= DATA_OUT_WIDTH), ("invalid parameter")) `RUNTIME_ASSERT ((~in_req_valid || in_req_mask != 0), ("%t: invalid request mask", $time)) @@ -92,7 +93,7 @@ module VX_mem_coalescer #( logic out_req_rw_r, out_req_rw_n; logic [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n; logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n; - logic [OUT_REQS-1:0][FLAGS_WIDTH-1:0] out_req_flags_r, out_req_flags_n; + logic [OUT_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] out_req_flags_r, out_req_flags_n; logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n; logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n; logic [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n; @@ -110,7 +111,7 @@ module VX_mem_coalescer #( logic [OUT_REQS-1:0] batch_valid_r, batch_valid_n; logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] seed_addr_r, seed_addr_n; - logic [OUT_REQS-1:0][FLAGS_WIDTH-1:0] seed_flags_r, seed_flags_n; + logic [OUT_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] seed_flags_r, seed_flags_n; logic [NUM_REQS-1:0] addr_matches_r, addr_matches_n; logic [NUM_REQS-1:0] req_rem_mask_r, req_rem_mask_n; @@ -139,7 +140,7 @@ module VX_mem_coalescer #( assign addr_base[j] = in_req_addr[DATA_RATIO * i + j][ADDR_WIDTH-1:DATA_RATIO_W]; end - wire [DATA_RATIO-1:0][FLAGS_WIDTH-1:0] req_flags; + wire [DATA_RATIO-1:0][`UP(FLAGS_WIDTH)-1:0] req_flags; for (genvar j = 0; j < DATA_RATIO; ++j) begin : g_req_flags assign req_flags[j] = in_req_flags[DATA_RATIO * i + j]; end @@ -221,7 +222,7 @@ module VX_mem_coalescer #( end VX_pipe_register #( - .DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + FLAGS_WIDTH + OUT_ADDR_WIDTH + FLAGS_WIDTH + DATA_OUT_SIZE + DATA_OUT_WIDTH) + OUT_TAG_WIDTH), + .DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + `UP(FLAGS_WIDTH) + OUT_ADDR_WIDTH + `UP(FLAGS_WIDTH) + DATA_OUT_SIZE + DATA_OUT_WIDTH) + OUT_TAG_WIDTH), .RESETW (1 + NUM_REQS + 1), .INIT_VALUE ({1'b0, {NUM_REQS{1'b1}}, 1'b0}) ) pipe_reg ( @@ -270,7 +271,12 @@ module VX_mem_coalescer #( assign out_req_mask = out_req_mask_r; assign out_req_byteen = out_req_byteen_r; assign out_req_addr = out_req_addr_r; - assign out_req_flags = out_req_flags_r; + if (FLAGS_WIDTH != 0) begin : g_out_req_flags + assign out_req_flags = out_req_flags_r; + end else begin : g_out_req_flags_0 + `UNUSED_VAR (out_req_flags_r) + assign out_req_flags = '0; + end assign out_req_data = out_req_data_r; assign out_req_tag = out_req_tag_r; @@ -346,30 +352,30 @@ module VX_mem_coalescer #( always @(posedge clk) begin if (out_req_fire) begin if (out_req_rw) begin - `TRACE(1, ("%t: %s out-req-wr: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask)) - `TRACE_ARRAY1D(1, "0x%h", out_req_addr, OUT_REQS) - `TRACE(1, (", flags=")) - `TRACE_ARRAY1D(1, "%b", out_req_flags, OUT_REQS) - `TRACE(1, (", byteen=")) - `TRACE_ARRAY1D(1, "0x%h", out_req_byteen, OUT_REQS) - `TRACE(1, (", data=")) - `TRACE_ARRAY1D(1, "0x%0h", out_req_data, OUT_REQS) + `TRACE(2, ("%t: %s out-req-wr: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", out_req_addr, OUT_REQS) + `TRACE(2, (", flags=")) + `TRACE_ARRAY1D(2, "%b", out_req_flags, OUT_REQS) + `TRACE(2, (", byteen=")) + `TRACE_ARRAY1D(2, "0x%h", out_req_byteen, OUT_REQS) + `TRACE(2, (", data=")) + `TRACE_ARRAY1D(2, "0x%0h", out_req_data, OUT_REQS) end else begin - `TRACE(1, ("%d: %s out-req-rd: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask)) - `TRACE_ARRAY1D(1, "0x%h", out_req_addr, OUT_REQS) - `TRACE(1, (", flags=")) - `TRACE_ARRAY1D(1, "%b", out_req_flags, OUT_REQS) + `TRACE(2, ("%d: %s out-req-rd: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", out_req_addr, OUT_REQS) + `TRACE(2, (", flags=")) + `TRACE_ARRAY1D(2, "%b", out_req_flags, OUT_REQS) end - `TRACE(1, (", offset=")) - `TRACE_ARRAY1D(1, "%0d", out_req_offset, NUM_REQS) - `TRACE(1, (", pmask=%b, coalesced=%0d, tag=0x%0h (#%0d)\n", out_req_pmask, $countones(out_req_pmask), out_req_tag, out_req_uuid)) + `TRACE(2, (", offset=")) + `TRACE_ARRAY1D(2, "%0d", out_req_offset, NUM_REQS) + `TRACE(2, (", pmask=%b, coalesced=%0d, tag=0x%0h (#%0d)\n", out_req_pmask, $countones(out_req_pmask), out_req_tag, out_req_uuid)) end if (out_rsp_fire) begin - `TRACE(1, ("%t: %s out-rsp: valid=%b, data=", $time, INSTANCE_ID, out_rsp_mask)) - `TRACE_ARRAY1D(1, "0x%0h", out_rsp_data, OUT_REQS) - `TRACE(1, (", offset=")) - `TRACE_ARRAY1D(1, "%0d", ibuf_dout_offset, NUM_REQS) - `TRACE(1, (", eop=%b, pmask=%b, tag=0x%0h (#%0d)\n", out_rsp_eop, ibuf_dout_pmask, out_rsp_tag, out_rsp_uuid)) + `TRACE(2, ("%t: %s out-rsp: valid=%b, data=", $time, INSTANCE_ID, out_rsp_mask)) + `TRACE_ARRAY1D(2, "0x%0h", out_rsp_data, OUT_REQS) + `TRACE(2, (", offset=")) + `TRACE_ARRAY1D(2, "%0d", ibuf_dout_offset, NUM_REQS) + `TRACE(2, (", eop=%b, pmask=%b, tag=0x%0h (#%0d)\n", out_rsp_eop, ibuf_dout_pmask, out_rsp_tag, out_rsp_uuid)) end end `endif diff --git a/hw/rtl/libs/VX_mem_scheduler.sv b/hw/rtl/libs/VX_mem_scheduler.sv index 4ba8bf147..65a057b80 100644 --- a/hw/rtl/libs/VX_mem_scheduler.sv +++ b/hw/rtl/libs/VX_mem_scheduler.sv @@ -21,7 +21,7 @@ module VX_mem_scheduler #( parameter WORD_SIZE = 4, parameter LINE_SIZE = WORD_SIZE, parameter ADDR_WIDTH = 32 - `CLOG2(WORD_SIZE), - parameter FLAGS_WIDTH = 1, + parameter FLAGS_WIDTH = 0, parameter TAG_WIDTH = 8, parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID parameter CORE_QUEUE_SIZE= 8, @@ -32,7 +32,7 @@ module VX_mem_scheduler #( parameter WORD_WIDTH = WORD_SIZE * 8, parameter LINE_WIDTH = LINE_SIZE * 8, - parameter COALESCE_ENABLE = (LINE_SIZE != WORD_SIZE), + parameter COALESCE_ENABLE = (CORE_REQS > 1) && (LINE_SIZE != WORD_SIZE), parameter PER_LINE_REQS = LINE_SIZE / WORD_SIZE, parameter MERGED_REQS = CORE_REQS / PER_LINE_REQS, parameter MEM_BATCHES = `CDIV(MERGED_REQS, MEM_CHANNELS), @@ -50,7 +50,7 @@ module VX_mem_scheduler #( input wire [CORE_REQS-1:0] core_req_mask, input wire [CORE_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, input wire [CORE_REQS-1:0][ADDR_WIDTH-1:0] core_req_addr, - input wire [CORE_REQS-1:0][FLAGS_WIDTH-1:0] core_req_flags, + input wire [CORE_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] core_req_flags, input wire [CORE_REQS-1:0][WORD_WIDTH-1:0] core_req_data, input wire [TAG_WIDTH-1:0] core_req_tag, output wire core_req_ready, @@ -72,7 +72,7 @@ module VX_mem_scheduler #( output wire [MEM_CHANNELS-1:0] mem_req_mask, output wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen, output wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr, - output wire [MEM_CHANNELS-1:0][FLAGS_WIDTH-1:0] mem_req_flags, + output wire [MEM_CHANNELS-1:0][`UP(FLAGS_WIDTH)-1:0] mem_req_flags, output wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data, output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, input wire mem_req_ready, @@ -94,6 +94,7 @@ module VX_mem_scheduler #( localparam CORE_BATCHES = COALESCE_ENABLE ? 1 : MEM_BATCHES; localparam CORE_BATCH_BITS = `CLOG2(CORE_BATCHES); + `STATIC_ASSERT ((MEM_CHANNELS <= CORE_REQS), ("invalid parameter")) `STATIC_ASSERT (`IS_DIVISBLE(CORE_REQS * WORD_SIZE, LINE_SIZE), ("invalid parameter")) `STATIC_ASSERT ((TAG_WIDTH >= UUID_WIDTH), ("invalid parameter")) `RUNTIME_ASSERT((~core_req_valid || core_req_mask != 0), ("%t: invalid request mask", $time)) @@ -112,7 +113,7 @@ module VX_mem_scheduler #( wire reqq_rw; wire [CORE_REQS-1:0][WORD_SIZE-1:0] reqq_byteen; wire [CORE_REQS-1:0][ADDR_WIDTH-1:0] reqq_addr; - wire [CORE_REQS-1:0][FLAGS_WIDTH-1:0] reqq_flags; + wire [CORE_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] reqq_flags; wire [CORE_REQS-1:0][WORD_WIDTH-1:0] reqq_data; wire [REQQ_TAG_WIDTH-1:0] reqq_tag; wire reqq_ready; @@ -122,7 +123,7 @@ module VX_mem_scheduler #( wire reqq_rw_s; wire [MERGED_REQS-1:0][LINE_SIZE-1:0] reqq_byteen_s; wire [MERGED_REQS-1:0][MEM_ADDR_WIDTH-1:0] reqq_addr_s; - wire [MERGED_REQS-1:0][FLAGS_WIDTH-1:0] reqq_flags_s; + wire [MERGED_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] reqq_flags_s; wire [MERGED_REQS-1:0][LINE_WIDTH-1:0] reqq_data_s; wire [MERGED_TAG_WIDTH-1:0] reqq_tag_s; wire reqq_ready_s; @@ -132,7 +133,7 @@ module VX_mem_scheduler #( wire mem_req_rw_s; wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_s; wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_s; - wire [MEM_CHANNELS-1:0][FLAGS_WIDTH-1:0] mem_req_flags_s; + wire [MEM_CHANNELS-1:0][`UP(FLAGS_WIDTH)-1:0] mem_req_flags_s; wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_s; wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s; wire mem_req_ready_s; @@ -167,7 +168,7 @@ module VX_mem_scheduler #( end VX_elastic_buffer #( - .DATAW (1 + CORE_REQS * (1 + WORD_SIZE + ADDR_WIDTH + FLAGS_WIDTH + WORD_WIDTH) + REQQ_TAG_WIDTH), + .DATAW (1 + CORE_REQS * (1 + WORD_SIZE + ADDR_WIDTH + `UP(FLAGS_WIDTH) + WORD_WIDTH) + REQQ_TAG_WIDTH), .SIZE (CORE_QUEUE_SIZE), .OUT_REG (1) ) req_queue ( @@ -223,7 +224,7 @@ module VX_mem_scheduler #( if (COALESCE_ENABLE) begin : g_coalescer VX_mem_coalescer #( - .INSTANCE_ID ($sformatf("%s-coalescer", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-coalescer", INSTANCE_ID))), .NUM_REQS (CORE_REQS), .DATA_IN_SIZE (WORD_SIZE), .DATA_OUT_SIZE (LINE_SIZE), @@ -297,7 +298,7 @@ module VX_mem_scheduler #( wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0] mem_req_mask_b; wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_b; wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_b; - wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][FLAGS_WIDTH-1:0] mem_req_flags_b; + wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][`UP(FLAGS_WIDTH)-1:0] mem_req_flags_b; wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_b; wire [BATCH_SEL_WIDTH-1:0] req_batch_idx; @@ -385,8 +386,10 @@ module VX_mem_scheduler #( assign reqq_ready_s = req_sent_all; + wire [MEM_CHANNELS-1:0][`UP(FLAGS_WIDTH)-1:0] mem_req_flags_u; + VX_elastic_buffer #( - .DATAW (MEM_CHANNELS + 1 + MEM_CHANNELS * (LINE_SIZE + MEM_ADDR_WIDTH + FLAGS_WIDTH + LINE_WIDTH) + MEM_TAG_WIDTH), + .DATAW (MEM_CHANNELS + 1 + MEM_CHANNELS * (LINE_SIZE + MEM_ADDR_WIDTH + `UP(FLAGS_WIDTH) + LINE_WIDTH) + MEM_TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_buf ( @@ -395,106 +398,128 @@ module VX_mem_scheduler #( .valid_in (mem_req_valid_s), .ready_in (mem_req_ready_s), .data_in ({mem_req_mask_s, mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_flags_s, mem_req_data_s, mem_req_tag_s}), - .data_out ({mem_req_mask, mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_flags, mem_req_data, mem_req_tag}), + .data_out ({mem_req_mask, mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_flags_u, mem_req_data, mem_req_tag}), .valid_out (mem_req_valid), .ready_out (mem_req_ready) ); + if (FLAGS_WIDTH != 0) begin : g_mem_req_flags + assign mem_req_flags = mem_req_flags_u; + end else begin : g_mem_req_flags_0 + `UNUSED_VAR (mem_req_flags_u) + assign mem_req_flags = '0; + end + // Handle memory responses //////////////////////////////////////////////// - reg [CORE_QUEUE_SIZE-1:0][CORE_REQS-1:0] rsp_rem_mask; - wire [CORE_REQS-1:0] rsp_rem_mask_n, curr_mask; wire [BATCH_SEL_WIDTH-1:0] rsp_batch_idx; - if (CORE_BATCHES > 1) begin : g_rsp_batch_idx assign rsp_batch_idx = mem_rsp_tag_s[CORE_BATCH_BITS-1:0]; end else begin : g_rsp_batch_idx_0 assign rsp_batch_idx = '0; end - for (genvar r = 0; r < CORE_REQS; ++r) begin : g_curr_mask - localparam i = r / CORE_CHANNELS; - localparam j = r % CORE_CHANNELS; - assign curr_mask[r] = (BATCH_SEL_WIDTH'(i) == rsp_batch_idx) && mem_rsp_mask_s[j]; - end + if (CORE_REQS == 1) begin : g_rsp_1 + `UNUSED_VAR (rsp_batch_idx) - assign rsp_rem_mask_n = rsp_rem_mask[ibuf_raddr] & ~curr_mask; + assign crsp_valid = mem_rsp_valid_s; + assign crsp_mask = mem_rsp_mask_s; + assign crsp_sop = 1'b1; + assign crsp_eop = 1'b1; + assign crsp_data = mem_rsp_data_s; - wire rsp_complete = ~(| rsp_rem_mask_n); + assign mem_rsp_ready_s = crsp_ready; - wire mem_rsp_fire_s = mem_rsp_valid_s && mem_rsp_ready_s; + end else begin : g_rsp_N - always @(posedge clk) begin - if (ibuf_push) begin - rsp_rem_mask[ibuf_waddr] <= core_req_mask; - end - if (mem_rsp_fire_s) begin - rsp_rem_mask[ibuf_raddr] <= rsp_rem_mask_n; + reg [CORE_QUEUE_SIZE-1:0][CORE_REQS-1:0] rsp_rem_mask; + wire [CORE_REQS-1:0] rsp_rem_mask_n, curr_mask; + + for (genvar r = 0; r < CORE_REQS; ++r) begin : g_curr_mask + localparam i = r / CORE_CHANNELS; + localparam j = r % CORE_CHANNELS; + assign curr_mask[r] = (BATCH_SEL_WIDTH'(i) == rsp_batch_idx) && mem_rsp_mask_s[j]; end - end - if (RSP_PARTIAL != 0 || CORE_REQS == 1) begin : g_rsp_partial + assign rsp_rem_mask_n = rsp_rem_mask[ibuf_raddr] & ~curr_mask; - reg [CORE_QUEUE_SIZE-1:0] rsp_sop_r; + wire mem_rsp_fire_s = mem_rsp_valid_s && mem_rsp_ready_s; always @(posedge clk) begin if (ibuf_push) begin - rsp_sop_r[ibuf_waddr] <= 1; + rsp_rem_mask[ibuf_waddr] <= core_req_mask; end if (mem_rsp_fire_s) begin - rsp_sop_r[ibuf_raddr] <= 0; + rsp_rem_mask[ibuf_raddr] <= rsp_rem_mask_n; end end - assign crsp_valid = mem_rsp_valid_s; - assign crsp_mask = curr_mask; - assign crsp_sop = rsp_sop_r[ibuf_raddr]; + wire rsp_complete = ~(| rsp_rem_mask_n) || (CORE_REQS == 1); - for (genvar r = 0; r < CORE_REQS; ++r) begin : g_crsp_data - localparam j = r % CORE_CHANNELS; - assign crsp_data[r] = mem_rsp_data_s[j]; - end + if (RSP_PARTIAL != 0) begin : g_rsp_partial - assign mem_rsp_ready_s = crsp_ready; + reg [CORE_QUEUE_SIZE-1:0] rsp_sop_r; - end else begin : g_rsp_full + always @(posedge clk) begin + if (ibuf_push) begin + rsp_sop_r[ibuf_waddr] <= 1; + end + if (mem_rsp_fire_s) begin + rsp_sop_r[ibuf_raddr] <= 0; + end + end + + assign crsp_valid = mem_rsp_valid_s; + assign crsp_mask = curr_mask; + assign crsp_sop = rsp_sop_r[ibuf_raddr]; + + for (genvar r = 0; r < CORE_REQS; ++r) begin : g_crsp_data + localparam j = r % CORE_CHANNELS; + assign crsp_data[r] = mem_rsp_data_s[j]; + end - wire [CORE_CHANNELS-1:0][CORE_BATCHES-1:0][WORD_WIDTH-1:0] rsp_store_n; - reg [CORE_REQS-1:0] rsp_orig_mask [CORE_QUEUE_SIZE-1:0]; + assign mem_rsp_ready_s = crsp_ready; - for (genvar i = 0; i < CORE_CHANNELS; ++i) begin : g_rsp_store - for (genvar j = 0; j < CORE_BATCHES; ++j) begin : g_j - reg [WORD_WIDTH-1:0] rsp_store [CORE_QUEUE_SIZE-1:0]; - wire rsp_wren = mem_rsp_fire_s - && (BATCH_SEL_WIDTH'(j) == rsp_batch_idx) - && ((CORE_CHANNELS == 1) || mem_rsp_mask_s[i]); - always @(posedge clk) begin - if (rsp_wren) begin - rsp_store[ibuf_raddr] <= mem_rsp_data_s[i]; + end else begin : g_rsp_full + + wire [CORE_CHANNELS-1:0][CORE_BATCHES-1:0][WORD_WIDTH-1:0] rsp_store_n; + reg [CORE_REQS-1:0] rsp_orig_mask [CORE_QUEUE_SIZE-1:0]; + + for (genvar i = 0; i < CORE_CHANNELS; ++i) begin : g_rsp_store + for (genvar j = 0; j < CORE_BATCHES; ++j) begin : g_j + reg [WORD_WIDTH-1:0] rsp_store [0:CORE_QUEUE_SIZE-1]; + wire rsp_wren = mem_rsp_fire_s + && (BATCH_SEL_WIDTH'(j) == rsp_batch_idx) + && ((CORE_CHANNELS == 1) || mem_rsp_mask_s[i]); + always @(posedge clk) begin + if (rsp_wren) begin + rsp_store[ibuf_raddr] <= mem_rsp_data_s[i]; + end end + assign rsp_store_n[i][j] = rsp_wren ? mem_rsp_data_s[i] : rsp_store[ibuf_raddr]; end - assign rsp_store_n[i][j] = rsp_wren ? mem_rsp_data_s[i] : rsp_store[ibuf_raddr]; end - end - always @(posedge clk) begin - if (ibuf_push) begin - rsp_orig_mask[ibuf_waddr] <= core_req_mask; + always @(posedge clk) begin + if (ibuf_push) begin + rsp_orig_mask[ibuf_waddr] <= core_req_mask; + end end - end - assign crsp_valid = mem_rsp_valid_s && rsp_complete; - assign crsp_mask = rsp_orig_mask[ibuf_raddr]; - assign crsp_sop = 1'b1; + assign crsp_valid = mem_rsp_valid_s && rsp_complete; + assign crsp_mask = rsp_orig_mask[ibuf_raddr]; + assign crsp_sop = 1'b1; - for (genvar r = 0; r < CORE_REQS; ++r) begin : g_crsp_data - localparam i = r / CORE_CHANNELS; - localparam j = r % CORE_CHANNELS; - assign crsp_data[r] = rsp_store_n[j][i]; - end + for (genvar r = 0; r < CORE_REQS; ++r) begin : g_crsp_data + localparam i = r / CORE_CHANNELS; + localparam j = r % CORE_CHANNELS; + assign crsp_data[r] = rsp_store_n[j][i]; + end - assign mem_rsp_ready_s = crsp_ready || ~rsp_complete; + assign mem_rsp_ready_s = crsp_ready || ~rsp_complete; + end + assign crsp_eop = rsp_complete; end if (UUID_WIDTH != 0) begin : g_crsp_tag @@ -503,8 +528,6 @@ module VX_mem_scheduler #( assign crsp_tag = ibuf_dout; end - assign crsp_eop = rsp_complete; - // Send response to caller VX_elastic_buffer #( @@ -516,7 +539,7 @@ module VX_mem_scheduler #( .reset (reset), .valid_in (crsp_valid), .ready_in (crsp_ready), - .data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag}), + .data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag}), .data_out ({core_rsp_mask, core_rsp_sop, core_rsp_eop, core_rsp_data, core_rsp_tag}), .valid_out (core_rsp_valid), .ready_out (core_rsp_ready) @@ -584,41 +607,41 @@ module VX_mem_scheduler #( always @(posedge clk) begin if (core_req_fire) begin if (core_req_rw) begin - `TRACE(1, ("%t: %s core-req-wr: valid=%b, addr=", $time, INSTANCE_ID, core_req_mask)) - `TRACE_ARRAY1D(1, "0x%h", core_req_addr, CORE_REQS) - `TRACE(1, (", byteen=")) - `TRACE_ARRAY1D(1, "0x%h", core_req_byteen, CORE_REQS) - `TRACE(1, (", data=")) - `TRACE_ARRAY1D(1, "0x%0h", core_req_data, CORE_REQS) + `TRACE(2, ("%t: %s core-req-wr: valid=%b, addr=", $time, INSTANCE_ID, core_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", core_req_addr, CORE_REQS) + `TRACE(2, (", byteen=")) + `TRACE_ARRAY1D(2, "0x%h", core_req_byteen, CORE_REQS) + `TRACE(2, (", data=")) + `TRACE_ARRAY1D(2, "0x%0h", core_req_data, CORE_REQS) end else begin - `TRACE(1, ("%t: %s core-req-rd: valid=%b, addr=", $time, INSTANCE_ID, core_req_mask)) - `TRACE_ARRAY1D(1, "0x%h", core_req_addr, CORE_REQS) + `TRACE(2, ("%t: %s core-req-rd: valid=%b, addr=", $time, INSTANCE_ID, core_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", core_req_addr, CORE_REQS) end - `TRACE(1, (", tag=0x%0h (#%0d)\n", core_req_tag, req_dbg_uuid)) + `TRACE(2, (", tag=0x%0h (#%0d)\n", core_req_tag, req_dbg_uuid)) end if (core_rsp_valid && core_rsp_ready) begin - `TRACE(1, ("%t: %s core-rsp: valid=%b, sop=%b, eop=%b, data=", $time, INSTANCE_ID, core_rsp_mask, core_rsp_sop, core_rsp_eop)) - `TRACE_ARRAY1D(1, "0x%0h", core_rsp_data, CORE_REQS) - `TRACE(1, (", tag=0x%0h (#%0d)\n", core_rsp_tag, rsp_dbg_uuid)) + `TRACE(2, ("%t: %s core-rsp: valid=%b, sop=%b, eop=%b, data=", $time, INSTANCE_ID, core_rsp_mask, core_rsp_sop, core_rsp_eop)) + `TRACE_ARRAY1D(2, "0x%0h", core_rsp_data, CORE_REQS) + `TRACE(2, (", tag=0x%0h (#%0d)\n", core_rsp_tag, rsp_dbg_uuid)) end if (| mem_req_fire_s) begin if (| mem_req_rw_s) begin - `TRACE(1, ("%t: %s mem-req-wr: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s)) - `TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS) - `TRACE(1, (", byteen=")) - `TRACE_ARRAY1D(1, "0x%h", mem_req_byteen_s, CORE_CHANNELS) - `TRACE(1, (", data=")) - `TRACE_ARRAY1D(1, "0x%0h", mem_req_data_s, CORE_CHANNELS) + `TRACE(2, ("%t: %s mem-req-wr: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s)) + `TRACE_ARRAY1D(2, "0x%h", mem_req_addr_s, CORE_CHANNELS) + `TRACE(2, (", byteen=")) + `TRACE_ARRAY1D(2, "0x%h", mem_req_byteen_s, CORE_CHANNELS) + `TRACE(2, (", data=")) + `TRACE_ARRAY1D(2, "0x%0h", mem_req_data_s, CORE_CHANNELS) end else begin - `TRACE(1, ("%t: %s mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s)) - `TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS) + `TRACE(2, ("%t: %s mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s)) + `TRACE_ARRAY1D(2, "0x%h", mem_req_addr_s, CORE_CHANNELS) end - `TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr_s, req_batch_idx, mem_req_dbg_uuid)) + `TRACE(2, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr_s, req_batch_idx, mem_req_dbg_uuid)) end - if (mem_rsp_fire_s) begin - `TRACE(1, ("%t: %s mem-rsp: valid=%b, data=", $time, INSTANCE_ID, mem_rsp_mask_s)) - `TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data_s, CORE_CHANNELS) - `TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_raddr, rsp_batch_idx, mem_rsp_dbg_uuid)) + if (mem_rsp_valid_s && mem_rsp_ready_s) begin + `TRACE(2, ("%t: %s mem-rsp: valid=%b, data=", $time, INSTANCE_ID, mem_rsp_mask_s)) + `TRACE_ARRAY1D(2, "0x%0h", mem_rsp_data_s, CORE_CHANNELS) + `TRACE(2, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_raddr, rsp_batch_idx, mem_rsp_dbg_uuid)) end end `endif diff --git a/hw/rtl/libs/VX_encoder.sv b/hw/rtl/libs/VX_onehot_encoder.sv similarity index 97% rename from hw/rtl/libs/VX_encoder.sv rename to hw/rtl/libs/VX_onehot_encoder.sv index 86ccad792..08198e430 100644 --- a/hw/rtl/libs/VX_encoder.sv +++ b/hw/rtl/libs/VX_onehot_encoder.sv @@ -13,11 +13,11 @@ `include "VX_platform.vh" -// Fast encoder using parallel prefix computation +// Fast one-hot encoder using parallel prefix computation // Adapted from BaseJump STL: http://bjump.org/data_out.html `TRACING_OFF -module VX_encoder #( +module VX_onehot_encoder #( parameter N = 1, parameter REVERSE = 0, parameter MODEL = 1, diff --git a/hw/rtl/libs/VX_pending_size.sv b/hw/rtl/libs/VX_pending_size.sv index 1e72cef19..b94889e6e 100644 --- a/hw/rtl/libs/VX_pending_size.sv +++ b/hw/rtl/libs/VX_pending_size.sv @@ -66,11 +66,13 @@ module VX_pending_size #( if (INCRW != 1 || DECRW != 1) begin : g_wide_step - localparam SUBW = `MIN(SIZEW, `MAX(INCRW, DECRW)+1); + localparam DELTAW = `MIN(SIZEW, `MAX(INCRW, DECRW)+1); logic [SIZEW-1:0] size_n, size_r; - assign size_n = $signed(size_r) + SIZEW'($signed(SUBW'(incr) - SUBW'(decr))); + wire [DELTAW-1:0] delta = DELTAW'(incr) - DELTAW'(decr); + + assign size_n = $signed(size_r) + SIZEW'($signed(delta)); always @(posedge clk) begin if (reset) begin @@ -80,8 +82,8 @@ module VX_pending_size #( alm_full_r <= 0; size_r <= '0; end else begin - `ASSERT((SIZEW'(incr) >= SIZEW'(decr)) || (size_n >= size_r), ("runtime error: counter overflow")); - `ASSERT((SIZEW'(incr) <= SIZEW'(decr)) || (size_n <= size_r), ("runtime error: counter underflow")); + `ASSERT((DELTAW'(incr) <= DELTAW'(decr)) || (size_n >= size_r), ("runtime error: counter overflow")); + `ASSERT((DELTAW'(incr) >= DELTAW'(decr)) || (size_n <= size_r), ("runtime error: counter underflow")); empty_r <= (size_n == SIZEW'(0)); full_r <= (size_n == SIZEW'(SIZE)); alm_empty_r <= (size_n <= SIZEW'(ALM_EMPTY)); @@ -129,7 +131,7 @@ module VX_pending_size #( wire is_empty_n = (used_r == ADDRW'(1)); wire is_full_n = (used_r == ADDRW'(SIZE-1)); - wire [1:0] push_minus_pop = {~incr & decr, incr ^ decr}; + wire [1:0] delta = {~incr & decr, incr ^ decr}; always @(posedge clk) begin if (reset) begin @@ -148,7 +150,7 @@ module VX_pending_size #( if (is_empty_n) empty_r <= 1; end - used_r <= $signed(used_r) + ADDRW'($signed(push_minus_pop)); + used_r <= $signed(used_r) + ADDRW'($signed(delta)); end end diff --git a/hw/rtl/libs/VX_placeholder.sv b/hw/rtl/libs/VX_placeholder.sv new file mode 100644 index 000000000..738da615b --- /dev/null +++ b/hw/rtl/libs/VX_placeholder.sv @@ -0,0 +1,27 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_platform.vh" + +`TRACING_OFF +`BLACKBOX_CELL module VX_placeholder #( + parameter I = 0, + parameter O = 0 +) ( + input wire [`UP(I)-1:0] in, + output wire [`UP(O)-1:0] out +); + // empty module + +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_rr_arbiter.sv b/hw/rtl/libs/VX_rr_arbiter.sv index efe9838d6..1d3b479bf 100644 --- a/hw/rtl/libs/VX_rr_arbiter.sv +++ b/hw/rtl/libs/VX_rr_arbiter.sv @@ -448,7 +448,7 @@ module VX_rr_arbiter #( end end - VX_encoder #( + VX_onehot_encoder #( .N (NUM_REQS) ) onehot_encoder ( .data_in (grant_onehot), @@ -480,12 +480,12 @@ module VX_rr_arbiter #( end end - VX_decoder #( + VX_demux #( .N (LOG_NUM_REQS), .D (NUM_REQS) ) grant_decoder ( - .data_in (grant_index), - .valid_in (grant_valid), + .sel_in (grant_index), + .data_in (1'b1), .data_out (grant_onehot) ); diff --git a/hw/rtl/libs/VX_scope_tap.sv b/hw/rtl/libs/VX_scope_tap.sv index 6a9b70ff1..6c0914b0c 100644 --- a/hw/rtl/libs/VX_scope_tap.sv +++ b/hw/rtl/libs/VX_scope_tap.sv @@ -113,8 +113,7 @@ module VX_scope_tap #( .DATAW (IDLE_CTRW), .SIZE (DEPTH), .OUT_REG (1), - .READ_ENABLE (0), - .NO_RWCHECK (1) + .RDW_MODE ("R") ) delta_store ( .clk (clk), .reset (reset), @@ -136,8 +135,7 @@ module VX_scope_tap #( .DATAW (DATAW), .SIZE (DEPTH), .OUT_REG (1), - .READ_ENABLE (0), - .NO_RWCHECK (1) + .RDW_MODE ("R") ) data_store ( .clk (clk), .reset (reset), diff --git a/hw/rtl/libs/VX_sp_ram.sv b/hw/rtl/libs/VX_sp_ram.sv index efce4b5f2..88b922384 100644 --- a/hw/rtl/libs/VX_sp_ram.sv +++ b/hw/rtl/libs/VX_sp_ram.sv @@ -13,6 +13,35 @@ `include "VX_platform.vh" +`define RAM_INITIALIZATION \ + if (INIT_ENABLE != 0) begin : g_init \ + if (INIT_FILE != "") begin : g_file \ + initial $readmemh(INIT_FILE, ram); \ + end else begin : g_value \ + initial begin \ + for (integer i = 0; i < SIZE; ++i) begin : g_i \ + ram[i] = INIT_VALUE; \ + end \ + end \ + end \ + end + +`ifdef QUARTUS + `define RAM_ARRAY_WREN reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1]; + `define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[addr][i] <= wdata[i * WSELW +: WSELW]; \ + end \ + end +`else + `define RAM_ARRAY_WREN reg [DATAW-1:0] ram [0:SIZE-1]; + `define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[addr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end +`endif + `TRACING_OFF module VX_sp_ram #( parameter DATAW = 1, @@ -20,11 +49,9 @@ module VX_sp_ram #( parameter WRENW = 1, parameter OUT_REG = 0, parameter LUTRAM = 0, - parameter NO_RWCHECK = 0, - parameter RW_ASSERT = 0, + parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first, N: no-change, U: undefined + parameter RDW_ASSERT = 0, parameter RESET_RAM = 0, - parameter RESET_OUT = 0, - parameter READ_ENABLE = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, @@ -39,32 +66,442 @@ module VX_sp_ram #( input wire [DATAW-1:0] wdata, output wire [DATAW-1:0] rdata ); - VX_dp_ram #( - .DATAW (DATAW), - .SIZE (SIZE), - .WRENW (WRENW), - .OUT_REG (OUT_REG), - .LUTRAM (LUTRAM), - .NO_RWCHECK (NO_RWCHECK), - .RW_ASSERT (RW_ASSERT), - .RESET_RAM (RESET_RAM), - .RESET_OUT (RESET_OUT), - .READ_ENABLE(READ_ENABLE), - .INIT_ENABLE(INIT_ENABLE), - .INIT_FILE (INIT_FILE), - .INIT_VALUE (INIT_VALUE), - .ADDRW (ADDRW) - ) dp_ram ( - .clk (clk), - .reset (reset), - .read (read), - .write (write), - .wren (wren), - .waddr (addr), - .wdata (wdata), - .raddr (addr), - .rdata (rdata) - ); + localparam WSELW = DATAW / WRENW; + `UNUSED_PARAM (LUTRAM) + + `STATIC_ASSERT(!(WRENW * WSELW != DATAW), ("invalid parameter")) + `STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W" || RDW_MODE == "N"), ("invalid parameter")) + `UNUSED_PARAM (RDW_ASSERT) + +`ifdef SYNTHESIS + localparam FORCE_BRAM = !LUTRAM && (SIZE * DATAW >= `MAX_LUTRAM); + if (OUT_REG) begin : g_sync + if (FORCE_BRAM) begin : g_bram + if (RDW_MODE == "R") begin : g_read_first + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + `RAM_WRITE_WREN + end + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + ram[addr] <= wdata; + end + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end + end else if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [ADDRW-1:0] addr_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + `RAM_WRITE_WREN + end + addr_r <= addr; + end + end + assign rdata = ram[addr_r]; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + ram[addr] <= wdata; + rdata_r <= wdata; + end else begin + rdata_r <= ram[addr]; + end + end + end + assign rdata = rdata_r; + end + end else if (RDW_MODE == "N") begin : g_no_change + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + `RAM_WRITE_WREN + end else begin + rdata_r <= ram[addr]; + end + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + ram[addr] <= wdata; + end else begin + rdata_r <= ram[addr]; + end + end + end + assign rdata = rdata_r; + end + end else if (RDW_MODE == "U") begin : g_unknown + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end + if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (write) begin + ram[addr] <= wdata; + end + if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end + end + end else begin : g_auto + if (RDW_MODE == "R") begin : g_read_first + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + `RAM_WRITE_WREN + end + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + ram[addr] <= wdata; + end + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end + end else if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [ADDRW-1:0] addr_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + `RAM_WRITE_WREN + end + addr_r <= addr; + end + end + assign rdata = ram[addr_r]; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + ram[addr] <= wdata; + rdata_r <= wdata; + end else begin + rdata_r <= ram[addr]; + end + end + end + assign rdata = rdata_r; + end + end else if (RDW_MODE == "N") begin : g_no_change + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + `RAM_WRITE_WREN + end else begin + rdata_r <= ram[addr]; + end + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + if (write) begin + ram[addr] <= wdata; + end else begin + rdata_r <= ram[addr]; + end + end + end + assign rdata = rdata_r; + end + end else if (RDW_MODE == "U") begin : g_unknown + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end + if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (write) begin + ram[addr] <= wdata; + end + if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end + end + end + end else begin : g_async + `UNUSED_VAR (read) + if (FORCE_BRAM) begin : g_bram + if (RDW_MODE == "W") begin : g_write_first + `ifdef VIVADO + VX_async_ram_patch #( + .DATAW (DATAW), + .SIZE (SIZE), + .WRENW (WRENW), + .DUAL_PORT (0), + .INIT_ENABLE(INIT_ENABLE), + .INIT_FILE (INIT_FILE), + .INIT_VALUE (INIT_VALUE) + ) async_ram_patch ( + .clk (clk), + .reset (reset), + .read (read), + .write (write), + .wren (wren), + .waddr (addr), + .wdata (wdata), + .raddr (addr), + .rdata (rdata) + ); + `else + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end + end + assign rdata = ram[addr]; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + ram[addr] <= wdata; + end + end + assign rdata = ram[addr]; + end + `endif + end else begin : g_read_first + if (WRENW != 1) begin : g_wren + `NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end + end + assign rdata = ram[addr]; + end else begin : g_no_wren + `NO_RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + ram[addr] <= wdata; + end + end + assign rdata = ram[addr]; + end + end + end else begin : g_auto + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end + end + assign rdata = ram[addr]; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + ram[addr] <= wdata; + end + end + assign rdata = ram[addr]; + end + end else begin : g_read_first + if (WRENW != 1) begin : g_wren + `NO_RW_RAM_CHECK `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + `RAM_WRITE_WREN + end + end + assign rdata = ram[addr]; + end else begin : g_no_wren + `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + if (write) begin + ram[addr] <= wdata; + end + end + assign rdata = ram[addr]; + end + end + end + end +`else + // simulation + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + + always @(posedge clk) begin + if (RESET_RAM && reset) begin + for (integer i = 0; i < SIZE; ++i) begin + ram[i] <= DATAW'(INIT_VALUE); + end + end else if (write) begin + for (integer i = 0; i < WRENW; ++i) begin + if (wren[i]) begin + ram[addr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; + end + end + end + end + + if (OUT_REG) begin : g_sync + if (RDW_MODE == "R") begin : g_read_first + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read || write) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else if (RDW_MODE == "W") begin : g_write_first + reg [ADDRW-1:0] addr_r; + always @(posedge clk) begin + if (read || write) begin + addr_r <= addr; + end + end + assign rdata = ram[addr_r]; + end else if (RDW_MODE == "N") begin : g_no_change + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read && ~write) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else if (RDW_MODE == "U") begin : g_unknown + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end + end else begin : g_async + `UNUSED_VAR (read) + if (RDW_MODE == "W") begin : g_write_first + assign rdata = ram[addr]; + end else begin : g_read_first + reg [DATAW-1:0] prev_data; + reg [ADDRW-1:0] prev_waddr; + reg prev_write; + always @(posedge clk) begin + if (reset) begin + prev_write <= 0; + prev_data <= '0; + prev_waddr <= '0; + end else begin + prev_write <= write; + prev_data <= ram[addr]; + prev_waddr <= addr; + end + end + assign rdata = (prev_write && (prev_waddr == addr)) ? prev_data : ram[addr]; + if (RDW_ASSERT) begin : g_rw_asert + `RUNTIME_ASSERT(~read || (rdata == ram[addr]), ("%t: read after write hazard", $time)) + end + end + end +`endif endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_stream_xbar.sv b/hw/rtl/libs/VX_stream_xbar.sv index febfd0465..68a31c4fc 100644 --- a/hw/rtl/libs/VX_stream_xbar.sv +++ b/hw/rtl/libs/VX_stream_xbar.sv @@ -64,12 +64,12 @@ module VX_stream_xbar #( ); for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_sel_in_decoders - VX_decoder #( + VX_demux #( .N (OUT_WIDTH), .D (NUM_OUTPUTS) - ) sel_in_decoder ( - .data_in (sel_in[i]), - .valid_in (valid_in[i]), + ) sel_in_demux ( + .sel_in (sel_in[i]), + .data_in (valid_in[i]), .data_out (per_output_valid_in[i]) ); assign ready_in[i] = | per_output_ready_in_w[i]; @@ -137,12 +137,12 @@ module VX_stream_xbar #( wire [NUM_OUTPUTS-1:0] valid_out_w, ready_out_w; wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w; - VX_decoder #( + VX_demux #( .N (OUT_WIDTH), .D (NUM_OUTPUTS) - ) sel_in_decoder ( - .data_in (sel_in[0]), - .valid_in (valid_in[0]), + ) sel_in_demux ( + .sel_in (sel_in[0]), + .data_in (valid_in[0]), .data_out (valid_out_w) ); diff --git a/hw/rtl/mem/VX_gbar_unit.sv b/hw/rtl/mem/VX_gbar_unit.sv index c9707748f..ac4c09349 100644 --- a/hw/rtl/mem/VX_gbar_unit.sv +++ b/hw/rtl/mem/VX_gbar_unit.sv @@ -60,11 +60,11 @@ module VX_gbar_unit #( `ifdef DBG_TRACE_GBAR always @(posedge clk) begin if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin - `TRACE(1, ("%t: %s acquire: bar_id=%0d, size=%0d, core_id=%0d\n", + `TRACE(2, ("%t: %s acquire: bar_id=%0d, size=%0d, core_id=%0d\n", $time, INSTANCE_ID, gbar_bus_if.req_id, gbar_bus_if.req_size_m1, gbar_bus_if.req_core_id)) end if (gbar_bus_if.rsp_valid) begin - `TRACE(1, ("%t: %s release: bar_id=%0d\n", $time, INSTANCE_ID, gbar_bus_if.rsp_id)) + `TRACE(2, ("%t: %s release: bar_id=%0d\n", $time, INSTANCE_ID, gbar_bus_if.rsp_id)) end end `endif diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index 7131c3f21..fd0694fe3 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -167,9 +167,8 @@ module VX_local_mem import VX_gpu_pkg::*; #( .SIZE (WORDS_PER_BANK), .WRENW (WORD_SIZE), .OUT_REG (1), - .READ_ENABLE (0), - .NO_RWCHECK (1) - ) data_store ( + .RDW_MODE ("R") + ) lmem_store ( .clk (clk), .reset (reset), .read (per_bank_req_valid[i] && per_bank_req_ready[i] && ~per_bank_req_rw[i]), @@ -330,15 +329,15 @@ module VX_local_mem import VX_gpu_pkg::*; #( always @(posedge clk) begin if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin if (mem_bus_if[i].req_data.rw) begin - `TRACE(1, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", + `TRACE(2, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, req_uuid[i])) end else begin - `TRACE(1, ("%t: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n", + `TRACE(2, ("%t: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, req_uuid[i])) end end if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin - `TRACE(1, ("%t: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%h (#%0d)\n", + `TRACE(2, ("%t: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.tag, mem_bus_if[i].rsp_data.data[i], rsp_uuid[i])) end end diff --git a/hw/scripts/xilinx_async_bram_patch.tcl b/hw/scripts/xilinx_async_bram_patch.tcl new file mode 100644 index 000000000..5af7ba953 --- /dev/null +++ b/hw/scripts/xilinx_async_bram_patch.tcl @@ -0,0 +1,525 @@ +namespace eval vortex { + +variable debug 0 + +proc print_error {msg {do_exit 1}} { + if {$do_exit} { + puts "ERROR: $msg" + exit -1 + } else { + puts "WARNING: $msg" + } +} + +proc str_replace {str match repl} { + set result "" + regsub $match $str $repl result + return $result +} + +proc unique_cell_name {name} { + if {[get_cells -quiet $name] == {}} { return $name } + set index 0 + while {[get_cells -quiet ${name}_${index}] != {}} { incr index } + return ${name}_${index} +} + +proc unique_net_name {name} { + if {[get_nets -quiet $name] == {}} { return $name } + set index 0 + while {[get_nets -quiet ${name}_${index}] != {}} { incr index } + return ${name}_${index} +} + +proc find_nested_cells {parent name_match {should_exist 1}} { + set matching_cells {} + foreach cell [get_cells -hierarchical -include_replicated_objects -filter "PARENT == $parent"] { + set name [get_property NAME $cell] + if {[regexp $name_match $name]} { + lappend matching_cells $cell + } + } + if {[llength $matching_cells] == 0} { + print_error "No matching cell found for '$parent' matching '$name_match'." $should_exist + } + return $matching_cells +} + +proc find_nested_cell {parent name_match} { + foreach cell [get_cells -hierarchical -filter "PARENT == $parent"] { + set name [get_property NAME $cell] + if {$name == $name_match} { + return $cell + } + } + puts "ERROR: No matching cell found for '$parent' matching '$name_match'." + exit -1 +} + +proc find_cell_nets {cell name_match {should_exist 1}} { + set matching_nets {} + foreach net [get_nets -hierarchical -filter "PARENT_CELL == $cell"] { + set name [get_property NAME $net] + if {[regexp $name_match $name]} { + lappend matching_nets $net + } + } + if {[llength $matching_nets] == 0} { + print_error "No matching net found for '$cell' matching '$name_match'." $should_exist + } + return $matching_nets +} + +proc get_cell_net {cell name_match} { + foreach net [get_nets -hierarchical -filter "PARENT_CELL == $cell"] { + set name [get_property NAME $net] + if {$name == $name_match} { + return $net + } + } + puts "ERROR: No matching net found for '$cell' matching '$name_match'." + exit -1 +} + +proc find_cell_pins {cell name_match {should_exist 1}} { + set matching_pins {} + foreach pin [get_pins -of_objects $cell] { + set name [get_property NAME $pin] + if {[regexp $name_match $name]} { + lappend matching_pins $pin + } + } + if {[llength $matching_pins] == 0} { + print_error "No matching pin found for '$cell' matching '$name_match'." $should_exist + } + return $matching_pins +} + +proc get_cell_pin {cell name_match} { + foreach pin [get_pins -of_objects $cell] { + set name [get_property NAME $pin] + if {$name == $name_match} { + return $pin + } + } + puts "ERROR: No matching pin found for '$cell' matching '$name_match'." + exit -1 +} + +proc replace_pin_source {pin source_pin} { + variable debug + + # Disconnect existing net from pin + set net [get_nets -of_objects $pin] + if {[llength $net] == 1} { + disconnect_net -net $net -objects $pin + if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."} + } elseif {[llength $net] > 1} { + puts "ERROR: Multiple nets connected to pin '$pin'." + exit -1 + } else { + puts "WARNING: No net connected to pin '$pin'." + } + + set source_net [get_nets -quiet -of_objects $source_pin] + if {[llength $source_net] == 0} { + # Create a new net if none exists + set source_cell [get_cells -of_objects $source_pin] + set net_name [unique_net_name "${source_cell}_net"] + set source_net [create_net $net_name] + if {$debug} {puts "DEBUG: Created source_net: '$source_net'"} + # Connect the source pin to the new net + connect_net -net $source_net -objects $source_pin -hierarchical + if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$source_pin'."} + } elseif {[llength $source_net] > 1} { + puts "ERROR: Multiple nets connected to pin '$source_pin'." + exit -1 + } + + # Connect pin to the new source net + connect_net -net $source_net -objects $pin -hierarchical + if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$pin'."} +} + +proc create_register_next {reg_cell prefix_name} { + variable debug + + set reg_d_pin [get_pins -of_objects $reg_cell -filter {NAME =~ "*/D"}] + if {[llength $reg_d_pin] == 0} { + puts "ERROR: No D pin found on register cell '$reg_cell'." + exit -1 + } elseif {[llength $reg_d_pin] > 1} { + puts "ERROR: Multiple D pins found on register cell '$reg_cell'." + exit -1 + } + + if {$debug} {puts "DEBUG: reg_d_pin: '$reg_d_pin'"} + + set reg_d_src_pin [find_pin_driver $reg_d_pin] + if {$reg_d_src_pin == ""} { + puts "ERROR: No source pin found connected to '$reg_d_pin'." + exit -1 + } + + if {$debug} {puts "DEBUG: reg_d_src_pin: '$reg_d_src_pin'"} + + set reg_r_src_pin "" + + set register_type [get_property REF_NAME $reg_cell] + if {$register_type == "FDRE"} { + set reg_r_pin [get_pins -of_objects $reg_cell -filter {NAME =~ "*/R"}] + if {[llength $reg_r_pin] == 0} { + puts "ERROR: No R pin found on FDRE cell '$reg_cell'." + exit -1 + } elseif {[llength $reg_r_pin] > 1} { + puts "ERROR: Multiple R pins found on FDRE cell '$reg_cell'." + exit -1 + } + + if {$debug} {puts "DEBUG: reg_r_pin: '$reg_r_pin'"} + + set reg_r_src_pin [find_pin_driver $reg_r_pin] + if {$reg_r_src_pin == ""} { + puts "ERROR: No source pin found connected to '$reg_r_pin'." + exit -1 + } + } elseif {$register_type == "FDSE"} { + set reg_s_pin [get_pins -of_objects $reg_cell -filter {NAME =~ "*/S"}] + if {[llength $reg_s_pin] == 0} { + puts "ERROR: No S pin found on FDSE cell '$reg_cell'." + exit -1 + } elseif {[llength $reg_s_pin] > 1} { + puts "ERROR: Multiple S pins found on FDSE cell '$reg_cell'." + exit -1 + } + + if {$debug} {puts "DEBUG: reg_s_pin: '$reg_s_pin'"} + + set reg_r_src_pin [find_pin_driver $reg_s_pin] + if {$reg_r_src_pin == ""} { + puts "ERROR: No source pin found connected to '$reg_s_pin'." + exit -1 + } + } else { + puts "ERROR: Unsupported register type: '$register_type'." + exit 1 + } + + if {$debug} {puts "DEBUG: reg_r_src_pin: '$reg_r_src_pin'"} + + set reg_d_src_net [get_nets -of_objects $reg_d_src_pin] + if {[llength $reg_d_src_net] == 0} { + puts "ERROR: Unable to get source nets for pins." + exit -1 + } elseif {[llength $reg_d_src_net] > 1} { + puts "ERROR: Multiple source nets found for pins." + exit -1 + } + + set reg_r_src_net [get_nets -of_objects $reg_r_src_pin] + if {[llength $reg_r_src_net] == 0} { + puts "ERROR: Unable to get source nets for pins." + exit -1 + } elseif {[llength $reg_r_src_net] > 1} { + puts "ERROR: Multiple source nets found for pins." + exit -1 + } + + # Create a MUX cell to implement register next value + # Use a 2x1 LUT to describe the logic: + # FDRE: O = I1 ? 0 : I0; where I0=D, I1=R + # FDSE: O = I1 ? 1 : I0; where I0=D, I1=S + set lut_name [unique_cell_name $prefix_name] + set lut_cell [create_cell -reference LUT2 $lut_name] + puts "INFO: Created lut cell: '$lut_cell'" + + if {$register_type == "FDRE"} { + set_property INIT 4'b0010 $lut_cell + } elseif {$register_type == "FDSE"} { + set_property INIT 4'b1110 $lut_cell + } else { + puts "ERROR: Unsupported register type: '$register_type'." + exit 1 + } + + set lut_i0_pin [get_pins -of_objects $lut_cell -filter {NAME =~ "*/I0"}] + if {[llength $lut_i0_pin] == 0} { + puts "ERROR: No I0 pin found on FDSE cell '$lut_cell'." + exit -1 + } elseif {[llength $lut_i0_pin] > 1} { + puts "ERROR: Multiple I0 pins found on FDSE cell '$lut_cell'." + exit -1 + } + + set lut_i1_pin [get_pins -of_objects $lut_cell -filter {NAME =~ "*/I1"}] + if {[llength $lut_i1_pin] == 0} { + puts "ERROR: No I1 pin found on FDSE cell '$lut_cell'." + exit -1 + } elseif {[llength $lut_i1_pin] > 1} { + puts "ERROR: Multiple I1 pins found on FDSE cell '$lut_cell'." + exit -1 + } + + set lut_o_pin [get_pins -of_objects $lut_cell -filter {NAME =~ "*/O"}] + if {[llength $lut_o_pin] == 0} { + puts "ERROR: No O pin found on FDSE cell '$lut_cell'." + exit -1 + } elseif {[llength $lut_o_pin] > 1} { + puts "ERROR: Multiple O pins found on FDSE cell '$lut_cell'." + exit -1 + } + + connect_net -net $reg_d_src_net -objects $lut_i0_pin -hierarchical + if {$debug} {puts "DEBUG: Connected net '$reg_d_src_net' to pin '$lut_i0_pin'."} + + connect_net -net $reg_r_src_net -objects $lut_i1_pin -hierarchical + if {$debug} {puts "DEBUG: Connected net '$reg_r_src_net' to pin '$lut_i1_pin'."} + + return $lut_o_pin +} + +proc getOrCreateVCCPin {prefix_name} { + variable debug + + set vcc_cell "" + set vcc_cells [get_cells -quiet -filter {REF_NAME == VCC}] + if {[llength $vcc_cells] == 0} { + set cell_name [unique_cell_name $prefix_name] + set vcc_cell [create_cell -reference VCC $cell_name] + puts "INFO: Created VCC cell: '$vcc_cell'" + } else { + set vcc_cell [lindex $vcc_cells 0] + } + set vcc_pin [get_pins -of_objects $vcc_cell -filter {NAME =~ "*/P"}] + if {[llength $vcc_pin] == 0} { + puts "ERROR: No VCC pin found on VCC cell '$vcc_cell'." + exit -1 + } elseif {[llength $vcc_pin] > 1} { + puts "ERROR: Multiple VCC pins found on VCC cell '$vcc_cell'." + exit -1 + } + return $vcc_pin +} + +proc getOrCreateGNDPin {prefix_name} { + variable debug + + set gnd_cell "" + set gnd_cells [get_cells -quiet -filter {REF_NAME == GND}] + if {[llength $gnd_cells] == 0} { + set cell_name [unique_cell_name $prefix_name] + set gnd_cell [create_cell -reference GND $cell_name] + puts "INFO: Created GND cell: '$gnd_cell'" + } else { + set gnd_cell [lindex $gnd_cells 0] + } + set gnd_pin [get_pins -of_objects $gnd_cell -filter {NAME =~ "*/G"}] + if {[llength $gnd_pin] == 0} { + puts "ERROR: No GND pin found on GND cell '$gnd_cell'." + exit -1 + } elseif {[llength $gnd_pin] > 1} { + puts "ERROR: Multiple GND pins found on GND cell '$gnd_cell'." + exit -1 + } + return $gnd_pin +} + +proc find_net_sinks {input_net {should_exist 1}} { + set sink_pins {} + foreach pin [get_pins -quiet -leaf -of_objects $input_net -filter {DIRECTION == "IN"}] { + lappend sink_pins $pin + } + foreach port [get_ports -quiet -of_objects $input_net -filter {DIRECTION == "OUT"}] { + lappend sink_pins $port + } + if {[llength $sink_pins] == 0} { + print_error "No sink found for '$input_net'." $should_exist + } + return $sink_pins +} + +proc find_net_driver {input_net {should_exist 1}} { + set driverPins [get_pins -quiet -leaf -of_objects $input_net -filter {DIRECTION == "OUT"}] + if {[llength $driverPins] == 0} { + set driverPorts [get_ports -quiet -of_objects $input_net -filter {DIRECTION == "IN"}] + if {[llength $driverPorts] == 0} { + print_error "No driver found for '$input_net'." $should_exist + } elseif {[llength $driverPorts] > 1} { + puts "WARNING: Multiple driver ports found for '$input_net'." + return [lindex $driverPorts 0] + } + return $driverPorts + } elseif {[llength $driverPins] > 1} { + puts "WARNING: Multiple driver pins found for '$input_net'." + return [lindex $driverPins 0] + } + return $driverPins +} + +proc find_pin_driver {input_pin {should_exist 1}} { + set net [get_nets -quiet -of_objects $input_pin] + if {[llength $net] == 0} { + print_error "No net connected to pin '$input_pin'." $should_exist + } elseif {[llength $net] > 1} { + puts "ERROR: Multiple nets connected to pin '$input_pin'." + exit -1 + } + return [find_net_driver $net] +} + +proc find_matching_nets {cell nets match repl} { + set matching_nets {} + foreach net $nets { + set net_name [str_replace $net $match $repl] + set matching_net [get_cell_net $cell $net_name] + if {$matching_net != ""} { + lappend matching_nets $matching_net + } + } + if {[llength $matching_nets] == 0} { + puts "ERROR: No matching nets found for '$nets'." + exit -1 + } elseif {[llength $matching_nets] != [llength $nets]} { + puts "ERROR: Mismatch in number of matching nets." + exit -1 + } + return $matching_nets +} + +proc replace_net_source {net source_pin} { + foreach pin [find_net_sinks $net 0] { + replace_pin_source $pin $source_pin + } +} + +proc resolve_async_bram {inst} { + variable debug + + puts "INFO: Resolving asynchronous BRAM patch: '$inst'." + + set raddr_w_nets [find_cell_nets $inst "raddr_w(\\\[\\d+\\\])?$"] + set read_s_net [find_cell_nets $inst "read_s$"] + set is_raddr_reg_net [find_cell_nets $inst "is_raddr_reg$"] + + set raddr_s_nets [find_matching_nets $inst $raddr_w_nets "raddr_w(\\\[\\d+\\\])?$" "raddr_s\\1"] + + set reg_next_pins {} + set reg_ce_src_pin "" + + foreach raddr_w_net $raddr_w_nets { + if {$debug} {puts "DEBUG: Processing raddr_w net: '$raddr_w_net'"} + + # Find raddr_w_net's driver pin + set raddr_src_pin [find_net_driver $raddr_w_net] + if {$debug} {puts "DEBUG: raddr_src_pin: '$raddr_src_pin'"} + + # Get the driver cell + set raddr_src_cell [get_cells -of_objects $raddr_src_pin] + if {[llength $raddr_src_cell] == 0} { + puts "ERROR: No source cell found connected to pin '$raddr_src_pin'." + exit -1 + } elseif {[llength $raddr_src_cell] > 1} { + puts "ERROR: Multiple source cells found connected to pin '$raddr_src_pin'." + exit -1 + } + + # Check driver type + set driver_type [get_property REF_NAME $raddr_src_cell] + if {$driver_type == "FDRE" || $driver_type == "FDSE"} { + if {$debug} {puts "DEBUG: Net '$raddr_w_net' is registered, driver_type='$driver_type'"} + } else { + puts "WARNING: Net '$raddr_w_net' is not be registered, driver_type='$driver_type'" + break + } + + # Create register next cell and return output pin + set reg_next_pin [create_register_next $raddr_src_cell "$inst/raddr_next"] + if {$reg_next_pin == ""} { + puts "ERROR: failed to create register next value for '$raddr_src_cell'." + exit -1 + } + if {$debug} {puts "DEBUG: reg_next_pin: '$reg_next_pin'"} + + lappend reg_next_pins $reg_next_pin + + # Find the CE pin on raddr_src_cell + if {$reg_ce_src_pin == ""} { + set reg_ce_pin [get_pins -of_objects $raddr_src_cell -filter {NAME =~ "*/CE"}] + if {[llength $reg_ce_pin] == 0} { + puts "ERROR: No CE pin found on register cell '$raddr_src_cell'." + exit -1 + } elseif {[llength $reg_ce_pin] > 1} { + puts "ERROR: Multiple CE pins found on register cell '$raddr_src_cell'." + exit -1 + } + if {$debug} {puts "DEBUG: reg_ce_pin: '$reg_ce_pin'"} + + set reg_ce_src_pin [find_pin_driver $reg_ce_pin] + if {$reg_ce_src_pin == ""} { + puts "ERROR: No source pin found connected to '$reg_ce_pin'." + exit -1 + } + if {$debug} {puts "DEBUG: reg_ce_src_pin: '$reg_ce_src_pin'"} + } + } + + # do we have a fully registered read address? + if {[llength $reg_next_pins] == [llength $raddr_w_nets]} { + puts "INFO: Fully registered read address detected." + set addr_width [llength $raddr_w_nets] + for {set addr_idx 0} {$addr_idx < $addr_width} {incr addr_idx} { + set raddr_w_net [lindex $raddr_w_nets $addr_idx] + set raddr_s_net [lindex $raddr_s_nets $addr_idx] + set reg_next_pin [lindex $reg_next_pins $addr_idx] + puts "INFO: Connecting pin '$reg_next_pin' to '$raddr_s_net's pins." + # Connect reg_next_pin to all input pins attached to raddr_s_net + replace_net_source $raddr_s_net $reg_next_pin + } + + # Connect reg_ce_src_pin to all input pins attached to read_s_net + puts "INFO: Connecting pin '$reg_ce_src_pin' to '$read_s_net's pins." + replace_net_source $read_s_net $reg_ce_src_pin + + # Create Const<1>'s pin + set vcc_pin [getOrCreateVCCPin "$inst/VCC"] + + # Connect vcc_pin to all input pins attached to is_raddr_reg_net + puts "INFO: Connecting pin '$vcc_pin' to '$is_raddr_reg_net's pins." + replace_net_source $is_raddr_reg_net $vcc_pin + } else { + puts "WARNING: Not all read addresses are registered!" + + # Create Const<0>'s pin + set gnd_pin [getOrCreateGNDPin "$inst/GND"] + + # Connect gnd_pin to all input pins attached to is_raddr_reg_net + puts "INFO: Connecting pin '$gnd_pin' to '$is_raddr_reg_net's pins." + replace_net_source $is_raddr_reg_net $gnd_pin + } + + # Remove all placeholder cells + foreach cell [find_nested_cells $inst "placeholder$"] { + remove_cell $cell + if {$debug} {puts "DEBUG: Cell '$cell' was removed successfully."} + } +} + +proc resolve_async_brams {} { + set bram_patch_cells {} + foreach cell [get_cells -hierarchical -filter {REF_NAME =~ "*VX_async_ram_patch*"}] { + puts "INFO: Found async BRAM patch cell: '$cell'." + lappend bram_patch_cells $cell + } + if {[llength $bram_patch_cells] != 0} { + foreach cell $bram_patch_cells { + resolve_async_bram $cell + } + } else { + puts "INFO: No async BRAM patch cells found in the design." + } +} + +} + +# Invoke the procedure to resolve async BRAM +vortex::resolve_async_brams diff --git a/hw/scripts/xilinx_export_netlist.tcl b/hw/scripts/xilinx_export_netlist.tcl new file mode 100644 index 000000000..25a0d17e8 --- /dev/null +++ b/hw/scripts/xilinx_export_netlist.tcl @@ -0,0 +1,71 @@ +# Function to export netlist to a Graphviz DOT file +proc export_netlist {dot_file_name} { + # Open the DOT file for writing + set dot_file [open $dot_file_name "w"] + + # Start the DOT graph definition + puts $dot_file "digraph Netlist {" + puts $dot_file "rankdir=LR;" ;# Set the graph direction from left to right + + # Extract and add cells to the graph + foreach cell [get_cells -hierarchical] { + set cell_name [get_property NAME $cell] + set cell_type [get_property REF_NAME $cell] + puts $dot_file "\"$cell_name\" \[label=\"$cell_name\\n($cell_type)\", shape=box\];" + } + + # Extract and add ports to the graph + foreach port [get_ports] { + set port_name [get_property NAME $port] + set direction [get_property DIRECTION $port] + set shape "ellipse" + + # Color code input and output ports for easier identification + if {$direction == "IN"} { + set color "lightblue" + } else { + set color "lightgreen" + } + puts $dot_file "\"$port_name\" \[label=\"$port_name\", shape=$shape, style=filled, fillcolor=$color\];" + } + + # Traverse nets and create edges between ports and pins + foreach net [get_nets -hierarchical] { + set net_name [get_property NAME $net] + + # Find source and destination pins + set source_pin "" + set sink_pins {} + + foreach pin [get_pins -of_objects $net] { + set direction [get_property DIRECTION $pin] + set cell [get_cells -of_objects $pin] + set pin_name [get_property NAME $pin] + + if {$direction == "OUT"} { + # Set as source pin + set source_pin "$cell/$pin_name" + } else { + # Collect as sink pin + lappend sink_pins "$cell/$pin_name" + } + } + + # Output edges from source to all sinks + if {$source_pin != ""} { + foreach sink_pin $sink_pins { + puts $dot_file "\"$source_pin\" -> \"$sink_pin\" \[label=\"$net_name\"\];" + } + } + } + + # End the DOT graph definition + puts $dot_file "}" + + # Close the DOT file + close $dot_file + puts "Netlist exported to DOT file: $dot_file_name" +} + +# Run the export function +export_netlist "netlist.dot" \ No newline at end of file diff --git a/hw/syn/xilinx/dut/common.mk b/hw/syn/xilinx/dut/common.mk index 933621bef..81946c88f 100644 --- a/hw/syn/xilinx/dut/common.mk +++ b/hw/syn/xilinx/dut/common.mk @@ -31,9 +31,9 @@ project_1/sources.txt: build: $(PROJECT).xpr $(PROJECT).xpr: project_1/sources.txt ifdef FPU_IP - MAX_JOBS=$(JOBS) FPU_IP=project_1/ip $(VIVADO) -mode batch -source $(SRC_DIR)/project.tcl -tclargs $(TOP_LEVEL_ENTITY) $(DEVICE) project_1/sources.txt $(SRC_DIR)/project.xdc $(SCRIPT_DIR) + MAX_JOBS=$(JOBS) FPU_IP=project_1/ip SCRIPT_DIR=$(SCRIPT_DIR) $(VIVADO) -mode batch -source $(SRC_DIR)/project.tcl -tclargs $(TOP_LEVEL_ENTITY) $(DEVICE) project_1/sources.txt $(SRC_DIR)/project.xdc else - MAX_JOBS=$(JOBS) $(VIVADO) -mode batch -source $(SRC_DIR)/project.tcl -tclargs $(TOP_LEVEL_ENTITY) $(DEVICE) project_1/sources.txt $(SRC_DIR)/project.xdc $(SCRIPT_DIR) + MAX_JOBS=$(JOBS) SCRIPT_DIR=$(SCRIPT_DIR) $(VIVADO) -mode batch -source $(SRC_DIR)/project.tcl -tclargs $(TOP_LEVEL_ENTITY) $(DEVICE) project_1/sources.txt $(SRC_DIR)/project.xdc endif clean: diff --git a/hw/syn/xilinx/dut/project.tcl b/hw/syn/xilinx/dut/project.tcl index dcaf883fa..9cb173c22 100644 --- a/hw/syn/xilinx/dut/project.tcl +++ b/hw/syn/xilinx/dut/project.tcl @@ -14,9 +14,9 @@ # Start time set start_time [clock seconds] -if { $::argc != 5 } { - puts "ERROR: Program \"$::argv0\" requires 5 arguments!\n" - puts "Usage: $::argv0 \n" +if { $::argc != 4 } { + puts "ERROR: Program \"$::argv0\" requires 4 arguments!\n" + puts "Usage: $::argv0 \n" exit } @@ -27,13 +27,16 @@ set top_module [lindex $::argv 0] set device_part [lindex $::argv 1] set vcs_file [lindex $::argv 2] set xdc_file [lindex $::argv 3] -set tool_dir [lindex $::argv 4] + +set script_dir $::env(SCRIPT_DIR) +set source_dir [file dirname [info script]] puts "Using top_module=$top_module" puts "Using device_part=$device_part" puts "Using vcs_file=$vcs_file" puts "Using xdc_file=$xdc_file" -puts "Using tool_dir=$tool_dir" +puts "Using script_dir=$script_dir" +puts "Using source_dir=$source_dir" # Set the number of jobs based on MAX_JOBS environment variable if {[info exists ::env(MAX_JOBS)]} { @@ -48,10 +51,10 @@ if {[info exists ::env(FPU_IP)]} { set ip_dir $::env(FPU_IP) set argv [list $ip_dir $device_part] set argc 2 - source ${tool_dir}/xilinx_ip_gen.tcl + source ${script_dir}/xilinx_ip_gen.tcl } -source "${tool_dir}/parse_vcs_list.tcl" +source "${script_dir}/parse_vcs_list.tcl" set vlist [parse_vcs_list "${vcs_file}"] set vsources_list [lindex $vlist 0] @@ -84,37 +87,52 @@ if {[info exists ::env(FPU_IP)]} { update_compile_order -fileset sources_1 +# Synthesis set_property top $top_module [current_fileset] + set_property \ -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} \ -value {-mode out_of_context -flatten_hierarchy "rebuilt"} \ -objects [get_runs synth_1] -# Synthesis +# register compilation hooks +#set_property STEPS.SYNTH_DESIGN.TCL.PRE ${source_dir}/pre_synth_hook.tcl [get_runs synth_1] +#set_property STEPS.SYNTH_DESIGN.TCL.POST ${source_dir}/post_synth_hook.tcl [get_runs synth_1] +set_property STEPS.OPT_DESIGN.TCL.PRE ${script_dir}/xilinx_async_bram_patch.tcl [get_runs impl_1] +#set_property STEPS.OPT_DESIGN.TCL.POST ${source_dir}/post_opt_hook.tcl [get_runs impl_1] +#set_property STEPS.ROUTE_DESIGN.TCL.PRE ${source_dir}/pre_route_hook.tcl [get_runs impl_1] +#set_property STEPS.ROUTE_DESIGN.TCL.POST ${source_dir}/post_route_hook.tcl [get_runs impl_1] + if {$num_jobs != 0} { - launch_runs synth_1 -jobs $num_jobs + launch_runs synth_1 -verbose -jobs $num_jobs } else { - launch_runs synth_1 + launch_runs synth_1 -verbose } wait_on_run synth_1 open_run synth_1 write_checkpoint -force post_synth.dcp -report_utilization -file utilization.rpt -hierarchical -hierarchical_percentages +report_utilization -file post_synth_util.rpt -hierarchical -hierarchical_percentages # Implementation if {$num_jobs != 0} { - launch_runs impl_1 -jobs $num_jobs + launch_runs impl_1 -verbose -jobs $num_jobs } else { - launch_runs impl_1 + launch_runs impl_1 -verbose } wait_on_run impl_1 open_run impl_1 write_checkpoint -force post_impl.dcp +report_utilization -file post_impl_util.rpt -hierarchical -hierarchical_percentages # Generate the synthesis report report_place_status -file place.rpt report_route_status -file route.rpt report_timing_summary -file timing.rpt + +# Generate timing report +report_timing -nworst 10 -delay_type max -sort_by group -file timing.rpt + +# Generate power and drc reports report_power -file power.rpt report_drc -file drc.rpt @@ -125,4 +143,4 @@ set elapsed_time [expr {[clock seconds] - $start_time}] set hours [format "%02d" [expr {$elapsed_time / 3600}]] set minutes [format "%02d" [expr {($elapsed_time % 3600) / 60}]] set seconds [format "%02d" [expr {$elapsed_time % 60}]] -puts "Total elapsed time: ${hours}h ${minutes}m ${seconds}s" \ No newline at end of file +puts "Total elapsed time: ${hours}h ${minutes}m ${seconds}s" diff --git a/hw/syn/xilinx/dut/unittest/Makefile b/hw/syn/xilinx/dut/unittest/Makefile index 1bc66aa38..3d756562e 100644 --- a/hw/syn/xilinx/dut/unittest/Makefile +++ b/hw/syn/xilinx/dut/unittest/Makefile @@ -1,4 +1,4 @@ -PROJECT = Unittest +PROJECT = VX_fifo_queue TOP_LEVEL_ENTITY = $(PROJECT) SRC_FILE = $(PROJECT).sv diff --git a/hw/syn/xilinx/sandbox/Makefile b/hw/syn/xilinx/sandbox/Makefile index e4def9c4e..074fcb87c 100644 --- a/hw/syn/xilinx/sandbox/Makefile +++ b/hw/syn/xilinx/sandbox/Makefile @@ -24,11 +24,8 @@ FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif -TEX_INCLUDE = -I$(RTL_DIR)/tex -RASTER_INCLUDE = -I$(RTL_DIR)/raster -OM_INCLUDE = -I$(RTL_DIR)/om RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -RTL_INCLUDE += $(FPU_INCLUDE) $(TEX_INCLUDE) $(RASTER_INCLUDE) $(OM_INCLUDE) +RTL_INCLUDE += $(FPU_INCLUDE) RTL_INCLUDE += -I$(SRC_DIR) # compilation flags diff --git a/hw/syn/xilinx/sandbox/project.tcl.in b/hw/syn/xilinx/sandbox/project.tcl.in index 8926b43ad..bb1bf86f2 100644 --- a/hw/syn/xilinx/sandbox/project.tcl.in +++ b/hw/syn/xilinx/sandbox/project.tcl.in @@ -121,8 +121,8 @@ proc run_setup {} { # None # Set 'sim_1' fileset file properties for local files -set file "testbench.v" -set file_obj [get_files -of_objects [get_filesets sim_1] [list "*$file"]] + set file "testbench.v" + set file_obj [get_files -of_objects [get_filesets sim_1] [list "*$file"]] set_property -name "file_type" -value "Verilog" -objects $file_obj set_property -name "is_enabled" -value "1" -objects $file_obj set_property -name "is_global_include" -value "0" -objects $file_obj @@ -300,7 +300,7 @@ set file_obj [get_files -of_objects [get_filesets sim_1] [list "*$file"]] CONFIG.Assume_Synchronous_Clk {true} \ CONFIG.Byte_Size {8} \ CONFIG.Load_Init_File {true} \ - CONFIG.Coe_File {@CURRENTDIR@/hw/syn/xilinx/sandbox/kernel.bin.coe} \ + CONFIG.Coe_File {@BUILDDIR@/hw/syn/xilinx/sandbox/kernel.bin.coe} \ CONFIG.EN_SAFETY_CKT {true} \ CONFIG.Enable_32bit_Address {true} \ CONFIG.Fill_Remaining_Memory_Locations {false} \ diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index f5997352c..643724069 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -76,22 +76,21 @@ CONFIGS += $(CONFIGS_$(NUM_CORES)c) # include sources RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv -RTL_PKGS += $(RTL_DIR)/tex/VX_tex_pkg.sv $(RTL_DIR)/raster/VX_raster_pkg.sv $(RTL_DIR)/om/VX_om_pkg.sv FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) RTL_PKGS += $(THIRD_PARTY_DIR)/cvfpu/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif -TEX_INCLUDE = -I$(RTL_DIR)/tex -RASTER_INCLUDE = -I$(RTL_DIR)/raster -OM_INCLUDE = -I$(RTL_DIR)/om RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(AFU_DIR) -RTL_INCLUDE += $(FPU_INCLUDE) $(TEX_INCLUDE) $(RASTER_INCLUDE) $(OM_INCLUDE) +RTL_INCLUDE += $(FPU_INCLUDE) # Kernel compiler global settings VPP_FLAGS += --link --target $(TARGET) --platform $(PLATFORM) --save-temps --no_ip_cache VPP_FLAGS += --vivado.synth.jobs $(JOBS) --vivado.impl.jobs $(JOBS) +# register compilation hooks +VPP_FLAGS += --xp "vivado_prop:run.impl_1.STEPS.OPT_DESIGN.TCL.PRE={$(SCRIPT_DIR)/xilinx_async_bram_patch.tcl}" + # load platform settings include $(SRC_DIR)/platforms.mk @@ -178,6 +177,7 @@ $(BIN_DIR)/emconfig.json: report: $(XCLBIN_CONTAINER) ifeq ($(TARGET), hw) + cp $(BUILD_DIR)/_x/logs/link/vivado.log $(BUILD_DIR)/bin cp $(BUILD_DIR)/_x/logs/link/syn/ulp_vortex_afu_1_0_synth_1_runme.log $(BUILD_DIR)/bin cp $(BUILD_DIR)/_x/reports/link/syn/ulp_vortex_afu_1_0_synth_1_ulp_vortex_afu_1_0_utilization_synth.rpt $(BUILD_DIR)/bin cp $(BUILD_DIR)/_x/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpt $(BUILD_DIR)/bin diff --git a/hw/unittest/Makefile b/hw/unittest/Makefile index f37d6ae1b..d3a74d794 100644 --- a/hw/unittest/Makefile +++ b/hw/unittest/Makefile @@ -1,5 +1,4 @@ all: - $(MAKE) -C cache $(MAKE) -C generic_queue $(MAKE) -C mem_streamer $(MAKE) -C cache_top @@ -9,7 +8,6 @@ all: $(MAKE) -C mem_unit_top run: - $(MAKE) -C cache run $(MAKE) -C generic_queue run $(MAKE) -C mem_streamer run $(MAKE) -C cache_top run @@ -19,7 +17,6 @@ run: $(MAKE) -C mem_unit_top run clean: - $(MAKE) -C cache clean $(MAKE) -C generic_queue clean $(MAKE) -C mem_streamer clean $(MAKE) -C cache_top clean diff --git a/hw/unittest/cache/Makefile b/hw/unittest/cache/Makefile deleted file mode 100644 index b734aaedd..000000000 --- a/hw/unittest/cache/Makefile +++ /dev/null @@ -1,26 +0,0 @@ -ROOT_DIR := $(realpath ../../..) -include $(ROOT_DIR)/config.mk - -PROJECT := cache - -RTL_DIR := $(VORTEX_HOME)/hw/rtl -DPI_DIR := $(VORTEX_HOME)/hw/dpi - -SRC_DIR := $(VORTEX_HOME)/hw/unittest/$(PROJECT) - -CXXFLAGS := -I$(SRC_DIR) -I$(VORTEX_HOME)/hw/unittest/common -I$(VORTEX_HOME)/sim/common -CXXFLAGS += -I$(ROOT_DIR)/hw - -SRCS := $(DPI_DIR)/util_dpi.cpp -SRCS += $(SRC_DIR)/cachesim.cpp $(SRC_DIR)/testbench.cpp - -DBG_TRACE_FLAGS := -DDBG_TRACE_CACHE - -RTL_PKGS := $(RTL_DIR)/VX_gpu_pkg.sv - -RTL_INCLUDE := -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -RTL_INCLUDE += -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache - -TOP := VX_cache_top - -include ../common.mk \ No newline at end of file diff --git a/hw/unittest/cache/cachesim.cpp b/hw/unittest/cache/cachesim.cpp deleted file mode 100644 index acd68419b..000000000 --- a/hw/unittest/cache/cachesim.cpp +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "cachesim.h" -#include -#include -#include -#include -#include - -#ifndef TRACE_START_TIME -#define TRACE_START_TIME 0ull -#endif - -#ifndef TRACE_STOP_TIME -#define TRACE_STOP_TIME -1ull -#endif - -static uint64_t timestamp = 0; -static bool trace_enabled = false; -static uint64_t trace_start_time = TRACE_START_TIME; -static uint64_t trace_stop_time = TRACE_STOP_TIME; - -double sc_time_stamp() { - return timestamp; -} - -bool sim_trace_enabled() { - if (timestamp >= trace_start_time - && timestamp < trace_stop_time) - return true; - return trace_enabled; -} - -void sim_trace_enable(bool enable) { - trace_enabled = enable; -} - -CacheSim::CacheSim() { - // force random values for uninitialized signals - Verilated::randReset(2); - - // create RTL module instance - cache_ = new VVX_cache_top(); - -#ifdef VCD_OUTPUT - Verilated::traceEverOn(true); - tfp_ = new VerilatedVcdC; - cache_->trace(tfp_, 99); - tfp_->open("trace.vcd"); -#endif - - ram_ = nullptr; - mem_rsp_active_ = false; - snp_req_active_ = false; -} - -CacheSim::~CacheSim() { -#ifdef VCD_OUTPUT - tfp_->close(); -#endif - delete cache_; - //need to delete the req and rsp vectors -} - -void CacheSim::attach_ram(RAM* ram) { - ram_ = ram; - mem_rsp_vec_.clear(); -} - -void CacheSim::reset() { -#ifndef NDEBUG - std::cout << timestamp << ": [sim] reset()" << std::endl; -#endif - - cache_->reset = 1; - this->step(); - cache_->reset = 0; - this->step(); - - mem_rsp_vec_.clear(); - //clear req and rsp vecs - -} - -void CacheSim::step() { - //std::cout << timestamp << ": [sim] step()" << std::endl; - //toggle clock - cache_->clk = 0; - this->eval(); - - cache_->clk = 1; - this->eval(); - - //handle core and memory reqs and rsps - this->eval_reqs(); - this->eval_rsps(); - this->eval_mem_bus(); - timestamp++; -} - -void CacheSim::eval() { - cache_->eval(); -#ifdef VCD_OUTPUT - tfp_->dump(timestamp); -#endif - ++timestamp; -} - -void CacheSim::run(){ -//#ifndef NDEBUG - -//#endif - this->step(); - - int valid = 300; - int stalls = 20 + 10; - - while (valid > -1) { - - this->step(); - display_miss(); - if(cache_->core_rsp_valid){ - get_core_rsp(); - } - - if(!cache_->core_req_valid && !cache_->core_rsp_valid){ - valid--; - - } - stalls--; - if (stalls == 20){ - //stall_mem(); - //send_snoop_req(); - stalls--; - } - } -} - -void CacheSim::clear_req(){ - cache_->core_req_valid = 0; -} - -void CacheSim::send_req(core_req_t *req){ - core_req_vec_.push(req); - unsigned int *data = new unsigned int[4]; - core_rsp_vec_.insert(std::pair(req->tag, data)); -} - -bool CacheSim::get_core_req_ready(){ - return cache_->core_req_ready; -} - -bool CacheSim::get_core_rsp_ready(){ - return cache_->core_rsp_ready; -} - -void CacheSim::eval_reqs(){ - //check to see if cache is accepting reqs - if(!core_req_vec_.empty() && cache_->core_req_ready){ - core_req_t *req = core_req_vec_.front(); - - cache_->core_req_valid = req->valid; - cache_->core_req_rw = req->rw; - cache_->core_req_byteen = req->byteen; - - cache_->core_req_addr[0] = req->addr[0]; - cache_->core_req_addr[1] = req->addr[1]; - cache_->core_req_addr[2] = req->addr[2]; - cache_->core_req_addr[3] = req->addr[3]; - - cache_->core_req_data[0] = req->data[0]; - cache_->core_req_data[1] = req->data[1]; - cache_->core_req_data[2] = req->data[2]; - cache_->core_req_data[3] = req->data[3]; - - cache_->core_req_tag = req->tag; - - core_req_vec_.pop(); - - } else { - clear_req(); - } -} - -void CacheSim::eval_rsps(){ - //check to see if a request has been responded to - if (cache_->core_rsp_valid){ - core_rsp_vec_.at(cache_->core_rsp_tag)[0] = cache_->core_rsp_data[0]; - core_rsp_vec_.at(cache_->core_rsp_tag)[1] = cache_->core_rsp_data[1]; - core_rsp_vec_.at(cache_->core_rsp_tag)[2] = cache_->core_rsp_data[2]; - core_rsp_vec_.at(cache_->core_rsp_tag)[3] = cache_->core_rsp_data[3]; - } -} - -void CacheSim::stall_mem(){ - cache_->mem_req_ready = 0; -} - -void CacheSim::send_snoop_req(){ - /*cache_->snp_req_valid = 1; - cache_->snp_req_addr = 0x12222222; - cache_->snp_req_invalidate = 1; - cache_->snp_req_tag = 0xff; */ -} - -void CacheSim::eval_mem_bus() { - if (ram_ == nullptr) { - cache_->mem_req_ready = 0; - return; - } - - // schedule memory responses - int dequeue_index = -1; - for (int i = 0; i < mem_rsp_vec_.size(); i++) { - if (mem_rsp_vec_[i].cycles_left > 0) { - mem_rsp_vec_[i].cycles_left -= 1; - } - if ((dequeue_index == -1) - && (mem_rsp_vec_[i].cycles_left == 0)) { - dequeue_index = i; - } - } - - // send memory response - if (mem_rsp_active_ - && cache_->mem_rsp_valid - && cache_->mem_rsp_ready) { - mem_rsp_active_ = false; - } - if (!mem_rsp_active_) { - if (dequeue_index != -1) { //time to respond to the request - cache_->mem_rsp_valid = 1; - - //copy data from the rsp queue to the cache module - memcpy(cache_->mem_rsp_data.data(), mem_rsp_vec_[dequeue_index].data, MEM_BLOCK_SIZE); - - cache_->mem_rsp_tag = mem_rsp_vec_[dequeue_index].tag; - free(mem_rsp_vec_[dequeue_index].data); //take data out of the queue - mem_rsp_vec_.erase(mem_rsp_vec_.begin() + dequeue_index); - mem_rsp_active_ = true; - } else { - cache_->mem_rsp_valid = 0; - } - } - - // handle memory stalls - bool mem_stalled = false; -#ifdef ENABLE_MEM_STALLS - if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { - mem_stalled = true; - } else - if (mem_rsp_vec_.size() >= MEM_RQ_SIZE) { - mem_stalled = true; - } -#endif - - // process memory requests - if (!mem_stalled) { - if (cache_->mem_req_valid) { - if (cache_->mem_req_rw) { //write = 1 - uint64_t byteen = cache_->mem_req_byteen; - uint64_t base_addr = (cache_->mem_req_addr * MEM_BLOCK_SIZE); - uint8_t* data = reinterpret_cast(cache_->mem_req_data.data()); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - (*ram_)[base_addr + i] = data[i]; - } - } - } else { - mem_req_t mem_req; - mem_req.cycles_left = MEM_LATENCY; - mem_req.data = (uint8_t*)malloc(MEM_BLOCK_SIZE); - mem_req.tag = cache_->mem_req_tag; - ram_->read(cache_->mem_req_addr * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE, mem_req.data); - mem_rsp_vec_.push_back(mem_req); - } - } - } - - cache_->mem_req_ready = ~mem_stalled; -} - -bool CacheSim::assert_equal(unsigned int* data, unsigned int tag){ - int check = 0; - unsigned int *rsp = core_rsp_vec_.at(tag); - for (int i = 0; i < 4; ++i){ - for (int j = 0; j < 4; ++j){ - if (data[i] == rsp[j]){ - check++; - } - } - } - - return check; - -} - -//DEBUG - -void CacheSim::display_miss(){ - //int i = (unsigned int)cache_->miss_vec; - //std::bitset<8> x(i); - //if (i) std::cout << "Miss Vec " << x << std::endl; - //std::cout << "Miss Vec 0" << cache_->miss_vec[0] << std::endl; -} - -void CacheSim::get_core_req(unsigned int (&rsp)[4]){ - rsp[0] = cache_->core_rsp_data[0]; - rsp[1] = cache_->core_rsp_data[1]; - rsp[2] = cache_->core_rsp_data[2]; - rsp[3] = cache_->core_rsp_data[3]; - - //std::cout << std::hex << "core_rsp_valid: " << cache_->core_rsp_valid << std::endl; - //std::cout << std::hex << "core_rsp_data: " << cache_->core_rsp_data << std::endl; - //std::cout << std::hex << "core_rsp_tag: " << cache_->core_rsp_tag << std::endl; -} - -void CacheSim::get_core_rsp(){ - //std::cout << cache_->genblk5_BRA_0_KET_->bank->is_fill_in_pipe<< std::endl; - char check = cache_->core_rsp_valid; - std::cout << std::hex << "core_rsp_valid: " << (unsigned int) check << std::endl; - std::cout << std::hex << "core_rsp_data[0]: " << cache_->core_rsp_data[0] << std::endl; - std::cout << std::hex << "core_rsp_data[1]: " << cache_->core_rsp_data[1] << std::endl; - std::cout << std::hex << "core_rsp_data[2]: " << cache_->core_rsp_data[2] << std::endl; - std::cout << std::hex << "core_rsp_data[3]: " << cache_->core_rsp_data[3] << std::endl; - std::cout << std::hex << "core_rsp_tag: " << cache_->core_rsp_tag << std::endl; -} - -void CacheSim::get_mem_req(){ - std::cout << std::hex << "mem_req_valid: " << cache_->mem_req_valid << std::endl; - std::cout << std::hex << "mem_req_rw: " << cache_->mem_req_rw << std::endl; - std::cout << std::hex << "mem_req_byteen: " << cache_->mem_req_byteen << std::endl; - std::cout << std::hex << "mem_req_addr: " << cache_->mem_req_addr << std::endl; - std::cout << std::hex << "mem_req_data: " << cache_->mem_req_data << std::endl; - std::cout << std::hex << "mem_req_tag: " << cache_->mem_req_tag << std::endl; -} - -void CacheSim::get_mem_rsp(){ - std::cout << std::hex << "mem_rsp_valid: " << cache_->mem_rsp_valid << std::endl; - std::cout << std::hex << "mem_rsp_data: " << cache_->mem_rsp_data << std::endl; - std::cout << std::hex << "mem_rsp_tag: " << cache_->mem_rsp_tag << std::endl; - std::cout << std::hex << "mem_rsp_ready: " << cache_->mem_rsp_ready << std::endl; -} diff --git a/hw/unittest/cache/cachesim.h b/hw/unittest/cache/cachesim.h deleted file mode 100644 index 5235735d6..000000000 --- a/hw/unittest/cache/cachesim.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "VVX_cache_top.h" -#include "VVX_cache_top__Syms.h" -#include "verilated.h" - -#ifdef VCD_OUTPUT -#include -#endif - -#include -#include "ram.h" -#include -#include -#include - -#define ENABLE_MEM_STALLS -#define MEM_LATENCY 100 -#define MEM_RQ_SIZE 16 -#define MEM_STALLS_MODULO 16 - -typedef struct { - int cycles_left; - uint8_t *data; - unsigned tag; -} mem_req_t; - -typedef struct { - char valid; - char rw; - unsigned byteen; - unsigned *addr; - unsigned *data; - unsigned int tag; -} core_req_t; - -class CacheSim { -public: - - CacheSim(); - virtual ~CacheSim(); - - bool busy(); - - void reset(); - void step(); - void wait(uint32_t cycles); - void attach_ram(RAM* ram); - void run(); //run until all reqs are empty - - //req/rsp - void send_req(core_req_t *req); - void clear_req(); - void stall_mem(); - void send_snoop_req(); - void send_snp_fwd_in(); - - //assert funcs - bool assert_equal(unsigned int* data, unsigned int tag); - - //debug funcs - void get_mem_req(); - void get_core_req(unsigned int (&rsp)[4]); - void get_core_rsp(); - bool get_core_req_ready(); - bool get_core_rsp_ready(); - void get_mem_rsp(); - void display_miss(); - -private: - - void eval(); - void eval_reqs(); - void eval_rsps(); - void eval_mem_bus(); - - std::queue core_req_vec_; - std::vector mem_rsp_vec_; - std::map core_rsp_vec_; - int mem_rsp_active_; - - uint32_t snp_req_active_; - uint32_t snp_req_size_; - uint32_t pending_snp_reqs_; - - VVX_cache_top* cache_; - RAM* ram_; -#ifdef VCD_OUTPUT - VerilatedVcdC* tfp_; -#endif -}; diff --git a/hw/unittest/cache/ram.h b/hw/unittest/cache/ram.h deleted file mode 100644 index d01934a52..000000000 --- a/hw/unittest/cache/ram.h +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -class RAM { -private: - - mutable uint8_t *mem_[(1 << 12)]; - - uint8_t *get(uint32_t address) const { - uint32_t block_addr = address >> 20; - uint32_t block_offset = address & 0x000FFFFF; - if (mem_[block_addr] == NULL) { - mem_[block_addr] = new uint8_t[(1 << 20)]; - } - return mem_[block_addr] + block_offset; - } - -public: - - RAM() { - for (uint32_t i = 0; i < (1 << 12); i++) { - mem_[i] = NULL; - } - } - - ~RAM() { - this->clear(); - } - - size_t size() const { - return (1ull << 32); - } - - void clear() { - for (uint32_t i = 0; i < (1 << 12); i++) { - if (mem_[i]) { - delete [] mem_[i]; - mem_[i] = NULL; - } - } - } - - void read(uint32_t address, uint32_t length, uint8_t *data) const { - for (unsigned i = 0; i < length; i++) { - data[i] = *this->get(address + i); - } - } - - void write(uint32_t address, uint32_t length, const uint8_t *data) { - for (unsigned i = 0; i < length; i++) { - *this->get(address + i) = data[i]; - } - } - - uint8_t& operator[](uint32_t address) { - return *get(address); - } - - const uint8_t& operator[](uint32_t address) const { - return *get(address); - } -}; \ No newline at end of file diff --git a/hw/unittest/cache/testbench.cpp b/hw/unittest/cache/testbench.cpp deleted file mode 100644 index bf9dfb340..000000000 --- a/hw/unittest/cache/testbench.cpp +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "cachesim.h" -#include -#include -#include - -#define VCD_OUTPUT 1 - - -int REQ_RSP(CacheSim *sim){ //verified - unsigned int addr[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; - unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; - unsigned int rsp[4] = {0,0,0,0}; - char responded = 0; - //write req - core_req_t* write = new core_req_t; - write->valid = 0xf; - write->rw = 0xf; - write->byteen = 0xffff; - write->addr = addr; - write->data = data; - write->tag = 0xff; - - //read req - core_req_t* read = new core_req_t; - read->valid = 0xf; - read->rw = 0; - read->byteen = 0xffff; - read->addr = addr; - read->data = addr; - read->tag = 0xff; - - // reset the device - sim->reset(); - - //queue reqs - sim->send_req(write); - sim->send_req(read); - - sim->run(); - - int check = sim->assert_equal(data, write->tag); - - if (check == 4) return 1; - - return 0; -} - -int HIT_1(CacheSim *sim){ - unsigned int addr[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; - unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; - unsigned int rsp[4] = {0,0,0,0}; - char responded = 0; - //write req - core_req_t* write = new core_req_t; - write->valid = 0xf; - write->rw = 0xf; - write->byteen = 0xffff; - write->addr = addr; - write->data = data; - write->tag = 0x11; - - //read req - core_req_t* read = new core_req_t; - read->valid = 0xf; - read->rw = 0; - read->byteen = 0xffff; - read->addr = addr; - read->data = addr; - read->tag = 0x22; - - // reset the device - sim->reset(); - - //queue reqs - sim->send_req(write); - sim->send_req(read); - - sim->run(); - - bool check = sim->assert_equal(data, write->tag); - - return check; -} - -int MISS_1(CacheSim *sim){ - unsigned int addr1[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; - unsigned int addr2[4] = {0x12229222, 0xabbbb4bb, 0xcddd47dd, 0xe4423544}; - unsigned int addr3[4] = {0x12223332, 0xabb454bb, 0xcdddeefd, 0xe4447744}; - unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; - unsigned int rsp[4] = {0,0,0,0}; - char responded = 0; - //write req - core_req_t* write = new core_req_t; - write->valid = 0xf; - write->rw = 0xf; - write->byteen = 0xffff; - write->addr = addr1; - write->data = data; - write->tag = 0xff; - - //read req - core_req_t* read1 = new core_req_t; - read1->valid = 0xf; - read1->rw = 0; - read1->byteen = 0xffff; - read1->addr = addr1; - read1->data = data; - read1->tag = 0xff; - - core_req_t* read2 = new core_req_t; - read2->valid = 0xf; - read2->rw = 0; - read2->byteen = 0xffff; - read2->addr = addr2; - read2->data = data; - read2->tag = 0xff; - - core_req_t* read3 = new core_req_t; - read3->valid = 0xf; - read3->rw = 0; - read3->byteen = 0xffff; - read3->addr = addr3; - read3->data = data; - read3->tag = 0xff; - - // reset the device - sim->reset(); - - //queue reqs - sim->send_req(write); - sim->send_req(read1); - sim->send_req(read2); - sim->send_req(read3); - - sim->run(); - - bool check = sim->assert_equal(data, write->tag); - - return check; -} -int FLUSH(CacheSim *sim){ - unsigned int addr[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; - unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; - unsigned int rsp[4] = {0,0,0,0}; - char responded = 0; - //write req - core_req_t* write = new core_req_t; - write->valid = 0xf; - write->rw = 0xf; - write->byteen = 0xffff; - write->addr = addr; - write->data = data; - write->tag = 0xff; - - //read req - core_req_t* read = new core_req_t; - read->valid = 0xf; - read->rw = 0; - read->byteen = 0xffff; - read->addr = addr; - read->data = addr; - read->tag = 0xff; - - // reset the device - sim->reset(); - - //queue reqs - sim->send_req(write); - sim->send_req(read); - - sim->run(); - - bool check = sim->assert_equal(data, write->tag); - - return check; -} - - -int BACK_PRESSURE(CacheSim *sim){ - //happens whenever the core is stalled or memory is stalled - unsigned int addr[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; - unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; - unsigned int rsp[4] = {0,0,0,0}; - char responded = 0; - - //write req - core_req_t* write = new core_req_t; - write->valid = 0xf; - write->rw = 0xf; - write->byteen = 0xffff; - write->addr = addr; - write->data = data; - write->tag = 0xff; - - //read req - core_req_t* read = new core_req_t; - read->valid = 0xf; - read->rw = 0; - read->byteen = 0xffff; - read->addr = addr; - read->data = addr; - read->tag = 0xff; - - // reset the device - sim->reset(); - - //queue reqs - for (int i = 0; i < 10; i++){ - sim->send_req(write); - } - sim->send_req(read); - - sim->run(); - - bool check = sim->assert_equal(data, write->tag); - - return check; -} - - -int main(int argc, char **argv) -{ - //init - RAM ram; - CacheSim cachesim; - cachesim.attach_ram(&ram); - int check = REQ_RSP(&cachesim); - if(check){ - std::cout << "PASSED" << std::endl; - } else { - std::cout << "FAILED" << std::endl; - } - - return 0; -} diff --git a/hw/unittest/generic_queue/Makefile b/hw/unittest/generic_queue/Makefile index 0adf78fae..ad79c6f94 100644 --- a/hw/unittest/generic_queue/Makefile +++ b/hw/unittest/generic_queue/Makefile @@ -21,4 +21,6 @@ RTL_INCLUDE := -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs TOP := VX_fifo_queue +PARAMS := -GDATAW=32 -GDEPTH=8 + include ../common.mk \ No newline at end of file diff --git a/miscs/docker/Dockerfile.prod b/miscs/docker/Dockerfile.prod index e1a8d94b5..20c9c033b 100644 --- a/miscs/docker/Dockerfile.prod +++ b/miscs/docker/Dockerfile.prod @@ -18,41 +18,32 @@ FROM ubuntu:20.04 ARG DEBIAN_FRONTEND=noninteractive # Install necessary dependencies and upgrade installed components -RUN apt-get update -y && \ - apt-get install -y \ +# Update and install necessary dependencies +RUN apt-get update && apt-get install -y \ software-properties-common \ build-essential \ python3 \ git \ wget \ curl \ - ca-certificates \ - valgrind \ - libstdc++6 \ - binutils \ - uuid-dev \ - ccache \ - cmake && \ - apt-get upgrade -y && \ - gcc_version=$(gcc -dumpversion) && \ - if dpkg --compare-versions "$gcc_version" lt 11; then \ - echo "GCC version is less than 11. Installing GCC 11..." && \ - add-apt-repository -y ppa:ubuntu-toolchain-r/test && \ - apt-get update -y && \ - apt-get install -y g++-11 gcc-11 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100; \ - else \ - echo "GCC version is 11 or greater. No need to install GCC 11."; \ - fi && \ + ca-certificates && \ rm -rf /var/lib/apt/lists/* +# upgrade installed components +RUN apt-get upgrade && apt-get update + +# temporary until remote dependency script gets updated +RUN apt-get install -y cmake + # Clone the Vortex repository RUN git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git /vortex # Set the initial working directory WORKDIR /vortex +# install system dependencies +RUN ./ci/install_dependencies.sh + # Configure the build folder RUN mkdir build && cd build && ../configure diff --git a/miscs/docker/README.md b/miscs/docker/README.md index 897f8f9fb..c077102da 100644 --- a/miscs/docker/README.md +++ b/miscs/docker/README.md @@ -4,17 +4,32 @@ You can install Docker desktop on MAC or PC or Ubuntu. - MAC: https://docs.docker.com/desktop/install/mac-install - Ubuntu: https://docs.docker.com/desktop/install/ubuntu -### 1- Create a Docker image from the Dockerfile - $ docker build -f Dockerfile.ubuntu -t vortex +### 1- Build a Docker Image from the Dockerfile + $ docker build --platform=linux/amd64 -t vortex-packaged -f Dockerfile.prod . -### 2- Build the Docker image - $ docker docker run -it vortex /bin/bash +### 2- Construct and run a Container from the Docker Image + $ docker run -it --name vortex --privileged=true --platform=linux/amd64 vortex-packaged -### 3- Build the project +### 3- Build the Project One you login the Docker terminal, you will be in the build directory. $ make -s -### 4- Run a simple test +### 4- Run a Simple Test +See `docs/` to learn more! - $ ./ci/blackbox.sh --cores=2 --app=vecadd \ No newline at end of file + $ ./ci/blackbox.sh --cores=2 --app=vecadd + +### 5- Exit the Container + + $ exit + $ docker stop vortex + +### 6- Restart and Re-Enter the Container +If you ran step `2` and then step `5` then, you have to start and re-enter the container + + $ docker start vortex + $ docker exec -it vortex + +--- +Note: Apple Silicon macs will run the container in emulation mode, so compiling and running will take a considerable amount of time -- but it still works! \ No newline at end of file diff --git a/sim/rtlsim/processor.cpp b/sim/rtlsim/processor.cpp index 32f4b4e1e..1807e5630 100644 --- a/sim/rtlsim/processor.cpp +++ b/sim/rtlsim/processor.cpp @@ -241,8 +241,6 @@ class Processor::Impl { #ifdef VCD_OUTPUT if (sim_trace_enabled()) { tfp_->dump(timestamp); - } else { - exit(-1); } #endif ++timestamp; diff --git a/sim/xrtsim/xrt_sim.cpp b/sim/xrtsim/xrt_sim.cpp index d572b9479..8dd800931 100644 --- a/sim/xrtsim/xrt_sim.cpp +++ b/sim/xrtsim/xrt_sim.cpp @@ -333,14 +333,27 @@ class xrt_sim::Impl { } device_->ap_rst_n = 1; + + // this AXI device is always ready to accept new requests for (int i = 0; i < PLATFORM_MEMORY_BANKS; ++i) { *m_axi_mem_[i].arready = 1; *m_axi_mem_[i].awready = 1; + *m_axi_mem_[i].wready = 1; } } void tick() { - this->axi_mem_bus_eval(); + device_->ap_clk = 0; + this->eval(); + + this->axi_mem_bus_eval(0); + + device_->ap_clk = 1; + this->eval(); + + this->axi_mem_bus_eval(1); + + dram_sim_.tick(); for (int i = 0; i < PLATFORM_MEMORY_BANKS; ++i) { if (!dram_queues_[i].empty()) { @@ -358,13 +371,6 @@ class xrt_sim::Impl { } } - dram_sim_.tick(); - - device_->ap_clk = 0; - this->eval(); - device_->ap_clk = 1; - this->eval(); - #ifndef NDEBUG fflush(stdout); #endif @@ -381,162 +387,175 @@ class xrt_sim::Impl { } void axi_ctrl_bus_reset() { - // address read request + // read request address device_->s_axi_ctrl_arvalid = 0; device_->s_axi_ctrl_araddr = 0; - // data read response + // read response device_->s_axi_ctrl_rready = 0; - // address write request + // write request address device_->s_axi_ctrl_awvalid = 0; device_->s_axi_ctrl_awaddr = 0; - // data write request + // write request data device_->s_axi_ctrl_wvalid = 0; device_->s_axi_ctrl_wdata = 0; device_->s_axi_ctrl_wstrb = 0; - // data write response + // write response device_->s_axi_ctrl_bready = 0; } void axi_mem_bus_reset() { - for (int i = 0; i < PLATFORM_MEMORY_BANKS; ++i) { - // address read request - *m_axi_mem_[i].arready = 0; + for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { + // read request address + *m_axi_mem_[b].arready = 0; - // address write request - *m_axi_mem_[i].awready = 0; + // write request address + *m_axi_mem_[b].awready = 0; - // data write request - *m_axi_mem_[i].wready = 0; + // write request data + *m_axi_mem_[b].wready = 0; - // data read response - *m_axi_mem_[i].rvalid = 0; + // read response + *m_axi_mem_[b].rvalid = 0; - // data write response - *m_axi_mem_[i].bvalid = 0; + // write response + *m_axi_mem_[b].bvalid = 0; // states - m_axi_states_[i].write_req_pending = false; + m_axi_states_[b].write_req_addr_ack = false; + m_axi_states_[b].write_req_data_ack = false; } } - void axi_mem_bus_eval() { - for (int i = 0; i < PLATFORM_MEMORY_BANKS; ++i) { + void axi_mem_bus_eval(bool clk) { + if (!clk) { + for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { + m_axi_states_[b].read_rsp_ready = *m_axi_mem_[b].rready; + m_axi_states_[b].write_rsp_ready = *m_axi_mem_[b].bready; + } + return; + } + + for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { // handle read responses - if (*m_axi_mem_[i].rvalid && *m_axi_mem_[i].rready) { - *m_axi_mem_[i].rvalid = 0; + if (*m_axi_mem_[b].rvalid && m_axi_states_[b].read_rsp_ready) { + *m_axi_mem_[b].rvalid = 0; } - if (!*m_axi_mem_[i].rvalid) { - if (!pending_mem_reqs_[i].empty() - && (*pending_mem_reqs_[i].begin())->ready - && !(*pending_mem_reqs_[i].begin())->write) { - auto mem_rsp_it = pending_mem_reqs_[i].begin(); + if (!*m_axi_mem_[b].rvalid) { + if (!pending_mem_reqs_[b].empty() + && (*pending_mem_reqs_[b].begin())->ready + && !(*pending_mem_reqs_[b].begin())->write) { + auto mem_rsp_it = pending_mem_reqs_[b].begin(); auto mem_rsp = *mem_rsp_it; - *m_axi_mem_[i].rvalid = 1; - *m_axi_mem_[i].rid = mem_rsp->tag; - *m_axi_mem_[i].rresp = 0; - *m_axi_mem_[i].rlast = 1; - memcpy(m_axi_mem_[i].rdata->data(), mem_rsp->data.data(), PLATFORM_MEMORY_DATA_SIZE); - pending_mem_reqs_[i].erase(mem_rsp_it); + *m_axi_mem_[b].rvalid = 1; + *m_axi_mem_[b].rid = mem_rsp->tag; + *m_axi_mem_[b].rresp = 0; + *m_axi_mem_[b].rlast = 1; + memcpy(m_axi_mem_[b].rdata->data(), mem_rsp->data.data(), PLATFORM_MEMORY_DATA_SIZE); + pending_mem_reqs_[b].erase(mem_rsp_it); delete mem_rsp; } } // handle write responses - if (*m_axi_mem_[i].bvalid && *m_axi_mem_[i].bready) { - *m_axi_mem_[i].bvalid = 0; + if (*m_axi_mem_[b].bvalid && m_axi_states_[b].write_rsp_ready) { + *m_axi_mem_[b].bvalid = 0; } - if (!*m_axi_mem_[i].bvalid) { - if (!pending_mem_reqs_[i].empty() - && (*pending_mem_reqs_[i].begin())->ready - && (*pending_mem_reqs_[i].begin())->write) { - auto mem_rsp_it = pending_mem_reqs_[i].begin(); + if (!*m_axi_mem_[b].bvalid) { + if (!pending_mem_reqs_[b].empty() + && (*pending_mem_reqs_[b].begin())->ready + && (*pending_mem_reqs_[b].begin())->write) { + auto mem_rsp_it = pending_mem_reqs_[b].begin(); auto mem_rsp = *mem_rsp_it; - *m_axi_mem_[i].bvalid = 1; - *m_axi_mem_[i].bid = mem_rsp->tag; - *m_axi_mem_[i].bresp = 0; - pending_mem_reqs_[i].erase(mem_rsp_it); + *m_axi_mem_[b].bvalid = 1; + *m_axi_mem_[b].bid = mem_rsp->tag; + *m_axi_mem_[b].bresp = 0; + pending_mem_reqs_[b].erase(mem_rsp_it); delete mem_rsp; } } // handle read requests - if (*m_axi_mem_[i].arvalid && *m_axi_mem_[i].arready) { + if (*m_axi_mem_[b].arvalid && *m_axi_mem_[b].arready) { auto mem_req = new mem_req_t(); - mem_req->tag = *m_axi_mem_[i].arid; - mem_req->addr = uint64_t(*m_axi_mem_[i].araddr); + mem_req->tag = *m_axi_mem_[b].arid; + mem_req->addr = uint64_t(*m_axi_mem_[b].araddr); ram_->read(mem_req->data.data(), mem_req->addr, PLATFORM_MEMORY_DATA_SIZE); mem_req->write = false; mem_req->ready = false; - pending_mem_reqs_[i].emplace_back(mem_req); + pending_mem_reqs_[b].emplace_back(mem_req); - /*printf("%0ld: [sim] axi-mem-read: bank=%d, addr=0x%lx, tag=0x%x, data=0x", timestamp, i, mem_req->addr, mem_req->tag); + /*printf("%0ld: [sim] axi-mem-read: bank=%d, addr=0x%lx, tag=0x%x, data=0x", timestamp, b, mem_req->addr, mem_req->tag); for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) { - printf("%02x", mem_req->data[i]); + printf("%02x", mem_req->data[b]); } printf("\n");*/ // send dram request - dram_queues_[i].push(mem_req); + dram_queues_[b].push(mem_req); } - if (*m_axi_mem_[i].wready && !m_axi_states_[i].write_req_pending) { - *m_axi_mem_[i].wready = 0; + // handle write address requests + if (*m_axi_mem_[b].awvalid && *m_axi_mem_[b].awready && !m_axi_states_[b].write_req_addr_ack) { + m_axi_states_[b].write_req_addr = *m_axi_mem_[b].awaddr; + m_axi_states_[b].write_req_tag = *m_axi_mem_[b].awid; + m_axi_states_[b].write_req_addr_ack = true; } - // handle address write requestsls - if (*m_axi_mem_[i].awvalid && *m_axi_mem_[i].awready && !*m_axi_mem_[i].wready) { - m_axi_states_[i].write_req_addr = *m_axi_mem_[i].awaddr; - m_axi_states_[i].write_req_tag = *m_axi_mem_[i].awid; - // activate data channel - *m_axi_mem_[i].wready = 1; - m_axi_states_[i].write_req_pending = !*m_axi_mem_[i].wvalid; + // handle write data requests + if (*m_axi_mem_[b].wvalid && *m_axi_mem_[b].wready && !m_axi_states_[b].write_req_data_ack) { + m_axi_states_[b].write_req_byteen = *m_axi_mem_[b].wstrb; + auto data = (const uint8_t*)m_axi_mem_[b].wdata->data(); + for (int i = 0; i < PLATFORM_MEMORY_DATA_SIZE; ++i) { + m_axi_states_[b].write_req_data[i] = data[i]; + } + m_axi_states_[b].write_req_data_ack = true; } - // handle data write requests - if (*m_axi_mem_[i].wvalid && *m_axi_mem_[i].wready) { - auto byteen = *m_axi_mem_[i].wstrb; - auto data = (uint8_t*)m_axi_mem_[i].wdata->data(); - auto byte_addr = m_axi_states_[i].write_req_addr; - - for (int i = 0; i < PLATFORM_MEMORY_DATA_SIZE; i++) { + // handle write requests + if (m_axi_states_[b].write_req_addr_ack && m_axi_states_[b].write_req_data_ack) { + auto byteen = m_axi_states_[b].write_req_byteen; + auto byte_addr = m_axi_states_[b].write_req_addr; + for (int i = 0; i < PLATFORM_MEMORY_DATA_SIZE; ++i) { if ((byteen >> i) & 0x1) { - (*ram_)[byte_addr + i] = data[i]; + (*ram_)[byte_addr + i] = m_axi_states_[b].write_req_data[i]; } } - auto mem_req = new mem_req_t(); - mem_req->tag = m_axi_states_[i].write_req_tag; + mem_req->tag = m_axi_states_[b].write_req_tag; mem_req->addr = byte_addr; mem_req->write = true; mem_req->ready = false; - pending_mem_reqs_[i].emplace_back(mem_req); + pending_mem_reqs_[b].emplace_back(mem_req); - /*printf("%0ld: [sim] axi-mem-write: bank=%d, addr=0x%lx, byteen=0x%lx, tag=0x%x, data=0x", timestamp, i, mem_req->addr, byteen, mem_req->tag); + /*printf("%0ld: [sim] axi-mem-write: bank=%d, addr=0x%lx, byteen=0x%lx, tag=0x%x, data=0x", timestamp, b, mem_req->addr, byteen, mem_req->tag); for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) { - printf("%02x", data[i]); + printf("%02x", m_axi_states_[b].write_req_data[i]]); } printf("\n");*/ // send dram request - dram_queues_[i].push(mem_req); + dram_queues_[b].push(mem_req); - // deactivate data channel - if (m_axi_states_[i].write_req_pending) { - *m_axi_mem_[i].wready = 0; - m_axi_states_[i].write_req_pending = false; - } + // clear acks + m_axi_states_[b].write_req_addr_ack = false; + m_axi_states_[b].write_req_data_ack = false; } } } typedef struct { + std::array write_req_data; + uint64_t write_req_byteen; uint64_t write_req_addr; uint32_t write_req_tag; - bool write_req_pending; + bool read_rsp_ready; + bool write_rsp_ready; + bool write_req_addr_ack; + bool write_req_data_ack; } m_axi_state_t; typedef struct { diff --git a/tests/regression/dogfood/testcases.h b/tests/regression/dogfood/testcases.h index f5760ec06..f3562bb17 100644 --- a/tests/regression/dogfood/testcases.h +++ b/tests/regression/dogfood/testcases.h @@ -141,7 +141,7 @@ class Test_IADD : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = a[i] + b[i]; if (c[i] != ref) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -171,7 +171,7 @@ class Test_IMUL : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = a[i] * b[i]; if (c[i] != ref) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -201,7 +201,7 @@ class Test_IDIV : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = a[i] / b[i]; if (c[i] != ref) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -233,7 +233,7 @@ class Test_IDIV_MUL : public ITestCase { auto y = a[i] * b[i]; auto ref = x + y; if (c[i] != ref) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -263,7 +263,7 @@ class Test_FADD : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = a[i] + b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -293,7 +293,7 @@ class Test_FSUB : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = a[i] - b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -323,7 +323,7 @@ class Test_FMUL : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = a[i] * b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -353,7 +353,7 @@ class Test_FMADD : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = a[i] * b[i] + b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -383,7 +383,7 @@ class Test_FMSUB : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = a[i] * b[i] - b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -413,7 +413,7 @@ class Test_FNMADD : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = -a[i] * b[i] - b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -443,7 +443,7 @@ class Test_FNMSUB : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = -a[i] * b[i] + b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -475,7 +475,7 @@ class Test_FNMADD_MADD : public ITestCase { auto y = a[i] * b[i] + b[i]; auto ref = x + y; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -505,7 +505,7 @@ class Test_FDIV : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = a[i] / b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -537,7 +537,7 @@ class Test_FDIV2 : public ITestCase { auto y = b[i] / a[i]; auto ref = x + y; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -568,7 +568,7 @@ class Test_FSQRT : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = sqrt(a[i] * b[i]); if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -600,7 +600,7 @@ class Test_FTOI : public ITestCase { auto x = a[i] + b[i]; auto ref = (int32_t)x; if (c[i] != ref) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -632,7 +632,7 @@ class Test_FTOU : public ITestCase { auto x = a[i] + b[i]; auto ref = (uint32_t)x; if (c[i] != ref) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -663,7 +663,7 @@ class Test_ITOF : public ITestCase { auto x = a[i] + b[i]; auto ref = (float)x; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -694,7 +694,7 @@ class Test_UTOF : public ITestCase { auto x = a[i] + b[i]; auto ref = (float)x; if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -724,7 +724,7 @@ class Test_FCLAMP : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = fmin(fmax(1.0f, a[i]), b[i]); if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -754,7 +754,7 @@ class Test_ICLAMP : public ITestCase { for (uint32_t i = 0; i < n; ++i) { auto ref = std::min(std::max(1, a[i]), b[i]); if (c[i] != ref) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -787,7 +787,7 @@ class Test_TRIGO : public ITestCase { ref = sinf(ref); } if (!almost_equal(c[i], ref)) { - std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -820,7 +820,7 @@ class Test_BAR : public ITestCase { for (uint32_t i = 0; i < n; ++i) { uint32_t ref = a[i] + 1; if (c[i] != ref) { - std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl; ++errors; } } @@ -857,7 +857,7 @@ class Test_GBAR : public ITestCase { for (uint32_t i = 0; i < n; ++i) { uint32_t ref = a[i] + 1; if (c[i] != ref) { - std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl; + std::cout << "error at result #" << std::dec << i << std::hex << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl; ++errors; } }