harvard-edge · profvjreddi · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/contents/hw_acceleration/hw_acceleration.bib b/contents/hw_acceleration/hw_acceleration.bib
@@ -1138,3 +1138,116 @@ @inproceedings{Jouppi2023TPUv4
   url           = {https://doi.org/10.1145/3579371.3589350},
   year          = {2023}
 }
+
+@article{yu2023rl,
+author = {Qian, Yu and Zhou, Xuegong and Zhou, Hao and Wang, Lingli},
+title = {An Efficient Reinforcement Learning Based Framework for Exploring Logic Synthesis},
+year = {2023},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+issn = {1084-4309},
+url = {https://doi.org/10.1145/3632174},
+doi = {10.1145/3632174},
+abstract = {Logic synthesis is a crucial step in electronic design automation tools. The rapid developments of reinforcement learning (RL) have enabled the automated exploration of logic synthesis. Existing RL based methods may lead to data inefficiency, and the exploration approaches for FPGA and ASIC technology mapping in recent works lack the flexibility of the learning process. This work proposes ESE, a reinforcement learning based framework to efficiently learn the logic synthesis process. The framework supports the modeling of logic optimization and technology mapping for FPGA and ASIC. The optimization for the execution time of the synthesis script is also considered. For the modeling of FPGA mapping, the logic optimization and technology mapping are combined to be learned in a flexible way. For the modeling of ASIC mapping, the standard cell based optimization and LUT optimization operations are incorporated into the ASIC synthesis flow. To improve the utilization of samples, the Proximal Policy Optimization model is adopted. Furthermore, the framework is enhanced by supporting MIG based synthesis exploration. Experiments show that for FPGA technology mapping on the VTR benchmark, the average LUT-Level-Product and script runtime are improved by more than 18.3\% and 12.4\% respectively than previous works. For ASIC mapping on the EPFL benchmark, the average Area-Delay-Product is improved by 14.5\%.},
+note = {Just Accepted},
+journal = {ACM Trans. Des. Autom. Electron. Syst.},
+month = {nov},
+keywords = {technology mapping, Majority-Inverter Graph, And-Inverter Graph, Reinforcement learning, logic optimization}
+}
+
+@inproceedings{zhou2023area,
+  title={Area-Driven FPGA Logic Synthesis Using Reinforcement Learning},
+  author={Zhou, Guanglei and Anderson, Jason H},
+  booktitle={Proceedings of the 28th Asia and South Pacific Design Automation Conference},
+  pages={159--165},
+  year={2023}
+}
+
+@inproceedings{valenzuela2000genetic,
+  title={A genetic algorithm for VLSI floorplanning},
+  author={Valenzuela, Christine L and Wang, Pearl Y},
+  booktitle={Parallel Problem Solving from Nature PPSN VI: 6th International Conference Paris, France, September 18--20, 2000 Proceedings 6},
+  pages={671--680},
+  year={2000},
+  organization={Springer}
+}
+
+@inproceedings{agnesina2023autodmp,
+  title={AutoDMP: Automated dreamplace-based macro placement},
+  author={Agnesina, Anthony and Rajvanshi, Puranjay and Yang, Tian and Pradipta, Geraldo and Jiao, Austin and Keller, Ben and Khailany, Brucek and Ren, Haoxing},
+  booktitle={Proceedings of the 2023 International Symposium on Physical Design},
+  pages={149--157},
+  year={2023}
+}
+
+@article{mirhoseini2021graph,
+  title={A graph placement methodology for fast chip design},
+  author={Mirhoseini, Azalia and Goldie, Anna and Yazgan, Mustafa and Jiang, Joe Wenjie and Songhori, Ebrahim and Wang, Shen and Lee, Young-Joon and Johnson, Eric and Pathak, Omkar and Nazi, Azade and others},
+  journal={Nature},
+  volume={594},
+  number={7862},
+  pages={207--212},
+  year={2021},
+  publisher={Nature Publishing Group}
+}
+
+@inproceedings{kao2020gamma,
+  title={Gamma: Automating the hw mapping of dnn models on accelerators via genetic algorithm},
+  author={Kao, Sheng-Chun and Krishna, Tushar},
+  booktitle={Proceedings of the 39th International Conference on Computer-Aided Design},
+  pages={1--9},
+  year={2020}
+}
+
+@inproceedings{reagen2017case,
+  title={A case for efficient accelerator design space exploration via bayesian optimization},
+  author={Reagen, Brandon and Hern{\'a}ndez-Lobato, Jos{\'e} Miguel and Adolf, Robert and Gelbart, Michael and Whatmough, Paul and Wei, Gu-Yeon and Brooks, David},
+  booktitle={2017 IEEE/ACM International Symposium on Low Power Electronics and Design (ISLPED)},
+  pages={1--6},
+  year={2017},
+  organization={IEEE}
+}
+
+@inproceedings{bhardwaj2020comprehensive,
+  title={A comprehensive methodology to determine optimal coherence interfaces for many-accelerator SoCs},
+  author={Bhardwaj, Kshitij and Havasi, Marton and Yao, Yuan and Brooks, David M and Hern{\'a}ndez-Lobato, Jos{\'e} Miguel and Wei, Gu-Yeon},
+  booktitle={Proceedings of the ACM/IEEE International Symposium on Low Power Electronics and Design},
+  pages={145--150},
+  year={2020}
+}
+
+@inproceedings{kao2020confuciux,
+  title={Confuciux: Autonomous hardware resource assignment for dnn accelerators using reinforcement learning},
+  author={Kao, Sheng-Chun and Jeong, Geonhwa and Krishna, Tushar},
+  booktitle={2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
+  pages={622--636},
+  year={2020},
+  organization={IEEE}
+}
+
+@misc{krishnan2022multiagent,
+      title={Multi-Agent Reinforcement Learning for Microprocessor Design Space Exploration}, 
+      author={Srivatsan Krishnan and Natasha Jaques and Shayegan Omidshafiei and Dan Zhang and Izzeddin Gur and Vijay Janapa Reddi and Aleksandra Faust},
+      year={2022},
+      eprint={2211.16385},
+      archivePrefix={arXiv},
+      primaryClass={cs.AR}
+}
+
+@inproceedings{zhangfast,
+author = {Zhang, Dan and Huda, Safeen and Songhori, Ebrahim and Prabhu, Kartik and Le, Quoc and Goldie, Anna and Mirhoseini, Azalia},
+title = {A Full-Stack Search Technique for Domain Optimized Deep Learning Accelerators},
+year = {2022},
+isbn = {9781450392051},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3503222.3507767},
+doi = {10.1145/3503222.3507767},
+abstract = {The rapidly-changing deep learning landscape presents a unique opportunity for building inference accelerators optimized for specific datacenter-scale workloads. We propose Full-stack Accelerator Search Technique (FAST), a hardware accelerator search framework that defines a broad optimization environment covering key design decisions within the hardware-software stack, including hardware datapath, software scheduling, and compiler passes such as operation fusion and tensor padding. In this paper, we analyze bottlenecks in state-of-the-art vision and natural language processing (NLP) models, including EfficientNet and BERT, and use FAST to design accelerators capable of addressing these bottlenecks. FAST-generated accelerators optimized for single workloads improve Perf/TDP by 3.7\texttimes{} on average across all benchmarks compared to TPU-v3. A FAST-generated accelerator optimized for serving a suite of workloads improves Perf/TDP by 2.4\texttimes{} on average compared to TPU-v3. Our return on investment analysis shows that FAST-generated accelerators can potentially be practical for moderate-sized datacenter deployments.},
+booktitle = {Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems},
+pages = {27–42},
+numpages = {16},
+keywords = {design space exploration, hardware-software codesign, tensor processing unit, machine learning, operation fusion},
+location = {Lausanne, Switzerland},
+series = {ASPLOS '22}
+}
diff --git a/contents/hw_acceleration/hw_acceleration.qmd b/contents/hw_acceleration/hw_acceleration.qmd
@@ -919,26 +919,26 @@ Quantum techniques may first make inroads for optimization before more generaliz
 
 ## Future Trends
 
-Thus far in this chapter, we have primarily explored how to design specialized hardware that is optimized for machine learning workloads and algorithms. For example, we discussed how GPUs and TPUs have architectures tailored for neural network training and inference. However, we have not yet discussed an emerging and exciting area - using machine learning to aid in the hardware design process itself.
+In this chapter, the primary focus has been on the design of specialized hardware optimized for machine learning workloads and algorithms. This discussion encompassed the tailored architectures of GPUs and TPUs for neural network training and inference. However, an emerging research direction is the leveraging machine learning in facilitating the hardware design process itself.
 
 The hardware design process involves many complex stages, including specification, high-level modeling, simulation, synthesis, verification, prototyping, and fabrication. Traditionally, much of this process requires extensive human expertise, effort, and time. However, recent advances in machine learning are enabling parts of the hardware design workflow to be automated and enhanced using ML techniques.
 
 Some examples of how ML is transforming hardware design include:
 
-* **Automated circuit synthesis using reinforcement learning:** Rather than hand-crafting transistor-level designs, ML agents can learn to connect logic gates and generate circuit layouts automatically. This can accelerate the time-consuming syntheses process.
-* **ML-based hardware simulation and emulation:** Deep neural network models can be trained to predict how a hardware design will perform under different conditions. This allows fast and accurate simulation compared to traditional RTL simulations.
-* **Automated chip floorplanning using ML algorithms:** Chip floorplanning, which involves optimally placing different components on a die, can leverage genetic algorithms and ML to explore floorplan options. This can lead to performance improvements.
-* **ML-driven architecture optimization:** Novel hardware architectures, like those for efficient ML accelerators, can be automatically generated and optimized using neural architecture search techniques. This expands the architectural design space.
+* **Automated circuit synthesis using reinforcement learning:** Rather than hand-crafting transistor-level designs, ML agents such as reinforcement learning can learn to connect logic gates and generate circuit layouts automatically. This can accelerate the time-consuming synthesis process.
+* **ML-based hardware simulation and emulation:** Deep neural network models can be trained to predict how a hardware design will perform under different conditions. For instance, deep learning models can be trained to predict cycle count for given workloads. This allows fast and accurate simulation compared to traditional RTL simulations.
+* **Automated chip floorplanning using ML algorithms:** Chip floorplanning, which involves optimally placing different components on a die. Evolutionary algorithms like genetic algorithms and other ML algorithms like reinforcement leanring are used explore floorplan options. This can significantly improve manual floorplanning placements in terms of faster turnaround time and also quality of placements.
+* **ML-driven architecture optimization:** Novel hardware architectures, like those for efficient ML accelerators, can be automatically generated and optimized by searching the architectural design space. Machine leanring algorithms can be used for effectively searching large architectural design space.
 
 Applying ML to hardware design automation holds enormous promise to make the process faster, cheaper, and more efficient. It opens up design possibilities that would be extremely difficult through manual design. The use of ML in hardware design is an area of active research and early deployment, and we will study the techniques involved and their transformative potential.
 
 ### ML for Hardware Design Automation
 
 A major opportunity for machine learning in hardware design is automating parts of the complex and tedious design workflow. Hardware design automation (HDA) broadly refers to using ML techniques like reinforcement learning, genetic algorithms, and neural networks to automate tasks like synthesis, verification, floorplanning, and more. A few examples of where ML for HDA shows real promise:
 
-* **Automated circuit synthesis:** Circuit synthesis involves converting a high-level description of desired logic into an optimized gate-level netlist implementation. This complex process has many design considerations and tradeoffs. ML agents can be trained through reinforcement learning to explore the design space and output optimized syntheses automatically. Startups like [Symbiotic EDA](https://www.symbioticeda.com/) are bringing this technology to market.
-* **Automated chip floorplanning:** Floorplanning refers to strategically placing different components on a chip die area. ML techniques like genetic algorithms can be used to automate floorplan optimization to minimize wire length, power consumption, and other objectives. This is extremely valuable as chip complexity increases.
-* **ML hardware simulators:** Training deep neural network models to predict how hardware designs will perform as simulators can accelerate the simulation process by over 100x compared to traditional RTL simulations.
+* **Automated circuit synthesis:** Circuit synthesis involves converting a high-level description of desired logic into an optimized gate-level netlist implementation. This complex process has many design considerations and tradeoffs. ML agents can be trained through reinforcement learning (@yu2023rl,@zhou2023area) to explore the design space and output optimized syntheses automatically. Startups like [Symbiotic EDA](https://www.symbioticeda.com/) are bringing this technology to market.
+* **Automated chip floorplanning:** Floorplanning refers to strategically placing different components on a chip die area. Search algorithms like genetic algorithms (@valenzuela2000genetic), reinforcement learning (@mirhoseini2021graph, @agnesina2023autodmp) can be used to automate floorplan optimization to minimize wire length, power consumption, and other objectives. These automated ML-assisted floorplanners are extremely valuable as chip complexity increases.
+* **ML hardware simulators:** Training deep neural network models to predict how hardware designs will perform as simulators can accelerate the simulation process by over 100x compared to traditional architectural and RTL simulations.
 * **Automated code translation:** Converting hardware description languages like Verilog to optimized RTL implementations is critical but time-consuming. ML models can be trained to act as translator agents and automate parts of this process.
 
 The benefits of HDA using ML are reduced design time, superior optimizations, and exploration of design spaces too complex for manual approaches. This can accelerate hardware development and lead to better designs.
@@ -950,7 +950,7 @@ Challenges include limits of ML generalization, the black-box nature of some tec
 Simulating and verifying hardware designs is critical before manufacturing to ensure the design behaves as intended. Traditional approaches like register-transfer level (RTL) simulation are complex and time-consuming. ML introduces new opportunities to enhance hardware simulation and verification. Some examples include:
 
 * **Surrogate modeling for simulation:** Highly accurate surrogate models of a design can be built using neural networks. These models predict outputs from inputs much faster than RTL simulation, enabling fast design space exploration. Companies like Ansys use this technique.
-* **ML simulators:** Large neural network models can be trained on RTL simulations to learn to mimic the functionality of a hardware design. Once trained, the NN model can act as a highly efficient simulator to use for regression testing and other tasks. Graphcore has demonstrated over 100x speedup with this approach.
+* **ML simulators:** Large neural network models can be trained on RTL simulations to learn to mimic the functionality of a hardware design. Once trained, the NN model can act as a highly efficient simulator to use for regression testing and other tasks. [Graphcore](https://www.graphcore.ai/posts/ai-for-simulation-how-graphcore-is-helping-transform-traditional-hpc) has demonstrated over 100x speedup with this approach.
 * **Formal verification using ML:** Formal verification mathematically proves properties about a design. ML techniques can help generate verification properties and can learn to solve the complex formal proofs needed. This automates parts of this challenging process. Startups like Cortical.io are bringing ML formal verification solutions to market.
 * **Bug detection:** ML models can be trained to process hardware designs and identify potential issues. This assists human designers in inspecting complex designs and finding bugs. Facebook has shown bug detection models for their server hardware.
 
@@ -960,10 +960,9 @@ The key benefits of applying ML to simulation and verification are faster design
 
 Designing hardware architectures optimized for performance, power, and efficiency is a key goal. ML introduces new techniques to automate and enhance architecture design space exploration for both general-purpose and specialized hardware like ML accelerators. Some promising examples include:
 
-* **Neural architecture search for hardware:** Search techniques like evolutionary algorithms can automatically generate novel hardware architectures by mutating and mixing design attributes like cache size, number of parallel units, memory bandwidth, and so on. This expands the design space beyond human limitations.
-* **ML-based architecture optimizers:** ML agents can be trained with reinforcement learning to tweak architectures to optimize for desired objectives like throughput or power. The agent explores the space of possible configurations to find high-performing, efficient designs.
-* **Predictive modeling for optimization:** - ML models can be trained to predict hardware performance, power, and efficiency metrics for a given architecture. These become "surrogate models" for fast optimization and space exploration by substituting lengthy simulations.
-* **Specialized accelerator optimization:** - For specialized chips like tensor processing units for AI, automated architecture search techniques based on ML/evolutionary algorithms show promise for finding fast, efficient designs.
+* **Architecture search for hardware:** Search techniques like evolutionary algorithms (@kao2020gamma), Bayesian optimization (@reagen2017case, @bhardwaj2020comprehensive), reinforcement learning (@kao2020confuciux, @krishnan2022multiagent) can automatically generate novel hardware architectures by mutating and mixing design attributes like cache size, number of parallel units, memory bandwidth, and so on. This allows for efficient navigation of large design spaces.
+* **Predictive modeling for optimization:** - ML models can be trained to predict hardware performance, power, and efficiency metrics for a given architecture. These become "surrogate models" (@krishnan2023archgym) for fast optimization and space exploration by substituting lengthy simulations.
+* **Specialized accelerator optimization:** - For specialized chips like tensor processing units for AI, automated architecture search techniques based on ML algorithms (@zhangfast) show promise for finding fast, efficient designs.
 
 The benefits of using ML include superior design space exploration, automated optimization, and reduced manual effort. Challenges include long training times for some techniques and local optima limitations. But ML for hardware architecture holds great potential for unlocking performance and efficiency gains.