diff --git a/.circleci/test.yml b/.circleci/test.yml
index 7fe2dd71ce..dc02cb4ffa 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -134,6 +134,57 @@ jobs:
           command: |
             docker exec mmengine python -m pytest tests/
 
+  build_integration_test:
+    parameters:
+      torch:
+        type: string
+      cuda:
+        type: string
+      cudnn:
+        type: integer
+        default: 7
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+      docker_layer_caching: true
+    resource_class: gpu.nvidia.small
+    steps:
+      - checkout
+      - run:
+          name: Build Docker image
+          command: |
+            docker build .circleci/docker -t mmengine:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
+            docker run --gpus all -t -d -v /home/circleci/project:/mmengine -w /mmengine --name mmengine mmengine:gpu
+      - run:
+          name: Build MMEngine from source
+          command: |
+            docker exec mmengine pip install -e . -v
+      - run:
+          name: Install unit tests dependencies
+          command: |
+            docker exec mmengine pip install -r requirements/tests.txt
+            docker exec mmengine pip install openmim
+            docker exec mmengine mim install 'mmcv>=2.0.0rc1'
+      - run:
+          name: Install down stream repositories
+          command: |
+            docker exec mmengine mim install 'mmdet>=3.0.0rc0'
+      - run:
+          name: Run integration tests
+          command: |
+            docker exec mmengine pytest tests/test_infer/test_infer.py
+      - run:
+          name: Install down stream repositories from source
+          # TODO: Switch to master branch
+          command: |
+            docker exec mmengine pip uninstall mmdet -y
+            docker exec mmengine apt install git -y
+            docker exec mmengine mkdir downstream_repos
+            docker exec mmengine git clone -b 3.x https://github.com/open-mmlab/mmdetection.git ./downstream_repos/mmdetection
+      - run:
+          name: Run inferencer tests
+          command: |
+            docker exec -e PYTHONPATH=./downstream_repos/mmdetection mmengine pytest tests/test_infer/test_infer.py
+
 workflows:
   pr_stage_lint:
     when: << pipeline.parameters.lint_only >>
@@ -173,10 +224,20 @@ workflows:
           python: 3.9.0
           requires:
             - minimum_version_cpu
+      - hold_integration_test:
+          type: approval
+          requires:
+            - lint
+      - build_integration_test:
+          name: integration_test
+          torch: 1.8.1
+          cuda: "10.2"
+          requires:
+            - hold_integration_test
       - hold:
           type: approval
           requires:
-            - maximum_version_cpu
+            - lint
       - build_cuda:
           name: mainstream_version_gpu
           torch: 1.8.1
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000000..9c0b5acd9f
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "MMEngine Contributors"
+title: "OpenMMLab Foundational Library for Training Deep Learning Models"
+date-released: 2022-09-01
+url: "https://github.com/open-mmlab/mmengine"
+license: Apache-2.0
diff --git a/README.md b/README.md
index 2e1815b7a6..5c69c8eb98 100644
--- a/README.md
+++ b/README.md
@@ -74,16 +74,14 @@ Major features:
 
 ## What's New
 
-v0.6.0 was released on 2023-02-24.
+v0.7.0 was released on 2023-03-16.
 
 Highlights:
 
-- Support `Apex` with `ApexOptimWrapper`
-- Support analyzing model complexity
-- Add `Lion` optimizer
-- Support using environment variables in the config file
+- Support PyTorch 2.0! Accelerate training by compiling models. See the tutorial [Model Compilation](https://mmengine.readthedocs.io/en/latest/common_usage/speed_up_training.html#model-compilation) for details
+- Add `EarlyStoppingHook` to stop training when the metric does not improve
 
-Read [Changelog](./docs/en/notes/changelog.md#v060-02242023) for more details.
+Read [Changelog](./docs/en/notes/changelog.md#v070-03162023) for more details.
 
 ## Installation
 
@@ -308,6 +306,19 @@ runner.train()
 
 We appreciate all contributions to improve MMEngine. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline.
 
+## Citation
+
+If you find this project useful in your research, please consider cite:
+
+```
+@article{mmengine2022,
+  title   = {{MMEngine}: OpenMMLab Foundational Library for Training Deep Learning Models},
+  author  = {MMEngine Contributors},
+  howpublished = {\url{https://github.com/open-mmlab/mmengine}},
+  year={2022}
+}
+```
+
 ## License
 
 This project is released under the [Apache 2.0 license](LICENSE).
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 1d31a17f95..f672ae25aa 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -60,16 +60,14 @@ MMEngine 是一个基于 PyTorch 实现的，用于训练深度学习模型的
 
 ## 最近进展
 
-最新版本 v0.6.0 在 2023.02.24 发布。
+最新版本 v0.7.0 在 2023.03.16 发布。
 
 亮点：
 
-- 新增 `ApexOptimWrapper` 支持 `Apex` 的混合精度训练功能
-- 支持计算模型复杂度
-- 新增 Lion 优化器
-- 支持在配置文件使用环境变量
+- 支持 PyTorch 2.0！通过编译模型实现训练加速，参考[编译模型文档](https://mmengine.readthedocs.io/en/latest/common_usage/speed_up_training.html#model-compilation)抢先体验
+- 新增 `EarlyStoppingHook`，当监控的指标不再提升时，自动停止训练
 
-如果想了解更多版本更新细节和历史信息，请阅读[更新日志](./docs/en/notes/changelog.md#v060-02242023)
+如果想了解更多版本更新细节和历史信息，请阅读[更新日志](./docs/en/notes/changelog.md#v070-03162023)
 
 ## 安装
 
@@ -294,6 +292,19 @@ runner.train()
 
 我们感谢所有的贡献者为改进和提升 MMEngine 所作出的努力。请参考[贡献指南](CONTRIBUTING_zh-CN.md)来了解参与项目贡献的相关指引。
 
+## 引用
+
+如果您觉得 MMEngine 对您的研究有所帮助，请考虑引用它：
+
+```
+@article{mmengine2022,
+  title   = {{MMEngine}: OpenMMLab Foundational Library for Training Deep Learning Models},
+  author  = {MMEngine Contributors},
+  howpublished = {\url{https://github.com/open-mmlab/mmengine}},
+  year={2022}
+}
+```
+
 ## 开源许可证
 
 该项目采用 [Apache 2.0 license](LICENSE) 开源许可证。
diff --git a/docs/en/advanced_tutorials/model_analysis.md b/docs/en/advanced_tutorials/model_analysis.md
index 77d436765e..767fc18871 100644
--- a/docs/en/advanced_tutorials/model_analysis.md
+++ b/docs/en/advanced_tutorials/model_analysis.md
@@ -1,29 +1,36 @@
 # Model Complexity Analysis
 
-We provide a tool to help with the complexity analysis for the network. We borrow the idea from the implementation of [fvcore](https://github.com/facebookresearch/fvcore) to build this tool, and plan to support more custom operators in the future. Currently, it provides the interfaces to compute "parameter", "activation" and "flops" of the given model, and supports printing the related information layer-by-layer in terms of network structure or table. The analysis tool provides both operator-level and module-level flop counts simultaneously. Please refer to [Flop Count](https://github.com/facebookresearch/fvcore/blob/main/docs/flop_count.md) for implementation details of how to accurately measure the flops of one operator if interested.
+We provide a tool to help with the complexity analysis for the network. We borrow the idea from the implementation of [fvcore](https://github.com/facebookresearch/fvcore) to build this tool, and plan to support more custom operators in the future. Currently, it provides the interfaces to compute "FLOPs", "Activations" and "Parameters",  of the given model, and supports printing the related information layer-by-layer in terms of network structure or table. The analysis tool provides both operator-level and module-level flop counts simultaneously. Please refer to [Flop Count](https://github.com/facebookresearch/fvcore/blob/main/docs/flop_count.md) for implementation details of how to accurately measure the flops of one operator if interested.
 
-## What's FLOPs
+## Definition
 
-Flop is not a well-defined metric in complexity analysis, we follow [detectron2](https://detectron2.readthedocs.io/en/latest/modules/fvcore.html#fvcore.nn.FlopCountAnalysis) to use one fused multiple-add as one flop.
+The model complexity has three indicators, namely floating-point operations (FLOPs), activations, and parameters. Their definitions are as follows:
 
-## What's Activation
+- FLOPs
 
-Activation is used to measure the feature quantity produced from one layer.
+  Floating-point operations (FLOPs) is not a clearly defined indicator. Here, we refer to the description in  [detectron2](https://detectron2.readthedocs.io/en/latest/modules/fvcore.html#fvcore.nn.FlopCountAnalysis), which defines a set of multiply-accumulate operations as 1 FLOP.
 
-For example, given the inputs with shape `inputs = torch.randn((1, 3, 10, 10))`, and one linear layer with `conv = nn.Conv2d(in_channels=3, out_channels=10, kernel_size=1)`.
+- Activations
 
-We get the `output` with shape `(1, 10, 10, 10)` after feeding the `inputs` into `conv`. The activation quantity of `output` of this `conv` layer is `1000=10*10*10`
+  Activation is used to measure the feature quantity produced from one layer.
 
-Let's start with the following examples.
+- Parameters
 
-## Usage Example 1: Model built with native nn.Module
+  The parameter count of a model.
 
-### Code
+For example, given an input size of `inputs = torch.randn((1, 3, 10, 10))` and a convolutional layer `conv = nn.Conv2d(in_channels=3, out_channels=10, kernel_size=3)`, if the output feature map size is `(1, 10, 8, 8)`, then its FLOPs are `17280 = 10*8*8*3*3*3` (where `10*8*8` represents the output feature map size, and `3*3*3` represents the computation for each output), activations are `640 = 10*8*8`, and the parameter count is `280 = 3*10*3*3 + 10` (where `3*10*3*3` represents the size of weights, and 10 represents the size of bias).
+
+## Usage
+
+### Model built with native nn.Module
+
+Build a model
 
 ```python
-import torch
 from torch import nn
 from mmengine.analysis import get_model_complexity_info
+
+
 # return a dict of analysis results, including:
 # ['flops', 'flops_str', 'activations', 'activations_str', 'params', 'params_str', 'out_table', 'out_arch']
 
@@ -49,17 +56,9 @@ input_shape = (1, 10)
 model = TestNet()
 
 analysis_results = get_model_complexity_info(model, input_shape)
-
-print(analysis_results['out_table'])
-print(analysis_results['out_arch'])
-
-print("Model Flops:{}".format(analysis_results['flops_str']))
-print("Model Parameters:{}".format(analysis_results['params_str']))
 ```
 
-### Description of Results
-
-The return outputs is dict, which contains the following keys:
+The `analysis_results` returned by `get_model_complexity_info` is a dict, which contains the following keys:
 
 - `flops`: number of total flops, e.g., 10000, 10000
 - `flops_str`: with formatted string, e.g., 1.0G, 100M
@@ -69,42 +68,44 @@ The return outputs is dict, which contains the following keys:
 - `activations_str`: with formatted string, e.g., 1.0G, 100M
 - `out_table`: print related information by table
 
-```
-+---------------------+----------------------+--------+--------------+
-| module              | #parameters or shape | #flops | #activations |
-+---------------------+----------------------+--------+--------------+
-| model               | 0.44K                | 0.4K   | 40           |
-|  fc1                |  0.11K               |  100   |  10          |
-|   fc1.weight        |   (10, 10)           |        |              |
-|   fc1.bias          |   (10,)              |        |              |
-|  fc2                |  0.11K               |  100   |  10          |
-|   fc2.weight        |   (10, 10)           |        |              |
-|   fc2.bias          |   (10,)              |        |              |
-|  inner              |  0.22K               |  0.2K  |  20          |
-|   inner.fc1         |   0.11K              |   100  |   10         |
-|    inner.fc1.weight |    (10, 10)          |        |              |
-|    inner.fc1.bias   |    (10,)             |        |              |
-|   inner.fc2         |   0.11K              |   100  |   10         |
-|    inner.fc2.weight |    (10, 10)          |        |              |
-|    inner.fc2.bias   |    (10,)             |        |              |
-+---------------------+----------------------+--------+--------------+
-```
-
-- `out_arch`: print related information by network layers
-
-```bash
-TestNet(
-  #params: 0.44K, #flops: 0.4K, #acts: 40
-  (fc1): Linear(
-    in_features=10, out_features=10, bias=True
-    #params: 0.11K, #flops: 100, #acts: 10
-  )
-  (fc2): Linear(
-    in_features=10, out_features=10, bias=True
-    #params: 0.11K, #flops: 100, #acts: 10
-  )
-  (inner): InnerNet(
-    #params: 0.22K, #flops: 0.2K, #acts: 20
+Print the results
+
+- print related information by table
+
+  ```python
+  print(analysis_results['out_table'])
+  ```
+
+  ```text
+  +---------------------+----------------------+--------+--------------+
+  | module              | #parameters or shape | #flops | #activations |
+  +---------------------+----------------------+--------+--------------+
+  | model               | 0.44K                | 0.4K   | 40           |
+  |  fc1                |  0.11K               |  100   |  10          |
+  |   fc1.weight        |   (10, 10)           |        |              |
+  |   fc1.bias          |   (10,)              |        |              |
+  |  fc2                |  0.11K               |  100   |  10          |
+  |   fc2.weight        |   (10, 10)           |        |              |
+  |   fc2.bias          |   (10,)              |        |              |
+  |  inner              |  0.22K               |  0.2K  |  20          |
+  |   inner.fc1         |   0.11K              |   100  |   10         |
+  |    inner.fc1.weight |    (10, 10)          |        |              |
+  |    inner.fc1.bias   |    (10,)             |        |              |
+  |   inner.fc2         |   0.11K              |   100  |   10         |
+  |    inner.fc2.weight |    (10, 10)          |        |              |
+  |    inner.fc2.bias   |    (10,)             |        |              |
+  +---------------------+----------------------+--------+--------------+
+  ```
+
+- print related information by network layers
+
+  ```python
+  print(analysis_results['out_arch'])
+  ```
+
+  ```bash
+  TestNet(
+    #params: 0.44K, #flops: 0.4K, #acts: 40
     (fc1): Linear(
       in_features=10, out_features=10, bias=True
       #params: 0.11K, #flops: 100, #acts: 10
@@ -113,13 +114,30 @@ TestNet(
       in_features=10, out_features=10, bias=True
       #params: 0.11K, #flops: 100, #acts: 10
     )
+    (inner): InnerNet(
+      #params: 0.22K, #flops: 0.2K, #acts: 20
+      (fc1): Linear(
+        in_features=10, out_features=10, bias=True
+        #params: 0.11K, #flops: 100, #acts: 10
+      )
+      (fc2): Linear(
+        in_features=10, out_features=10, bias=True
+        #params: 0.11K, #flops: 100, #acts: 10
+      )
+    )
   )
-)
-```
+  ```
 
-## Usage Example 2: Model built with mmengine
+- print results with formatted string
 
-### Code
+  ```python
+  print("Model Flops:{}".format(analysis_results['flops_str']))
+  # Model Flops:0.4K
+  print("Model Parameters:{}".format(analysis_results['params_str']))
+  # Model Parameters:0.44K
+  ```
+
+### Model built with mmengine
 
 ```python
 import torch.nn.functional as F
@@ -148,16 +166,10 @@ model = MMResNet50()
 
 analysis_results = get_model_complexity_info(model, input_shape)
 
-
 print("Model Flops:{}".format(analysis_results['flops_str']))
+# Model Flops:4.145G
 print("Model Parameters:{}".format(analysis_results['params_str']))
-```
-
-### Output
-
-```bash
-Model Flops:4.145G
-Model Parameters:25.557M
+# Model Parameters:25.557M
 ```
 
 ## Interface
@@ -168,4 +180,4 @@ We provide more options to support custom output
 - `input_shape`: (tuple) the shape of the input, e.g., (3, 224, 224)
 - `inputs`: (optional: torch.Tensor), if given, `input_shape` will be ignored
 - `show_table`: (bool) whether return the statistics in the form of table, default: True
-- `show_arch`: (bool) whether return the statistics in the form of table,  default: True
+- `show_arch`: (bool) whether return the statistics by network layers,  default: True
diff --git a/docs/en/advanced_tutorials/registry.md b/docs/en/advanced_tutorials/registry.md
index 76cc9f9091..784ca50607 100644
--- a/docs/en/advanced_tutorials/registry.md
+++ b/docs/en/advanced_tutorials/registry.md
@@ -169,7 +169,7 @@ func_res = FUNCTION.build(func_cfg)
 
 The registry in MMEngine supports hierarchical registration, which enables cross-project calls, meaning that modules from one project can be used in another project. Though there are other ways to implement this, the registry provides a much easier solution.
 
-To easily make cross-library calls, MMEngine provides twenty root registries, including:
+To easily make cross-library calls, MMEngine provides twenty two root registries, including:
 
 - RUNNERS: the registry for Runner.
 - RUNNER_CONSTRUCTORS: the constructors for Runner.
@@ -191,6 +191,8 @@ To easily make cross-library calls, MMEngine provides twenty root registries, in
 - VISUALIZERS: the management drawing module that draws prediction boxes on images, such as `DetVisualizer`.
 - VISBACKENDS: the backend for storing training logs, such as `LocalVisBackend`, and `TensorboardVisBackend`.
 - LOG_PROCESSORS: controls the log statistics window and statistics methods, by default we use `LogProcessor`. You may customize `LogProcessor` if you have special needs.
+- FUNCTIONS: registers various functions, such as `collate_fn` in `DataLoader`.
+- INFERENCERS: registers inferencers of different tasks, such as `DetInferencer`, which is used to perform inference on the detection task.
 
 ### Use the module of the parent node
 
diff --git a/docs/en/common_usage/set_random_seed.md b/docs/en/common_usage/set_random_seed.md
new file mode 100644
index 0000000000..5b2a1749eb
--- /dev/null
+++ b/docs/en/common_usage/set_random_seed.md
@@ -0,0 +1,35 @@
+# How to Set Random Seed
+
+As described in [PyTorch REPRODUCIBILITY](https://pytorch.org/docs/stable/notes/randomness.html), there are 2 factors affecting the reproducibility of an experiment, namely random number and nondeterministic algorithms.
+
+MMEngine provides the functionality to set the random number and select a deterministic algorithm. Users can simply set the `randomness` argument of the `Runner`. The argument is eventually consumed in [set_random_seed](mmengine.runner.set_random_seed) and it has the following three fields:
+
+- seed (int): The random seed. If this argument is not set, a random number will be used.
+- diff_rank_seed (bool): Whether to set different seeds for different processes by adding the `rank` (process index) to the seed.
+- deterministic (bool): Whether to set deterministic options for the CUDNN backend.
+
+Let's take the [Get Started in 15 Minutes](../get_started/15_minutes.md) as an example to demonstrate how to set `randomness` in MMEngine.
+
+```python
+runner = Runner(
+    model=MMResNet50(),
+    work_dir='./work_dir',
+    train_dataloader=train_dataloader,
+    optim_wrapper=dict(optimizer=dict(type=SGD, lr=0.001, momentum=0.9)),
+    train_cfg=dict(by_epoch=True, max_epochs=5, val_interval=1),
+    val_dataloader=val_dataloader,
+    val_cfg=dict(),
+    val_evaluator=dict(type=Accuracy),
+    # adding randomness setting
+    randomness=dict(seed=0),
+)
+runner.train()
+```
+
+However, there may still be some differences between any two experiments, even with the random number set and the deterministic algorithms chosen. The core reason is that the atomic operations in CUDA are unordered and random during parallel training.
+
+The CUDA implementation of some operators sometimes inevitably performs atomic operations such as adding, subtracting, multiplying, and dividing the same memory address multiple times in different CUDA kernels. In particular, during the `backward` process, the use of `atomicAdd` is very common. These atomic operations are unordered and random when computed. Therefore, when performing atomic operations at the same memory address multiple times, let's say adding multiple gradients at the same address, the order in which they are performed is uncertain, and even if each number is the same, the order in which the numbers are added will be different.
+
+The randomness of the summing order leads to another problem, that is, since the summed values are generally floating point numbers that have the problem of precision loss, there will be a slight difference in the final result.
+
+Therefore, by setting random seeds and deterministic to `True`, we can make sure that the initialization weights and even the forward outputs of the model are identical for each experiment, and the loss values are also identical. However, there may be subtle differences after one back-propagation, and the final performance of the trained models will be slightly different.
diff --git a/docs/en/common_usage/speed_up_training.md b/docs/en/common_usage/speed_up_training.md
index 96b9e00e85..85264c510f 100644
--- a/docs/en/common_usage/speed_up_training.md
+++ b/docs/en/common_usage/speed_up_training.md
@@ -84,3 +84,32 @@ runner.train()
 ```{warning}
 Up till PyTorch 1.13, `torch.bfloat16` performance on `Convolution` is bad unless manually set environment variable `TORCH_CUDNN_V8_API_ENABLED=1`. More context at [PyTorch issue](https://github.com/pytorch/pytorch/issues/57707#issuecomment-1166656767)
 ```
+
+## Model Compilation
+
+PyTorch introduced [torch.compile](https://pytorch.org/docs/2.0/dynamo/get-started.html) in its 2.0 release. It compiles your model to speedup trainning & validation. This feature can be enabled since MMEngine v0.7.0, by passing to `Runner` an extra `cfg` dict with `compile` keyword:
+
+```python
+runner = Runner(
+    model=ResNet18(),
+    ...  # other arguments you want
+    cfg=dict(compile=True)
+)
+```
+
+For advanced usage, you can also change compile options as illustrated in [torch.compile API Documentation](https://pytorch.org/docs/2.0/generated/torch.compile.html#torch-compile). For example:
+
+```python
+compile_options = dict(backend='inductor', mode='max-autotune')
+runner = Runner(
+    model=ResNet18(),
+    ...  # other arguments you want
+    cfg=dict(compile=compile_options)
+)
+```
+
+This feature is only available for PyTorch >= 2.0.0.
+
+```{warning}
+`torch.compile` is still under development by PyTorch team. Some models may fail compilation. If you encounter errors during compilation, you can refer to [PyTorch Dynamo FAQ](https://pytorch.org/docs/2.0/dynamo/faq.html) for quick fix, or [TorchDynamo Troubleshooting](https://pytorch.org/docs/2.0/dynamo/troubleshooting.html) to post an issue in PyTorch.
+```
diff --git a/docs/en/design/logging.md b/docs/en/design/logging.md
index 0029d5d49b..8110ef8579 100644
--- a/docs/en/design/logging.md
+++ b/docs/en/design/logging.md
@@ -402,27 +402,29 @@ Since distributed applications will create multiple log files, we add a director
 
 ### Export logs in distributed training
 
-When training with pytorch distributed methods, users can set `distributed=True` in config file to export multiple logs from all processes. If not specified, only master process will export log file.
+When training with pytorch distributed methods, users can set `distributed=True` or `log_level='DEBUG'` in config file to export multiple logs from all processes. If not specified, only master process will export log file.
 
 ```python
 logger = MMLogger.get_instance('mmengine', log_file='tmp.log', distributed=True, log_level='INFO')
+# or
+# logger = MMLogger.get_instance('mmengine', log_file='tmp.log', log_level='DEBUG')
 ```
 
 In the case of multiple processes in a single node, or multiple processes in multiple nodes with shared storage, the exported log files have the following hierarchy
 
 ```text
 #  shared storage case
-./tmp
-├── tmp.log
-├── tmp_rank1.log
-├── tmp_rank2.log
-├── tmp_rank3.log
-├── tmp_rank4.log
-├── tmp_rank5.log
-├── tmp_rank6.log
-└── tmp_rank7.log
+work_dir/20230228_141908
+├── 20230306_183634_${hostname}_device0_rank0.log
+├── 20230306_183634_${hostname}_device1_rank1.log
+├── 20230306_183634_${hostname}_device2_rank2.log
+├── 20230306_183634_${hostname}_device3_rank3.log
+├── 20230306_183634_${hostname}_device4_rank4.log
+├── 20230306_183634_${hostname}_device5_rank5.log
+├── 20230306_183634_${hostname}_device6_rank6.log
+├── 20230306_183634_${hostname}_device7_rank7.log
 ...
-└── tmp_rank63.log
+├── 20230306_183634_${hostname}_device7_rank63.log
 ```
 
 In the case of multiple processes in multiple nodes without storage, logs are organized as follows
@@ -430,21 +432,24 @@ In the case of multiple processes in multiple nodes without storage, logs are or
 ```text
 # without shared storage
 # node 0：
-work_dir/
-└── exp_name_logs
-    ├── exp_name.log
-    ├── exp_name_rank1.log
-    ├── exp_name_rank2.log
-    ├── exp_name_rank3.log
-    ...
-    └── exp_name_rank7.log
+work_dir/20230228_141908
+├── 20230306_183634_${hostname}_device0_rank0.log
+├── 20230306_183634_${hostname}_device1_rank1.log
+├── 20230306_183634_${hostname}_device2_rank2.log
+├── 20230306_183634_${hostname}_device3_rank3.log
+├── 20230306_183634_${hostname}_device4_rank4.log
+├── 20230306_183634_${hostname}_device5_rank5.log
+├── 20230306_183634_${hostname}_device6_rank6.log
+├── 20230306_183634_${hostname}_device7_rank7.log
 
 # node 7：
-work_dir/
-└── exp_name_logs
-    ├── exp_name_rank56.log
-    ├── exp_name_rank57.log
-    ├── exp_name_rank58.log
-    ...
-    └── exp_name_rank63.log
+work_dir/20230228_141908
+├── 20230306_183634_${hostname}_device0_rank56.log
+├── 20230306_183634_${hostname}_device1_rank57.log
+├── 20230306_183634_${hostname}_device2_rank58.log
+├── 20230306_183634_${hostname}_device3_rank59.log
+├── 20230306_183634_${hostname}_device4_rank60.log
+├── 20230306_183634_${hostname}_device5_rank61.log
+├── 20230306_183634_${hostname}_device6_rank62.log
+├── 20230306_183634_${hostname}_device7_rank63.log
 ```
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 5d5650f696..d018107986 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -23,6 +23,7 @@ You can switch between Chinese and English documents in the lower-left corner of
    common_usage/resume_training.md
    common_usage/speed_up_training.md
    common_usage/save_gpu_memory.md
+   common_usage/set_random_seed.md
    common_usage/debug_tricks.md
    common_usage/model_analysis.md
    common_usage/set_interval.md
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index f6e0476403..44c0006057 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,48 @@
 # Changelog of v0.x
 
+## v0.7.0 (03/16/2023)
+
+### Highlights
+
+- Support PyTorch 2.0! Accelerate training by compiling models. See the tutorial [Model Compilation](https://mmengine.readthedocs.io/en/latest/common_usage/speed_up_training.html#model-compilation) for details
+- Add `EarlyStoppingHook` to stop training when the metric does not improve
+
+### New Features & Enhancements
+
+- Add configurations to support `torch.compile` in Runner by [@C1rN09](https://github.com/C1rN09) in https://github.com/open-mmlab/mmengine/pull/976
+- Support `EarlyStoppingHook` by [@nijkah](https://github.com/nijkah) in https://github.com/open-mmlab/mmengine/pull/739
+- Disable duplicated warning during distributed training by [@HAOCHENYE](https://github.com/HAOCHENYE) in https://github.com/open-mmlab/mmengine/pull/961
+- Add `FUNCTIONS` root Registry by [@HAOCHENYE](https://github.com/HAOCHENYE) in https://github.com/open-mmlab/mmengine/pull/983
+- Save the "memory" field to visualization backends by [@enkilee](https://github.com/enkilee) in https://github.com/open-mmlab/mmengine/pull/974
+- Enable bf16 in `AmpOptimWrapper` by [@C1rN09](https://github.com/C1rN09) in https://github.com/open-mmlab/mmengine/pull/960
+- Support writing data to `vis_backend` with prefix by [@HAOCHENYE](https://github.com/HAOCHENYE) in https://github.com/open-mmlab/mmengine/pull/972
+- Support exporting logs of different ranks in debug mode by [@HAOCHENYE](https://github.com/HAOCHENYE) in https://github.com/open-mmlab/mmengine/pull/968
+- Silence error when `ManagerMixin` built instance with duplicate name. by [@HAOCHENYE](https://github.com/HAOCHENYE) in https://github.com/open-mmlab/mmengine/pull/990
+
+### Bug fixes
+
+- Fix optim_wrapper unittest for `pytorch < 1.10.0` by [@C1rN09](https://github.com/C1rN09) in https://github.com/open-mmlab/mmengine/pull/975
+- Support calculating the flops of `matmul` with single dimension matrix by [@HAOCHENYE](https://github.com/HAOCHENYE) in https://github.com/open-mmlab/mmengine/pull/970
+- Fix repeated warning by [@HAOCHENYE](https://github.com/HAOCHENYE) in https://github.com/open-mmlab/mmengine/pull/992
+- Fix lint by [@zhouzaida](https://github.com/zhouzaida) in https://github.com/open-mmlab/mmengine/pull/993
+- Fix AMP in Ascend and support using NPUJITCompile environment by [@luomaoling](https://github.com/luomaoling) in https://github.com/open-mmlab/mmengine/pull/994
+- Fix inferencer gets wrong configs path by [@HAOCHENYE](https://github.com/HAOCHENYE) in https://github.com/open-mmlab/mmengine/pull/996
+
+### Docs
+
+- Translate "Debug Tricks" to English by [@enkilee](https://github.com/enkilee) in https://github.com/open-mmlab/mmengine/pull/953
+- Translate "Model Analysis" document to English by [@enkilee](https://github.com/enkilee) in https://github.com/open-mmlab/mmengine/pull/956
+- Translate "Model Complexity Analysis" to Chinese. by [@VoyagerXvoyagerx](https://github.com/VoyagerXvoyagerx) in https://github.com/open-mmlab/mmengine/pull/969
+- Add a document about setting interval by [@YuetianW](https://github.com/YuetianW) in https://github.com/open-mmlab/mmengine/pull/964
+- Translate "how to set random seed" by [@xin-li-67](https://github.com/xin-li-67) in https://github.com/open-mmlab/mmengine/pull/930
+- Fix typo by [@zhouzaida](https://github.com/zhouzaida) in https://github.com/open-mmlab/mmengine/pull/965
+- Fix typo in hook document by [@acdart](https://github.com/acdart) in https://github.com/open-mmlab/mmengine/pull/980
+- Fix changelog date by [@HAOCHENYE](https://github.com/HAOCHENYE) in https://github.com/open-mmlab/mmengine/pull/986
+
+### Contributors
+
+A total of 10 developers contributed to this release. Thanks [@xin-li-67](https://github.com/xin-li-67), [@acdart](https://github.com/acdart), [@enkilee](https://github.com/enkilee), [@YuetianW](https://github.com/YuetianW), [@luomaoling](https://github.com/luomaoling), [@nijkah](https://github.com/nijkah), [@VoyagerXvoyagerx](https://github.com/VoyagerXvoyagerx), [@zhouzaida](https://github.com/zhouzaida), [@HAOCHENYE](https://github.com/HAOCHENYE), [@C1rN09](https://github.com/C1rN09)
+
 ## v0.6.0 (02/24/2023)
 
 ### Highlights
@@ -251,7 +294,7 @@ A total of 16 developers contributed to this release. Thanks [@BayMaxBHL](https:
 - [@BIGWangYuDong](https://github.com/BIGWangYuDong) made their first contribution in https://github.com/open-mmlab/mmengine/pull/556
 - [@zengyh1900](https://github.com/zengyh1900) made their first contribution in https://github.com/open-mmlab/mmengine/pull/659
 
-## v0.2.0 (11/10/2022)
+## v0.2.0 (10/11/2022)
 
 ### New Features & Enhancements
 
diff --git a/docs/en/tutorials/dataset.md b/docs/en/tutorials/dataset.md
index b1a1c8ff94..27706db478 100644
--- a/docs/en/tutorials/dataset.md
+++ b/docs/en/tutorials/dataset.md
@@ -119,10 +119,10 @@ MMEngine provides 2 built-in `collate_fn`:
 - `pseudo_collate`: Default value in MMEngine. It won't concatenate data through `batch` index. Detailed explanations can be found in [pseudo_collate API doc](mmengine.dataset.pseudo_collate)
 - `default_collate`: It behaves almost identically to PyTorch's `default_collate`. It will transfer data into `Tensor` and concatenate them through `batch` index. More details and slight differences from PyTorch can be found in [default_collate API doc](mmengine.dataset.default_collate)
 
-If you want to use a custom `collate_fn`, you can register it to `COLLATE_FUNCTIONS` registry.
+If you want to use a custom `collate_fn`, you can register it to `FUNCTIONS` registry.
 
 ```python
-@COLLATE_FUNCTIONS.register_module()
+@FUNCTIONS.register_module()
 def my_collate_func(data_batch: Sequence) -> Any:
     pass
 
diff --git a/docs/en/tutorials/hook.md b/docs/en/tutorials/hook.md
index bec68a9378..fdb81f938f 100644
--- a/docs/en/tutorials/hook.md
+++ b/docs/en/tutorials/hook.md
@@ -232,9 +232,9 @@ We simply pass the hook config to the `custom_hooks` parameter of the Runner, wh
 
 ```python
 from mmengine.runner import Runner
-custom_hooks = dict(
+custom_hooks = [
     dict(type='CheckInvalidLossHook', interval=50)
-)
+]
 runner = Runner(custom_hooks=custom_hooks, ...)
 runner.train()  # start training
 ```
@@ -244,9 +244,9 @@ Then the loss value are checked after iteration.
 Note that the priority of the custom hook is `NORMAL (50)` by default, if you want to change the priority of the hook, then you can set the priority key in the config.
 
 ```python
-custom_hooks = dict(
+custom_hooks = [
     dict(type='CheckInvalidLossHook', interval=50, priority='ABOVE_NORMAL')
-)
+]
 ```
 
 You can also set priority when defining classes.
diff --git a/docs/zh_cn/advanced_tutorials/model_analysis.md b/docs/zh_cn/advanced_tutorials/model_analysis.md
index 34a7a57ddf..c3867933d1 100644
--- a/docs/zh_cn/advanced_tutorials/model_analysis.md
+++ b/docs/zh_cn/advanced_tutorials/model_analysis.md
@@ -1,3 +1,184 @@
 # 模型复杂度分析
 
-翻译中，请暂时阅读英文文档 [Model Complexity Analysis](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/model_analysis.html)。
+我们提供了一个工具来帮助分析网络的复杂性。我们借鉴了 [fvcore](https://github.com/facebookresearch/fvcore) 的实现思路来构建这个工具，并计划在未来支持更多的自定义算子。目前的工具提供了用于计算给定模型的浮点运算量（FLOPs）、激活量（Activations）和参数量（Parameters）的接口，并支持以网络结构或表格的形式逐层打印相关信息，同时提供了算子级别（operator）和模块级别（Module）的统计。如果您对统计浮点运算量的实现细节感兴趣，请参考 [Flop Count](https://github.com/facebookresearch/fvcore/blob/main/docs/flop_count.md)。
+
+## 定义
+
+模型复杂度有 3 个指标，分别是浮点运算量（FLOPs）、激活量（Activations）以及参数量（Parameters），它们的定义如下：
+
+- 浮点运算量
+
+  浮点运算量不是一个定义非常明确的指标，在这里参考 [detectron2](https://detectron2.readthedocs.io/en/latest/modules/fvcore.html#fvcore.nn.FlopCountAnalysis) 的描述，将一组乘加运算定义为 1 个 flop。
+
+- 激活量
+
+  激活量用于衡量某一层产生的特征数量。
+
+- 参数量
+
+  模型的参数量。
+
+例如，给定输入尺寸 `inputs = torch.randn((1, 3, 10, 10))`，和一个卷积层 `conv = nn.Conv2d(in_channels=3, out_channels=10, kernel_size=3)`，那么它输出的特征图尺寸为 `(1, 10, 8, 8)`，则它的浮点运算量是 `17280 = 10*8*8*3*3*3`（10*8*8 表示输出的特征图大小、3*3*3 表示每一个输出需要的计算量）、激活量是 `640 = 10*8*8`、参数量是 `280 = 3*10*3*3 + 10`（3*10*3\*3 表示权重的尺寸、10 表示偏置值的尺寸）。
+
+## 用法
+
+### 基于 `nn.Module` 构建的模型
+
+构建模型
+
+```python
+from torch import nn
+
+from mmengine.analysis import get_model_complexity_info
+
+
+# 以字典的形式返回分析结果，包括:
+# ['flops', 'flops_str', 'activations', 'activations_str', 'params', 'params_str', 'out_table', 'out_arch']
+class InnerNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(10, 10)
+        self.fc2 = nn.Linear(10, 10)
+
+    def forward(self, x):
+        return self.fc1(self.fc2(x))
+
+
+class TestNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(10, 10)
+        self.fc2 = nn.Linear(10, 10)
+        self.inner = InnerNet()
+
+    def forward(self, x):
+        return self.fc1(self.fc2(self.inner(x)))
+
+
+input_shape = (1, 10)
+model = TestNet()
+```
+
+`get_model_complexity_info` 返回的 `analysis_results` 是一个包含 7 个值的字典:
+
+- `flops`: flop 的总数, 例如, 1000, 1000000
+- `flops_str`: 格式化的字符串, 例如, 1.0G, 1.0M
+- `params`: 全部参数的数量, 例如, 1000, 1000000
+- `params_str`: 格式化的字符串, 例如, 1.0K, 1M
+- `activations`: 激活量的总数, 例如, 1000, 1000000
+- `activations_str`: 格式化的字符串, 例如, 1.0G, 1M
+- `out_table`: 以表格形式打印相关信息
+
+打印结果
+
+- 以表格形式打印相关信息
+
+  ```python
+  print(analysis_results['out_table'])
+  ```
+
+  ```text
+  +---------------------+----------------------+--------+--------------+
+  | module              | #parameters or shape | #flops | #activations |
+  +---------------------+----------------------+--------+--------------+
+  | model               | 0.44K                | 0.4K   | 40           |
+  |  fc1                |  0.11K               |  100   |  10          |
+  |   fc1.weight        |   (10, 10)           |        |              |
+  |   fc1.bias          |   (10,)              |        |              |
+  |  fc2                |  0.11K               |  100   |  10          |
+  |   fc2.weight        |   (10, 10)           |        |              |
+  |   fc2.bias          |   (10,)              |        |              |
+  |  inner              |  0.22K               |  0.2K  |  20          |
+  |   inner.fc1         |   0.11K              |   100  |   10         |
+  |    inner.fc1.weight |    (10, 10)          |        |              |
+  |    inner.fc1.bias   |    (10,)             |        |              |
+  |   inner.fc2         |   0.11K              |   100  |   10         |
+  |    inner.fc2.weight |    (10, 10)          |        |              |
+  |    inner.fc2.bias   |    (10,)             |        |              |
+  +---------------------+----------------------+--------+--------------+
+  ```
+
+- 以网络层级结构打印相关信息
+
+  ```python
+  print(analysis_results['out_arch'])
+  ```
+
+  ```bash
+  TestNet(
+    #params: 0.44K, #flops: 0.4K, #acts: 40
+    (fc1): Linear(
+      in_features=10, out_features=10, bias=True
+      #params: 0.11K, #flops: 100, #acts: 10
+    )
+    (fc2): Linear(
+      in_features=10, out_features=10, bias=True
+      #params: 0.11K, #flops: 100, #acts: 10
+    )
+    (inner): InnerNet(
+      #params: 0.22K, #flops: 0.2K, #acts: 20
+      (fc1): Linear(
+        in_features=10, out_features=10, bias=True
+        #params: 0.11K, #flops: 100, #acts: 10
+      )
+      (fc2): Linear(
+        in_features=10, out_features=10, bias=True
+        #params: 0.11K, #flops: 100, #acts: 10
+      )
+    )
+  )
+  ```
+
+- 以字符串的形式打印结果
+
+  ```python
+  print("Model Flops:{}".format(analysis_results['flops_str']))
+  # Model Flops:0.4K
+  print("Model Parameters:{}".format(analysis_results['params_str']))
+  # Model Parameters:0.44K
+  ```
+
+### 基于 BaseModel（来自 MMEngine）构建的模型
+
+```python
+import torch.nn.functional as F
+import torchvision
+from mmengine.model import BaseModel
+from mmengine.analysis import get_model_complexity_info
+
+
+class MMResNet50(BaseModel):
+    def __init__(self):
+        super().__init__()
+        self.resnet = torchvision.models.resnet50()
+
+    def forward(self, imgs, labels=None, mode='tensor'):
+        x = self.resnet(imgs)
+        if mode == 'loss':
+            return {'loss': F.cross_entropy(x, labels)}
+        elif mode == 'predict':
+            return x, labels
+        elif mode == 'tensor':
+            return x
+
+
+input_shape = (3, 224, 224)
+model = MMResNet50()
+
+analysis_results = get_model_complexity_info(model, input_shape)
+
+print("Model Flops:{}".format(analysis_results['flops_str']))
+# Model Flops:4.145G
+print("Model Parameters:{}".format(analysis_results['params_str']))
+# Model Parameters:25.557M
+```
+
+## 其他接口
+
+除了上述基本用法，`get_model_complexity_info` 还能接受以下参数，输出定制化的统计结果：
+
+- `model`: (nn.Module) 待分析的模型
+- `input_shape`: (tuple) 输入尺寸，例如 (3, 224, 224)
+- `inputs`: (optional: torch.Tensor), 如果传入该参数, `input_shape` 会被忽略
+- `show_table`: (bool) 是否以表格形式返回统计结果，默认值：True
+- `show_arch`: (bool) 是否以网络结构形式返回统计结果，默认值：True
diff --git a/docs/zh_cn/advanced_tutorials/registry.md b/docs/zh_cn/advanced_tutorials/registry.md
index 4cd42ddcdd..75b71b22cb 100644
--- a/docs/zh_cn/advanced_tutorials/registry.md
+++ b/docs/zh_cn/advanced_tutorials/registry.md
@@ -167,7 +167,7 @@ func_res = FUNCTION.build(func_cfg)
 
 MMEngine 的注册器支持层级注册，利用该功能可实现跨项目调用，即可以在一个项目中使用另一个项目的模块。虽然跨项目调用也有其他方法的可以实现，但 MMEngine 注册器提供了更为简便的方法。
 
-为了方便跨库调用，MMEngine 提供了 20 个根注册器：
+为了方便跨库调用，MMEngine 提供了 22 个根注册器：
 
 - RUNNERS: Runner 的注册器
 - RUNNER_CONSTRUCTORS: Runner 的构造器
@@ -189,6 +189,8 @@ MMEngine 的注册器支持层级注册，利用该功能可实现跨项目调
 - VISUALIZERS: 管理绘制模块，如 `DetVisualizer` 可在图片上绘制预测框
 - VISBACKENDS: 存储训练日志的后端，如 `LocalVisBackend`, `TensorboardVisBackend`
 - LOG_PROCESSORS: 控制日志的统计窗口和统计方法，默认使用 `LogProcessor`，如有特殊需求可自定义 `LogProcessor`
+- FUNCTIONS: 注册了各种函数，如 Dataloader 中传入的 `collate_fn`
+- INFERENCERS: 注册了各种任务的推理器，如 `DetInferencer`，负责检测任务的推理
 
 ### 调用父节点的模块
 
diff --git a/docs/zh_cn/common_usage/speed_up_training.md b/docs/zh_cn/common_usage/speed_up_training.md
index 7f7b86b778..0ff3680dd3 100644
--- a/docs/zh_cn/common_usage/speed_up_training.md
+++ b/docs/zh_cn/common_usage/speed_up_training.md
@@ -85,3 +85,32 @@ runner.train()
 ```{warning}
 截止到 PyTorch 1.13 版本，在 `Convolution` 中直接使用 `torch.bfloat16` 性能低下，必须手动设置环境变量 `TORCH_CUDNN_V8_API_ENABLED=1` 以启用 CuDNN 版本的 BF16 Convolution。相关讨论见 [PyTorch Issue](https://github.com/pytorch/pytorch/issues/57707#issuecomment-1166656767)
 ```
+
+## 模型编译
+
+PyTorch 2.0 版本引入了 [torch.compile](https://pytorch.org/docs/2.0/dynamo/get-started.html) 新特性，通过对模型进行编译来加速训练、验证。MMEngine 从 v0.7.0 版本开始支持这一特性，你可以通过向 `Runner` 的 `cfg` 参数传入一个带有 `compile` 关键词的字典来开启模型编译：
+
+```python
+runner = Runner(
+    model=ResNet18(),
+    ...  # 你的其他 Runner 配置参数
+    cfg=dict(compile=True)
+)
+```
+
+此外，你也可以传入更多的编译配置选项，所有编译配置选项可以参考 [torch.compile API 文档](https://pytorch.org/docs/2.0/generated/torch.compile.html#torch-compile)
+
+```python
+compile_options = dict(backend='inductor', mode='max-autotune')
+runner = Runner(
+    model=ResNet18(),
+    ...  # 你的其他 Runner 配置参数
+    cfg=dict(compile=compile_options)
+)
+```
+
+这一特性只有在你安装 PyTorch >= 2.0.0 版本时才可用。
+
+```{warning}
+`torch.compile` 目前仍然由 PyTorch 团队持续开发中，一些模型可能会编译失败。如果遇到了类似问题，你可以查阅 [PyTorch Dynamo FAQ](https://pytorch.org/docs/2.0/dynamo/faq.html) 解决常见问题，或参考 [TorchDynamo Troubleshooting](https://pytorch.org/docs/2.0/dynamo/troubleshooting.html) 向 PyTorch 提 issue.
+```
diff --git a/docs/zh_cn/design/logging.md b/docs/zh_cn/design/logging.md
index f47dcf98b0..7f6ac21a54 100644
--- a/docs/zh_cn/design/logging.md
+++ b/docs/zh_cn/design/logging.md
@@ -412,17 +412,17 @@ logger = MMLogger.get_instance('mmengine', log_file='tmp.log', distributed=True,
 
 ```text
 #  共享存储
-./tmp
-├── tmp.log
-├── tmp_rank1.log
-├── tmp_rank2.log
-├── tmp_rank3.log
-├── tmp_rank4.log
-├── tmp_rank5.log
-├── tmp_rank6.log
-└── tmp_rank7.log
+work_dir/20230228_141908
+├── 20230306_183634_${hostname}_device0_rank0.log
+├── 20230306_183634_${hostname}_device1_rank1.log
+├── 20230306_183634_${hostname}_device2_rank2.log
+├── 20230306_183634_${hostname}_device3_rank3.log
+├── 20230306_183634_${hostname}_device4_rank4.log
+├── 20230306_183634_${hostname}_device5_rank5.log
+├── 20230306_183634_${hostname}_device6_rank6.log
+├── 20230306_183634_${hostname}_device7_rank7.log
 ...
-└── tmp_rank63.log
+├── 20230306_183634_${hostname}_device7_rank63.log
 ```
 
 多机多卡，独立存储的情况：
@@ -430,21 +430,24 @@ logger = MMLogger.get_instance('mmengine', log_file='tmp.log', distributed=True,
 ```text
 # 独立存储
 # 设备0：
-work_dir/
-└── exp_name_logs
-    ├── exp_name.log
-    ├── exp_name_rank1.log
-    ├── exp_name_rank2.log
-    ├── exp_name_rank3.log
-    ...
-    └── exp_name_rank7.log
+work_dir/20230228_141908
+├── 20230306_183634_${hostname}_device0_rank0.log
+├── 20230306_183634_${hostname}_device1_rank1.log
+├── 20230306_183634_${hostname}_device2_rank2.log
+├── 20230306_183634_${hostname}_device3_rank3.log
+├── 20230306_183634_${hostname}_device4_rank4.log
+├── 20230306_183634_${hostname}_device5_rank5.log
+├── 20230306_183634_${hostname}_device6_rank6.log
+├── 20230306_183634_${hostname}_device7_rank7.log
 
 # 设备7：
-work_dir/
-└── exp_name_logs
-    ├── exp_name_rank56.log
-    ├── exp_name_rank57.log
-    ├── exp_name_rank58.log
-    ...
-    └── exp_name_rank63.log
+work_dir/20230228_141908
+├── 20230306_183634_${hostname}_device0_rank56.log
+├── 20230306_183634_${hostname}_device1_rank57.log
+├── 20230306_183634_${hostname}_device2_rank58.log
+├── 20230306_183634_${hostname}_device3_rank59.log
+├── 20230306_183634_${hostname}_device4_rank60.log
+├── 20230306_183634_${hostname}_device5_rank61.log
+├── 20230306_183634_${hostname}_device6_rank62.log
+├── 20230306_183634_${hostname}_device7_rank63.log
 ```
diff --git a/docs/zh_cn/tutorials/dataset.md b/docs/zh_cn/tutorials/dataset.md
index 18e086f500..5c5adf3386 100644
--- a/docs/zh_cn/tutorials/dataset.md
+++ b/docs/zh_cn/tutorials/dataset.md
@@ -117,10 +117,10 @@ MMengine 中提供了 2 种内置的 `collate_fn`：
 - `pseudo_collate`，缺省时的默认参数。它不会将数据沿着 `batch` 的维度合并。详细说明可以参考 [pseudo_collate](mmengine.dataset.pseudo_collate)
 - `default_collate`，与 PyTorch 中的 `default_collate` 行为几乎完全一致，会将数据转化为 `Tensor` 并沿着 `batch` 维度合并。一些细微不同和详细说明可以参考 [default_collate](mmengine.dataset.default_collate)
 
-如果你想要使用自定义的 `collate_fn`，你也可以将它注册到 `COLLATE_FUNCTIONS` 根注册器中来使用
+如果你想要使用自定义的 `collate_fn`，你也可以将它注册到 `FUNCTIONS` 根注册器中来使用
 
 ```python
-@COLLATE_FUNCTIONS.register_module()
+@FUNCTIONS.register_module()
 def my_collate_func(data_batch: Sequence) -> Any:
     pass
 
diff --git a/docs/zh_cn/tutorials/hook.md b/docs/zh_cn/tutorials/hook.md
index db1f206425..cc9b151f3d 100644
--- a/docs/zh_cn/tutorials/hook.md
+++ b/docs/zh_cn/tutorials/hook.md
@@ -236,9 +236,9 @@ class CheckInvalidLossHook(Hook):
 ```python
 from mmengine.runner import Runner
 
-custom_hooks = dict(
+custom_hooks = [
     dict(type='CheckInvalidLossHook', interval=50)
-)
+]
 runner = Runner(custom_hooks=custom_hooks, ...)  # 实例化执行器，主要完成环境的初始化以及各种模块的构建
 runner.train()  # 执行器开始训练
 ```
@@ -248,9 +248,9 @@ runner.train()  # 执行器开始训练
 注意，自定义钩子的优先级默认为 `NORMAL (50)`，如果想改变钩子的优先级，则可以在配置中设置 priority 字段。
 
 ```python
-custom_hooks = dict(
+custom_hooks = [
     dict(type='CheckInvalidLossHook', interval=50, priority='ABOVE_NORMAL')
-)
+]
 ```
 
 也可以在定义类时给定优先级
diff --git a/mmengine/analysis/jit_handles.py b/mmengine/analysis/jit_handles.py
index 4f3dd696f2..917509d7e3 100644
--- a/mmengine/analysis/jit_handles.py
+++ b/mmengine/analysis/jit_handles.py
@@ -209,13 +209,16 @@ def einsum_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
 
 def matmul_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
     """Count flops for matmul."""
-    # Inputs should be a list of length 2.
-    # Inputs contains the shapes of two matrices.
-    input_shapes = [get_shape(v) for v in inputs]
-    assert len(input_shapes) == 2, input_shapes
-    assert input_shapes[0][-1] == input_shapes[1][  # type: ignore
-        -2], input_shapes  # type: ignore
-    flop = prod(input_shapes[0]) * input_shapes[-1][-1]  # type: ignore
+    # input_shapes is a list of length 2.
+    input_shapes: list = [get_shape(v) for v in inputs]
+    input1, input2 = input_shapes
+    if len(input1) == 1:
+        input1 = [1, input1[0]]
+    if len(input2) == 1:
+        input2 = [input2[0], 1]
+
+    assert input1[-1] == input2[-2], input_shapes
+    flop = prod(input1) * input2[-1]
     return flop
 
 
diff --git a/mmengine/config/utils.py b/mmengine/config/utils.py
index 830ab73eb7..6e03a93ba5 100644
--- a/mmengine/config/utils.py
+++ b/mmengine/config/utils.py
@@ -26,6 +26,7 @@
     'mmrotate': 'mmrotate',
     'mmselfsup': 'mmselfsup',
     'mmyolo': 'mmyolo',
+    'mmpretrain': 'mmpretrain',
 }
 
 # PKG2PROJECT is not a proper name to represent the mapping between module name
diff --git a/mmengine/dataset/utils.py b/mmengine/dataset/utils.py
index 00867b14f9..2c9cf96497 100644
--- a/mmengine/dataset/utils.py
+++ b/mmengine/dataset/utils.py
@@ -8,10 +8,12 @@
 from torch.utils.data._utils.collate import \
     default_collate as torch_default_collate
 
-from mmengine.registry import Registry
+from mmengine.registry import FUNCTIONS
 from mmengine.structures import BaseDataElement
 
-COLLATE_FUNCTIONS = Registry('Collate Functions')
+# FUNCTIONS is new in MMEngine v0.7.0. Reserve the `COLLATE_FUNCTIONS` to keep
+# the compatibility.
+COLLATE_FUNCTIONS = FUNCTIONS
 
 
 def worker_init_fn(worker_id: int,
@@ -39,7 +41,7 @@ def worker_init_fn(worker_id: int,
         warnings.simplefilter('ignore')
 
 
-@COLLATE_FUNCTIONS.register_module()
+@FUNCTIONS.register_module()
 def pseudo_collate(data_batch: Sequence) -> Any:
     """Convert list of data sampled from dataset into a batch of data, of which
     type consistent with the type of each data_itement in ``data_batch``.
@@ -97,7 +99,7 @@ def pseudo_collate(data_batch: Sequence) -> Any:
         return data_batch
 
 
-@COLLATE_FUNCTIONS.register_module()
+@FUNCTIONS.register_module()
 def default_collate(data_batch: Sequence) -> Any:
     """Convert list of data sampled from dataset into a batch of data, of which
     type consistent with the type of each data_itement in ``data_batch``.
diff --git a/mmengine/device/utils.py b/mmengine/device/utils.py
index 44e92f715b..a6575a4267 100644
--- a/mmengine/device/utils.py
+++ b/mmengine/device/utils.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os
 from typing import Optional
 
 import torch
@@ -39,7 +40,8 @@ def is_npu_available() -> bool:
 
         # Enable operator support for dynamic shape and
         # binary operator support on the NPU.
-        torch.npu.set_compile_mode(jit_compile=False)
+        npu_jit_compile = bool(os.getenv('NPUJITCompile', False))
+        torch.npu.set_compile_mode(jit_compile=npu_jit_compile)
     except Exception:
         return False
     return hasattr(torch, 'npu') and torch.npu.is_available()
diff --git a/mmengine/hooks/logger_hook.py b/mmengine/hooks/logger_hook.py
index 13ab926067..ddf0227ea9 100644
--- a/mmengine/hooks/logger_hook.py
+++ b/mmengine/hooks/logger_hook.py
@@ -234,14 +234,8 @@ def after_val_epoch(self,
             runner, len(runner.val_dataloader), 'val')
         runner.logger.info(log_str)
         if self.log_metric_by_epoch:
-            # when `log_metric_by_epoch` is set to True, it's expected
-            # that validation metric can be logged by epoch rather than
-            # by iter. At the same time, scalars related to time should
-            # still be logged by iter to avoid messy visualized result.
-            # see details in PR #278.
-            metric_tags = {k: v for k, v in tag.items() if 'time' not in k}
             runner.visualizer.add_scalars(
-                metric_tags, step=runner.epoch, file_path=self.json_log_path)
+                tag, step=runner.epoch, file_path=self.json_log_path)
         else:
             runner.visualizer.add_scalars(
                 tag, step=runner.iter, file_path=self.json_log_path)
diff --git a/mmengine/infer/infer.py b/mmengine/infer/infer.py
index e27d1b233d..95c4f4c9e0 100644
--- a/mmengine/infer/infer.py
+++ b/mmengine/infer/infer.py
@@ -16,12 +16,12 @@
 
 from mmengine.config import Config, ConfigDict
 from mmengine.config.utils import MODULE2PACKAGE
-from mmengine.dataset import COLLATE_FUNCTIONS, pseudo_collate
+from mmengine.dataset import pseudo_collate
 from mmengine.device import get_device
 from mmengine.fileio import (get_file_backend, isdir, join_path,
                              list_dir_or_file, load)
 from mmengine.logging import print_log
-from mmengine.registry import MODELS, VISUALIZERS, DefaultScope
+from mmengine.registry import FUNCTIONS, MODELS, VISUALIZERS, DefaultScope
 from mmengine.runner.checkpoint import (_load_checkpoint,
                                         _load_checkpoint_to_model)
 from mmengine.structures import InstanceData
@@ -419,9 +419,9 @@ def _get_repo_or_mim_dir(scope):
             return repo_dir
         else:
             mim_dir = osp.join(package_path, '.mim')
-            if not osp.exists(osp.join(mim_dir, 'Configs')):
+            if not osp.exists(osp.join(mim_dir, 'configs')):
                 raise FileNotFoundError(
-                    f'Cannot find Configs directory in {package_path}!, '
+                    f'Cannot find `configs` directory in {package_path}!, '
                     f'please check the completeness of the {scope}.')
             return mim_dir
 
@@ -522,8 +522,7 @@ def preprocess(self, inputs, batch_size, **kwargs):
             Callable: Collate function.
         """
         try:
-            with COLLATE_FUNCTIONS.switch_scope_and_registry(
-                    self.scope) as registry:
+            with FUNCTIONS.switch_scope_and_registry(self.scope) as registry:
                 collate_fn = registry.get(cfg.test_dataloader.collate_fn)
         except AttributeError:
             collate_fn = pseudo_collate
diff --git a/mmengine/logging/logger.py b/mmengine/logging/logger.py
index 442bef2fc4..4119d0499c 100644
--- a/mmengine/logging/logger.py
+++ b/mmengine/logging/logger.py
@@ -1,8 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
 import os
+import os.path as osp
 import sys
+import warnings
+from getpass import getuser
 from logging import Logger, LogRecord
+from socket import gethostname
 from typing import Optional, Union
 
 from termcolor import colored
@@ -11,6 +15,35 @@
 from mmengine.utils.manager import _accquire_lock, _release_lock
 
 
+class FilterDuplicateWarning(logging.Filter):
+    """Filter the repeated warning message.
+
+    Args:
+        name (str): name of the filter.
+    """
+
+    def __init__(self, name: str = 'mmengine'):
+        super().__init__(name)
+        self.seen: set = set()
+
+    def filter(self, record: LogRecord) -> bool:
+        """Filter the repeated warning message.
+
+        Args:
+            record (LogRecord): The log record.
+
+        Returns:
+            bool: Whether to output the log record.
+        """
+        if record.levelno != logging.WARNING:
+            return True
+
+        if record.msg not in self.seen:
+            self.seen.add(record.msg)
+            return True
+        return False
+
+
 class MMFormatter(logging.Formatter):
     """Colorful format for MMLogger. If the log level is error, the logger will
     additionally output the location of the code.
@@ -134,8 +167,9 @@ class MMLogger(Logger, ManagerMixin):
             If `logger_name` is not defined, defaults to 'mmengine'.
         log_file (str, optional): The log filename. If specified, a
             ``FileHandler`` will be added to the logger. Defaults to None.
-        log_level (str): The log level of the handler and logger. Defaults to
-            "NOTSET".
+        log_level (str): The log level of the handler. Defaults to
+            'INFO'. If log level is 'DEBUG', distributed logs will be saved
+            during distributed training.
         file_mode (str): The file mode used to open log file. Defaults to 'w'.
         distributed (bool): Whether to save distributed logs, Defaults to
             false.
@@ -145,14 +179,16 @@ def __init__(self,
                  name: str,
                  logger_name='mmengine',
                  log_file: Optional[str] = None,
-                 log_level: str = 'INFO',
+                 log_level: Union[int, str] = 'INFO',
                  file_mode: str = 'w',
                  distributed=False):
         Logger.__init__(self, logger_name)
         ManagerMixin.__init__(self, name)
         # Get rank in DDP mode.
-
-        rank = _get_rank()
+        if isinstance(log_level, str):
+            log_level = logging._nameToLevel[log_level]
+        global_rank = _get_rank()
+        device_id = _get_device_id()
 
         # Config stream_handler. If `rank != 0`. stream_handler can only
         # export ERROR logs.
@@ -162,24 +198,31 @@ def __init__(self,
         stream_handler.setFormatter(
             MMFormatter(color=True, datefmt='%m/%d %H:%M:%S'))
         # Only rank0 `StreamHandler` will log messages below error level.
-        stream_handler.setLevel(log_level) if rank == 0 else \
+        if global_rank == 0:
+            stream_handler.setLevel(log_level)
+        else:
             stream_handler.setLevel(logging.ERROR)
+        stream_handler.addFilter(FilterDuplicateWarning(logger_name))
         self.handlers.append(stream_handler)
 
         if log_file is not None:
-            if rank != 0:
-                # rename `log_file` with rank suffix.
-                path_split = log_file.split(os.sep)
-                if '.' in path_split[-1]:
-                    filename_list = path_split[-1].split('.')
-                    filename_list[-2] = f'{filename_list[-2]}_rank{rank}'
-                    path_split[-1] = '.'.join(filename_list)
+            world_size = _get_world_size()
+            is_distributed = (log_level <= logging.DEBUG
+                              or distributed) and world_size > 1
+            if is_distributed:
+                filename, suffix = osp.splitext(osp.basename(log_file))
+                hostname = _get_host_info()
+                if hostname:
+                    filename = (f'{filename}_{hostname}_device{device_id}_'
+                                f'rank{global_rank}{suffix}')
                 else:
-                    path_split[-1] = f'{path_split[-1]}_rank{rank}'
-                log_file = os.sep.join(path_split)
+                    # Omit hostname if it is empty
+                    filename = (f'{filename}_device{device_id}_'
+                                f'rank{global_rank}{suffix}')
+                log_file = osp.join(osp.dirname(log_file), filename)
             # Save multi-ranks logs if distributed is True. The logs of rank0
             # will always be saved.
-            if rank == 0 or distributed:
+            if global_rank == 0 or is_distributed:
                 # Here, the default behaviour of the official logger is 'a'.
                 # Thus, we provide an interface to change the file mode to
                 # the default behaviour. `FileHandler` is not supported to
@@ -191,7 +234,13 @@ def __init__(self,
                 file_handler.setFormatter(
                     MMFormatter(color=False, datefmt='%Y/%m/%d %H:%M:%S'))
                 file_handler.setLevel(log_level)
+                file_handler.addFilter(FilterDuplicateWarning(logger_name))
                 self.handlers.append(file_handler)
+        self._log_file = log_file
+
+    @property
+    def log_file(self):
+        return self._log_file
 
     @classmethod
     def get_current_instance(cls) -> 'MMLogger':
@@ -288,6 +337,17 @@ def print_log(msg,
             f'"silent", "current" or None, but got {type(logger)}')
 
 
+def _get_world_size():
+    """Support using logging module without torch."""
+    try:
+        # requires torch
+        from mmengine.dist import get_world_size
+    except ImportError:
+        return 1
+    else:
+        return get_world_size()
+
+
 def _get_rank():
     """Support using logging module without torch."""
     try:
@@ -297,3 +357,38 @@ def _get_rank():
         return 0
     else:
         return get_rank()
+
+
+def _get_device_id():
+    """Get device id of current machine."""
+    try:
+        import torch
+    except ImportError:
+        return 0
+    else:
+        local_rank = int(os.getenv('LOCAL_RANK', '0'))
+        # TODO: return device id of npu and mlu.
+        if not torch.cuda.is_available():
+            return local_rank
+        cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+        if cuda_visible_devices is None:
+            num_device = torch.cuda.device_count()
+            cuda_visible_devices = list(range(num_device))
+        else:
+            cuda_visible_devices = cuda_visible_devices.split(',')
+        return int(cuda_visible_devices[local_rank])
+
+
+def _get_host_info() -> str:
+    """Get hostname and username.
+
+    Return empty string if exception raised, e.g. ``getpass.getuser()`` will
+    lead to error in docker container
+    """
+    host = ''
+    try:
+        host = f'{getuser()}@{gethostname()}'
+    except Exception as e:
+        warnings.warn(f'Host or user not found: {str(e)}')
+    finally:
+        return host
diff --git a/mmengine/model/base_module.py b/mmengine/model/base_module.py
index e742d15886..1167bdf201 100644
--- a/mmengine/model/base_module.py
+++ b/mmengine/model/base_module.py
@@ -95,15 +95,12 @@ def init_weights(self):
             for sub_module in self.modules():
                 sub_module._params_init_info = self._params_init_info
 
-        logger = MMLogger.get_current_instance()
-        logger_name = logger.instance_name
-
         module_name = self.__class__.__name__
         if not self._is_init:
             if self.init_cfg:
                 print_log(
                     f'initialize {module_name} with init_cfg {self.init_cfg}',
-                    logger=logger_name,
+                    logger='current',
                     level=logging.DEBUG)
 
                 init_cfgs = self.init_cfg
@@ -145,7 +142,6 @@ def init_weights(self):
                 level=logging.WARNING)
 
         if is_top_level_module:
-            # self._dump_init_info(logger_name)
             self._dump_init_info()
 
             for sub_module in self.modules():
@@ -154,14 +150,9 @@ def init_weights(self):
     @master_only
     def _dump_init_info(self):
         """Dump the initialization information to a file named
-        `initialization.log.json` in workdir.
-
-        Args:
-            logger_name (str): The name of logger.
-        """
+        `initialization.log.json` in workdir."""
 
         logger = MMLogger.get_current_instance()
-        logger_name = logger.instance_name
         with_file_handler = False
         # dump the information to the logger file if there is a `FileHandler`
         for handler in logger.handlers:
@@ -176,10 +167,9 @@ def _dump_init_info(self):
                 with_file_handler = True
         if not with_file_handler:
             for name, param in self.named_parameters():
-                print_log(
+                logger.info(
                     f'\n{name} - {param.shape}: '
-                    f"\n{self._params_init_info[param]['init_info']} \n ",
-                    logger=logger_name)
+                    f"\n{self._params_init_info[param]['init_info']} \n ")
 
     def __repr__(self):
         s = super().__repr__()
diff --git a/mmengine/model/utils.py b/mmengine/model/utils.py
index 19c290eaeb..c78ea3134d 100644
--- a/mmengine/model/utils.py
+++ b/mmengine/model/utils.py
@@ -84,14 +84,13 @@ def traverse(grad_fn):
                     traverse(grad_fn)
 
     traverse(loss.grad_fn)
-    from mmengine.logging import MMLogger
-    logger = MMLogger.get_current_instance()
     for n, p in model.named_parameters():
         if p not in parameters_in_graph and p.requires_grad:
-            logger.log(
-                level=logging.ERROR,
-                msg=f'{n} with shape {p.size()} is not '
-                f'in the computational graph \n')
+            print_log(
+                f'{n} with shape {p.size()} is not '
+                f'in the computational graph \n',
+                logger='current',
+                level=logging.ERROR)
 
 
 def merge_dict(*args):
diff --git a/mmengine/registry/__init__.py b/mmengine/registry/__init__.py
index a326750b0c..7b10890e88 100644
--- a/mmengine/registry/__init__.py
+++ b/mmengine/registry/__init__.py
@@ -3,11 +3,12 @@
                               build_runner_from_cfg, build_scheduler_from_cfg)
 from .default_scope import DefaultScope
 from .registry import Registry
-from .root import (DATA_SAMPLERS, DATASETS, EVALUATOR, HOOKS, INFERENCERS,
-                   LOG_PROCESSORS, LOOPS, METRICS, MODEL_WRAPPERS, MODELS,
-                   OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, OPTIMIZERS,
-                   PARAM_SCHEDULERS, RUNNER_CONSTRUCTORS, RUNNERS, TASK_UTILS,
-                   TRANSFORMS, VISBACKENDS, VISUALIZERS, WEIGHT_INITIALIZERS)
+from .root import (DATA_SAMPLERS, DATASETS, EVALUATOR, FUNCTIONS, HOOKS,
+                   INFERENCERS, LOG_PROCESSORS, LOOPS, METRICS, MODEL_WRAPPERS,
+                   MODELS, OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
+                   OPTIMIZERS, PARAM_SCHEDULERS, RUNNER_CONSTRUCTORS, RUNNERS,
+                   TASK_UTILS, TRANSFORMS, VISBACKENDS, VISUALIZERS,
+                   WEIGHT_INITIALIZERS)
 from .utils import (count_registered_modules, init_default_scope,
                     traverse_registry_tree)
 
@@ -19,5 +20,5 @@
     'VISBACKENDS', 'VISUALIZERS', 'LOG_PROCESSORS', 'EVALUATOR', 'INFERENCERS',
     'DefaultScope', 'traverse_registry_tree', 'count_registered_modules',
     'build_model_from_cfg', 'build_runner_from_cfg', 'build_from_cfg',
-    'build_scheduler_from_cfg', 'init_default_scope'
+    'build_scheduler_from_cfg', 'init_default_scope', 'FUNCTIONS'
 ]
diff --git a/mmengine/registry/root.py b/mmengine/registry/root.py
index db2e5206d5..3f01e0e2d9 100644
--- a/mmengine/registry/root.py
+++ b/mmengine/registry/root.py
@@ -59,3 +59,6 @@
 
 # manage inferencer
 INFERENCERS = Registry('inferencer')
+
+# manage function
+FUNCTIONS = Registry('function')
diff --git a/mmengine/runner/amp.py b/mmengine/runner/amp.py
index 8acf072ac3..33ab6bd25d 100644
--- a/mmengine/runner/amp.py
+++ b/mmengine/runner/amp.py
@@ -126,6 +126,10 @@ def autocast(device_type: Optional[str] = None,
 
         elif device_type == 'mlu':
             pass
+
+        elif device_type == 'npu':
+            pass
+
         else:
             # Device like MPS does not support fp16 training or testing.
             # If an inappropriate device is set and fp16 is enabled, an error
diff --git a/mmengine/runner/log_processor.py b/mmengine/runner/log_processor.py
index 41fc3bc546..b278cf06fd 100644
--- a/mmengine/runner/log_processor.py
+++ b/mmengine/runner/log_processor.py
@@ -52,6 +52,13 @@ class LogProcessor:
               `epoch` to statistics log value by epoch.
         num_digits (int): The number of significant digit shown in the
             logging message.
+        log_with_hierarchy (bool): Whether to log with hierarchy. If it is
+            True, the information is written to visualizer backend such as
+            :obj:`LocalVisBackend` and :obj:`TensorboardBackend`
+            with hierarchy. For example, ``loss`` will be saved as
+            ``train/loss``, and accuracy will be saved as ``val/accuracy``.
+            Defaults to False.
+            `New in version 0.7.0.`
 
     Examples:
         >>> # `log_name` is defined, `loss_large_window` will be an additional
@@ -98,11 +105,13 @@ def __init__(self,
                  window_size=10,
                  by_epoch=True,
                  custom_cfg: Optional[List[dict]] = None,
-                 num_digits: int = 4):
+                 num_digits: int = 4,
+                 log_with_hierarchy: bool = False):
         self.window_size = window_size
         self.by_epoch = by_epoch
         self.custom_cfg = custom_cfg if custom_cfg else []
         self.num_digits = num_digits
+        self.log_with_hierarchy = log_with_hierarchy
         self._check_custom_cfg()
 
     def get_log_after_iter(self, runner, batch_idx: int,
@@ -120,18 +129,26 @@ def get_log_after_iter(self, runner, batch_idx: int,
             recorded by :obj:`runner.message_hub` and :obj:`runner.visualizer`.
         """
         assert mode in ['train', 'test', 'val']
-        current_loop = self._get_cur_loop(runner, mode)
         cur_iter = self._get_iter(runner, batch_idx=batch_idx)
         # Overwrite ``window_size`` defined in ``custom_cfg`` to int value.
-        custom_cfg_copy = self._parse_windows_size(runner, batch_idx)
-        # tag is used to write log information to different backends.
-        tag = self._collect_scalars(custom_cfg_copy, runner, mode)
-        # `log_tag` will pop 'lr' and loop other keys to `log_str`.
-        log_tag = copy.deepcopy(tag)
+        parsed_cfg = self._parse_windows_size(runner, batch_idx,
+                                              self.custom_cfg)
+        # log_tag is used to write log information to terminal
+        # If `self.log_with_hierarchy` is False, the tag is the same as
+        # log_tag. Otherwise, each key in tag starts with prefix `train`,
+        # `test` or `val`
+        log_tag = self._collect_scalars(parsed_cfg, runner, mode)
+
+        if not self.log_with_hierarchy:
+            tag = copy.deepcopy(log_tag)
+        else:
+            tag = self._collect_scalars(parsed_cfg, runner, mode, True)
+
         # Record learning rate.
         lr_str_list = []
         for key, value in tag.items():
             if key.endswith('lr'):
+                key = self._remove_prefix(key, f'{mode}/')
                 log_tag.pop(key)
                 lr_str_list.append(f'{key}: '
                                    f'{value:.{self.num_digits}e}')
@@ -148,7 +165,7 @@ def get_log_after_iter(self, runner, batch_idx: int,
             # Epoch(train)  [  9][010/270]
             # ...                 ||| |||
             # Epoch(train)  [ 10][100/270]
-            dataloader_len = len(current_loop.dataloader)
+            dataloader_len = self._get_dataloader_size(runner, mode)
             cur_iter_str = str(cur_iter).rjust(len(str(dataloader_len)))
 
             if mode in ['train', 'val']:
@@ -174,30 +191,31 @@ def get_log_after_iter(self, runner, batch_idx: int,
                 log_str = (f'Iter({mode}) '
                            f'[{cur_iter_str}/{runner.max_iters}]  ')
             else:
-                dataloader_len = len(current_loop.dataloader)
+                dataloader_len = self._get_dataloader_size(runner, mode)
                 cur_iter_str = str(batch_idx + 1).rjust(
                     len(str(dataloader_len)))
-                log_str = (f'Iter({mode}) [{cur_iter_str}'
-                           f'/{len(current_loop.dataloader)}]  ')
+                log_str = (f'Iter({mode}) [{cur_iter_str}/{dataloader_len}]  ')
         # Concatenate lr, momentum string with log header.
         log_str += f'{lr_str}  '
         # If IterTimerHook used in runner, eta, time, and data_time should be
         # recorded.
-        if (all(item in tag for item in ['time', 'data_time'])
+        if (all(item in log_tag for item in ['time', 'data_time'])
                 and 'eta' in runner.message_hub.runtime_info):
             eta = runner.message_hub.get_info('eta')
             eta_str = str(datetime.timedelta(seconds=int(eta)))
             log_str += f'eta: {eta_str}  '
-            log_str += (f'time: {tag["time"]:.{self.num_digits}f}  '
+            log_str += (f'time: {log_tag["time"]:.{self.num_digits}f}  '
                         f'data_time: '
-                        f'{tag["data_time"]:.{self.num_digits}f}  ')
+                        f'{log_tag["data_time"]:.{self.num_digits}f}  ')
             # Pop recorded keys
             log_tag.pop('time')
             log_tag.pop('data_time')
 
         # If cuda is available, the max memory occupied should be calculated.
         if is_cuda_available():
-            log_str += f'memory: {self._get_max_memory(runner)}  '
+            max_memory = self._get_max_memory(runner)
+            log_str += f'memory: {max_memory}  '
+            tag['memory'] = max_memory
         # Loop left keys to fill `log_str`.
         if mode in ('train', 'val'):
             log_items = []
@@ -233,15 +251,8 @@ def get_log_after_epoch(self,
             'test', 'val'
         ], ('`_get_metric_log_str` only accept val or test mode, but got '
             f'{mode}')
-        cur_loop = self._get_cur_loop(runner, mode)
-        dataloader_len = len(cur_loop.dataloader)
+        dataloader_len = self._get_dataloader_size(runner, mode)
 
-        custom_cfg_copy = self._parse_windows_size(runner, batch_idx)
-        # tag is used to write log information to different backends.
-        tag = self._collect_scalars(custom_cfg_copy, runner, mode)
-        non_scalar_tag = self._collect_non_scalars(runner, mode)
-        tag.pop('time', None)
-        tag.pop('data_time', None)
         # By epoch:
         #     Epoch(val) [10][1000/1000]  ...
         #     Epoch(test) [1000/1000] ...
@@ -259,8 +270,42 @@ def get_log_after_epoch(self,
 
         else:
             log_str = (f'Iter({mode}) [{dataloader_len}/{dataloader_len}]  ')
-        # `time` and `data_time` will not be recorded in after epoch log
-        # message.
+
+        custom_cfg_copy = copy.deepcopy(self.custom_cfg)
+        # remove prefix
+        custom_keys = [
+            self._remove_prefix(cfg['data_src'], f'{mode}/')
+            for cfg in custom_cfg_copy
+        ]
+        # Count the averaged time and data_time by epoch
+        if 'time' not in custom_keys:
+            custom_cfg_copy.append(
+                dict(
+                    data_src=f'{mode}/time',
+                    window_size='epoch',
+                    method_name='mean'))
+        if 'data_time' not in custom_keys:
+            custom_cfg_copy.append(
+                dict(
+                    data_src=f'{mode}/data_time',
+                    window_size='epoch',
+                    method_name='mean'))
+        parsed_cfg = self._parse_windows_size(runner, batch_idx,
+                                              custom_cfg_copy)
+        # tag is used to write log information to different backends.
+        ori_tag = self._collect_scalars(parsed_cfg, runner, mode,
+                                        self.log_with_hierarchy)
+        non_scalar_tag = self._collect_non_scalars(runner, mode)
+        # move `time` or `data_time` to the end of the log
+        tag = OrderedDict()
+        time_tag = OrderedDict()
+        for key, value in ori_tag.items():
+            if key in (f'{mode}/time', f'{mode}/data_time', 'time',
+                       'data_time'):
+                time_tag[key] = value
+            else:
+                tag[key] = value
+        # Log other messages.
         log_items = []
         for name, val in chain(tag.items(), non_scalar_tag.items()):
             if isinstance(val, float):
@@ -271,12 +316,19 @@ def get_log_after_epoch(self,
             log_items.append(f'{name}: {val}')
         log_str += '  '.join(log_items)
 
+        for name, val in time_tag.items():
+            log_str += f'{name}: {val:.{self.num_digits}f}  '
+
         if with_non_scalar:
             tag.update(non_scalar_tag)
+        tag.update(time_tag)
         return tag, log_str
 
-    def _collect_scalars(self, custom_cfg: List[dict], runner,
-                         mode: str) -> dict:
+    def _collect_scalars(self,
+                         custom_cfg: List[dict],
+                         runner,
+                         mode: str,
+                         reserve_prefix: bool = False) -> dict:
         """Collect log information to compose a dict according to mode.
 
         Args:
@@ -285,6 +337,7 @@ def _collect_scalars(self, custom_cfg: List[dict], runner,
             runner (Runner): The runner of the training/testing/validation
                 process.
             mode (str): Current mode of runner.
+            reserve_prefix (bool): Whether to reserve the prefix of the key.
 
         Returns:
             dict: Statistical values of logs.
@@ -298,7 +351,10 @@ def _collect_scalars(self, custom_cfg: List[dict], runner,
         # according to mode.
         for prefix_key, log_buffer in history_scalars.items():
             if prefix_key.startswith(mode):
-                key = prefix_key.partition('/')[-1]
+                if not reserve_prefix:
+                    key = self._remove_prefix(prefix_key, f'{mode}/')
+                else:
+                    key = prefix_key
                 mode_history_scalars[key] = log_buffer
         for key in mode_history_scalars:
             # Update the latest learning rate and smoothed time logs.
@@ -339,10 +395,20 @@ def _collect_non_scalars(self, runner, mode: str) -> dict:
         # extract log info and remove prefix to `mode_infos` according to mode.
         for prefix_key, value in infos.items():
             if prefix_key.startswith(mode):
-                key = prefix_key.partition('/')[-1]
+                if self.log_with_hierarchy:
+                    key = prefix_key
+                else:
+                    key = self._remove_prefix(prefix_key, f'{mode}/')
                 mode_infos[key] = value
         return mode_infos
 
+    def _remove_prefix(self, string: str, prefix: str):
+        """Remove the prefix ``train``, ``val`` and ``test`` of the key."""
+        if string.startswith(prefix):
+            return string[len(prefix):]
+        else:
+            return string
+
     def _check_custom_cfg(self) -> None:
         """Check the legality of ``self.custom_cfg``."""
 
@@ -375,16 +441,24 @@ def _check_repeated_log_name():
         _check_repeated_log_name()
         _check_window_size()
 
-    def _parse_windows_size(self, runner, batch_idx: int) -> list:
+    def _parse_windows_size(self,
+                            runner,
+                            batch_idx: int,
+                            custom_cfg: Optional[list] = None) -> list:
         """Parse window_size defined in custom_cfg to int value.
 
         Args:
             runner (Runner): The runner of the training/testing/validation
                 process.
             batch_idx (int): The iteration index of current dataloader.
+            custom_cfg (list): A copy of ``self.custom_cfg``. Defaults to None
+                to keep backward compatibility.
         """
-        custom_cfg_copy = copy.deepcopy(self.custom_cfg)
-        for log_cfg in custom_cfg_copy:
+        if custom_cfg is None:
+            custom_cfg = copy.deepcopy(self.custom_cfg)
+        else:
+            custom_cfg = copy.deepcopy(custom_cfg)
+        for log_cfg in custom_cfg:
             window_size = log_cfg.get('window_size', None)
             if window_size is None or isinstance(window_size, int):
                 continue
@@ -396,7 +470,7 @@ def _parse_windows_size(self, runner, batch_idx: int) -> list:
                 raise TypeError(
                     'window_size should be int, epoch or global, but got '
                     f'invalid {window_size}')
-        return custom_cfg_copy
+        return custom_cfg
 
     def _get_max_memory(self, runner) -> int:
         """Returns the maximum GPU memory occupied by tensors in megabytes (MB)
@@ -472,3 +546,15 @@ def _get_cur_loop(self, runner, mode: str):
             return runner.val_loop
         else:
             return runner.test_loop
+
+    def _get_dataloader_size(self, runner, mode) -> int:
+        """Get dataloader size of current loop.
+
+        Args:
+            runner (Runner): The runner of the training/validation/testing
+            mode (str): Current mode of runner.
+
+        Returns:
+            int: The dataloader size of current loop.
+        """
+        return len(self._get_cur_loop(runner=runner, mode=mode).dataloader)
diff --git a/mmengine/runner/runner.py b/mmengine/runner/runner.py
index 8adc0360f4..9f30da624a 100644
--- a/mmengine/runner/runner.py
+++ b/mmengine/runner/runner.py
@@ -19,7 +19,7 @@
 
 import mmengine
 from mmengine.config import Config, ConfigDict
-from mmengine.dataset import COLLATE_FUNCTIONS, worker_init_fn
+from mmengine.dataset import worker_init_fn
 from mmengine.device import get_device
 from mmengine.dist import (broadcast, get_dist_info, get_rank, init_dist,
                            is_distributed, master_only)
@@ -31,10 +31,10 @@
                             is_model_wrapper, revert_sync_batchnorm)
 from mmengine.optim import (OptimWrapper, OptimWrapperDict, _ParamScheduler,
                             build_optim_wrapper)
-from mmengine.registry import (DATA_SAMPLERS, DATASETS, EVALUATOR, HOOKS,
-                               LOG_PROCESSORS, LOOPS, MODEL_WRAPPERS, MODELS,
-                               OPTIM_WRAPPERS, PARAM_SCHEDULERS, RUNNERS,
-                               VISUALIZERS, DefaultScope)
+from mmengine.registry import (DATA_SAMPLERS, DATASETS, EVALUATOR, FUNCTIONS,
+                               HOOKS, LOG_PROCESSORS, LOOPS, MODEL_WRAPPERS,
+                               MODELS, OPTIM_WRAPPERS, PARAM_SCHEDULERS,
+                               RUNNERS, VISUALIZERS, DefaultScope)
 from mmengine.utils import digit_version, get_git_hash, is_seq_of
 from mmengine.utils.dl_utils import (TORCH_VERSION, collect_env,
                                      set_multi_processing)
@@ -180,6 +180,14 @@ class Runner:
         cfg (dict or Configdict or :obj:`Config`, optional): Full config.
             Defaults to None.
 
+    Note:
+        Since PyTorch 2.0.0, you can enable ``torch.compile`` by passing in
+        `cfg.compile = True`. If you want to control compile options, you
+        can pass a dict, e.g. ``cfg.compile = dict(backend='eager')``.
+        Refer to `PyTorch API Documentation <https://pytorch.org/docs/
+        master/generated/torch.compile.html#torch.compile>`_ for more valid
+        options.
+
     Examples:
         >>> from mmengine.runner import Runner
         >>> cfg = dict(
@@ -1402,7 +1410,7 @@ def build_dataloader(dataloader: Union[DataLoader, Dict],
         collate_fn_cfg = dataloader_cfg.pop('collate_fn',
                                             dict(type='pseudo_collate'))
         collate_fn_type = collate_fn_cfg.pop('type')
-        collate_fn = COLLATE_FUNCTIONS.get(collate_fn_type)
+        collate_fn = FUNCTIONS.get(collate_fn_type)
         collate_fn = partial(collate_fn, **collate_fn_cfg)  # type: ignore
         data_loader = DataLoader(
             dataset=dataset,
@@ -1686,6 +1694,10 @@ def train(self) -> nn.Module:
             self._train_loop.iter,  # type: ignore
             self._train_loop.max_iters)  # type: ignore
 
+        # Maybe compile the model according to options in self.cfg.compile
+        # This must be called **AFTER** model has been wrapped.
+        self._maybe_compile('train_step')
+
         model = self.train_loop.run()  # type: ignore
         self.call_hook('after_run')
         return model
@@ -1959,13 +1971,10 @@ def resume(self,
         current_seed = self._randomness_cfg.get('seed')
         if resumed_seed is not None and resumed_seed != current_seed:
             if current_seed is not None:
-                print_log(
-                    f'The value of random seed in the '
-                    f'checkpoint "{resumed_seed}" is '
-                    f'different from the value in '
-                    f'`randomness` config "{current_seed}"',
-                    logger='current',
-                    level=logging.WARNING)
+                self.logger.warning(f'The value of random seed in the '
+                                    f'checkpoint "{resumed_seed}" is '
+                                    f'different from the value in '
+                                    f'`randomness` config "{current_seed}"')
             self._randomness_cfg.update(seed=resumed_seed)
             self.set_randomness(**self._randomness_cfg)
 
@@ -1976,13 +1985,11 @@ def resume(self,
         # np.ndarray, which cannot be directly judged as equal or not,
         # therefore we just compared their dumped results.
         if pickle.dumps(resumed_dataset_meta) != pickle.dumps(dataset_meta):
-            print_log(
+            self.logger.warning(
                 'The dataset metainfo from the resumed checkpoint is '
                 'different from the current training dataset, please '
                 'check the correctness of the checkpoint or the training '
-                'dataset.',
-                logger='current',
-                level=logging.WARNING)
+                'dataset.')
 
         self.message_hub.load_state_dict(checkpoint['message_hub'])
 
@@ -1994,11 +2001,9 @@ def resume(self,
 
         # resume param scheduler
         if resume_param_scheduler and self.param_schedulers is None:
-            print_log(
+            self.logger.warning(
                 '`resume_param_scheduler` is True but `self.param_schedulers` '
-                'is None, so skip resuming parameter schedulers',
-                logger='current',
-                level=logging.WARNING)
+                'is None, so skip resuming parameter schedulers')
             resume_param_scheduler = False
         if 'param_schedulers' in checkpoint and resume_param_scheduler:
             self.param_schedulers = self.build_param_scheduler(  # type: ignore
@@ -2155,11 +2160,9 @@ def save_checkpoint(
 
         # save param scheduler state dict
         if save_param_scheduler and self.param_schedulers is None:
-            print_log(
+            self.logger.warning(
                 '`save_param_scheduler` is True but `self.param_schedulers` '
-                'is None, so skip saving parameter schedulers',
-                logger='current',
-                level=logging.WARNING)
+                'is None, so skip saving parameter schedulers')
             save_param_scheduler = False
         if save_param_scheduler:
             if isinstance(self.param_schedulers, dict):
@@ -2288,3 +2291,28 @@ def _log_env(self, env_cfg: dict) -> None:
                          '\nRuntime environment:' + runtime_env_info + '\n' +
                          dash_line + '\n')
         self.logger.info(f'Config:\n{self.cfg.pretty_text}')
+
+    def _maybe_compile(self, target: str) -> None:
+        """Use `torch.compile` to optimize model/wrapped_model."""
+        compile_cfg = self.cfg.get('compile', None)
+        if compile_cfg is None:
+            # no compile options given, won't compile
+            return
+
+        if isinstance(compile_cfg, bool):
+            if not compile_cfg:
+                # compile=False, compilation is disabled
+                return
+            # compile=True, use default configurations
+            compile_cfg = dict()
+
+        assert digit_version(TORCH_VERSION) >= digit_version('2.0.0'), (
+            'PyTorch >= 2.0.0 is required to enable torch.compile')
+        assert isinstance(compile_cfg, dict), (
+            f'`compile` should be a dict or bool, got {type(compile_cfg)}')
+
+        func = getattr(self.model, target)
+        compiled_func = torch.compile(func, **compile_cfg)
+        setattr(self.model, target, compiled_func)
+        self.logger.info('Model has been "compiled". The first few iterations'
+                         ' will be slow, please be patient.')
diff --git a/mmengine/structures/base_data_element.py b/mmengine/structures/base_data_element.py
index 7be1ef9044..46c4c886e6 100644
--- a/mmengine/structures/base_data_element.py
+++ b/mmengine/structures/base_data_element.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
-import sys
 from typing import Any, Iterator, Optional, Tuple, Type, Union
 
 import numpy as np
@@ -309,7 +308,16 @@ def keys(self) -> list:
         Returns:
             list: Contains all keys in data_fields.
         """
-        return list(self._data_fields)
+        # We assume that the name of the attribute related to property is
+        # '_' + the name of the property. We use this rule to filter out
+        # private keys.
+        # TODO: Use a more robust way to solve this problem
+        private_keys = {
+            '_' + key
+            for key in self._data_fields
+            if isinstance(getattr(type(self), key, None), property)
+        }
+        return list(self._data_fields - private_keys)
 
     def metainfo_keys(self) -> list:
         """
@@ -466,12 +474,7 @@ def set_field(self,
                 raise AttributeError(
                     f'Cannot set {name} to be a field of data '
                     f'because {name} is already a metainfo field')
-            # The name only added to `data_fields` when it is not the
-            # attribute related to property(methods decorated by @property).
-            if not isinstance(
-                    getattr(type(self),
-                            sys._getframe(1).f_code.co_name, None), property):
-                self._data_fields.add(name)
+            self._data_fields.add(name)
         super().__setattr__(name, value)
 
     # Tensor-like methods
diff --git a/mmengine/utils/manager.py b/mmengine/utils/manager.py
index 68f3409a6a..70b45f2d8e 100644
--- a/mmengine/utils/manager.py
+++ b/mmengine/utils/manager.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import inspect
 import threading
+import warnings
 from collections import OrderedDict
 from typing import Type, TypeVar
 
@@ -108,10 +109,11 @@ def get_instance(cls: Type[T], name: str, **kwargs) -> T:
         if name not in instance_dict:
             instance = cls(name=name, **kwargs)  # type: ignore
             instance_dict[name] = instance  # type: ignore
-        else:
-            assert not kwargs, (
-                f'{cls} instance named of {name} has been created, the method '
-                '`get_instance` should not access any other arguments')
+        elif kwargs:
+            warnings.warn(
+                f'{cls} instance named of {name} has been created, '
+                'the method `get_instance` should not accept any other '
+                'arguments')
         # Get latest instantiated instance or root instance.
         _release_lock()
         return instance_dict[name]
diff --git a/mmengine/version.py b/mmengine/version.py
index bb11ba7482..9edb382144 100644
--- a/mmengine/version.py
+++ b/mmengine/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-__version__ = '0.6.0'
+__version__ = '0.7.0'
 
 
 def parse_version_info(version_str):
diff --git a/mmengine/visualization/vis_backend.py b/mmengine/visualization/vis_backend.py
index fad5232686..569188e685 100644
--- a/mmengine/visualization/vis_backend.py
+++ b/mmengine/visualization/vis_backend.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 import functools
+import logging
 import os
 import os.path as osp
 import warnings
@@ -15,7 +16,7 @@
 from mmengine.config import Config
 from mmengine.fileio import dump
 from mmengine.hooks.logger_hook import SUFFIX_TYPE
-from mmengine.logging import MMLogger
+from mmengine.logging import MMLogger, print_log
 from mmengine.registry import VISBACKENDS
 from mmengine.utils import scandir
 from mmengine.utils.dl_utils import TORCH_VERSION
@@ -45,12 +46,13 @@ def wrapper(obj: object, *args, **kwargs):
         # `_env_initialized` is False, call `_init_env` and set
         # `_env_initialized` to True
         if not getattr(obj, '_env_initialized', False):
-            logger = MMLogger.get_current_instance()
-            logger.debug('Attribute `_env_initialized` is not defined in '
-                         f'{type(obj)} or `{type(obj)}._env_initialized is '
-                         'False, `_init_env` will be called and '
-                         f'{type(obj)}._env_initialized will be set to '
-                         'True')
+            print_log(
+                'Attribute `_env_initialized` is not defined in '
+                f'{type(obj)} or `{type(obj)}._env_initialized is '
+                'False, `_init_env` will be called and '
+                f'{type(obj)}._env_initialized will be set to True',
+                logger='current',
+                level=logging.DEBUG)
             obj._init_env()  # type: ignore
             obj._env_initialized = True  # type: ignore
 
diff --git a/tests/test_analysis/test_flop_count.py b/tests/test_analysis/test_flop_count.py
index 0c0e8943cd..20749a0bab 100644
--- a/tests/test_analysis/test_flop_count.py
+++ b/tests/test_analysis/test_flop_count.py
@@ -580,8 +580,6 @@ def _test_conv(
             transpose=True,
             output_padding=output_padding9,
         )
-
-    def test_matmul(self) -> None:
         """Test flop count for operation matmul."""
         m = 20
         n = 10
@@ -596,6 +594,13 @@ def test_matmul(self) -> None:
         self.assertDictEqual(
             flop_dict, gt_dict,
             'Matmul operation failed to pass the flop count test.')
+        # Test with single dimension y
+        y = torch.randn(n)
+        gt_dict['matmul'] = m * n * 1 / 1e9
+        flop_dict, _ = flop_count(m_net, (x, y))
+        self.assertDictEqual(
+            flop_dict, gt_dict,
+            'Matmul operation failed to pass the flop count test.')
 
     def test_matmul_broadcast(self) -> None:
         """Test flop count for operation matmul."""
diff --git a/tests/test_hooks/test_logger_hook.py b/tests/test_hooks/test_logger_hook.py
index 4a8114a9b8..aab2817a21 100644
--- a/tests/test_hooks/test_logger_hook.py
+++ b/tests/test_hooks/test_logger_hook.py
@@ -147,7 +147,12 @@ def test_after_val_epoch(self):
         logger_hook.after_val_epoch(runner)
         args = {'step': ANY, 'file_path': ANY}
         # expect visualizer log `time` and `metric` respectively
-        runner.visualizer.add_scalars.assert_called_with({'acc': 0.8}, **args)
+        runner.visualizer.add_scalars.assert_called_with(
+            {
+                'time': 1,
+                'datatime': 1,
+                'acc': 0.8
+            }, **args)
 
         # Test when `log_metric_by_epoch` is False
         logger_hook = LoggerHook(log_metric_by_epoch=False)
diff --git a/tests/test_infer/test_infer.py b/tests/test_infer/test_infer.py
index 2b6bc8983e..2d020b6300 100644
--- a/tests/test_infer/test_infer.py
+++ b/tests/test_infer/test_infer.py
@@ -10,10 +10,18 @@
 from mmengine.infer import BaseInferencer
 from mmengine.registry import VISUALIZERS, DefaultScope
 from mmengine.testing import RunnerTestCase
-from mmengine.utils import is_installed, is_list_of
+from mmengine.utils import is_list_of
 from mmengine.visualization import Visualizer
 
 
+def is_imported(package):
+    try:
+        __import__(package)
+        return True
+    except ImportError:
+        return False
+
+
 class ToyInferencer(BaseInferencer):
     preprocess_kwargs = {'pre_arg'}
     forward_kwargs = {'for_arg'}
@@ -98,7 +106,7 @@ def test_init(self):
             ToyInferencer([self.epoch_based_cfg], self.ckpt_path)
 
         # Pass model as model name defined in metafile
-        if is_installed('mmdet'):
+        if is_imported('mmdet'):
             from mmdet.utils import register_all_modules
 
             register_all_modules()
@@ -126,7 +134,7 @@ def test_call(self):
         inferencer(img_paths)
 
     @pytest.mark.skipif(
-        not is_installed('mmdet'), reason='mmdet is not installed')
+        not is_imported('mmdet'), reason='mmdet is not installed')
     def test_load_model_from_meta(self):
         from mmdet.utils import register_all_modules
 
@@ -210,7 +218,7 @@ def test_preprocess(self):
             self.assertTrue(is_list_of(data, torch.Tensor))
 
     @pytest.mark.skipif(
-        not is_installed('mmdet'), reason='mmdet is not installed')
+        not is_imported('mmdet'), reason='mmdet is not installed')
     def test_list_models(self):
         model_list = BaseInferencer.list_models('mmdet')
         self.assertTrue(len(model_list) > 0)
diff --git a/tests/test_logging/test_logger.py b/tests/test_logging/test_logger.py
index e0734702cc..8cd35ccc93 100644
--- a/tests/test_logging/test_logger.py
+++ b/tests/test_logging/test_logger.py
@@ -4,11 +4,13 @@
 import re
 import sys
 from collections import OrderedDict
+from contextlib import contextmanager
 from unittest.mock import patch
 
 import pytest
 
 from mmengine.logging import MMLogger, print_log
+from mmengine.logging.logger import _get_device_id
 
 
 class TestLogger:
@@ -49,10 +51,13 @@ def test_init_rank0(self, tmp_path):
         MMLogger._instance_dict.clear()
 
     @patch('mmengine.logging.logger._get_rank', lambda: 1)
+    @patch('mmengine.logging.logger._get_device_id', lambda: 1)
+    @patch('mmengine.logging.logger._get_world_size', lambda: 2)
+    @patch('mmengine.logging.logger._get_host_info', lambda: 'test')
     def test_init_rank1(self, tmp_path):
         # If `rank!=1`, the `loglevel` of file_handler is `logging.ERROR`.
         tmp_file = tmp_path / 'tmp_file.log'
-        log_path = tmp_path / 'tmp_file_rank1.log'
+        log_path = tmp_path / 'tmp_file_test_device1_rank1.log'
         logger = MMLogger.get_instance(
             'rank1.pkg2', log_level='INFO', log_file=str(tmp_file))
         assert len(logger.handlers) == 1
@@ -64,7 +69,7 @@ def test_init_rank1(self, tmp_path):
         assert logger.handlers[0].level == logging.ERROR
         assert logger.handlers[1].level == logging.INFO
         assert len(logger.handlers) == 2
-        assert os.path.exists(log_path)
+        assert os.path.exists(str(log_path))
         # `FileHandler` should be closed in Windows, otherwise we cannot
         # delete the temporary directory
         logging.shutdown()
@@ -91,7 +96,8 @@ def test_handler(self, capsys, tmp_path, log_level):
         logger = MMLogger.get_instance(
             instance_name, log_level=log_level, log_file=tmp_file)
         logger.log(level=log_level, msg='welcome')
-        with open(tmp_path / 'tmp_file.log') as f:
+
+        with open(tmp_file) as f:
             log_text = f.read()
             match = re.fullmatch(
                 self.file_handler_regex_time +
@@ -184,3 +190,70 @@ def test_set_level(self, capsys):
         logger.warning('hello')
         out, _ = capsys.readouterr()
         assert 'WARNING' in out
+
+    def test_filter(self, capsys):
+        logger = MMLogger.get_instance('test_filter')
+        logger.warning('hello')
+        out, _ = capsys.readouterr()
+        assert 'WARNING' in out
+        # Filter repeated warning.
+        logger.warning('hello')
+        out, _ = capsys.readouterr()
+        assert not out
+        # Pass new warning
+        logger.warning('hello1')
+        out, _ = capsys.readouterr()
+        assert 'WARNING' in out
+
+
+@patch('torch.cuda.device_count', lambda: 4)
+def test_get_device_id():
+
+    @contextmanager
+    def patch_env(local_rank, cuda_visible_devices):
+        ori_local_rank = os.getenv('LOCAL_RANK', None)
+        ori_cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+
+        if local_rank is not None:
+            os.environ['LOCAL_RANK'] = local_rank
+        if cuda_visible_devices is not None:
+            os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
+        yield
+        if ori_local_rank is not None:
+            os.environ['LOCAL_RANK'] = ori_local_rank
+        elif 'LOCAL_RANK' in os.environ:
+            os.environ.pop('LOCAL_RANK')
+        if ori_cuda_visible_devices is not None:
+            os.environ['CUDA_VISIBLE_DEVICES'] = ori_cuda_visible_devices
+        elif 'CUDA_VISIBLE_DEVICES' in os.environ:
+            os.environ.pop('CUDA_VISIBLE_DEVICES')
+
+    # cuda is not available and local_rank is not set
+    with patch('torch.cuda.is_available', lambda: False), \
+         patch_env(None, '0,1,2,3'):
+        assert _get_device_id() == 0
+
+    # cuda is not available and local_rank is set
+    with patch('torch.cuda.is_available', lambda: False), \
+         patch_env('1', '0,1,2,3'):
+        assert _get_device_id() == 1
+
+    # CUDA_VISIBLE_DEVICES will not influence non-cuda device
+    with patch('torch.cuda.is_available', lambda: False), \
+         patch_env('1', '0,100,2,3'):
+        assert _get_device_id() == 1
+
+    # cuda is available and local_rank is not set
+    with patch('torch.cuda.is_available', lambda: True), \
+         patch_env(None, '0,1,2,3'):
+        assert _get_device_id() == 0
+
+    # cuda is available and local_rank is set
+    with patch('torch.cuda.is_available', lambda: True), \
+         patch_env('2', '0,1,2,3'):
+        assert _get_device_id() == 2
+
+    # CUDA_VISIBLE_DEVICES worked
+    with patch('torch.cuda.is_available', lambda: True), \
+         patch_env('2', '0,1,3,5'):
+        assert _get_device_id() == 3
diff --git a/tests/test_runner/test_log_processor.py b/tests/test_runner/test_log_processor.py
index b152a9c6cd..2009db708b 100644
--- a/tests/test_runner/test_log_processor.py
+++ b/tests/test_runner/test_log_processor.py
@@ -47,35 +47,38 @@ def test_check_custom_cfg(self):
     def test_parse_windows_size(self):
         log_processor = LogProcessor()
         # Test parse 'epoch' window_size.
-        log_processor.custom_cfg = [
-            dict(data_src='loss_cls', window_size='epoch')
-        ]
-        custom_cfg = log_processor._parse_windows_size(self.runner, 1)
+        custom_cfg = [dict(data_src='loss_cls', window_size='epoch')]
+        custom_cfg = log_processor._parse_windows_size(self.runner, 1,
+                                                       custom_cfg)
         assert custom_cfg[0]['window_size'] == 2
 
         # Test parse 'global' window_size.
-        log_processor.custom_cfg = [
-            dict(data_src='loss_cls', window_size='global')
-        ]
-        custom_cfg = log_processor._parse_windows_size(self.runner, 1)
+        custom_cfg = [dict(data_src='loss_cls', window_size='global')]
+        custom_cfg = log_processor._parse_windows_size(self.runner, 1,
+                                                       custom_cfg)
         assert custom_cfg[0]['window_size'] == 11
 
         # Test parse int window_size
-        log_processor.custom_cfg = [dict(data_src='loss_cls', window_size=100)]
-        custom_cfg = log_processor._parse_windows_size(self.runner, 1)
+        custom_cfg = [dict(data_src='loss_cls', window_size=100)]
+        custom_cfg = log_processor._parse_windows_size(self.runner, 1,
+                                                       custom_cfg)
         assert custom_cfg[0]['window_size'] == 100
 
         # Invalid type window_size will raise TypeError.
-        log_processor.custom_cfg = [dict(data_src='loss_cls', window_size=[])]
+        custom_cfg = [dict(data_src='loss_cls', window_size=[])]
         with pytest.raises(TypeError):
-            log_processor._parse_windows_size(custom_cfg, self.runner)
+            log_processor._parse_windows_size(self.runner, 1, custom_cfg)
 
-    @pytest.mark.parametrize('by_epoch,mode',
-                             ([True, 'train'], [False, 'train'], [True, 'val'],
-                              [False, 'val'], [True, 'test'], [False, 'test']))
-    def test_get_log_after_iter(self, by_epoch, mode):
+    @pytest.mark.parametrize(
+        'by_epoch,mode,log_with_hierarchy',
+        ([True, 'train', True], [True, 'train', False], [False, 'train', True],
+         [False, 'train', False], [True, 'val', True], [True, 'val', False],
+         [False, 'val', True], [False, 'val', False], [True, 'test', True],
+         [True, 'test', False], [False, 'test', True], [False, 'test', False]))
+    def test_get_log_after_iter(self, by_epoch, mode, log_with_hierarchy):
         # Prepare LoggerHook
-        log_processor = LogProcessor(by_epoch=by_epoch)
+        log_processor = LogProcessor(
+            by_epoch=by_epoch, log_with_hierarchy=log_with_hierarchy)
         log_processor._get_max_memory = MagicMock(return_value='100')
         eta = 40
         self.runner.message_hub.update_info('eta', eta)
@@ -84,8 +87,9 @@ def test_get_log_after_iter(self, by_epoch, mode):
             train_logs = dict(lr=0.1, time=1.0, data_time=1.0, loss_cls=1.0)
         else:
             train_logs = dict(time=1.0, data_time=1.0, loss_cls=1.0)
-        log_processor._collect_scalars = MagicMock(return_value=train_logs)
-        tag, out = log_processor.get_log_after_iter(self.runner, 1, mode)
+        log_processor._collect_scalars = \
+            lambda *args, **kwargs: copy.deepcopy(train_logs)
+        _, out = log_processor.get_log_after_iter(self.runner, 1, mode)
         # Verify that the correct context have been logged.
         cur_loop = log_processor._get_cur_loop(self.runner, mode)
         if by_epoch:
@@ -138,11 +142,13 @@ def test_get_log_after_iter(self, by_epoch, mode):
             assert out == log_str
 
     @pytest.mark.parametrize(
-        'by_epoch,mode',
-        ([True, 'val'], [False, 'val'], [True, 'test'], [False, 'test']))
-    def test_log_val(self, by_epoch, mode):
+        'by_epoch,mode,log_with_hierarchy',
+        ([True, 'val', True], [True, 'val', False], [False, 'val', True],
+         [False, 'val', False], [True, 'test', True], [False, 'test', False]))
+    def test_log_val(self, by_epoch, mode, log_with_hierarchy):
         # Prepare LoggerHook
-        log_processor = LogProcessor(by_epoch=by_epoch)
+        log_processor = LogProcessor(
+            by_epoch=by_epoch, log_with_hierarchy=log_with_hierarchy)
         # Prepare validation information.
         scalar_logs = dict(accuracy=0.9, data_time=1.0)
         non_scalar_logs = dict(
@@ -155,7 +161,7 @@ def test_log_val(self, by_epoch, mode):
             return_value=non_scalar_logs)
         _, out = log_processor.get_log_after_epoch(self.runner, 2, mode)
         expect_metric_str = ("accuracy: 0.9000  recall: {'cat': 1, 'dog': 0}  "
-                             'cm: \ntensor([1, 2, 3])\n')
+                             'cm: \ntensor([1, 2, 3])\ndata_time: 1.0000  ')
         if by_epoch:
             if mode == 'test':
                 assert out == 'Epoch(test) [5/5]  ' + expect_metric_str
diff --git a/tests/test_runner/test_runner.py b/tests/test_runner/test_runner.py
index 1ef0f0c214..bd421ee3af 100644
--- a/tests/test_runner/test_runner.py
+++ b/tests/test_runner/test_runner.py
@@ -5,7 +5,7 @@
 import os.path as osp
 import shutil
 import tempfile
-from unittest import TestCase
+from unittest import TestCase, skipIf
 
 import numpy as np
 import torch
@@ -15,7 +15,7 @@
 from torch.utils.data import DataLoader, Dataset
 
 from mmengine.config import Config
-from mmengine.dataset import COLLATE_FUNCTIONS, DefaultSampler, pseudo_collate
+from mmengine.dataset import DefaultSampler, pseudo_collate
 from mmengine.evaluator import BaseMetric, Evaluator
 from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, Hook,
                             IterTimerHook, LoggerHook, ParamSchedulerHook,
@@ -24,10 +24,11 @@
 from mmengine.model import BaseDataPreprocessor, BaseModel, ImgDataPreprocessor
 from mmengine.optim import (DefaultOptimWrapperConstructor, MultiStepLR,
                             OptimWrapper, OptimWrapperDict, StepLR)
-from mmengine.registry import (DATASETS, EVALUATOR, HOOKS, LOG_PROCESSORS,
-                               LOOPS, METRICS, MODEL_WRAPPERS, MODELS,
-                               OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
-                               PARAM_SCHEDULERS, RUNNERS, Registry)
+from mmengine.registry import (DATASETS, EVALUATOR, FUNCTIONS, HOOKS,
+                               LOG_PROCESSORS, LOOPS, METRICS, MODEL_WRAPPERS,
+                               MODELS, OPTIM_WRAPPER_CONSTRUCTORS,
+                               OPTIM_WRAPPERS, PARAM_SCHEDULERS, RUNNERS,
+                               Registry)
 from mmengine.runner import (BaseLoop, EpochBasedTrainLoop, IterBasedTrainLoop,
                              LogProcessor, Runner, TestLoop, ValLoop)
 from mmengine.runner.loops import _InfiniteDataloaderIterator
@@ -352,7 +353,7 @@ def setUp(self):
         LOG_PROCESSORS.register_module(module=CustomLogProcessor, force=True)
         RUNNERS.register_module(module=CustomRunner, force=True)
         EVALUATOR.register_module(module=ToyEvaluator, force=True)
-        COLLATE_FUNCTIONS.register_module(module=custom_collate, force=True)
+        FUNCTIONS.register_module(module=custom_collate, force=True)
 
         self.temp_dir = tempfile.mkdtemp()
         epoch_based_cfg = dict(
@@ -435,7 +436,7 @@ def tearDown(self):
         LOG_PROCESSORS.module_dict.pop('CustomLogProcessor')
         RUNNERS.module_dict.pop('CustomRunner')
         EVALUATOR.module_dict.pop('ToyEvaluator')
-        COLLATE_FUNCTIONS.module_dict.pop('custom_collate')
+        FUNCTIONS.module_dict.pop('custom_collate')
 
         logging.shutdown()
         MMLogger._instance_dict.clear()
@@ -1704,6 +1705,24 @@ def train_step(self, *args, **kwargs):
         with self.assertRaisesRegex(AssertionError, 'If you want to validate'):
             runner.train()
 
+    @skipIf(
+        not hasattr(torch, 'compile'),
+        reason='torch.compile is not valid, please install PyTorch>=2.0.0')
+    def test_train_with_compile(self):
+        # 1. test with simple configuration
+        cfg = copy.deepcopy(self.epoch_based_cfg)
+        cfg.experiment_name = 'test_train_compile_simple'
+        cfg.compile = True
+        runner = Runner.from_cfg(cfg)
+        runner.train()
+
+        # 2. test with advanced configuration
+        cfg = copy.deepcopy(self.epoch_based_cfg)
+        cfg.experiment_name = 'test_train_compile_advanced'
+        cfg.compile = dict(backend='inductor', mode='default')
+        runner = Runner.from_cfg(cfg)
+        runner.train()
+
     def test_val(self):
         cfg = copy.deepcopy(self.epoch_based_cfg)
         cfg.experiment_name = 'test_val1'
@@ -1756,6 +1775,24 @@ def get_outputs_callback(module, inputs, outputs):
             self.assertIn(predictions[0].dtype,
                           (torch.float16, torch.bfloat16))
 
+    @skipIf(
+        not hasattr(torch, 'compile'),
+        reason='torch.compile is not valid, please install PyTorch>=2.0.0')
+    def test_val_with_compile(self):
+        # 1. test with simple configuration
+        cfg = copy.deepcopy(self.epoch_based_cfg)
+        cfg.experiment_name = 'test_val_compile_simple'
+        cfg.compile = True
+        runner = Runner.from_cfg(cfg)
+        runner.val()
+
+        # 2. test with advanced configuration
+        cfg = copy.deepcopy(self.epoch_based_cfg)
+        cfg.experiment_name = 'test_val_compile_advanced'
+        cfg.compile = dict(backend='inductor', mode='default')
+        runner = Runner.from_cfg(cfg)
+        runner.val()
+
     def test_test(self):
         cfg = copy.deepcopy(self.epoch_based_cfg)
         cfg.experiment_name = 'test_test1'
@@ -1810,6 +1847,24 @@ def get_outputs_callback(module, inputs, outputs):
             self.assertIn(predictions[0].dtype,
                           (torch.float16, torch.bfloat16))
 
+    @skipIf(
+        not hasattr(torch, 'compile'),
+        reason='torch.compile is not valid, please install PyTorch>=2.0.0')
+    def test_test_with_compile(self):
+        # 1. test with simple configuration
+        cfg = copy.deepcopy(self.epoch_based_cfg)
+        cfg.experiment_name = 'test_test_compile_simple'
+        cfg.compile = True
+        runner = Runner.from_cfg(cfg)
+        runner.test()
+
+        # 2. test with advanced configuration
+        cfg = copy.deepcopy(self.epoch_based_cfg)
+        cfg.experiment_name = 'test_test_compile_advanced'
+        cfg.compile = dict(backend='inductor', mode='default')
+        runner = Runner.from_cfg(cfg)
+        runner.test()
+
     def test_register_hook(self):
         cfg = copy.deepcopy(self.epoch_based_cfg)
         cfg.experiment_name = 'test_register_hook'
diff --git a/tests/test_utils/test_manager.py b/tests/test_utils/test_manager.py
index be9348e2d5..913affb649 100644
--- a/tests/test_utils/test_manager.py
+++ b/tests/test_utils/test_manager.py
@@ -72,5 +72,5 @@ def test_get_instance(self):
             SubClassA.get_instance(name=1)
         # `get_instance` should not accept other arguments if corresponding
         # instance has been created.
-        with pytest.raises(AssertionError):
+        with pytest.warns(UserWarning):
             SubClassA.get_instance('name2', a=1, b=2)