diff --git a/.circleci/config.yml b/.circleci/config.yml
index 161b46a6c..a4ffba3b5 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,152 +1,34 @@
 version: 2.1
 
-jobs:
-  lint:
-    docker:
-      - image: cimg/python:3.7.4
-    steps:
-      - checkout
-      - run:
-          name: Install pre-commit hook
-          command: |
-            pip install pre-commit
-            pre-commit install
-      - run:
-          name: Linting
-          command: pre-commit run --all-files
-      - run:
-          name: Check docstring coverage
-          command: |
-            pip install interrogate
-            interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 50 mmtrack
-  build_cpu:
-    parameters:
-      # The python version must match available image tags in
-      # https://circleci.com/developer/images/image/cimg/python
-      python:
-        type: string
-        default: "3.7.4"
-      torch:
-        type: string
-      torchvision:
-        type: string
-    docker:
-      - image: cimg/python:<< parameters.python >>
-    resource_class: large
-    steps:
-      - checkout
-      - run:
-          name: Install Libraries
-          command: |
-            sudo apt-get update
-            sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5
-      - run:
-          name: Configure Python & pip
-          command: |
-            python -m pip install --upgrade pip
-            python -m pip install wheel
-      - run:
-          name: Install PyTorch
-          command: |
-            python -V
-            python -m pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html
-      - run:
-          name: Install mmtrack dependencies
-          command: |
-            python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/torch<< parameters.torch >>/index.html
-            python -m pip install mmdet
-            python -m pip install -r requirements.txt
-            python -m pip install git+https://github.com/votchallenge/toolkit.git
-      - run:
-          name: Build and install
-          command: |
-            python -m pip install -e .
-      - run:
-          name: Run unittests
-          command: |
-            python -m coverage run --branch --source mmtrack -m pytest tests/
-            python -m coverage xml
-            python -m coverage report -m
-  build_cu101:
-    machine:
-      image: ubuntu-1604-cuda-10.1:201909-23
-    resource_class: gpu.nvidia.small
-    steps:
-      - checkout
-      - run:
-          name: Install Libraries
-          command: |
-            sudo apt-get update
-            sudo apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx
-      - run:
-          name: Configure Python & pip
-          command: |
-            pyenv global 3.7.0
-            python -m pip install --upgrade pip
-            python -m pip install wheel
-      - run:
-          name: Install PyTorch
-          command: |
-            python -V
-            python -m pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
-      - run:
-          name: Install mmtrack dependencies
-          # python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu101/torch${{matrix.torch_version}}/index.html
-          command: |
-            python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html
-            python -m pip install mmdet
-            python -m pip install -r requirements.txt
-            python -m pip install git+https://github.com/votchallenge/toolkit.git
-      - run:
-          name: Build and install
-          command: |
-            python setup.py check -m -s
-            TORCH_CUDA_ARCH_LIST=7.0 python -m pip install -e .
-      - run:
-          name: Run unittests
-          command: |
-            python -m pytest tests/
+# this allows you to use CircleCI's dynamic configuration feature
+setup: true
+
+# the path-filtering orb is required to continue a pipeline based on
+# the path of an updated fileset
+orbs:
+  path-filtering: circleci/path-filtering@0.1.2
+
 workflows:
-  unit_tests:
+  # the always-run workflow is always triggered, regardless of the pipeline parameters.
+  always-run:
     jobs:
-      - lint
-      - build_cpu:
-          name: build_cpu_th1.6
-          torch: 1.6.0
-          torchvision: 0.7.0
-          requires:
-            - lint
-      - build_cpu:
-          name: build_cpu_th1.7
-          torch: 1.7.0
-          torchvision: 0.8.1
-          requires:
-            - lint
-      - build_cpu:
-          name: build_cpu_th1.8_py3.9
-          torch: 1.8.0
-          torchvision: 0.9.0
-          python: "3.9.0"
-          requires:
-            - lint
-      - build_cpu:
-          name: build_cpu_th1.9_py3.8
-          torch: 1.9.0
-          torchvision: 0.10.0
-          python: "3.8.0"
-          requires:
-            - lint
-      - build_cpu:
-          name: build_cpu_th1.9_py3.9
-          torch: 1.9.0
-          torchvision: 0.10.0
-          python: "3.9.0"
-          requires:
-            - lint
-      - build_cu101:
-          requires:
-            - build_cpu_th1.6
-            - build_cpu_th1.7
-            - build_cpu_th1.8_py3.9
-            - build_cpu_th1.9_py3.8
-            - build_cpu_th1.9_py3.9
+      # the path-filtering/filter job determines which pipeline
+      # parameters to update.
+      - path-filtering/filter:
+          name: check-updated-files
+          # 3-column, whitespace-delimited mapping. One mapping per
+          # line:
+          # <regex path-to-test> <parameter-to-set> <value-of-pipeline-parameter>
+          mapping: |
+            mmtrack/.* lint_only false
+            requirements/.* lint_only false
+            tests/.* lint_only false
+            tools/.* lint_only false
+            configs/.* lint_only false
+            .circleci/.* lint_only false
+          base-revision: dev-1.x
+          # this is the path of the configuration we should trigger once
+          # path filtering and pipeline parameter value updates are
+          # complete. In this case, we are using the parent dynamic
+          # configuration itself.
+          config-path: .circleci/test.yml
diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile
new file mode 100644
index 000000000..d9cf8cc77
--- /dev/null
+++ b/.circleci/docker/Dockerfile
@@ -0,0 +1,11 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# To fix GPG key error when running apt-get update
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+RUN apt-get update && apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx
diff --git a/.circleci/test.yml b/.circleci/test.yml
new file mode 100644
index 000000000..4b60ac947
--- /dev/null
+++ b/.circleci/test.yml
@@ -0,0 +1,198 @@
+version: 2.1
+
+# the default pipeline parameters, which will be updated according to
+# the results of the path-filtering orb
+parameters:
+  lint_only:
+    type: boolean
+    default: true
+
+jobs:
+  lint:
+    docker:
+      - image: cimg/python:3.7.4
+    steps:
+      - checkout
+      - run:
+          name: Install pre-commit hook
+          command: |
+            pip install pre-commit
+            pre-commit install
+      - run:
+          name: Linting
+          command: pre-commit run --all-files
+      - run:
+          name: Check docstring coverage
+          command: |
+            pip install interrogate
+            interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-magic --ignore-regex "__repr__" --fail-under 50 mmtrack
+  build_cpu:
+    parameters:
+      # The python version must match available image tags in
+      # https://circleci.com/developer/images/image/cimg/python
+      python:
+        type: string
+        default: "3.7.4"
+      torch:
+        type: string
+      torchvision:
+        type: string
+      mmcv:
+        type: string
+    docker:
+      - image: cimg/python:<< parameters.python >>
+    resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Install Libraries
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5
+      - run:
+          name: Configure Python & pip
+          command: |
+            python -m pip install --upgrade pip
+            python -m pip install wheel
+      - run:
+          name: Install PyTorch
+          command: |
+            python -V
+            python -m pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - run:
+          name: Install mmtrack dependencies
+          command: |
+            python -m pip install git+https://github.com/votchallenge/toolkit.git
+            python -m pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main
+            python -m pip install << parameters.mmcv >>
+            python -m pip install git+ssh://git@github.com/open-mmlab/mmdetection.git@dev-3.x
+            python -m pip install git+ssh://git@github.com/open-mmlab/mmclassification.git@dev-1.x
+            python -m pip install -r requirements.txt
+            python -m pip install git+https://github.com/JonathonLuiten/TrackEval.git
+            python -m pip install git+https://github.com/TAO-Dataset/tao.git
+      - run:
+          name: Build and install
+          command: |
+            python -m pip install -e .
+      - run:
+          name: Run unittests
+          command: |
+            python -m coverage run --branch --source mmtrack -m pytest tests/
+            python -m coverage xml
+            python -m coverage report -m
+  build_cuda:
+    parameters:
+      torch:
+        type: string
+      cuda:
+        type: enum
+        enum: ["10.1", "10.2", "11.1"]
+      cudnn:
+        type: integer
+        default: 7
+      mmcv:
+        type: string
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+      # docker_layer_caching: true
+    resource_class: gpu.nvidia.small
+    steps:
+      - checkout
+      - run:
+          # CLoning repos in VM since Docker doesn't have access to the private key
+          name: Clone Repos
+          command: |
+            git clone -b main --depth 1 ssh://git@github.com/open-mmlab/mmengine.git /home/circleci/mmengine
+            git clone -b dev-3.x --depth 1 ssh://git@github.com/open-mmlab/mmdetection.git /home/circleci/mmdetection
+            git clone -b dev-1.x --depth 1 ssh://git@github.com/open-mmlab/mmclassification.git /home/circleci/mmclassification
+      - run:
+          name: Build Docker image
+          command: |
+            docker build .circleci/docker -t mmtrack:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
+            docker run --gpus all -t -d -v /home/circleci/project:/mmtrack -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmdetection:/mmdetection -v /home/circleci/mmclassification:/mmclassification -w /mmtrack --name mmtrack mmtrack:gpu
+      - run:
+          name: Install mmtrack dependencies
+          command: |
+            docker exec mmtrack apt-get -y install git
+            docker exec mmtrack pip install -e /mmengine
+            docker exec mmtrack pip install << parameters.mmcv >>
+            docker exec mmtrack pip install -e /mmdetection
+            docker exec mmtrack pip install -e /mmclassification
+            docker exec mmtrack pip install -r requirements.txt
+            docker exec mmtrack python -m pip install git+https://github.com/JonathonLuiten/TrackEval.git
+            docker exec mmtrack python -m pip install git+https://github.com/votchallenge/toolkit.git
+            docker exec mmtrack python -m pip install git+https://github.com/TAO-Dataset/tao.git
+      - run:
+          name: Build and install
+          command: |
+            docker exec mmtrack pip install -e .
+      - run:
+          name: Run unittests
+          command: |
+            docker exec mmtrack python -m pytest tests/
+workflows:
+  pr_stage_lint:
+    when: << pipeline.parameters.lint_only >>
+    jobs:
+      - lint:
+          name: lint
+          filters:
+            branches:
+              ignore:
+                - dev-1.x
+  pr_stage_test:
+    when:
+      not:
+        << pipeline.parameters.lint_only >>
+    jobs:
+      - lint:
+          name: lint
+          filters:
+            branches:
+              ignore:
+                - dev-1.x
+      - build_cpu:
+          name: minimum_version_cpu
+          torch: 1.7.0
+          torchvision: 0.8.1
+          python: 3.6.9  # The lowest python 3.6.x version available on CircleCI images
+          mmcv: https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/mmcv-2.0.0rc1-cp36-cp36m-manylinux1_x86_64.whl
+          requires:
+            - lint
+      - build_cpu:
+          name: maximum_version_cpu
+          torch: 1.9.0
+          torchvision: 0.10.0
+          python: 3.9.0
+          mmcv: https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/mmcv-2.0.0rc1-cp39-cp39-manylinux1_x86_64.whl
+          requires:
+            - minimum_version_cpu
+      - hold:
+          type: approval
+          requires:
+            - maximum_version_cpu
+      - build_cuda:
+          name: mainstream_version_gpu
+          torch: 1.8.1
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          cuda: "10.2"
+          mmcv: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0rc1-cp37-cp37m-manylinux1_x86_64.whl
+          requires:
+            - hold
+  merge_stage_test:
+    when:
+      not:
+        << pipeline.parameters.lint_only >>
+    jobs:
+      - build_cuda:
+          name: mainstream_version_gpu
+          torch: 1.8.1
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          mmcv: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0rc1-cp37-cp37m-manylinux1_x86_64.whl
+          cuda: "10.2"
+          filters:
+            branches:
+              only:
+                - dev-1.x
diff --git a/.dev_scripts/benchmark/batch_train_list.txt b/.dev_scripts/benchmark/batch_train_list.txt
index 51f28420c..d3f21bdca 100644
--- a/.dev_scripts/benchmark/batch_train_list.txt
+++ b/.dev_scripts/benchmark/batch_train_list.txt
@@ -1,13 +1,16 @@
 # VID
-configs/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid.py
-configs/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid.py
-configs/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py
-configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid.py
+configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
+configs/vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
+configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
+configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
 
 # MOT
-configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py
-configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py
-configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private-half.py
+configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
+configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
 
 # SOT
-configs/sot/siamese_rpn/siamese_rpn_r50_20e_lasot.py
+configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py
+configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-lasot.py
+configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot.py
diff --git a/.dev_scripts/benchmark/gather_train_benchmark_metric.py b/.dev_scripts/benchmark/gather_train_benchmark_metric.py
index 710dff1fb..df4cb6921 100644
--- a/.dev_scripts/benchmark/gather_train_benchmark_metric.py
+++ b/.dev_scripts/benchmark/gather_train_benchmark_metric.py
@@ -4,7 +4,7 @@
 import json
 import os.path as osp
 
-import mmcv
+import mmengine
 
 try:
     import xlrd
@@ -65,20 +65,26 @@ def parse_args():
             config_name = osp.splitext(config_name)[0]
             result_path = osp.join(root_path, config_name)
             if osp.exists(result_path):
+
                 # 1 read config and excel
-                cfg = mmcv.Config.fromfile(config)
-                total_epochs = cfg.total_epochs
+                cfg = mmengine.Config.fromfile(config)
+                total_epochs = cfg.train_cfg.max_epochs if cfg.train_cfg else 0
 
                 # the first metric will be used to find the best ckpt
                 has_final_ckpt = True
                 if 'vid' in config:
-                    eval_metrics = ['bbox_mAP_50']
+                    eval_metrics = ['coco/bbox_mAP_50']
                 elif 'mot' in config:
-                    eval_metrics = ['MOTA', 'IDF1']
+                    eval_metrics = [
+                        'motchallenge-metric/MOTA', 'motchallenge-metric/IDF1'
+                    ]
                     # tracktor and deepsort don't have ckpt.
-                    has_final_ckpt = False
+                    if 'deepsort' in result_path or 'tracktor' in result_path:
+                        has_final_ckpt = False
                 elif 'sot' in config:
-                    eval_metrics = ['success', 'norm_precision', 'precision']
+                    eval_metrics = [
+                        'sot/success', 'sot/norm_precision', 'sot/precision'
+                    ]
                 else:
                     raise NotImplementedError(
                         f'Not supported config: {config}')
@@ -102,21 +108,32 @@ def parse_args():
                 ckpt_path = f'epoch_{total_epochs}.pth'
                 if osp.exists(osp.join(result_path, ckpt_path)) or \
                         not has_final_ckpt:
-                    log_json_path = list(
-                        sorted(glob.glob(osp.join(result_path,
-                                                  '*.log.json'))))[-1]
+                    if has_final_ckpt:
+                        log_json_path = list(
+                            sorted(
+                                glob.glob(
+                                    osp.join(result_path, '*', 'vis_data',
+                                             'scalars.json'))))[-1]
+                    else:
+                        log_json_path = list(
+                            sorted(
+                                glob.glob(
+                                    osp.join(result_path, '*', '*.json'))))[-1]
 
                     # 3 read metric
                     result_dict = dict()
                     with open(log_json_path, 'r') as f:
                         for line in f.readlines():
                             log_line = json.loads(line)
-                            if 'mode' not in log_line.keys():
+                            if 'lr' in log_line.keys():
                                 continue
-
-                            if log_line['mode'] == 'val' or \
-                                    log_line['mode'] == 'test':
-                                result_dict[f"epoch_{log_line['epoch']}"] = {
+                            if has_final_ckpt:
+                                result_dict[f"epoch_{log_line['step']}"] = {
+                                    key: log_line[key]
+                                    for key in eval_metrics if key in log_line
+                                }
+                            else:
+                                result_dict['test'] = {
                                     key: log_line[key]
                                     for key in eval_metrics if key in log_line
                                 }
@@ -131,7 +148,7 @@ def parse_args():
                                 best_epoch_results = result_dict[epoch]
 
                     for metric in best_epoch_results:
-                        if 'success' in best_epoch_results:
+                        if 'sot/success' in best_epoch_results:
                             performance = round(best_epoch_results[metric], 1)
                         else:
                             performance = round(
diff --git a/.dev_scripts/benchmark/train_benchmark.sh b/.dev_scripts/benchmark/train_benchmark.sh
index 78e2dcd28..d2cffcebb 100755
--- a/.dev_scripts/benchmark/train_benchmark.sh
+++ b/.dev_scripts/benchmark/train_benchmark.sh
@@ -2,67 +2,82 @@ PARTITION=$1
 ROOT_DIR=$2
 
 # VID
-CONFIG=configs/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid.py
-WORK_DIR=dff_faster_rcnn_r50_dc5_1x_imagenetvid
+CONFIG=configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
+WORK_DIR=dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid
 echo ${CONFIG} &
-./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options checkpoint_config.max_keep_ckpts=1 >/dev/null &
+./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1 >/dev/null &
 
-CONFIG=configs/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid.py
-WORK_DIR=fgfa_faster_rcnn_r50_dc5_1x_imagenetvid
+CONFIG=configs/vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
+WORK_DIR=fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid
 echo ${CONFIG} &
-./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options checkpoint_config.max_keep_ckpts=1 >/dev/null &
+./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1 >/dev/null &
 
-CONFIG=configs/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py
-WORK_DIR=selsa_faster_rcnn_r50_dc5_1x_imagenetvid
+CONFIG=configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
+WORK_DIR=selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid
 echo ${CONFIG} &
-./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options checkpoint_config.max_keep_ckpts=1 >/dev/null &
+./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1 >/dev/null &
 
-CONFIG=configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid.py
-WORK_DIR=selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid
+CONFIG=configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
+WORK_DIR=selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid
 echo ${CONFIG} &
-./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options checkpoint_config.max_keep_ckpts=1 >/dev/null &
+./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1 >/dev/null &
 
 # MOT
-CONFIG=configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private-half.py
-WORK_DIR=bytetrack_yolox_x_crowdhuman_mot17-private-half
+CONFIG=configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
+WORK_DIR=bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval
 echo ${CONFIG} &
-./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options checkpoint_config.max_keep_ckpts=1 >/dev/null &
+./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1 >/dev/null &
+
+CONFIG=configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+WORK_DIR=qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
+echo ${CONFIG} &
+./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1 >/dev/null &
 
 # VIS
-CONFIG=configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019.py
-WORK_DIR=masktrack_rcnn_r50_fpn_12e_youtubevis2019
+CONFIG=configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py
+WORK_DIR=masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019
 echo ${CONFIG} &
-./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options checkpoint_config.max_keep_ckpts=1 >/dev/null &
+./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1 >/dev/null &
 
 # SOT
-CONFIG=configs/sot/siamese_rpn/siamese_rpn_r50_20e_lasot.py
-WORK_DIR=siamese_rpn_r50_20e_lasot
+CONFIG=configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py
+WORK_DIR=siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot
+echo ${CONFIG} &
+./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1 >/dev/null &
+
+CONFIG=configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-lasot.py
+ST1_WORK_DIR=stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-lasot
+echo ${CONFIG} &
+./tools/slurm_train.sh ${PARTITION} ${ST1_WORK_DIR} ${CONFIG} ${ROOT_DIR}/${ST1_WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1 >/dev/null
+
+CONFIG=configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot.py
+ST2_WORK_DIR=stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot
 echo ${CONFIG} &
-./tools/slurm_train.sh ${PARTITION} ${WORK_DIR} ${CONFIG} ${ROOT_DIR}/${WORK_DIR} 8 --cfg-options checkpoint_config.max_keep_ckpts=1 >/dev/null &
+./tools/slurm_train.sh ${PARTITION} ${ST2_WORK_DIR} ${CONFIG} ${ROOT_DIR}/${ST2_WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1 load_from=${ROOT_DIR}/${ST1_WORK_DIR}/epoch_500.pth >/dev/null
 
 # MOT
-REID_CONFIG=configs/reid/resnet50_b32x8_MOT17.py
-REID_WORK_DIR=resnet50_b32x8_MOT17
+REID_CONFIG=configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py
+REID_WORK_DIR=reid_r50_8xb32-6e_mot17train80_test-mot17val20
 echo ${REID_CONFIG}
-./tools/slurm_train.sh ${PARTITION} ${REID_WORK_DIR} ${REID_CONFIG} ${ROOT_DIR}/${REID_WORK_DIR} 8 --cfg-options checkpoint_config.max_keep_ckpts=1
+./tools/slurm_train.sh ${PARTITION} ${REID_WORK_DIR} ${REID_CONFIG} ${ROOT_DIR}/${REID_WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1
 
-DET_CONFIG=configs/det/faster-rcnn_r50_fpn_4e_mot17-half.py
-DET_WORK_DIR=faster-rcnn_r50_fpn_4e_mot17-half
+DET_CONFIG=configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+DET_WORK_DIR=faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
 echo ${DET_CONFIG}
-./tools/slurm_train.sh ${PARTITION} ${DET_WORK_DIR} ${DET_CONFIG} ${ROOT_DIR}/${DET_WORK_DIR} 8 --cfg-options checkpoint_config.max_keep_ckpts=1
+./tools/slurm_train.sh ${PARTITION} ${DET_WORK_DIR} ${DET_CONFIG} ${ROOT_DIR}/${DET_WORK_DIR} 8 --cfg-options default_hooks.checkpoint.max_keep_ckpts=1
 
-CONFIG=configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py
-WORK_DIR=deepsort_faster-rcnn_fpn_4e_mot17-private-half
+CONFIG=configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+WORK_DIR=deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
 echo ${CONFIG}
-./tools/slurm_test.sh ${PARTITION} ${WORK_DIR} ${CONFIG} 8 --work-dir ${ROOT_DIR}/${WORK_DIR} --eval track --cfg-options model.detector.init_cfg.checkpoint=${ROOT_DIR}/${DET_WORK_DIR}/epoch_4.pth
+./tools/slurm_test.sh ${PARTITION} ${WORK_DIR} ${CONFIG} 8 --work-dir ${ROOT_DIR}/${WORK_DIR} --cfg-options model.detector.init_cfg.checkpoint=${ROOT_DIR}/${DET_WORK_DIR}/epoch_4.pth
 
-CONFIG=configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py
-WORK_DIR=tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half
+CONFIG=configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+WORK_DIR=tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
 echo ${CONFIG}
-./tools/slurm_test.sh ${PARTITION} ${WORK_DIR} ${CONFIG} 8 --work-dir ${ROOT_DIR}/${WORK_DIR} --eval track --cfg-options model.detector.init_cfg.checkpoint=${ROOT_DIR}/${DET_WORK_DIR}/epoch_4.pth model.reid.init_cfg.checkpoint=${ROOT_DIR}/${REID_WORK_DIR}/epoch_6.pth
+./tools/slurm_test.sh ${PARTITION} ${WORK_DIR} ${CONFIG} 8 --work-dir ${ROOT_DIR}/${WORK_DIR} --cfg-options model.detector.init_cfg.checkpoint=${ROOT_DIR}/${DET_WORK_DIR}/epoch_4.pth model.reid.init_cfg.checkpoint=${ROOT_DIR}/${REID_WORK_DIR}/epoch_6.pth
 
 # VIS
-CONFIG=configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019.py
-WORK_DIR=masktrack_rcnn_r50_fpn_12e_youtubevis2019
+CONFIG=configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py
+WORK_DIR=masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019
 echo ${CONFIG}
-./tools/slurm_test.sh ${PARTITION} ${WORK_DIR} ${CONFIG} 8 --eval-options resfile_path=${ROOT_DIR}/${WORK_DIR} --format-only --checkpoint ${ROOT_DIR}/${WORK_DIR}/epoch_12.pth
+./tools/slurm_test.sh ${PARTITION} ${WORK_DIR} ${CONFIG} 8 --cfg-options test_evaluator.outfile_prefix=${ROOT_DIR}/${WORK_DIR} --checkpoint ${ROOT_DIR}/${WORK_DIR}/epoch_12.pth
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f4f1867e3..8d57d4594 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -23,225 +23,225 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
-jobs:
-  build_cpu:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [3.7]
-        torch: [1.5.1, 1.7.1]
-        include:
-          - torch: 1.5.1
-            torch_version: torch1.5
-            torchvision: 0.6.1
-          - torch: 1.7.1
-            torch_version: torch1.7
-            torchvision: 0.8.2
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install PyTorch
-        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
-      - name: Install mmtrack dependencies
-        run: |
-          pip install git+https://github.com/votchallenge/toolkit.git
-          pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/${{matrix.torch_version}}/index.html
-          pip install mmdet
-          pip install -r requirements.txt
-          pip install git+https://github.com/JonathonLuiten/TrackEval.git
-          pip install git+https://github.com/lvis-dataset/lvis-api.git
-          pip install git+https://github.com/TAO-Dataset/tao.git
-      - name: Build and install
-        run: rm -rf .eggs && pip install -e .
-      - name: Run unittests and generate coverage report
-        run: |
-          coverage run --branch --source mmtrack -m pytest tests/
-          coverage xml
-          coverage report -m
-
-  build_cuda101:
-    runs-on: ubuntu-18.04
-    container:
-      image: pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel
-
-    strategy:
-      matrix:
-        python-version: [3.7]
-        torch: [1.5.1+cu101, 1.7.1+cu101]
-        include:
-          - torch: 1.5.1+cu101
-            torch_version: torch1.5
-            torchvision: 0.6.1+cu101
-          - torch: 1.7.1+cu101
-            torch_version: torch1.7
-            torchvision: 0.8.2+cu101
-
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Fetch GPG keys
-        run: |
-          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
-      # Add ppa source repo for python3.9.
-      - name: Add python3.9 source
-        run: |
-          apt-get update && apt-get install -y software-properties-common
-          add-apt-repository -y ppa:deadsnakes/ppa
-        if: ${{matrix.python-version == '3.9'}}
-      # Install python-dev for some packages which require libpython3.Xm.
-      # Github's setup-python cannot install python3.9-dev, so we have to use apt install.
-      # Set DEBIAN_FRONTEND=noninteractive to avoid some interactions.
-      - name: Install python-dev
-        run: apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python${{matrix.python-version}}-dev
-      - name: Install system dependencies
-        run: |
-          apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6
-          apt-get clean
-          rm -rf /var/lib/apt/lists/*
-      - name: Install PyTorch
-        run: python -m pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} -f https://download.pytorch.org/whl/torch_stable.html
-      - name: Install mmtrack dependencies
-        run: |
-          python -m pip install git+https://github.com/votchallenge/toolkit.git
-          python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu101/${{matrix.torch_version}}/index.html
-          python -m pip install mmdet
-          python -m pip install -r requirements.txt
-          python -m pip install git+https://github.com/JonathonLuiten/TrackEval.git
-          python -m pip install git+https://github.com/lvis-dataset/lvis-api.git
-          python -m pip install git+https://github.com/TAO-Dataset/tao.git
-      - name: Build and install
-        run: |
-          rm -rf .eggs
-          python setup.py check -m -s
-          TORCH_CUDA_ARCH_LIST=7.0 pip install .
-      - name: Run unittests and generate coverage report
-        run: |
-          coverage run --branch --source mmtrack -m pytest tests/
-          coverage xml
-          coverage report -m
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v1.0.10
-        with:
-          file: ./coverage.xml
-          flags: unittests
-          env_vars: OS,PYTHON
-          name: codecov-umbrella
-          fail_ci_if_error: false
-
-  build_cuda102:
-    runs-on: ubuntu-18.04
-    container:
-      image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
-
-    strategy:
-      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
-        torch: [1.9.0+cu102]
-        include:
-          - torch: 1.9.0+cu102
-            torch_version: torch1.9
-            torchvision: 0.10.0+cu102
-
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Fetch GPG keys
-        run: |
-          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
-      # Add ppa source repo for python3.9.
-      - name: Add python3.9 source
-        run: |
-          apt-get update && apt-get install -y software-properties-common
-          add-apt-repository -y ppa:deadsnakes/ppa
-        if: ${{matrix.python-version == '3.9'}}
-      # Install python-dev for some packages which require libpython3.Xm.
-      # Github's setup-python cannot install python3.9-dev, so we have to use apt install.
-      # Set DEBIAN_FRONTEND=noninteractive to avoid some interactions.
-      - name: Install python-dev
-        run: apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python${{matrix.python-version}}-dev
-      - name: Install system dependencies
-        run: |
-          apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6
-          apt-get clean
-          rm -rf /var/lib/apt/lists/*
-      - name: Install PyTorch
-        run: python -m pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} -f https://download.pytorch.org/whl/torch_stable.html
-      - name: Install mmtrack dependencies
-        run: |
-          python -m pip install git+https://github.com/votchallenge/toolkit.git
-          python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu102/${{matrix.torch_version}}/index.html
-          python -m pip install mmdet
-          python -m pip install -r requirements.txt
-          python -m pip install git+https://github.com/JonathonLuiten/TrackEval.git
-          python -m pip install git+https://github.com/lvis-dataset/lvis-api.git
-          python -m pip install git+https://github.com/TAO-Dataset/tao.git
-      - name: Build and install
-        run: |
-          rm -rf .eggs
-          python setup.py check -m -s
-          TORCH_CUDA_ARCH_LIST=7.0 pip install .
-      - name: Run unittests and generate coverage report
-        run: |
-          coverage run --branch --source mmtrack -m pytest tests/
-          coverage xml
-          coverage report -m
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v2
-        with:
-          files: ./coverage.xml
-          flags: unittests
-          env_vars: OS,PYTHON
-          name: codecov-umbrella
-          fail_ci_if_error: false
-  test_windows:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [windows-2022]
-        python: [3.8]
-        platform: [cpu, cu111]
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python }}
-      - name: Upgrade pip
-        run: python -m pip install pip --upgrade --user
-      - name: Install PyTorch
-        # As a complement to Linux CI, we test on PyTorch LTS version
-        run: python -m pip install torch==1.8.2+${{ matrix.platform }} torchvision==0.9.2+${{ matrix.platform }} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
-      - name: Install votchallenge toolkit
-        run: python -m pip install git+https://github.com/votchallenge/toolkit.git
-      - name: Install MMCV
-        run: |
-          python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8/index.html --only-binary mmcv-full
-      - name: Install mmdet
-        run: |
-          python -m pip install mmdet
-      - name: Install unittest dependencies
-        run: |
-             python -m pip install -r requirements/tests.txt -r requirements/runtime.txt
-             python -m pip install git+https://github.com/JonathonLuiten/TrackEval.git
-             python -m pip install git+https://github.com/lvis-dataset/lvis-api.git
-             python -m pip install git+https://github.com/TAO-Dataset/tao.git
-      - name: Build and install
-        run: python -m pip install -e .
-      - name: Run unittests
-        run: |
-          coverage run --branch --source mmtrack -m pytest tests/
-      - name: Generate coverage report
-        run: |
-          coverage xml
-          coverage report -m
+#jobs:
+#  build_cpu:
+#    runs-on: ubuntu-latest
+#    strategy:
+#      matrix:
+#        python-version: [3.7]
+#        torch: [1.5.1, 1.7.1]
+#        include:
+#          - torch: 1.5.1
+#            torch_version: torch1.5
+#            torchvision: 0.6.1
+#          - torch: 1.7.1
+#            torch_version: torch1.7
+#            torchvision: 0.8.2
+#    steps:
+#      - uses: actions/checkout@v2
+#      - name: Set up Python ${{ matrix.python-version }}
+#        uses: actions/setup-python@v2
+#        with:
+#          python-version: ${{ matrix.python-version }}
+#      - name: Install PyTorch
+#        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+#      - name: Install mmtrack dependencies
+#        run: |
+#          pip install git+https://github.com/votchallenge/toolkit.git
+#          pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/${{matrix.torch_version}}/index.html
+#          pip install mmdet
+#          pip install -r requirements.txt
+#          pip install git+https://github.com/JonathonLuiten/TrackEval.git
+#          pip install git+https://github.com/lvis-dataset/lvis-api.git
+#          pip install git+https://github.com/TAO-Dataset/tao.git
+#      - name: Build and install
+#        run: rm -rf .eggs && pip install -e .
+#      - name: Run unittests and generate coverage report
+#        run: |
+#          coverage run --branch --source mmtrack -m pytest tests/
+#          coverage xml
+#          coverage report -m
+#
+#  build_cuda101:
+#    runs-on: ubuntu-18.04
+#    container:
+#      image: pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel
+#
+#    strategy:
+#      matrix:
+#        python-version: [3.7]
+#        torch: [1.5.1+cu101, 1.7.1+cu101]
+#        include:
+#          - torch: 1.5.1+cu101
+#            torch_version: torch1.5
+#            torchvision: 0.6.1+cu101
+#          - torch: 1.7.1+cu101
+#            torch_version: torch1.7
+#            torchvision: 0.8.2+cu101
+#
+#    steps:
+#      - uses: actions/checkout@v2
+#      - name: Set up Python ${{ matrix.python-version }}
+#        uses: actions/setup-python@v2
+#        with:
+#          python-version: ${{ matrix.python-version }}
+#      - name: Fetch GPG keys
+#        run: |
+#          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+#          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+#      # Add ppa source repo for python3.9.
+#      - name: Add python3.9 source
+#        run: |
+#          apt-get update && apt-get install -y software-properties-common
+#          add-apt-repository -y ppa:deadsnakes/ppa
+#        if: ${{matrix.python-version == '3.9'}}
+#      # Install python-dev for some packages which require libpython3.Xm.
+#      # Github's setup-python cannot install python3.9-dev, so we have to use apt install.
+#      # Set DEBIAN_FRONTEND=noninteractive to avoid some interactions.
+#      - name: Install python-dev
+#        run: apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python${{matrix.python-version}}-dev
+#      - name: Install system dependencies
+#        run: |
+#          apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6
+#          apt-get clean
+#          rm -rf /var/lib/apt/lists/*
+#      - name: Install PyTorch
+#        run: python -m pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} -f https://download.pytorch.org/whl/torch_stable.html
+#      - name: Install mmtrack dependencies
+#        run: |
+#          python -m pip install git+https://github.com/votchallenge/toolkit.git
+#          python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu101/${{matrix.torch_version}}/index.html
+#          python -m pip install mmdet
+#          python -m pip install -r requirements.txt
+#          python -m pip install git+https://github.com/JonathonLuiten/TrackEval.git
+#          python -m pip install git+https://github.com/lvis-dataset/lvis-api.git
+#          python -m pip install git+https://github.com/TAO-Dataset/tao.git
+#      - name: Build and install
+#        run: |
+#          rm -rf .eggs
+#          python setup.py check -m -s
+#          TORCH_CUDA_ARCH_LIST=7.0 pip install .
+#      - name: Run unittests and generate coverage report
+#        run: |
+#          coverage run --branch --source mmtrack -m pytest tests/
+#          coverage xml
+#          coverage report -m
+#      - name: Upload coverage to Codecov
+#        uses: codecov/codecov-action@v1.0.10
+#        with:
+#          file: ./coverage.xml
+#          flags: unittests
+#          env_vars: OS,PYTHON
+#          name: codecov-umbrella
+#          fail_ci_if_error: false
+#
+#  build_cuda102:
+#    runs-on: ubuntu-18.04
+#    container:
+#      image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
+#
+#    strategy:
+#      matrix:
+#        python-version: [3.6, 3.7, 3.8, 3.9]
+#        torch: [1.9.0+cu102]
+#        include:
+#          - torch: 1.9.0+cu102
+#            torch_version: torch1.9
+#            torchvision: 0.10.0+cu102
+#
+#    steps:
+#      - uses: actions/checkout@v2
+#      - name: Set up Python ${{ matrix.python-version }}
+#        uses: actions/setup-python@v2
+#        with:
+#          python-version: ${{ matrix.python-version }}
+#      - name: Fetch GPG keys
+#        run: |
+#          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+#          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+#      # Add ppa source repo for python3.9.
+#      - name: Add python3.9 source
+#        run: |
+#          apt-get update && apt-get install -y software-properties-common
+#          add-apt-repository -y ppa:deadsnakes/ppa
+#        if: ${{matrix.python-version == '3.9'}}
+#      # Install python-dev for some packages which require libpython3.Xm.
+#      # Github's setup-python cannot install python3.9-dev, so we have to use apt install.
+#      # Set DEBIAN_FRONTEND=noninteractive to avoid some interactions.
+#      - name: Install python-dev
+#        run: apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python${{matrix.python-version}}-dev
+#      - name: Install system dependencies
+#        run: |
+#          apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6
+#          apt-get clean
+#          rm -rf /var/lib/apt/lists/*
+#      - name: Install PyTorch
+#        run: python -m pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} -f https://download.pytorch.org/whl/torch_stable.html
+#      - name: Install mmtrack dependencies
+#        run: |
+#          python -m pip install git+https://github.com/votchallenge/toolkit.git
+#          python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu102/${{matrix.torch_version}}/index.html
+#          python -m pip install mmdet
+#          python -m pip install -r requirements.txt
+#          python -m pip install git+https://github.com/JonathonLuiten/TrackEval.git
+#          python -m pip install git+https://github.com/lvis-dataset/lvis-api.git
+#          python -m pip install git+https://github.com/TAO-Dataset/tao.git
+#      - name: Build and install
+#        run: |
+#          rm -rf .eggs
+#          python setup.py check -m -s
+#          TORCH_CUDA_ARCH_LIST=7.0 pip install .
+#      - name: Run unittests and generate coverage report
+#        run: |
+#          coverage run --branch --source mmtrack -m pytest tests/
+#          coverage xml
+#          coverage report -m
+#      - name: Upload coverage to Codecov
+#        uses: codecov/codecov-action@v2
+#        with:
+#          files: ./coverage.xml
+#          flags: unittests
+#          env_vars: OS,PYTHON
+#          name: codecov-umbrella
+#          fail_ci_if_error: false
+#  test_windows:
+#    runs-on: ${{ matrix.os }}
+#    strategy:
+#      matrix:
+#        os: [windows-2022]
+#        python: [3.8]
+#        platform: [cpu, cu111]
+#    steps:
+#      - uses: actions/checkout@v2
+#      - name: Set up Python ${{ matrix.python }}
+#        uses: actions/setup-python@v2
+#        with:
+#          python-version: ${{ matrix.python }}
+#      - name: Upgrade pip
+#        run: python -m pip install pip --upgrade --user
+#      - name: Install PyTorch
+#        # As a complement to Linux CI, we test on PyTorch LTS version
+#        run: python -m pip install torch==1.8.2+${{ matrix.platform }} torchvision==0.9.2+${{ matrix.platform }} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+#      - name: Install votchallenge toolkit
+#        run: python -m pip install git+https://github.com/votchallenge/toolkit.git
+#      - name: Install MMCV
+#        run: |
+#          python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8/index.html --only-binary mmcv-full
+#      - name: Install mmdet
+#        run: |
+#          python -m pip install mmdet
+#      - name: Install unittest dependencies
+#        run: |
+#             python -m pip install -r requirements/tests.txt -r requirements/runtime.txt
+#             python -m pip install git+https://github.com/JonathonLuiten/TrackEval.git
+#             python -m pip install git+https://github.com/lvis-dataset/lvis-api.git
+#             python -m pip install git+https://github.com/TAO-Dataset/tao.git
+#      - name: Build and install
+#        run: python -m pip install -e .
+#      - name: Run unittests
+#        run: |
+#          coverage run --branch --source mmtrack -m pytest tests/
+#      - name: Generate coverage report
+#        run: |
+#          coverage xml
+#          coverage report -m
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 4f27b9116..1d0272682 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -3,22 +3,22 @@ name: deploy
 on: push
 
 jobs:
-  build-n-publish:
-    runs-on: ubuntu-latest
-    if: startsWith(github.event.ref, 'refs/tags')
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python 3.7
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.7
-      - name: Install torch
-        run: pip install torch
-      - name: Install wheel
-        run: pip install wheel
-      - name: Build MMTracking
-        run: python setup.py sdist bdist_wheel
-      - name: Publish distribution to PyPI
-        run: |
-          pip install twine
-          twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}
+ build-n-publish:
+   runs-on: ubuntu-latest
+   if: startsWith(github.event.ref, 'refs/tags')
+   steps:
+     - uses: actions/checkout@v2
+     - name: Set up Python 3.7
+       uses: actions/setup-python@v2
+       with:
+         python-version: 3.7
+     - name: Install torch
+       run: pip install torch
+     - name: Install wheel
+       run: pip install wheel
+     - name: Build MMTracking
+       run: python setup.py sdist bdist_wheel
+     - name: Publish distribution to PyPI
+       run: |
+         pip install twine
+         twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 580509624..4df95cdc6 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,22 +6,22 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
-jobs:
-  lint:
-      runs-on: ubuntu-latest
-      steps:
-        - uses: actions/checkout@v2
-        - name: Set up Python 3.7
-          uses: actions/setup-python@v2
-          with:
-            python-version: 3.7
-        - name: Install pre-commit hook
-          run: |
-            pip install pre-commit
-            pre-commit install
-        - name: Linting
-          run: pre-commit run --all-files
-        - name: Check docstring coverage
-          run: |
-            pip install interrogate
-            interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 80 mmtrack
+#jobs:
+#  lint:
+#      runs-on: ubuntu-latest
+#      steps:
+#        - uses: actions/checkout@v2
+#        - name: Set up Python 3.7
+#          uses: actions/setup-python@v2
+#          with:
+#            python-version: 3.7
+#        - name: Install pre-commit hook
+#          run: |
+#            pip install pre-commit
+#            pre-commit install
+#        - name: Linting
+#          run: pre-commit run --all-files
+#        - name: Check docstring coverage
+#          run: |
+#            pip install interrogate
+#            interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 80 mmtrack
diff --git a/.github/workflows/test_mim.yml b/.github/workflows/test_mim.yml
index b0229f9fd..ea5270ec5 100644
--- a/.github/workflows/test_mim.yml
+++ b/.github/workflows/test_mim.yml
@@ -15,30 +15,30 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
-jobs:
-  build_cpu:
-    runs-on: ubuntu-18.04
-    strategy:
-      matrix:
-        python-version: [3.7]
-        torch: [1.8.0]
-        include:
-          - torch: 1.8.0
-            torch_version: torch1.8
-            torchvision: 0.9.0
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Upgrade pip
-        run: pip install pip --upgrade
-      - name: Install PyTorch
-        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
-      - name: Install openmim
-        run: pip install openmim
-      - name: Build and install
-        run: rm -rf .eggs && mim install -e .
-      - name: test commands of mim
-        run: mim search mmtracking
+#jobs:
+#  build_cpu:
+#    runs-on: ubuntu-18.04
+#    strategy:
+#      matrix:
+#        python-version: [3.7]
+#        torch: [1.8.0]
+#        include:
+#          - torch: 1.8.0
+#            torch_version: torch1.8
+#            torchvision: 0.9.0
+#    steps:
+#      - uses: actions/checkout@v2
+#      - name: Set up Python ${{ matrix.python-version }}
+#        uses: actions/setup-python@v2
+#        with:
+#          python-version: ${{ matrix.python-version }}
+#      - name: Upgrade pip
+#        run: pip install pip --upgrade
+#      - name: Install PyTorch
+#        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+#      - name: Install openmim
+#        run: pip install openmim
+#      - name: Build and install
+#        run: rm -rf .eggs && mim install -e .
+#      - name: test commands of mim
+#        run: mim search mmtracking
diff --git a/.gitignore b/.gitignore
index 2b290e0fe..95270b1cc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -121,3 +121,10 @@ mmtrack/.mim
 *.pth
 *.py~
 *.sh~
+
+# Debug
+demo/demo.sh
+demo/debug_mot.py
+
+# Demo Outputs
+outputs/
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 7e66e93fb..f4d6c5a59 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -9,4 +9,3 @@ python:
     - requirements: requirements/docs.txt
     - requirements: requirements/runtime.txt
     - requirements: requirements/readthedocs.txt
-    - requirements: requirements/mminstall.txt
diff --git a/README.md b/README.md
index a01d165db..840572cb7 100644
--- a/README.md
+++ b/README.md
@@ -1,199 +1,19 @@
-<div align="center">
-  <img src="resources/mmtrack-logo.png" width="600"/>
-  <div>&nbsp;</div>
-  <div align="center">
-    <b><font size="5">OpenMMLab website</font></b>
-    <sup>
-      <a href="https://openmmlab.com">
-        <i><font size="4">HOT</font></i>
-      </a>
-    </sup>
-    &nbsp;&nbsp;&nbsp;&nbsp;
-    <b><font size="5">OpenMMLab platform</font></b>
-    <sup>
-      <a href="https://platform.openmmlab.com">
-        <i><font size="4">TRY IT OUT</font></i>
-      </a>
-    </sup>
-  </div>
-  <div>&nbsp;</div>
-
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmtrack)](https://pypi.org/project/mmtrack/)
-[![PyPI](https://img.shields.io/pypi/v/mmtrack)](https://pypi.org/project/mmtrack)
-[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmtracking.readthedocs.io/en/latest/)
-[![badge](https://github.com/open-mmlab/mmtracking/workflows/build/badge.svg)](https://github.com/open-mmlab/mmtracking/actions)
-[![codecov](https://codecov.io/gh/open-mmlab/mmtracking/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmtracking)
-[![license](https://img.shields.io/github/license/open-mmlab/mmtracking.svg)](https://github.com/open-mmlab/mmtracking/blob/master/LICENSE)
-
-[📘Documentation](https://mmtracking.readthedocs.io/) |
-[🛠️Installation](https://mmtracking.readthedocs.io/en/latest/install.html) |
-[👀Model Zoo](https://mmtracking.readthedocs.io/en/latest/model_zoo.html) |
-[🆕Update News](https://mmtracking.readthedocs.io/en/latest/changelog.html) |
-[🤔Reporting Issues](https://github.com/open-mmlab/mmtracking/issues/new/choose)
-
-</div>
-
-<div align="center">
-
-English | [简体中文](README_zh-CN.md)
-
-</div>
-
-## Introduction
-
-MMTracking is an open source video perception toolbox by PyTorch. It is a part of [OpenMMLab](https://openmmlab.com) project.
-
-The master branch works with **PyTorch1.5+**.
-
-<div align="center">
-  <img src="https://user-images.githubusercontent.com/24663779/103343312-c724f480-4ac6-11eb-9c22-b56f1902584e.gif" width="800"/>
-</div>
-
-### Major features
-
-- **The First Unified Video Perception Platform**
-
-  We are the first open source toolbox that unifies versatile video perception tasks include video object detection, multiple object tracking, single object tracking and video instance segmentation.
-
-- **Modular Design**
-
-  We decompose the video perception framework into different components and one can easily construct a customized method by combining different modules.
-
-- **Simple, Fast and Strong**
-
-  **Simple**: MMTracking interacts with other OpenMMLab projects. It is built upon [MMDetection](https://github.com/open-mmlab/mmdetection) that we can capitalize any detector only through modifying the configs.
-
-  **Fast**: All operations run on GPUs. The training and inference speeds are faster than or comparable to other implementations.
-
-  **Strong**: We reproduce state-of-the-art models and some of them even outperform the official implementations.
-
-## What's New
-
-Release [QDTrack](configs/mot/qdtrack) pretrained models.
-
-v0.13.0 was released in 29/04/2022.
-Please refer to [changelog.md](docs/en/changelog.md) for details and release history.
-
-## Installation
-
-Please refer to [install.md](docs/en/install.md) for install instructions.
-
-## Getting Started
-
-Please see [dataset.md](docs/en/dataset.md) and [quick_run.md](docs/en/quick_run.md) for the basic usage of MMTracking.
-
-A Colab tutorial is provided. You may preview the notebook [here](./demo/MMTracking_Tutorial.ipynb) or directly run it on [Colab](https://colab.research.google.com/github/open-mmlab/mmtracking/blob/master/demo/MMTracking_Tutorial.ipynb).
-
-There are also usage [tutorials](docs/en/tutorials/), such as [learning about configs](docs/en/tutorials/config.md), [an example about detailed description of vid config](docs/en/tutorials/config_vid.md), [an example about detailed description of mot config](docs/en/tutorials/config_mot.md), [an example about detailed description of sot config](docs/en/tutorials/config_sot.md), [customizing dataset](docs/en/tutorials/customize_dataset.md), [customizing data pipeline](docs/en/tutorials/customize_data_pipeline.md), [customizing vid model](docs/en/tutorials/customize_vid_model.md), [customizing mot model](docs/en/tutorials/customize_mot_model.md), [customizing sot model](docs/en/tutorials/customize_sot_model.md), [customizing runtime settings](docs/en/tutorials/customize_runtime.md) and [useful tools](docs/en/useful_tools_scripts.md).
-
-## Benchmark and model zoo
-
-Results and models are available in the [model zoo](docs/en/model_zoo.md).
-
-### Video Object Detection
-
-Supported Methods
-
-- [x] [DFF](configs/vid/dff) (CVPR 2017)
-- [x] [FGFA](configs/vid/fgfa) (ICCV 2017)
-- [x] [SELSA](configs/vid/selsa) (ICCV 2019)
-- [x] [Temporal RoI Align](configs/vid/temporal_roi_align) (AAAI 2021)
-
-Supported Datasets
-
-- [x] [ILSVRC](http://image-net.org/challenges/LSVRC/2017/)
-
-### Single Object Tracking
-
-Supported Methods
-
-- [x] [SiameseRPN++](configs/sot/siamese_rpn) (CVPR 2019)
-- [x] [STARK](configs/sot/stark) (ICCV 2021)
-- [ ] [PrDiMP](https://arxiv.org/abs/2003.12565) (CVPR2020) (WIP)
-
-Supported Datasets
-
-- [x] [LaSOT](http://vision.cs.stonybrook.edu/~lasot/)
-- [x] [UAV123](https://cemse.kaust.edu.sa/ivul/uav123/)
-- [x] [TrackingNet](https://tracking-net.org/)
-- [x] [OTB100](http://www.visual-tracking.net/)
-- [x] [GOT10k](http://got-10k.aitestunion.com/)
-- [x] [VOT2018](https://www.votchallenge.net/vot2018/)
-
-### Multi-Object Tracking
-
-Supported Methods
-
-- [x] [SORT/DeepSORT](configs/mot/deepsort) (ICIP 2016/2017)
-- [x] [Tracktor](configs/mot/tracktor) (ICCV 2019)
-- [x] [QDTrack](configs/mot/qdtrack) (CVPR 2021)
-- [x] [ByteTrack](configs/mot/bytetrack) (arXiv 2021)
-- [ ] [OC-SORT](https://arxiv.org/abs/2203.14360)  (arXiv 2022) (WIP)
-
-Supported Datasets
-
-- [x] [MOT Challenge](https://motchallenge.net/)
-- [x] [CrowdHuman](https://www.crowdhuman.org/)
-- [x] [LVIS](https://www.lvisdataset.org/)
-- [x] [TAO](https://taodataset.org/)
-- [x] [DanceTrack](https://arxiv.org/abs/2111.14690)
-
-### Video Instance Segmentation
-
-Supported Methods
-
-- [x] [MaskTrack R-CNN](configs/vis/masktrack_rcnn) (ICCV 2019)
-
-Supported Datasets
-
-- [x] [YouTube-VIS](https://youtube-vos.org/dataset/vis/)
-
-## Contributing
-
-We appreciate all contributions to improve MMTracking. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) for the contributing guideline and [this discussion](https://github.com/open-mmlab/mmtracking/issues/73) for development roadmap.
-
-## Acknowledgement
-
-MMTracking is an open source project that welcome any contribution and feedback.
-We wish that the toolbox and benchmark could serve the growing research
-community by providing a flexible as well as standardized toolkit to reimplement existing methods
-and develop their own new video perception methods.
-
-## Citation
-
-If you find this project useful in your research, please consider cite:
-
-```latex
-@misc{mmtrack2020,
-    title={{MMTracking: OpenMMLab} video perception toolbox and benchmark},
-    author={MMTracking Contributors},
-    howpublished = {\url{https://github.com/open-mmlab/mmtracking}},
-    year={2020}
-}
+# Installation
+To setup `mot-mmtrack`, run the following command:
+```bash
+conda create -n mot-mmtrack python=3.10
+pip install torch torchvision openmim
+mim install -r requirements/mminstall.txt
+pip install -v -e .
+pip install mmyolo
+pip install git+https://github.com/JonathonLuiten/TrackEval.git
 ```
 
-## License
-
-This project is released under the [Apache 2.0 license](LICENSE).
-
-## Projects in OpenMMLab
-
-- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
-- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
-- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
-- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
-- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
-- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
-- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition and understanding toolbox.
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
-- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
-- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning Toolbox and Benchmark.
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab Model Compression Toolbox and Benchmark.
-- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab FewShot Learning Toolbox and Benchmark.
-- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
-- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
-- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration):  OpenMMLab Generative Model toolbox and benchmark.
-- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMlab deep learning model deployment toolset.
+# Usage
+Run the demo:
+```bash
+python demo/demo_mot_vis.py \
+    configs/mot/deepsort/my_config.py \
+    --input demo/demo.mp4 \
+    --output outputs
+```
\ No newline at end of file
diff --git a/README_zh-CN.md b/README_zh-CN.md
deleted file mode 100644
index 6f6d59897..000000000
--- a/README_zh-CN.md
+++ /dev/null
@@ -1,212 +0,0 @@
-<div align="center">
-  <img src="resources/mmtrack-logo.png" width="600"/>
-  <div>&nbsp;</div>
-  <div align="center">
-    <b><font size="5">OpenMMLab 官网</font></b>
-    <sup>
-      <a href="https://openmmlab.com">
-        <i><font size="4">HOT</font></i>
-      </a>
-    </sup>
-    &nbsp;&nbsp;&nbsp;&nbsp;
-    <b><font size="5">OpenMMLab 开放平台</font></b>
-    <sup>
-      <a href="https://platform.openmmlab.com">
-        <i><font size="4">TRY IT OUT</font></i>
-      </a>
-    </sup>
-  </div>
-  <div>&nbsp;</div>
-
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmtrack)](https://pypi.org/project/mmtrack/)
-[![PyPI](https://img.shields.io/pypi/v/mmtrack)](https://pypi.org/project/mmtrack)
-[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmtracking.readthedocs.io/en/latest/)
-[![badge](https://github.com/open-mmlab/mmtracking/workflows/build/badge.svg)](https://github.com/open-mmlab/mmtracking/actions)
-[![codecov](https://codecov.io/gh/open-mmlab/mmtracking/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmtracking)
-[![license](https://img.shields.io/github/license/open-mmlab/mmtracking.svg)](https://github.com/open-mmlab/mmtracking/blob/master/LICENSE)
-
-[📘Documentation](https://mmtracking.readthedocs.io/zh_CN/latest/) |
-[🛠️Installation](https://mmtracking.readthedocs.io/zh_CN/latest/install.html) |
-[👀Model Zoo](https://mmtracking.readthedocs.io/zh_CN/latest/model_zoo.html) |
-[🆕Update News](https://mmtracking.readthedocs.io/en/latest/changelog.html) |
-[🤔Reporting Issues](https://github.com/open-mmlab/mmtracking/issues/new/choose)
-
-</div>
-
-<div align="center">
-
-[English](/README.md) | 简体中文
-
-</div>
-
-## 简介
-
-MMTracking是一款基于PyTorch的视频目标感知开源工具箱，是[OpenMMLab](http://openmmlab.org/)项目的一部分。
-
-主分支代码目前支持**PyTorch 1.5以上**的版本。
-
-<div align="center">
-  <img src="https://user-images.githubusercontent.com/24663779/103343312-c724f480-4ac6-11eb-9c22-b56f1902584e.gif" width="800"/>
-</div>
-
-### 主要特性
-
-- **首个开源一体化视频目标感知平台**
-
-  MMTracking 是首个开源一体化视频目标感知工具箱，同时支持视频目标检测，多目标跟踪，单目标跟踪和视频个例分割等多种任务和算法。
-
-- **模块化设计**
-
-  MMTracking将统一的视频目标感知框架解耦成不同的模块组件，通过组合不同模块组件，用户可以便捷地构建自定义视频目标感知模型。
-
-- **简洁、高效、强大**
-
-  **简洁**：MMTracking与其他OpenMMLab平台充分交互。MMTracking充分复用[MMDetection](https://github.com/open-mmlab/mmdetection)中的已有模块，我们只需要修改配置文件就可以使用任何检测器。
-
-  **高效**：MMTracking所有操作都在GPU上运行。相比其他开源库的实现，MMTracking的训练和推理更加高效。
-
-  **强大**：MMTracking复现了SOTA性能的模型。受益于[MMDetection](https://github.com/open-mmlab/mmdetection)的持续推进，部分实现精度超出官方版本。
-
-## 更新
-
-v0.13.0版本已于2022年04月29日发布，可通过查阅[更新日志](docs/en/changelog.md)了解更多细节以及发布历史。
-
-## 安装
-
-请参考[安装指南](docs/en/install.md)进行安装。
-
-## 开始使用MMTracking
-
-请参考[数据集](docs/en/dataset.md)和[快速开始](docs/en/quick_run.md)了解MMTracking的基本使用。
-
-我们提供了跟踪的Colab教程，您可以在[这里](<(./demo/MMTracking_Tutorial.ipynb)>)预览或者直接在[Colab](https://colab.research.google.com/github/open-mmlab/mmtracking/blob/master/demo/MMTracking_Tutorial.ipynb)上运行。
-
-MMTracking也提供了更详细的[教程](docs/en/tutorials/)，比如[配置文件简介](docs/en/tutorials/config.md), [视频目标检测器配置文件详解](docs/en/tutorials/config_vid.md), [多目标跟踪器配置文件详解](docs/en/tutorials/config_mot.md), [单目标跟踪器配置文件详解](docs/en/tutorials/config_sot.md), [自定义数据集](docs/en/tutorials/customize_dataset.md), [自定义数据预处理流程](docs/en/tutorials/customize_data_pipeline.md), [自定义视频目标检测器](docs/en/tutorials/customize_vid_model.md), [自定义多目标跟踪器](docs/en/tutorials/customize_mot_model.md), [自定义单目标跟踪器](docs/en/tutorials/customize_sot_model.md), [自定义训练配置](docs/en/tutorials/customize_runtime.md) 以及 [有用的工具和脚本](docs/en/useful_tools_scripts.md).
-
-## 基准测试与模型库
-
-本工具箱支持的各个模型的结果和设置都可以在[模型库](docs/en/model_zoo.md)页面中查看。
-
-### 视频目标检测
-
-支持的算法:
-
-- [x] [DFF](configs/vid/dff) (CVPR 2017)
-- [x] [FGFA](configs/vid/fgfa) (ICCV 2017)
-- [x] [SELSA](configs/vid/selsa) (ICCV 2019)
-- [x] [Temporal RoI Align](configs/vid/temporal_roi_align) (AAAI 2021)
-
-支持的数据集：
-
-- [x] [ILSVRC](http://image-net.org/challenges/LSVRC/2017/)
-
-### 单目标跟踪
-
-支持的算法:
-
-- [x] [SiameseRPN++](configs/sot/siamese_rpn) (CVPR 2019)
-- [x] [STARK](configs/sot/stark) (ICCV 2021)
-- [ ] [PrDiMP](https://arxiv.org/abs/2003.12565) (CVPR2020) (WIP)
-
-支持的数据集：
-
-- [x] [LaSOT](http://vision.cs.stonybrook.edu/~lasot/)
-- [x] [UAV123](https://cemse.kaust.edu.sa/ivul/uav123/)
-- [x] [TrackingNet](https://tracking-net.org/)
-- [x] [OTB100](http://www.visual-tracking.net/)
-- [x] [GOT10k](http://got-10k.aitestunion.com/)
-- [x] [VOT2018](https://www.votchallenge.net/vot2018/)
-
-### 多目标跟踪
-
-支持的算法:
-
-- [x] [SORT/DeepSORT](configs/mot/deepsort) (ICIP 2016/2017)
-- [x] [Tracktor](configs/mot/tracktor) (ICCV 2019)
-- [x] [QDTrack](configs/mot/qdtrack) (CVPR 2021)
-- [x] [ByteTrack](configs/mot/bytetrack) (arXiv 2021)
-- [ ] [OC-SORT](https://arxiv.org/abs/2203.14360)  (arXiv 2022) (WIP)
-
-支持的数据集：
-
-- [x] [MOT Challenge](https://motchallenge.net/)
-- [x] [CrowdHuman](https://www.crowdhuman.org/)
-- [x] [LVIS](https://www.lvisdataset.org/)
-- [x] [TAO](https://taodataset.org/)
-- [x] [DanceTrack](https://arxiv.org/abs/2111.14690)
-
-### 视频实例分割
-
-支持的算法:
-
-- [x] [MaskTrack R-CNN](configs/vis/masktrack_rcnn) (ICCV 2019)
-
-支持的数据集：
-
-- [x] [YouTube-VIS](https://youtube-vos.org/dataset/vis/)
-
-## 参与贡献
-
-我们非常欢迎用户对于MMTracking做出的任何贡献，可以参考[贡献指南](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md)文件了解更多细节和在这个[讨论](https://github.com/open-mmlab/mmtracking/issues/73)中规划MMTracking的开发计划。
-
-## 致谢
-
-MMTracking是一款开源项目，我们欢迎任何贡献和反馈。我们希望该工具箱和基准测试可以为社区提供灵活的代码工具，供用户复现现有算法并开发自己新的视频目标感知方法。
-
-## 引用
-
-如果你觉得MMTracking对你的研究有所帮助，可以考虑引用它:
-
-```latex
-@misc{mmtrack2020,
-    title={{MMTracking: OpenMMLab} video perception toolbox and benchmark},
-    author={MMTracking Contributors},
-    howpublished = {\url{https://github.com/open-mmlab/mmtracking}},
-    year={2020}
-}
-```
-
-## 许可
-
-该项目遵循[Apache 2.0 license](/LICENSE)开源协议。
-
-## OpenMMLab 的其他项目
-
-- [MMCV](https://github.com/open-mmlab/mmcv)：OpenMMLab计算机视觉基础库
-- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
-- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
-- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
-- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
-- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
-- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
-- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
-- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
-- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
-- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
-- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
-- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
-- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
-
-## 欢迎加入 OpenMMLab 社区
-
-扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=aCvMxdr3)
-
-<div align="center">
-<img src="https://user-images.githubusercontent.com/24663779/116371114-a8005e80-a83d-11eb-9123-17fc9cfe7475.jpg" height="400" />  <img src="https://user-images.githubusercontent.com/24663779/116371171-b8b0d480-a83d-11eb-9514-247f0e5dbfa3.jpg" height="400" />
-</div>
-
-我们会在 OpenMMLab 社区为大家
-
-- 📢 分享 AI 框架的前沿核心技术
-- 💻 解读 PyTorch 常用模块源码
-- 📰 发布 OpenMMLab 的相关新闻
-- 🚀 介绍 OpenMMLab 开发的前沿算法
-- 🏃 获取更高效的问题答疑和意见反馈
-- 🔥 提供与各行各业开发者充分交流的平台
-
-干货满满 📘，等你来撩 💗，OpenMMLab 社区期待您的加入 👬
diff --git a/configs/README.md b/configs/README.md
deleted file mode 100644
index 996ceb9fe..000000000
--- a/configs/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Usage of configs
-
-## Training with configs
-
-Please refer to [Training](../docs/en/quick_run.md#training) to see the tutorials of training models.
-
-## Testing with configs
-
-Please refer to [Testing](../docs/en/quick_run.md#testing) to see the tutorials of testing models.
-
-## Inference with configs
-
-Please refer to [Inference](../docs/en/quick_run.md#inference) to see the tutorials of inferencing models.
diff --git a/configs/_base_/datasets/dancetrack.py b/configs/_base_/datasets/dancetrack.py
deleted file mode 100644
index 4cc961ac4..000000000
--- a/configs/_base_/datasets/dancetrack.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# dataset settings
-dataset_type = 'DanceTrackDataset'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-train_pipeline = [
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
-    dict(
-        type='SeqResize',
-        img_scale=(1088, 1088),
-        share_params=True,
-        ratio_range=(0.8, 1.2),
-        keep_ratio=True,
-        bbox_clip_border=False),
-    dict(type='SeqPhotoMetricDistortion', share_params=True),
-    dict(
-        type='SeqRandomCrop',
-        share_params=False,
-        crop_size=(1088, 1088),
-        bbox_clip_border=False),
-    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='SeqPad', size_divisor=32),
-    dict(type='MatchInstances', skip_nomatch=True),
-    dict(
-        type='VideoCollect',
-        keys=[
-            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
-            'gt_instance_ids'
-        ]),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img'])
-        ])
-]
-data_root = 'data/dancetrack/'
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
-        type=dataset_type,
-        visibility_thr=-1,
-        ann_file=data_root + 'annotations/train_cocoformat.json',
-        img_prefix=data_root + 'train',
-        ref_img_sampler=dict(
-            num_ref_imgs=1,
-            frame_range=10,
-            filter_key_img=True,
-            method='uniform'),
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=data_root + 'annotations/val_cocoformat.json',
-        img_prefix=data_root + 'val',
-        ref_img_sampler=None,
-        pipeline=test_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=data_root + 'annotations/val_cocoformat.json',
-        img_prefix=data_root + 'val',
-        ref_img_sampler=None,
-        pipeline=test_pipeline))
diff --git a/configs/_base_/datasets/got10k.py b/configs/_base_/datasets/got10k.py
new file mode 100644
index 000000000..e0e46afbe
--- /dev/null
+++ b/configs/_base_/datasets/got10k.py
@@ -0,0 +1,33 @@
+# test pipeline
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=False),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type='GOT10kDataset',
+        data_root='data/',
+        ann_file='GOT10k/annotations/got10k_test_infos.txt',
+        data_prefix=dict(img_path='GOT10k'),
+        pipeline=test_pipeline,
+        test_mode=True))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='SOTMetric',
+    format_only=True,
+    metric_options=dict(dataset_type='got10k'))
+test_evaluator = val_evaluator
+
+# runner loop
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/_base_/datasets/imagenet_vid_dff_style.py b/configs/_base_/datasets/imagenet_vid_dff_style.py
index 331be1423..94b8596d0 100644
--- a/configs/_base_/datasets/imagenet_vid_dff_style.py
+++ b/configs/_base_/datasets/imagenet_vid_dff_style.py
@@ -1,73 +1,83 @@
 # dataset settings
 dataset_type = 'ImagenetVIDDataset'
 data_root = 'data/ILSVRC/'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# data pipeline
 train_pipeline = [
-    dict(type='LoadMultiImagesFromFile'),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
-    dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
-    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='SeqPad', size_divisor=16),
     dict(
-        type='VideoCollect',
-        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_instance_ids']),
-    dict(type='ConcatVideoReferences'),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadTrackAnnotations', with_instance_id=True),
+            dict(type='mmdet.Resize', scale=(1000, 600), keep_ratio=True),
+            dict(type='RandomFlip', prob=0.5),
+        ]),
+    dict(type='PackTrackInputs')
 ]
 test_pipeline = [
     dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1000, 600),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=16),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img'])
-        ])
+    dict(type='mmdet.Resize', scale=(1000, 600), keep_ratio=True),
+    dict(type='PackTrackInputs', pack_single_img=True)
 ]
-data = dict(
-    samples_per_gpu=1,
-    workers_per_gpu=2,
-    train=[
-        dict(
-            type=dataset_type,
-            ann_file=data_root + 'annotations/imagenet_vid_train.json',
-            img_prefix=data_root + 'Data/VID',
-            ref_img_sampler=dict(
-                num_ref_imgs=1,
-                frame_range=9,
-                filter_key_img=False,
-                method='uniform'),
-            pipeline=train_pipeline),
-        dict(
-            type=dataset_type,
-            load_as_video=False,
-            ann_file=data_root + 'annotations/imagenet_det_30plus1cls.json',
-            img_prefix=data_root + 'Data/DET',
-            ref_img_sampler=dict(
-                num_ref_imgs=1,
-                frame_range=0,
-                filter_key_img=False,
-                method='uniform'),
-            pipeline=train_pipeline)
-    ],
-    val=dict(
+
+# dataloader
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),
+    dataset=dict(
+        type='ConcatDataset',
+        datasets=[
+            dict(
+                type=dataset_type,
+                data_root=data_root,
+                ann_file='annotations/imagenet_vid_train.json',
+                data_prefix=dict(img_path='Data/VID'),
+                filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                pipeline=train_pipeline,
+                load_as_video=True,
+                ref_img_sampler=dict(
+                    num_ref_imgs=1,
+                    frame_range=9,
+                    filter_key_img=True,
+                    method='uniform')),
+            dict(
+                type=dataset_type,
+                data_root=data_root,
+                ann_file='annotations/imagenet_det_30plus1cls.json',
+                data_prefix=dict(img_path='Data/DET'),
+                filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                pipeline=train_pipeline,
+                load_as_video=False,
+                ref_img_sampler=dict(
+                    num_ref_imgs=1,
+                    frame_range=0,
+                    filter_key_img=True,
+                    method='uniform'))
+        ]))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
         type=dataset_type,
-        ann_file=data_root + 'annotations/imagenet_vid_val.json',
-        img_prefix=data_root + 'Data/VID',
-        ref_img_sampler=None,
+        data_root=data_root,
+        ann_file='annotations/imagenet_vid_val.json',
+        data_prefix=dict(img_path='Data/VID'),
         pipeline=test_pipeline,
-        test_mode=True),
-    test=dict(
-        type=dataset_type,
-        ann_file=data_root + 'annotations/imagenet_vid_val.json',
-        img_prefix=data_root + 'Data/VID',
+        load_as_video=True,
         ref_img_sampler=None,
-        pipeline=test_pipeline,
         test_mode=True))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='CocoVideoMetric',
+    ann_file=data_root + 'annotations/imagenet_vid_val.json',
+    metric='bbox')
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/imagenet_vid_fgfa_style.py b/configs/_base_/datasets/imagenet_vid_fgfa_style.py
index dd83f2025..d090f4bde 100644
--- a/configs/_base_/datasets/imagenet_vid_fgfa_style.py
+++ b/configs/_base_/datasets/imagenet_vid_fgfa_style.py
@@ -1,80 +1,92 @@
 # dataset settings
 dataset_type = 'ImagenetVIDDataset'
 data_root = 'data/ILSVRC/'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# data pipeline
 train_pipeline = [
-    dict(type='LoadMultiImagesFromFile'),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
-    dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
-    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='SeqPad', size_divisor=16),
     dict(
-        type='VideoCollect',
-        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_instance_ids']),
-    dict(type='ConcatVideoReferences'),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadTrackAnnotations', with_instance_id=True),
+            dict(type='mmdet.Resize', scale=(1000, 600), keep_ratio=True),
+            dict(type='RandomFlip', prob=0.5),
+        ]),
+    dict(type='PackTrackInputs')
 ]
 test_pipeline = [
-    dict(type='LoadMultiImagesFromFile'),
-    dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
-    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='SeqPad', size_divisor=16),
     dict(
-        type='VideoCollect',
-        keys=['img'],
-        meta_keys=('num_left_ref_imgs', 'frame_stride')),
-    dict(type='ConcatVideoReferences'),
-    dict(type='MultiImagesToTensor', ref_prefix='ref'),
-    dict(type='ToList')
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='mmdet.Resize', scale=(1000, 600), keep_ratio=True),
+        ]),
+    dict(type='PackTrackInputs', pack_single_img=False)
 ]
-data = dict(
-    samples_per_gpu=1,
-    workers_per_gpu=2,
-    train=[
-        dict(
-            type=dataset_type,
-            ann_file=data_root + 'annotations/imagenet_vid_train.json',
-            img_prefix=data_root + 'Data/VID',
-            ref_img_sampler=dict(
-                num_ref_imgs=2,
-                frame_range=9,
-                filter_key_img=True,
-                method='bilateral_uniform'),
-            pipeline=train_pipeline),
-        dict(
-            type=dataset_type,
-            load_as_video=False,
-            ann_file=data_root + 'annotations/imagenet_det_30plus1cls.json',
-            img_prefix=data_root + 'Data/DET',
-            ref_img_sampler=dict(
-                num_ref_imgs=2,
-                frame_range=0,
-                filter_key_img=False,
-                method='bilateral_uniform'),
-            pipeline=train_pipeline)
-    ],
-    val=dict(
+
+# dataloader
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),
+    dataset=dict(
+        type='ConcatDataset',
+        datasets=[
+            dict(
+                type=dataset_type,
+                data_root=data_root,
+                ann_file='annotations/imagenet_vid_train.json',
+                data_prefix=dict(img_path='Data/VID'),
+                filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                pipeline=train_pipeline,
+                load_as_video=True,
+                ref_img_sampler=dict(
+                    num_ref_imgs=2,
+                    frame_range=9,
+                    filter_key_img=True,
+                    method='bilateral_uniform')),
+            dict(
+                type=dataset_type,
+                data_root=data_root,
+                ann_file='annotations/imagenet_det_30plus1cls.json',
+                data_prefix=dict(img_path='Data/DET'),
+                filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                pipeline=train_pipeline,
+                load_as_video=False,
+                ref_img_sampler=dict(
+                    num_ref_imgs=2,
+                    frame_range=0,
+                    filter_key_img=False,
+                    method='bilateral_uniform'))
+        ]))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
         type=dataset_type,
-        ann_file=data_root + 'annotations/imagenet_vid_val.json',
-        img_prefix=data_root + 'Data/VID',
-        ref_img_sampler=dict(
-            num_ref_imgs=30,
-            frame_range=[-15, 15],
-            stride=1,
-            method='test_with_fix_stride'),
+        data_root=data_root,
+        ann_file='annotations/imagenet_vid_val.json',
+        data_prefix=dict(img_path='Data/VID'),
         pipeline=test_pipeline,
-        test_mode=True),
-    test=dict(
-        type=dataset_type,
-        ann_file=data_root + 'annotations/imagenet_vid_val.json',
-        img_prefix=data_root + 'Data/VID',
+        load_as_video=True,
         ref_img_sampler=dict(
             num_ref_imgs=30,
             frame_range=[-15, 15],
             stride=1,
             method='test_with_fix_stride'),
-        pipeline=test_pipeline,
         test_mode=True))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='CocoVideoMetric',
+    ann_file=data_root + 'annotations/imagenet_vid_val.json',
+    metric='bbox')
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/lasot.py b/configs/_base_/datasets/lasot.py
new file mode 100644
index 000000000..f74adeb48
--- /dev/null
+++ b/configs/_base_/datasets/lasot.py
@@ -0,0 +1,33 @@
+# test pipeline
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=False),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type='LaSOTDataset',
+        data_root='data/',
+        ann_file='LaSOT_full/annotations/lasot_test_infos.txt',
+        data_prefix=dict(img_path='LaSOT_full/LaSOTBenchmark'),
+        pipeline=test_pipeline,
+        test_mode=True))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='SOTMetric',
+    metric='OPE',
+    metric_options=dict(only_eval_visible=True))
+test_evaluator = val_evaluator
+
+# runner loop
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/_base_/datasets/mot_challenge.py b/configs/_base_/datasets/mot_challenge.py
index 362a97989..de1866c77 100644
--- a/configs/_base_/datasets/mot_challenge.py
+++ b/configs/_base_/datasets/mot_challenge.py
@@ -1,74 +1,86 @@
 # dataset settings
 dataset_type = 'MOTChallengeDataset'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data_root = 'data/MOT17/'
+
+# data pipeline
 train_pipeline = [
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
     dict(
-        type='SeqResize',
-        img_scale=(1088, 1088),
-        share_params=True,
-        ratio_range=(0.8, 1.2),
-        keep_ratio=True,
-        bbox_clip_border=False),
-    dict(type='SeqPhotoMetricDistortion', share_params=True),
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadTrackAnnotations', with_instance_id=True),
+            dict(
+                type='mmdet.RandomResize',
+                scale=(1088, 1088),
+                ratio_range=(0.8, 1.2),
+                keep_ratio=True,
+                clip_object_border=False),
+            dict(type='mmdet.PhotoMetricDistortion')
+        ]),
     dict(
-        type='SeqRandomCrop',
-        share_params=False,
-        crop_size=(1088, 1088),
-        bbox_clip_border=False),
-    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='SeqPad', size_divisor=32),
-    dict(type='MatchInstances', skip_nomatch=True),
+        type='TransformBroadcaster',
+        share_random_params=False,
+        transforms=[
+            dict(
+                type='mmdet.RandomCrop',
+                crop_size=(1088, 1088),
+                bbox_clip_border=False)
+        ]),
     dict(
-        type='VideoCollect',
-        keys=[
-            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
-            'gt_instance_ids'
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='mmdet.RandomFlip', prob=0.5),
         ]),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+    dict(type='PackTrackInputs', ref_prefix='ref', num_key_frames=1)
 ]
+
 test_pipeline = [
     dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img'])
-        ])
+    dict(type='LoadTrackAnnotations', with_instance_id=True),
+    dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),
+    dict(type='PackTrackInputs', pack_single_img=True)
 ]
-data_root = 'data/MOT17/'
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
+
+# dataloader
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),
+    dataset=dict(
         type=dataset_type,
+        data_root=data_root,
         visibility_thr=-1,
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        img_prefix=data_root + 'train',
+        ann_file='annotations/half-train_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        metainfo=dict(classes=('pedestrian', )),
         ref_img_sampler=dict(
             num_ref_imgs=1,
             frame_range=10,
             filter_key_img=True,
             method='uniform'),
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train',
-        ref_img_sampler=None,
-        pipeline=test_pipeline),
-    test=dict(
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
         type=dataset_type,
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train',
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img_path='train'),
         ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
         pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='MOTChallengeMetrics', metric=['HOTA', 'CLEAR', 'Identity'])
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/mot_challenge_det.py b/configs/_base_/datasets/mot_challenge_det.py
index dc2e8de7c..56e99210d 100644
--- a/configs/_base_/datasets/mot_challenge_det.py
+++ b/configs/_base_/datasets/mot_challenge_det.py
@@ -1,59 +1,66 @@
 # dataset settings
-dataset_type = 'CocoDataset'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+dataset_type = 'mmdet.CocoDataset'
+data_root = 'data/MOT17/'
+
 train_pipeline = [
     dict(type='LoadImageFromFile', to_float32=True),
     dict(type='LoadAnnotations', with_bbox=True),
     dict(
-        type='Resize',
-        img_scale=(1088, 1088),
+        type='RandomResize',
+        scale=(1088, 1088),
         ratio_range=(0.8, 1.2),
         keep_ratio=True,
-        bbox_clip_border=False),
+        clip_object_border=False),
     dict(type='PhotoMetricDistortion'),
     dict(type='RandomCrop', crop_size=(1088, 1088), bbox_clip_border=False),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
 ]
+
 test_pipeline = [
     dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(1088, 1088), keep_ratio=True),
     dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img'])
-        ])
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
 ]
-data_root = 'data/MOT17/'
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
-        type=dataset_type,
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        img_prefix=data_root + 'train',
-        classes=('pedestrian', ),
-        pipeline=train_pipeline),
-    val=dict(
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),
+    dataset=dict(
         type=dataset_type,
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train',
-        classes=('pedestrian', ),
-        pipeline=test_pipeline),
-    test=dict(
+        data_root=data_root,
+        _scope_='mmdet',
+        ann_file='annotations/half-train_cocoformat.json',
+        data_prefix=dict(img='train/'),
+        metainfo=dict(classes=('pedestrian', )),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
         type=dataset_type,
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train',
-        classes=('pedestrian', ),
+        data_root=data_root,
+        _scope_='mmdet',
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img='train/'),
+        metainfo=dict(classes=('pedestrian', )),
+        test_mode=True,
         pipeline=test_pipeline))
-evaluation = dict(metric=['bbox'])
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='mmdet.CocoMetric',
+    ann_file=data_root + 'annotations/half-val_cocoformat.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/mot_challenge_reid.py b/configs/_base_/datasets/mot_challenge_reid.py
index e79c524f4..46cf99de9 100644
--- a/configs/_base_/datasets/mot_challenge_reid.py
+++ b/configs/_base_/datasets/mot_challenge_reid.py
@@ -1,51 +1,57 @@
+# dataset settings
 dataset_type = 'ReIDDataset'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data_root = 'data/MOT17/'
+
+# data pipeline
 train_pipeline = [
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(
-        type='SeqResize',
-        img_scale=(128, 256),
-        share_params=False,
-        keep_ratio=False,
-        bbox_clip_border=False,
-        override=False),
     dict(
-        type='SeqRandomFlip',
-        share_params=False,
-        flip_ratio=0.5,
-        direction='horizontal'),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='VideoCollect', keys=['img', 'gt_label']),
-    dict(type='ReIDFormatBundle')
+        type='TransformBroadcaster',
+        share_random_params=False,
+        transforms=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(
+                type='mmdet.Resize',
+                scale=(128, 256),
+                keep_ratio=False,
+                clip_object_border=False),
+            dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+        ]),
+    dict(type='PackReIDInputs', meta_keys=('flip', 'flip_direction'))
 ]
 test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='Resize', img_scale=(128, 256), keep_ratio=False),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='ImageToTensor', keys=['img']),
-    dict(type='Collect', keys=['img'], meta_keys=[])
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='mmdet.Resize', scale=(128, 256), keep_ratio=False),
+    dict(type='PackReIDInputs')
 ]
-data_root = 'data/MOT17/'
-data = dict(
-    samples_per_gpu=1,
-    workers_per_gpu=2,
-    train=dict(
+
+# dataloader
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
         type=dataset_type,
+        data_root=data_root,
         triplet_sampler=dict(num_ids=8, ins_per_id=4),
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/train_80.txt',
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        triplet_sampler=None,
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/val_20.txt',
-        pipeline=test_pipeline),
-    test=dict(
+        data_prefix=dict(img_path='reid/imgs'),
+        ann_file='reid/meta/train_80.txt',
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
         type=dataset_type,
+        data_root=data_root,
         triplet_sampler=None,
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/val_20.txt',
+        data_prefix=dict(img_path='reid/imgs'),
+        ann_file='reid/meta/val_20.txt',
         pipeline=test_pipeline))
-evaluation = dict(interval=1, metric='mAP')
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(type='ReIDMetrics', metric=['mAP', 'CMC'])
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/otb100.py b/configs/_base_/datasets/otb100.py
new file mode 100644
index 000000000..91b3bf730
--- /dev/null
+++ b/configs/_base_/datasets/otb100.py
@@ -0,0 +1,33 @@
+# test pipeline
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=False),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type='OTB100Dataset',
+        data_root='data/',
+        ann_file='OTB100/annotations/otb100_infos.txt',
+        data_prefix=dict(img_path='OTB100'),
+        pipeline=test_pipeline,
+        test_mode=True))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='SOTMetric',
+    metric='OPE',
+    metric_options=dict(only_eval_visible=False))
+test_evaluator = val_evaluator
+
+# runner loop
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/_base_/datasets/tao.py b/configs/_base_/datasets/tao.py
index 366941129..8df4b44e8 100644
--- a/configs/_base_/datasets/tao.py
+++ b/configs/_base_/datasets/tao.py
@@ -1,67 +1,69 @@
 # dataset settings
 dataset_type = 'TaoDataset'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data_root = 'data/tao/'
+
+# data pipeline
 train_pipeline = [
-    dict(type='LoadMultiImagesFromFile'),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
-    dict(
-        type='SeqResize',
-        img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
-                   (1333, 768), (1333, 800)],
-        share_params=True,
-        multiscale_mode='value',
-        keep_ratio=True),
-    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='SeqPad', size_divisor=32),
-    dict(type='MatchInstances', skip_nomatch=True),
     dict(
-        type='VideoCollect',
-        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_match_indices'],
-    ),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='ref'),
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadTrackAnnotations', with_instance_id=True),
+            dict(
+                type='RandomChoiceResize',
+                scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                        (1333, 768), (1333, 800)],
+                resize_type='mmdet.Resize',
+                keep_ratio=True),
+            dict(type='mmdet.RandomFlip', prob=0.5),
+        ]),
+    dict(type='PackTrackInputs', ref_prefix='ref')
 ]
+
 test_pipeline = [
     dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=True),
+    dict(type='mmdet.Resize', scale=(1333, 800), keep_ratio=True),
     dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1333, 800),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img'])
-        ])
+        type='PackTrackInputs',
+        pack_single_img=True,
+        meta_keys=('frame_index', 'neg_category_ids',
+                   'not_exhaustive_category_ids'))
 ]
-data_root = 'data/tao/'
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
+
+# dataloader
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),
+    dataset=dict(
         type='ClassBalancedDataset',
         oversample_thr=1e-3,
         dataset=dict(
             type=dataset_type,
-            classes=data_root + 'annotations/tao_classes.txt',
+            data_root='data/lvis/',
             load_as_video=False,
-            ann_file='data/lvis/annotations/lvisv0.5+coco_train.json',
-            img_prefix='data/lvis/train/',
-            pipeline=train_pipeline)),
-    val=dict(
-        type=dataset_type,
-        classes=data_root + 'annotations/tao_classes.txt',
-        ann_file=data_root + 'annotations/validation_482_classes.json',
-        img_prefix=data_root + 'val/',
-        ref_img_sampler=None,
-        pipeline=test_pipeline),
-    test=dict(
+            ref_img_sampler=dict(num_ref_imgs=1, frame_range=0),
+            metainfo=dict(CLASSES=data_root + 'annotations/tao_classes.txt'),
+            ann_file='annotations/lvisv0.5+coco_train.json',
+            data_prefix=dict(img_path='train'),
+            pipeline=train_pipeline)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
         type=dataset_type,
-        classes=data_root + 'annotations/tao_classes.txt',
-        ann_file=data_root + 'annotations/validation_482_classes.json',
-        img_prefix=data_root + 'val/',
+        data_root=data_root,
+        metainfo=dict(CLASSES=data_root + 'annotations/tao_classes.txt'),
+        ann_file='annotations/validation_482_classes.json',
         ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
         pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/configs/_base_/datasets/trackingnet.py b/configs/_base_/datasets/trackingnet.py
new file mode 100644
index 000000000..b2827985f
--- /dev/null
+++ b/configs/_base_/datasets/trackingnet.py
@@ -0,0 +1,33 @@
+# test pipeline
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=False),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type='TrackingNetDataset',
+        data_root='data/',
+        ann_file='TrackingNet/annotations/trackingnet_test_infos.txt',
+        data_prefix=dict(img_path='TrackingNet'),
+        pipeline=test_pipeline,
+        test_mode=True))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='SOTMetric',
+    format_only=True,
+    metric_options=dict(dataset_type='trackingnet'))
+test_evaluator = val_evaluator
+
+# runner loop
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/_base_/datasets/uav123.py b/configs/_base_/datasets/uav123.py
new file mode 100644
index 000000000..e34ceff93
--- /dev/null
+++ b/configs/_base_/datasets/uav123.py
@@ -0,0 +1,33 @@
+# test pipeline
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=False),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type='UAV123Dataset',
+        data_root='data/',
+        ann_file='UAV123/annotations/uav123_infos.txt',
+        data_prefix=dict(img_path='UAV123'),
+        pipeline=test_pipeline,
+        test_mode=True))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='SOTMetric',
+    metric='OPE',
+    metric_options=dict(only_eval_visible=False))
+test_evaluator = val_evaluator
+
+# runner loop
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/_base_/datasets/vot2018.py b/configs/_base_/datasets/vot2018.py
new file mode 100644
index 000000000..e4f4de00c
--- /dev/null
+++ b/configs/_base_/datasets/vot2018.py
@@ -0,0 +1,33 @@
+# test pipeline
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=False),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type='VOTDataset',
+        data_root='data/',
+        ann_file='VOT2018/annotations/vot2018_infos.txt',
+        data_prefix=dict(img_path='VOT2018'),
+        pipeline=test_pipeline,
+        test_mode=True))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='SOTMetric',
+    metric='VOT',
+    metric_options=dict(dataset_type='vot2018'))
+test_evaluator = val_evaluator
+
+# runner loop
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/_base_/datasets/youtube_vis.py b/configs/_base_/datasets/youtube_vis.py
index 5474543bf..879858a9d 100644
--- a/configs/_base_/datasets/youtube_vis.py
+++ b/configs/_base_/datasets/youtube_vis.py
@@ -1,69 +1,70 @@
 # dataset settings
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 train_pipeline = [
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
     dict(
-        type='SeqLoadAnnotations',
-        with_bbox=True,
-        with_mask=True,
-        with_track=True),
-    dict(
-        type='SeqResize',
-        share_params=True,
-        img_scale=(640, 360),
-        keep_ratio=True),
-    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='SeqPad', size_divisor=32),
-    dict(
-        type='VideoCollect',
-        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_instance_ids']),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='LoadTrackAnnotations',
+                with_instance_id=True,
+                with_mask=True,
+                with_bbox=True),
+            dict(type='mmdet.Resize', scale=(640, 360), keep_ratio=True),
+            dict(type='mmdet.RandomFlip', prob=0.5),
+        ]),
+    dict(type='PackTrackInputs', ref_prefix='ref', num_key_frames=1)
 ]
+
 test_pipeline = [
     dict(type='LoadImageFromFile'),
     dict(
-        type='MultiScaleFlipAug',
-        img_scale=(640, 360),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img'])
-        ])
+        type='LoadTrackAnnotations',
+        with_instance_id=True,
+        with_mask=True,
+        with_bbox=True),
+    dict(type='mmdet.Resize', scale=(640, 360), keep_ratio=True),
+    dict(type='PackTrackInputs', pack_single_img=True)
 ]
+
 dataset_type = 'YouTubeVISDataset'
 data_root = 'data/youtube_vis_2019/'
-dataset_version = data_root[-5:-1]
-data = dict(
-    samples_per_gpu=1,
-    workers_per_gpu=2,
-    train=dict(
+dataset_version = data_root[-5:-1]  # 2019 or 2021
+# dataloader
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),
+    dataset=dict(
         type=dataset_type,
+        data_root=data_root,
         dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2019_train.json',
-        img_prefix=data_root + 'train/JPEGImages',
+        ann_file='annotations/youtube_vis_2019_train.json',
+        data_prefix=dict(img_path='train/JPEGImages'),
+        pipeline=train_pipeline,
+        load_as_video=True,
         ref_img_sampler=dict(
             num_ref_imgs=1,
             frame_range=100,
             filter_key_img=True,
-            method='uniform'),
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2019_valid.json',
-        img_prefix=data_root + 'valid/JPEGImages',
-        ref_img_sampler=None,
-        pipeline=test_pipeline),
-    test=dict(
+            method='uniform')))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
         type=dataset_type,
+        data_root=data_root,
         dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2019_valid.json',
-        img_prefix=data_root + 'valid/JPEGImages',
+        ann_file='annotations/youtube_vis_2019_valid.json',
+        data_prefix=dict(img_path='valid/JPEGImages'),
+        pipeline=test_pipeline,
+        load_as_video=True,
         ref_img_sampler=None,
-        pipeline=test_pipeline))
+        test_mode=True))
+test_dataloader = val_dataloader
diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py
index 3a5f0e9ef..f455ef7aa 100644
--- a/configs/_base_/default_runtime.py
+++ b/configs/_base_/default_runtime.py
@@ -1,22 +1,24 @@
-# optimizer
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(grad_clip=None)
-checkpoint_config = dict(interval=1)
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-dist_params = dict(backend='nccl')
+default_scope = 'mmtrack'
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='TrackVisualizationHook', draw=False),
+)
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
 log_level = 'INFO'
 load_from = None
-resume_from = None
-workflow = [('train', 1)]
-
-# disable opencv multithreading to avoid system being overloaded
-opencv_num_threads = 0
-# set multi-process start method as `fork` to speed up the training
-mp_start_method = 'fork'
+resume = False
diff --git a/configs/_base_/mmyolo_default_runtime.py b/configs/_base_/mmyolo_default_runtime.py
new file mode 100644
index 000000000..74de21409
--- /dev/null
+++ b/configs/_base_/mmyolo_default_runtime.py
@@ -0,0 +1,28 @@
+default_scope = 'mmyolo'
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='mmdet.DetVisualizationHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='mmdet.DetLocalVisualizer',
+    vis_backends=vis_backends,
+    name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
+
+file_client_args = dict(backend='disk')
\ No newline at end of file
diff --git a/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py
similarity index 100%
rename from configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
rename to configs/_base_/models/cascade-mask-rcnn_r50_fpn.py
diff --git a/configs/_base_/models/cascade_rcnn_r50_fpn.py b/configs/_base_/models/cascade-rcnn_r50_fpn.py
similarity index 100%
rename from configs/_base_/models/cascade_rcnn_r50_fpn.py
rename to configs/_base_/models/cascade-rcnn_r50_fpn.py
diff --git a/configs/_base_/models/faster_rcnn_r50_caffe_c4.py b/configs/_base_/models/faster-rcnn_r50-caffe-c4.py
similarity index 100%
rename from configs/_base_/models/faster_rcnn_r50_caffe_c4.py
rename to configs/_base_/models/faster-rcnn_r50-caffe-c4.py
diff --git a/configs/_base_/models/faster_rcnn_r50_dc5.py b/configs/_base_/models/faster-rcnn_r50-dc5.py
similarity index 90%
rename from configs/_base_/models/faster_rcnn_r50_dc5.py
rename to configs/_base_/models/faster-rcnn_r50-dc5.py
index ef50daabb..a77d096bc 100644
--- a/configs/_base_/models/faster_rcnn_r50_dc5.py
+++ b/configs/_base_/models/faster-rcnn_r50-dc5.py
@@ -1,6 +1,15 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=True)
 model = dict(
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=16),
     detector=dict(
         type='FasterRCNN',
+        _scope_='mmdet',
         backbone=dict(
             type='ResNet',
             depth=50,
@@ -9,7 +18,7 @@
             strides=(1, 2, 2, 1),
             dilations=(1, 1, 1, 2),
             frozen_stages=1,
-            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_cfg=norm_cfg,
             norm_eval=True,
             style='pytorch',
             init_cfg=dict(
@@ -66,6 +75,7 @@
             rpn=dict(
                 assigner=dict(
                     type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlaps2D'),
                     pos_iou_thr=0.7,
                     neg_iou_thr=0.3,
                     min_pos_iou=0.3,
@@ -87,6 +97,7 @@
             rcnn=dict(
                 assigner=dict(
                     type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlaps2D'),
                     pos_iou_thr=0.5,
                     neg_iou_thr=0.5,
                     min_pos_iou=0.5,
diff --git a/configs/_base_/models/faster_rcnn_r50_fpn.py b/configs/_base_/models/faster-rcnn_r50_fpn.py
similarity index 94%
rename from configs/_base_/models/faster_rcnn_r50_fpn.py
rename to configs/_base_/models/faster-rcnn_r50_fpn.py
index 98310030b..25bd85cbb 100644
--- a/configs/_base_/models/faster_rcnn_r50_fpn.py
+++ b/configs/_base_/models/faster-rcnn_r50_fpn.py
@@ -1,6 +1,14 @@
 model = dict(
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        rgb_to_bgr=False,
+        pad_size_divisor=32),
     detector=dict(
         type='FasterRCNN',
+        _scope_='mmdet',
         backbone=dict(
             type='ResNet',
             depth=50,
diff --git a/configs/_base_/models/mask_rcnn_r50_fpn.py b/configs/_base_/models/mask-rcnn_r50_fpn.py
similarity index 94%
rename from configs/_base_/models/mask_rcnn_r50_fpn.py
rename to configs/_base_/models/mask-rcnn_r50_fpn.py
index de7bfd8b0..4b8ce31bc 100644
--- a/configs/_base_/models/mask_rcnn_r50_fpn.py
+++ b/configs/_base_/models/mask-rcnn_r50_fpn.py
@@ -1,6 +1,14 @@
 # model settings
 model = dict(
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
     detector=dict(
+        _scope_='mmdet',
         type='MaskRCNN',
         backbone=dict(
             type='ResNet',
diff --git a/configs/_base_/models/retinanet_r50_fpn.py b/configs/_base_/models/retinanet_r50_fpn.py
index 72fadfe06..4572c58c7 100644
--- a/configs/_base_/models/retinanet_r50_fpn.py
+++ b/configs/_base_/models/retinanet_r50_fpn.py
@@ -1,9 +1,16 @@
 # model settings
+preprocess_cfg = dict(
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    to_rgb=True,
+    pad_size_divisor=32)
+
 model = dict(
+    preprocess_cfg=preprocess_cfg,
     detector=dict(
-        type='RetinaNet',
+        type='mmdet.RetinaNet',
         backbone=dict(
-            type='ResNet',
+            type='mmdet.ResNet',
             depth=50,
             num_stages=4,
             out_indices=(0, 1, 2, 3),
@@ -14,43 +21,45 @@
             init_cfg=dict(
                 type='Pretrained', checkpoint='torchvision://resnet50')),
         neck=dict(
-            type='FPN',
+            type='mmdet.FPN',
             in_channels=[256, 512, 1024, 2048],
             out_channels=256,
             start_level=1,
             add_extra_convs='on_input',
             num_outs=5),
         bbox_head=dict(
-            type='RetinaHead',
+            type='mmdet.RetinaHead',
             num_classes=80,
             in_channels=256,
             stacked_convs=4,
             feat_channels=256,
             anchor_generator=dict(
-                type='AnchorGenerator',
+                type='mmdet.AnchorGenerator',
                 octave_base_scale=4,
                 scales_per_octave=3,
                 ratios=[0.5, 1.0, 2.0],
                 strides=[8, 16, 32, 64, 128]),
             bbox_coder=dict(
-                type='DeltaXYWHBBoxCoder',
+                type='mmdet.DeltaXYWHBBoxCoder',
                 target_means=[.0, .0, .0, .0],
                 target_stds=[1.0, 1.0, 1.0, 1.0]),
             loss_cls=dict(
-                type='FocalLoss',
+                type='mmdet.FocalLoss',
                 use_sigmoid=True,
                 gamma=2.0,
                 alpha=0.25,
                 loss_weight=1.0),
-            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
-        # training and testing settings
+            loss_bbox=dict(type='mmdet.L1Loss', loss_weight=1.0)),
+        # model training and testing settings
         train_cfg=dict(
             assigner=dict(
-                type='MaxIoUAssigner',
+                type='mmdet.MaxIoUAssigner',
+                iou_calculator=dict(type='mmdet.BboxOverlaps2D'),
                 pos_iou_thr=0.5,
                 neg_iou_thr=0.4,
                 min_pos_iou=0,
                 ignore_iof_thr=-1),
+            sampler=dict(type='mmdet.PseudoSampler'),
             allowed_border=-1,
             pos_weight=-1,
             debug=False),
diff --git a/configs/_base_/models/yolov7_l1_syncbn_fast.py b/configs/_base_/models/yolov7_l1_syncbn_fast.py
new file mode 100644
index 000000000..5180202d7
--- /dev/null
+++ b/configs/_base_/models/yolov7_l1_syncbn_fast.py
@@ -0,0 +1,66 @@
+# parameters that often need to be modified
+img_scale = (1088, 1088)  # width, height
+
+# different from yolov5
+anchors = [
+    [(19, 27), (44, 40), (38, 94)],  # P3/8
+    [(96, 68), (86, 152), (180, 137)],  # P4/16
+    [(140, 301), (303, 264), (238, 542)],  # P5/32
+]
+strides = [8, 16, 32]
+num_det_layers = 3
+num_classes = 1
+
+model = dict(
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[0., 0., 0.],
+        std=[255., 255., 255.],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    detector=dict(
+        type='YOLODetector',
+        _scope_='mmyolo',
+        backbone=dict(
+            type='YOLOv7Backbone',
+            arch='L',
+            norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg=dict(type='SiLU', inplace=True)),
+        neck=dict(
+            type='YOLOv7PAFPN',
+            block_cfg=dict(
+                type='ELANBlock',
+                middle_ratio=0.5,
+                block_ratio=0.25,
+                num_blocks=4,
+                num_convs_in_block=1),
+            upsample_feats_cat_first=False,
+            in_channels=[512, 1024, 1024],
+            # The real output channel will be multiplied by 2
+            out_channels=[128, 256, 512],
+            norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg=dict(type='SiLU', inplace=True)),
+        bbox_head=dict(
+            type='YOLOv7Head',
+            head_module=dict(
+                type='YOLOv7HeadModule',
+                num_classes=num_classes,
+                in_channels=[256, 512, 1024],
+                featmap_strides=strides,
+                num_base_priors=3),
+            prior_generator=dict(
+                type='mmdet.YOLOAnchorGenerator',
+                base_sizes=anchors,
+                strides=strides),
+            obj_level_weights=[4., 1., 0.4],
+            # BatchYOLOv7Assigner params
+            prior_match_thr=4.,
+            simota_candidate_topk=10,
+            simota_iou_weight=3.0,
+            simota_cls_weight=1.0),
+        test_cfg=dict(
+            multi_label=False,
+            nms_pre=30000,
+            score_thr=0.01,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=300)))
\ No newline at end of file
diff --git a/configs/_base_/models/yolov7_l_syncbn_fast.py b/configs/_base_/models/yolov7_l_syncbn_fast.py
new file mode 100644
index 000000000..b90c27d7f
--- /dev/null
+++ b/configs/_base_/models/yolov7_l_syncbn_fast.py
@@ -0,0 +1,85 @@
+# parameters that often need to be modified
+img_scale = (1088, 1088)  # width, height
+
+# different from yolov5
+anchors = [
+    [(12, 16), (19, 36), (40, 28)],  # P3/8
+    [(36, 75), (76, 55), (72, 146)],  # P4/16
+    [(142, 110), (192, 243), (459, 401)]  # P5/32
+]
+strides = [8, 16, 32]
+num_det_layers = 3
+num_classes = 1
+
+model = dict(
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[0., 0., 0.],
+        std=[255., 255., 255.],
+        bgr_to_rgb=True,
+        rgb_to_bgr=False,
+        pad_size_divisor=32),
+    detector=dict(
+        type='YOLODetector',
+        _scope_='mmyolo',
+        backbone=dict(
+            type='YOLOv7Backbone',
+            arch='L',
+            norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg=dict(type='SiLU', inplace=True)),
+        neck=dict(
+            type='YOLOv7PAFPN',
+            block_cfg=dict(
+                type='ELANBlock',
+                middle_ratio=0.5,
+                block_ratio=0.25,
+                num_blocks=4,
+                num_convs_in_block=1),
+            upsample_feats_cat_first=False,
+            in_channels=[512, 1024, 1024],
+            # The real output channel will be multiplied by 2
+            out_channels=[128, 256, 512],
+            norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg=dict(type='SiLU', inplace=True)),
+        bbox_head=dict(
+            type='YOLOv7Head',
+            head_module=dict(
+                type='YOLOv7HeadModule',
+                num_classes=num_classes,
+                in_channels=[256, 512, 1024],
+                featmap_strides=strides,
+                num_base_priors=3),
+            prior_generator=dict(
+                type='mmdet.YOLOAnchorGenerator',
+                base_sizes=anchors,
+                strides=strides),
+            # scaled based on number of detection layers
+            loss_cls=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='mean',
+                loss_weight=0.3 * (num_classes / 80 * 3 / num_det_layers)),
+            loss_bbox=dict(
+                type='IoULoss',
+                iou_mode='ciou',
+                bbox_format='xywh',
+                reduction='mean',
+                loss_weight=0.05 * (3 / num_det_layers),
+                return_iou=True),
+            loss_obj=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='mean',
+                loss_weight=0.7 * ((img_scale[0] / 640)**2 * 3 / num_det_layers)),
+            obj_level_weights=[4., 1., 0.4],
+            # BatchYOLOv7Assigner params
+            prior_match_thr=4.,
+            simota_candidate_topk=10,
+            simota_iou_weight=3.0,
+            simota_cls_weight=1.0),
+        test_cfg=dict(
+            multi_label=False,
+            nms_pre=30000,
+            score_thr=0.01,
+            nms=dict(type='nms', iou_threshold=0.65),
+            max_per_img=300)))
\ No newline at end of file
diff --git a/configs/_base_/models/yolox_x_8x8.py b/configs/_base_/models/yolox_x_8x8.py
index aeb786d61..c7c6623fe 100644
--- a/configs/_base_/models/yolox_x_8x8.py
+++ b/configs/_base_/models/yolox_x_8x8.py
@@ -2,11 +2,19 @@
 img_scale = (640, 640)
 
 model = dict(
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='mmdet.BatchSyncRandomResize',
+                random_size_range=(480, 800),
+                size_divisor=32,
+                interval=10)
+        ]),
     detector=dict(
+        _scope_='mmdet',
         type='YOLOX',
-        input_size=img_scale,
-        random_size_range=(15, 25),
-        random_size_interval=10,
         backbone=dict(
             type='CSPDarknet', deepen_factor=1.33, widen_factor=1.25),
         neck=dict(
diff --git a/configs/det/faster-rcnn_r101_fpn_4e_mot17-half.py b/configs/det/faster-rcnn_r101_fpn_4e_mot17-half.py
deleted file mode 100644
index ab3de8801..000000000
--- a/configs/det/faster-rcnn_r101_fpn_4e_mot17-half.py
+++ /dev/null
@@ -1,13 +0,0 @@
-USE_MMDET = True
-_base_ = ['./faster-rcnn_r50_fpn_4e_mot17-half.py']
-model = dict(
-    detector=dict(
-        backbone=dict(
-            depth=101,
-            init_cfg=dict(
-                type='Pretrained', checkpoint='torchvision://resnet101')),
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth'  # noqa: E501
-        )))
diff --git a/configs/det/faster-rcnn_r101_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/configs/det/faster-rcnn_r101_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
new file mode 100644
index 000000000..1ea5f791f
--- /dev/null
+++ b/configs/det/faster-rcnn_r101_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,13 @@
+_base_ = [
+    './faster-rcnn_resnet50-fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py'
+]
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint=  # noqa: E251
+        'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth'  # noqa: E501
+    ))
diff --git a/configs/det/faster-rcnn_r50_fpn_4e_mot15-half.py b/configs/det/faster-rcnn_r50_fpn_4e_mot15-half.py
deleted file mode 100644
index 085e579c2..000000000
--- a/configs/det/faster-rcnn_r50_fpn_4e_mot15-half.py
+++ /dev/null
@@ -1,14 +0,0 @@
-USE_MMDET = True
-_base_ = ['./faster-rcnn_r50_fpn_4e_mot17-half.py']
-# data
-data_root = 'data/MOT15/'
-data = dict(
-    train=dict(
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    val=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    test=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'))
diff --git a/configs/det/faster-rcnn_r50_fpn_4e_mot16-half.py b/configs/det/faster-rcnn_r50_fpn_4e_mot16-half.py
deleted file mode 100644
index a3ae11d06..000000000
--- a/configs/det/faster-rcnn_r50_fpn_4e_mot16-half.py
+++ /dev/null
@@ -1,14 +0,0 @@
-USE_MMDET = True
-_base_ = ['./faster-rcnn_r50_fpn_4e_mot17-half.py']
-# data
-data_root = 'data/MOT16/'
-data = dict(
-    train=dict(
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    val=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    test=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'))
diff --git a/configs/det/faster-rcnn_r50_fpn_4e_mot17-half.py b/configs/det/faster-rcnn_r50_fpn_4e_mot17-half.py
deleted file mode 100644
index a89479096..000000000
--- a/configs/det/faster-rcnn_r50_fpn_4e_mot17-half.py
+++ /dev/null
@@ -1,24 +0,0 @@
-USE_MMDET = True
-_base_ = [
-    '../_base_/models/faster_rcnn_r50_fpn.py',
-    '../_base_/datasets/mot_challenge_det.py', '../_base_/default_runtime.py'
-]
-model = dict(
-    detector=dict(
-        rpn_head=dict(bbox_coder=dict(clip_border=False)),
-        roi_head=dict(
-            bbox_head=dict(bbox_coder=dict(clip_border=False), num_classes=1)),
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'  # noqa: E501
-        )))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=100,
-    warmup_ratio=1.0 / 100,
-    step=[3])
-# runtime settings
-total_epochs = 4
diff --git a/configs/det/faster-rcnn_r50_fpn_4e_mot17.py b/configs/det/faster-rcnn_r50_fpn_4e_mot17.py
deleted file mode 100644
index 33790e407..000000000
--- a/configs/det/faster-rcnn_r50_fpn_4e_mot17.py
+++ /dev/null
@@ -1,8 +0,0 @@
-USE_MMDET = True
-_base_ = ['./faster-rcnn_r50_fpn_4e_mot17-half.py']
-# data
-data_root = 'data/MOT17/'
-data = dict(
-    train=dict(ann_file=data_root + 'annotations/train_cocoformat.json'),
-    val=dict(ann_file=data_root + 'annotations/train_cocoformat.json'),
-    test=dict(ann_file=data_root + 'annotations/train_cocoformat.json'))
diff --git a/configs/det/faster-rcnn_r50_fpn_8e_mot20-half.py b/configs/det/faster-rcnn_r50_fpn_8e_mot20-half.py
deleted file mode 100644
index 4eeb7e00c..000000000
--- a/configs/det/faster-rcnn_r50_fpn_8e_mot20-half.py
+++ /dev/null
@@ -1,28 +0,0 @@
-USE_MMDET = True
-_base_ = ['./faster-rcnn_r50_fpn_4e_mot17-half.py']
-model = dict(
-    detector=dict(
-        rpn_head=dict(bbox_coder=dict(clip_border=True)),
-        roi_head=dict(
-            bbox_head=dict(bbox_coder=dict(clip_border=True), num_classes=1))))
-# data
-data_root = 'data/MOT20/'
-data = dict(
-    train=dict(
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    val=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    test=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=100,
-    warmup_ratio=1.0 / 100,
-    step=[6])
-# runtime settings
-total_epochs = 8
diff --git a/configs/det/faster-rcnn_r50_fpn_8e_mot20.py b/configs/det/faster-rcnn_r50_fpn_8e_mot20.py
deleted file mode 100644
index 3c9ad268c..000000000
--- a/configs/det/faster-rcnn_r50_fpn_8e_mot20.py
+++ /dev/null
@@ -1,28 +0,0 @@
-USE_MMDET = True
-_base_ = ['./faster-rcnn_r50_fpn_4e_mot17-half.py']
-model = dict(
-    detector=dict(
-        rpn_head=dict(bbox_coder=dict(clip_border=True)),
-        roi_head=dict(
-            bbox_head=dict(bbox_coder=dict(clip_border=True), num_classes=1))))
-# data
-data_root = 'data/MOT20/'
-data = dict(
-    train=dict(
-        ann_file=data_root + 'annotations/train_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    val=dict(
-        ann_file=data_root + 'annotations/train_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    test=dict(
-        ann_file=data_root + 'annotations/train_cocoformat.json',
-        img_prefix=data_root + 'train'))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=100,
-    warmup_ratio=1.0 / 100,
-    step=[6])
-# runtime settings
-total_epochs = 8
diff --git a/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot15halftrain_test-mot15halfval.py b/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot15halftrain_test-mot15halfval.py
new file mode 100644
index 000000000..313a4f2ba
--- /dev/null
+++ b/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot15halftrain_test-mot15halfval.py
@@ -0,0 +1,12 @@
+_base_ = [
+    './faster-rcnn_resnet50-fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py'
+]
+# data
+data_root = 'data/MOT15/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root +
+                     'annotations/half-val_cocoformat.json')
+test_evaluator = val_evaluator
diff --git a/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot16halftrain_test-mot16halfval.py b/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot16halftrain_test-mot16halfval.py
new file mode 100644
index 000000000..67e8cfb42
--- /dev/null
+++ b/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot16halftrain_test-mot16halfval.py
@@ -0,0 +1,12 @@
+_base_ = [
+    './faster-rcnn_resnet50-fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py'
+]
+# data
+data_root = 'data/MOT16/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root +
+                     'annotations/half-val_cocoformat.json')
+test_evaluator = val_evaluator
diff --git a/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
new file mode 100644
index 000000000..020b6e0ca
--- /dev/null
+++ b/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,151 @@
+_base_ = [
+    '../_base_/datasets/mot_challenge_det.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    type='FasterRCNN',
+    _scope_='mmdet',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0],
+            clip_border=False),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=1,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2],
+                clip_border=False),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))),
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)),
+    # soft-nms is also supported for rcnn testing
+    # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint=  # noqa: E251
+        'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'  # noqa: E501
+    ))
+
+default_hooks = dict(visualization=dict(type='mmdet.DetVisualizationHook'))
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='mmdet.DetLocalVisualizer',
+    vis_backends=vis_backends,
+    name='visualizer')
+
+# training schedule for 4e
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=4, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=4,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
diff --git a/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17train.py b/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17train.py
new file mode 100644
index 000000000..463f80da7
--- /dev/null
+++ b/configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17train.py
@@ -0,0 +1,13 @@
+_base_ = [
+    './faster-rcnn_resnet50-fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py'
+]
+# data
+data_root = 'data/MOT17/'
+train_dataloader = dict(
+    dataset=dict(ann_file='annotations/train_cocoformat.json'))
+val_dataloader = dict(
+    dataset=dict(ann_file='annotations/train_cocoformat.json'))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root + 'annotations/train_cocoformat.json')
+test_evaluator = val_evaluator
diff --git a/configs/det/faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py b/configs/det/faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py
new file mode 100644
index 000000000..8321da132
--- /dev/null
+++ b/configs/det/faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py
@@ -0,0 +1,31 @@
+_base_ = [
+    './faster-rcnn_resnet50-fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py'
+]
+model = dict(
+    rpn_head=dict(bbox_coder=dict(clip_border=True)),
+    roi_head=dict(
+        bbox_head=dict(bbox_coder=dict(clip_border=True), num_classes=1)))
+# data
+data_root = 'data/MOT20/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root +
+                     'annotations/half-val_cocoformat.json')
+test_evaluator = val_evaluator
+
+# training schedule for 8e
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
+
+# learning rate
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        milestones=[6],
+        gamma=0.1)
+]
diff --git a/configs/det/faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20train.py b/configs/det/faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20train.py
new file mode 100644
index 000000000..bcb376613
--- /dev/null
+++ b/configs/det/faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20train.py
@@ -0,0 +1,34 @@
+_base_ = [
+    './faster-rcnn_resnet50-fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py'
+]
+model = dict(
+    rpn_head=dict(bbox_coder=dict(clip_border=True)),
+    roi_head=dict(
+        bbox_head=dict(bbox_coder=dict(clip_border=True), num_classes=1)))
+# data
+data_root = 'data/MOT20/'
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root, ann_file='annotations/train_cocoformat.json'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root, ann_file='annotations/train_cocoformat.json'))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root + 'annotations/train_cocoformat.json')
+test_evaluator = val_evaluator
+
+# training schedule for 8e
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
+
+# learning rate
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        milestones=[6],
+        gamma=0.1)
+]
diff --git a/configs/det/yolov7_l1_syncbn_fast_mot17.py b/configs/det/yolov7_l1_syncbn_fast_mot17.py
new file mode 100644
index 000000000..518559785
--- /dev/null
+++ b/configs/det/yolov7_l1_syncbn_fast_mot17.py
@@ -0,0 +1,260 @@
+_base_ = '../_base_/mmyolo_default_runtime.py'
+
+# parameters that often need to be modified
+img_scale = (1088, 1088)  # width, height
+
+# dataset settings
+data_root = 'data/MOT17/'
+dataset_type = 'YOLOv5CocoDataset'
+max_epochs = 30
+batch_size = 8
+train_num_workers = 4
+val_num_workers = 2
+save_epoch_intervals = 10
+persistent_workers = True
+
+# only on Val
+batch_shapes_cfg = dict(
+    type='BatchShapePolicy',
+    batch_size=1,
+    img_size=img_scale[0],
+    size_divisor=32,
+    extra_pad_ratio=0.5)
+
+# different from yolov5
+anchors = [
+    [(19, 27), (44, 40), (38, 94)],  # P3/8
+    [(96, 68), (86, 152), (180, 137)],  # P4/16
+    [(140, 301), (303, 264), (238, 542)],  # P5/32
+]
+strides = [8, 16, 32]
+num_det_layers = 3
+num_classes = 1
+
+# single-scale training is recommended to
+# be turned on, which can speed up training.
+env_cfg = dict(cudnn_benchmark=True)
+
+model = dict(
+    type='YOLODetector',
+    _scope_='mmyolo',
+    backbone=dict(
+        type='YOLOv7Backbone',
+        arch='L',
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    neck=dict(
+        type='YOLOv7PAFPN',
+        block_cfg=dict(
+            type='ELANBlock',
+            middle_ratio=0.5,
+            block_ratio=0.25,
+            num_blocks=4,
+            num_convs_in_block=1),
+        upsample_feats_cat_first=False,
+        in_channels=[512, 1024, 1024],
+        # The real output channel will be multiplied by 2
+        out_channels=[128, 256, 512],
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    bbox_head=dict(
+        type='YOLOv7Head',
+        head_module=dict(
+            type='YOLOv7HeadModule',
+            num_classes=num_classes,
+            in_channels=[256, 512, 1024],
+            featmap_strides=strides,
+            num_base_priors=3),
+        prior_generator=dict(
+            type='mmdet.YOLOAnchorGenerator',
+            base_sizes=anchors,
+            strides=strides),
+        # scaled based on number of detection layers
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=0),
+        loss_bbox=dict(
+            type='IoULoss',
+            iou_mode='ciou',
+            bbox_format='xywh',
+            reduction='mean',
+            loss_weight=0.1,
+            return_iou=True),
+        loss_obj=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=1.0),
+        obj_level_weights=[4., 1., 0.4],
+        # BatchYOLOv7Assigner params
+        prior_match_thr=4.,
+        simota_candidate_topk=10,
+        simota_iou_weight=3.0,
+        simota_cls_weight=1.0),
+    test_cfg=dict(
+        multi_label=False,
+        nms_pre=30000,
+        score_thr=0.01,
+        nms=dict(type='nms', iou_threshold=0.65),
+        max_per_img=300))
+
+pre_transform = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True)
+]
+
+mosiac4_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        max_translate_ratio=0.2,  # note
+        scaling_ratio_range=(0.1, 2.0),  # note
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114)),
+]
+
+mosiac9_pipeline = [
+    dict(
+        type='Mosaic9',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        max_translate_ratio=0.2,  # note
+        scaling_ratio_range=(0.1, 2.0),  # note
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114)),
+]
+
+randchoice_mosaic_pipeline = dict(
+    type='RandomChoice',
+    transforms=[mosiac4_pipeline, mosiac9_pipeline],
+    prob=[0.8, 0.2])
+
+train_pipeline = [
+    *pre_transform,
+    randchoice_mosaic_pipeline,
+    dict(
+        type='YOLOv5MixUp',
+        alpha=8.0,  # note
+        beta=8.0,  # note
+        prob=0.15,
+        pre_transform=[*pre_transform, randchoice_mosaic_pipeline]),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(
+    batch_size=batch_size,
+    num_workers=train_num_workers,
+    persistent_workers=persistent_workers,
+    pin_memory=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    collate_fn=dict(type='yolov5_collate'),  # FASTER
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-train_cocoformat.json',
+        data_prefix=dict(img='train/'),
+        metainfo=dict(classes=('pedestrian', )),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline))
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='YOLOv5KeepRatioResize', scale=img_scale),
+    dict(
+        type='LetterResize',
+        scale=img_scale,
+        allow_scale_up=False,
+        pad_val=dict(img=114)),
+    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=val_num_workers,
+    persistent_workers=persistent_workers,
+    pin_memory=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        test_mode=True,
+        data_prefix=dict(img='train/'),
+        ann_file='annotations/half-val_cocoformat.json',
+        pipeline=test_pipeline,
+        batch_shapes_cfg=batch_shapes_cfg))
+
+test_dataloader = val_dataloader
+
+param_scheduler = None
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='SGD',
+        lr=0.001,
+        momentum=0.937,
+        weight_decay=0.0005,
+        nesterov=True),
+    constructor='YOLOv7OptimWrapperConstructor')
+
+default_hooks = dict(
+    param_scheduler=dict(
+        type='YOLOv5ParamSchedulerHook',
+        scheduler_type='cosine',
+        lr_factor=0.01,  # note
+        max_epochs=max_epochs),
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_param_scheduler=False,
+        interval=1,
+        save_best='auto',
+        max_keep_ckpts=3))
+
+val_evaluator = dict(
+    type='mmdet.CocoMetric',
+    proposal_nums=(100, 1, 10),  # Can be accelerated
+    ann_file=data_root + 'annotations/half-val_cocoformat.json',
+    metric='bbox')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=max_epochs,
+    val_interval=save_epoch_intervals)
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        strict_load=False,
+        priority=49)
+]
+
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
\ No newline at end of file
diff --git a/configs/det/yolov7_l_syncbn_fast_mot17.py b/configs/det/yolov7_l_syncbn_fast_mot17.py
new file mode 100644
index 000000000..be55874ac
--- /dev/null
+++ b/configs/det/yolov7_l_syncbn_fast_mot17.py
@@ -0,0 +1,207 @@
+_base_ = [
+    '../_base_/default_runtime.py',
+]
+
+img_scale = (800, 1040)
+batch_size = 2
+max_epochs = 5
+save_epoch_intervals = 1
+
+# different from yolov5
+anchors = [
+    [(12, 16), (19, 36), (40, 28)],  # P3/8
+    [(36, 75), (76, 55), (72, 146)],  # P4/16
+    [(142, 110), (192, 243), (459, 401)]  # P5/32
+]
+strides = [8, 16, 32]
+num_det_layers = 3
+num_classes = 1
+
+# single-scale training is recommended to
+# be turned on, which can speed up training.
+env_cfg = dict(cudnn_benchmark=True)
+
+model = dict(
+    type='YOLODetector',
+    _scope_='mmyolo',
+    data_preprocessor=dict(
+        type='YOLOv5DetDataPreprocessor',
+        pad_size_divisor=32,
+        mean=[0., 0., 0.],
+        std=[255., 255., 255.],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='YOLOv7Backbone',
+        arch='L',
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    neck=dict(
+        type='YOLOv7PAFPN',
+        block_cfg=dict(
+            type='ELANBlock',
+            middle_ratio=0.5,
+            block_ratio=0.25,
+            num_blocks=4,
+            num_convs_in_block=1),
+        upsample_feats_cat_first=False,
+        in_channels=[512, 1024, 1024],
+        # The real output channel will be multiplied by 2
+        out_channels=[128, 256, 512],
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    bbox_head=dict(
+        type='YOLOv7Head',
+        head_module=dict(
+            type='YOLOv7HeadModule',
+            num_classes=num_classes,
+            in_channels=[256, 512, 1024],
+            featmap_strides=strides,
+            num_base_priors=3),
+        prior_generator=dict(
+            type='mmdet.YOLOAnchorGenerator',
+            base_sizes=anchors,
+            strides=strides),
+        # scaled based on number of detection layers
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=0.3 * (num_classes / 80 * 3 / num_det_layers)),
+        loss_bbox=dict(
+            type='IoULoss',
+            iou_mode='ciou',
+            bbox_format='xywh',
+            reduction='mean',
+            loss_weight=0.05 * (3 / num_det_layers),
+            return_iou=True),
+        loss_obj=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=0.7 * ((img_scale[0] / 640)**2 * 3 / num_det_layers)),
+        obj_level_weights=[4., 1., 0.4],
+        # BatchYOLOv7Assigner params
+        prior_match_thr=4.,
+        simota_candidate_topk=10,
+        simota_iou_weight=3.0,
+        simota_cls_weight=1.0),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth'),
+    test_cfg=dict(
+        multi_label=False,
+        nms_pre=30000,
+        score_thr=0.001,
+        nms=dict(type='nms', iou_threshold=0.65),
+        max_per_img=300))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='mmdet.Resize',
+        scale=img_scale,
+        keep_ratio=True,
+        clip_object_border=False),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(type='mmdet.Pad', size=img_scale, pad_val=0),
+    dict(type='mmdet.PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True),
+    dict(type='mmdet.Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+# dataset settings
+dataset_type = 'mmdet.CocoDataset'
+data_root = 'data/MOT17/'
+
+train_dataloader = dict(
+    batch_size=batch_size,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),
+    collate_fn=dict(type='yolov5_collate'),  # FASTER
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-train_cocoformat.json',
+        data_prefix=dict(img='train/'),
+        metainfo=dict(classes=('pedestrian', )),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        _scope_='mmdet',
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img='train/'),
+        metainfo=dict(classes=('pedestrian', )),
+        test_mode=True,
+        pipeline=test_pipeline))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='mmdet.CocoMetric',
+    ann_file=data_root + 'annotations/half-val_cocoformat.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+param_scheduler = None
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='SGD',
+        lr=1e-3,
+        momentum=0.937,
+        weight_decay=0.0005,
+        nesterov=True))
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=max_epochs,
+    val_interval=save_epoch_intervals,
+    dynamic_intervals=[(270, 1)])
+
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(
+    param_scheduler=dict(
+        type='YOLOv5ParamSchedulerHook',
+        _scope_='mmyolo',
+        scheduler_type='cosine',
+        lr_factor=0.1,  # note
+        max_epochs=max_epochs),
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_param_scheduler=False,
+        interval=1,
+        save_best='auto',
+        max_keep_ckpts=3))
+
+custom_hooks = [
+    dict(
+        type='mmdet.EMAHook',
+        ema_type='mmdet.ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        strict_load=False,
+        priority=49)
+]
diff --git a/configs/det/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/configs/det/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 000000000..2d3ad7bee
--- /dev/null
+++ b/configs/det/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,209 @@
+_base_ = ['../_base_/default_runtime.py']
+
+data_root = 'data/MOT17/'
+
+img_scale = (800, 1440)
+batch_size = 4
+
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='BatchSyncRandomResize',
+                random_size_range=(480, 800),
+                size_divisor=32,
+                interval=10)
+        ]),
+    _scope_='mmdet',
+    type='YOLOX',
+    backbone=dict(type='CSPDarknet', deepen_factor=1.33, widen_factor=1.25),
+    neck=dict(
+        type='YOLOXPAFPN',
+        in_channels=[320, 640, 1280],
+        out_channels=320,
+        num_csp_blocks=4),
+    bbox_head=dict(
+        type='YOLOXHead', num_classes=1, in_channels=320, feat_channels=320),
+    train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
+    test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.7)),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint=  # noqa: E251
+        'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth'  # noqa: E501
+    ))
+
+train_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        bbox_clip_border=False),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        bbox_clip_border=False),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0,
+        bbox_clip_border=False),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Resize',
+        scale=img_scale,
+        keep_ratio=True,
+        clip_object_border=False),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=batch_size,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='MultiImageMixDataset',
+        _scope_='mmdet',
+        dataset=dict(
+            type='ConcatDataset',
+            _scope_='mmdet',
+            datasets=[
+                dict(
+                    type='CocoDataset',
+                    data_root='data/MOT17',
+                    ann_file='annotations/half-train_cocoformat.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian', )),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadAnnotations'),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_train.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian', )),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadAnnotations'),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_val.json',
+                    data_prefix=dict(img='val'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian', )),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadAnnotations'),
+                    ]),
+            ]),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root=data_root,
+        _scope_='mmdet',
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img='train'),
+        metainfo=dict(CLASSES=('pedestrian', )),
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+# default 8 gpu
+lr = 0.001 / 8 * batch_size
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=lr, momentum=0.9, weight_decay=5e-4, nesterov=True),
+    paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))
+
+# some hyper parameters
+# training settings
+total_epochs = 80
+num_last_epochs = 10
+resume_from = None
+interval = 5
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=total_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# learning policy
+param_scheduler = [
+    dict(
+        # use quadratic formula to warm up 1 epochs
+        # and lr is updated by iteration
+        type='mmdet.QuadraticWarmupLR',
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        # use cosine lr from 1 to 70 epoch
+        type='mmdet.CosineAnnealingLR',
+        eta_min=lr * 0.05,
+        begin=1,
+        T_max=total_epochs - num_last_epochs,
+        end=total_epochs - num_last_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        # use fixed lr during last 10 epochs
+        type='mmdet.ConstantLR',
+        by_epoch=True,
+        factor=1,
+        begin=total_epochs - num_last_epochs,
+        end=total_epochs,
+    )
+]
+
+custom_hooks = [
+    dict(
+        type='YOLOXModeSwitchHook',
+        num_last_epochs=num_last_epochs,
+        priority=48),
+    dict(type='mmdet.SyncNormHook', priority=48),
+    dict(
+        type='mmdet.EMAHook',
+        ema_type='mmdet.ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        priority=49)
+]
+default_hooks = dict(checkpoint=dict(interval=1))
+
+# evaluator
+val_evaluator = dict(
+    type='mmdet.CocoMetric',
+    ann_file=data_root + 'annotations/half-val_cocoformat.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
diff --git a/configs/det/yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py b/configs/det/yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
new file mode 100644
index 000000000..6485b6abb
--- /dev/null
+++ b/configs/det/yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
@@ -0,0 +1,124 @@
+_base_ = [
+    './yolox_x_8xb4-80e_crowdhuman-mot17halftrain'
+    '_test-mot17halfval.py'
+]
+
+img_scale = (896, 1600)
+
+model = dict(
+    data_preprocessor=dict(batch_augments=[
+        dict(type='BatchSyncRandomResize', random_size_range=(640, 1152))
+    ]))
+
+train_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        bbox_clip_border=True),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        bbox_clip_border=True),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0,
+        bbox_clip_border=True),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Resize',
+        scale=img_scale,
+        keep_ratio=True,
+        clip_object_border=True),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        type='MultiImageMixDataset',
+        _scope_='mmdet',
+        dataset=dict(
+            type='ConcatDataset',
+            _scope_='mmdet',
+            datasets=[
+                dict(
+                    type='CocoDataset',
+                    data_root='data/MOT20',
+                    ann_file='annotations/train_cocoformat.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian', )),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadAnnotations'),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_train.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian', )),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadAnnotations'),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_val.json',
+                    data_prefix=dict(img='val'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian', )),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadAnnotations'),
+                    ]),
+            ]),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/MOT17',
+        _scope_='mmdet',
+        ann_file='annotations/train_cocoformat.json',
+        data_prefix=dict(img='train'),
+        metainfo=dict(CLASSES=('pedestrian', )),
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = dict(
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/MOT20',
+        _scope_='mmdet',
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img='test'),
+        metainfo=dict(CLASSES=('pedestrian', )),
+        test_mode=True,
+        pipeline=test_pipeline))
+
+val_evaluator = dict(
+    type='mmdet.CocoMetric',
+    ann_file='data/MOT17/annotations/train_cocoformat.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = dict(
+    type='mmdet.CocoMetric',
+    ann_file='data/MOT20/annotations/test_cocoformat.json',
+    metric='bbox',
+    format_only=True)
diff --git a/configs/mot/README.md b/configs/mot/README.md
deleted file mode 100644
index d64e73706..000000000
--- a/configs/mot/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Usage of MOT configs
-
-## Training with MOT configs
-
-Please refer to [Train MOT models](../../docs/en/quick_run.md#examples-of-training-mot-model) to see the examples.
-
-## Testing with MOT configs
-
-Please refer to [Test MOT models](../../docs/en/quick_run.md#examples-of-testing-mot-model) to see the examples.
-
-## Inference with MOT configs
-
-Please refer to [Inference MOT models](../../docs/en/quick_run.md#inference-motvis-models) to see the examples.
diff --git a/configs/mot/bytetrack/README.md b/configs/mot/bytetrack/README.md
index 74af0a943..09f86ee18 100644
--- a/configs/mot/bytetrack/README.md
+++ b/configs/mot/bytetrack/README.md
@@ -31,10 +31,10 @@ Please note that the performance on `MOT17-half-val` is comparable with the perf
 
 The reason is that ByteTrack tunes customized hyper-parameters (e.g., image resolution and the high threshold of detection score) for each video in `MOT17-test` set, while we use unified parameters.
 
-|  Method   | Detector |           Train Set           |    Test Set    | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP   |  FN   | IDSw. |                            Config                            |                                                                                                                                                           Download                                                                                                                                                           |
-| :-------: | :------: | :---------------------------: | :------------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :----------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| ByteTrack | YOLOX-X  | CrowdHuman + MOT17-half-train | MOT17-half-val |   N    |       -        | 67.7 | 78.6 | 79.2 | 12909 | 21024 |  666  | [config](bytetrack_yolox_x_crowdhuman_mot17-private-half.py) | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500.log.json) |
-| ByteTrack | YOLOX-X  | CrowdHuman + MOT17-half-train |   MOT17-test   |   N    |       -        | 61.7 | 78.1 | 74.8 | 36705 | 85032 | 2049  |   [config](bytetrack_yolox_x_crowdhuman_mot17-private.py)    | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500.log.json) |
+|  Method   | Detector |           Train Set           |    Test Set    | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP   |  FN   | IDSw. |                                         Config                                          |                                                                                                                                                           Download                                                                                                                                                           |
+| :-------: | :------: | :---------------------------: | :------------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :-------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| ByteTrack | YOLOX-X  | CrowdHuman + MOT17-half-train | MOT17-half-val |   N    |       -        | 67.7 | 78.6 | 79.2 | 12909 | 21024 |  666  | [config](bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py) | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500.log.json) |
+| ByteTrack | YOLOX-X  | CrowdHuman + MOT17-half-train |   MOT17-test   |   N    |       -        | 61.7 | 78.1 | 74.8 | 36705 | 85032 | 2049  |  [config](bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py)   | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500.log.json) |
 
 ## Results and models on MOT20
 
@@ -42,7 +42,62 @@ Since there are only 4 videos in `MOT20-train`, ByteTrack is validated on `MOT17
 
 Please note that the MOTA on `MOT20-test` is slightly lower than that reported in the manuscript, because we don't tune the threshold for each video.
 
-|  Method   | Detector |        Train Set         |  Test Set   | Public | Inf time (fps) | HOTA | MOTA | IDF1 |   FP   |   FN   | IDSw. |                         Config                          |                                                                                                                                                      Download                                                                                                                                                      |
-| :-------: | :------: | :----------------------: | :---------: | :----: | :------------: | :--: | :--: | :--: | :----: | :----: | :---: | :-----------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| ByteTrack | YOLOX-X  | CrowdHuman + MOT20-train | MOT17-train |   N    |       -        | 57.3 | 64.9 | 71.8 | 33,747 | 83,385 | 1,263 | [config](bytetrack_yolox_x_crowdhuman_mot20-private.py) | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040-9ce38a60.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040.log.json) |
-| ByteTrack | YOLOX-X  | CrowdHuman + MOT20-train | MOT20-test  |   N    |       -        | 61.5 | 77.0 | 75.4 | 33,083 | 84,433 | 1,345 | [config](bytetrack_yolox_x_crowdhuman_mot20-private.py) | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040-9ce38a60.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040.log.json) |
+|  Method   | Detector |        Train Set         |  Test Set   | Public | Inf time (fps) | HOTA | MOTA | IDF1 |   FP   |   FN   | IDSw. |                                      Config                                      |                                                                                                                                                      Download                                                                                                                                                      |
+| :-------: | :------: | :----------------------: | :---------: | :----: | :------------: | :--: | :--: | :--: | :----: | :----: | :---: | :------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| ByteTrack | YOLOX-X  | CrowdHuman + MOT20-train | MOT17-train |   N    |       -        | 57.3 | 64.9 | 71.8 | 33,747 | 83,385 | 1,263 | [config](bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py) | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040-9ce38a60.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040.log.json) |
+| ByteTrack | YOLOX-X  | CrowdHuman + MOT20-train | MOT20-test  |   N    |       -        | 61.5 | 77.0 | 75.4 | 33,083 | 84,433 | 1,345 | [config](bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py) | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040-9ce38a60.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040.log.json) |
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training.
+
+```shell
+# Training Bytetrack on crowdhuman and mot17-half-train dataset with following command
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs
+./tools/dist_train.sh \
+    configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+**2.1 Example on MOTxx-halfval dataset**
+
+```shell
+# Example 1: Test on motXX-half-val set
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py 8 \
+    --checkpoint ./checkpoints/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth
+```
+
+**2.2 Example on MOTxx-test dataset**
+
+If you want to get the results of the [MOT Challenge](https://motchallenge.net/) test set, please use the following command to generate result files that can be used for submission. It will be stored in `./mot_17_test_res`, you can modify the saved path in `test_evaluator` of the config.
+
+```shell
+# Example 2: Test on motxx-test set
+# The number after config file represents the number of GPUs used
+./tools/dist_test.sh \
+    configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py 8 \
+    --checkpoint ./checkpoints/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth
+```
+
+If you want to know about more detailed usage of `test.py/dist_test.sh/slurm_test.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_mot_vis.py \
+    configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py \
+    --checkpoint ./checkpoints/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth \
+    --input demo/demo.mp4 \
+    --output mot.mp4
+```
+
+If you want to know about more detailed usage of `demo_mot_vis.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 000000000..cbaaff29e
--- /dev/null
+++ b/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,217 @@
+_base_ = [
+    '../../_base_/models/yolox_x_8x8.py',
+    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+]
+
+dataset_type = 'MOTChallengeDataset'
+data_root = 'data/MOT17/'
+
+img_scale = (800, 1440)
+batch_size = 4
+
+model = dict(
+    type='ByteTrack',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='mmdet.BatchSyncRandomResize',
+                random_size_range=(576, 1024),
+                size_divisor=32,
+                interval=10)
+        ]),
+    detector=dict(
+        _scope_='mmdet',
+        bbox_head=dict(num_classes=1),
+        test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.7)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth'  # noqa: E501
+        )),
+    motion=dict(type='KalmanFilter'),
+    tracker=dict(
+        type='ByteTracker',
+        obj_score_thrs=dict(high=0.6, low=0.1),
+        init_track_thr=0.7,
+        weight_iou_with_det_scores=True,
+        match_iou_thrs=dict(high=0.1, low=0.5, tentative=0.3),
+        num_frames_retain=30))
+
+train_pipeline = [
+    dict(
+        type='mmdet.Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        bbox_clip_border=False),
+    dict(
+        type='mmdet.RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        bbox_clip_border=False),
+    dict(
+        type='mmdet.MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0,
+        bbox_clip_border=False),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.Resize',
+        scale=img_scale,
+        keep_ratio=True,
+        clip_object_border=False),
+    dict(
+        type='mmdet.Pad',
+        size_divisor=32,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(
+        type='mmdet.FilterAnnotations',
+        min_gt_bbox_wh=(1, 1),
+        keep_empty=False),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='mmdet.Pad',
+        size_divisor=32,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=batch_size,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='mmdet.MultiImageMixDataset',
+        dataset=dict(
+            type='mmdet.ConcatDataset',
+            datasets=[
+                dict(
+                    type='mmdet.CocoDataset',
+                    data_root='data/MOT17',
+                    ann_file='annotations/half-train_cocoformat.json',
+                    # TODO: mmdet use img as key, but img_path is needed
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian')),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadTrackAnnotations'),
+                    ]),
+                dict(
+                    type='mmdet.CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_train.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian')),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadTrackAnnotations'),
+                    ]),
+                dict(
+                    type='mmdet.CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_val.json',
+                    data_prefix=dict(img='val'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian')),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadTrackAnnotations'),
+                    ]),
+            ]),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+# optimizer
+# default 8 gpu
+lr = 0.001 / 8 * batch_size
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=lr, momentum=0.9, weight_decay=5e-4, nesterov=True),
+    paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))
+
+# some hyper parameters
+# training settings
+total_epochs = 80
+num_last_epochs = 10
+resume_from = None
+interval = 5
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=total_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# learning policy
+param_scheduler = [
+    dict(
+        # use quadratic formula to warm up 1 epochs
+        # and lr is updated by iteration
+        # TODO: fix default scope in get function
+        type='mmdet.QuadraticWarmupLR',
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        # use cosine lr from 1 to 70 epoch
+        type='mmdet.CosineAnnealingLR',
+        eta_min=lr * 0.05,
+        begin=1,
+        T_max=total_epochs - num_last_epochs,
+        end=total_epochs - num_last_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        # use fixed lr during last 10 epochs
+        type='mmdet.ConstantLR',
+        by_epoch=True,
+        factor=1,
+        begin=total_epochs - num_last_epochs,
+        end=total_epochs,
+    )
+]
+
+custom_hooks = [
+    dict(
+        type='YOLOXModeSwitchHook',
+        num_last_epochs=num_last_epochs,
+        priority=48),
+    dict(type='mmdet.SyncNormHook', priority=48),
+    dict(
+        type='mmdet.EMAHook',
+        ema_type='mmdet.ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        priority=49)
+]
+default_hooks = dict(checkpoint=dict(interval=1))
+# evaluator
+val_evaluator = dict(postprocess_tracklet_cfg=[
+    dict(type='InterpolateTracklets', min_num_frames=5, max_num_frames=20)
+])
+test_evaluator = val_evaluator
diff --git a/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py b/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
new file mode 100644
index 000000000..f91bbf9a8
--- /dev/null
+++ b/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
@@ -0,0 +1,147 @@
+_base_ = [
+    './bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_'
+    'test-mot17halfval.py'
+]
+
+dataset_type = 'MOTChallengeDataset'
+
+img_scale = (896, 1600)
+
+model = dict(
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='mmdet.BatchSyncRandomResize',
+                random_size_range=(640, 1152))
+        ]),
+    tracker=dict(
+        weight_iou_with_det_scores=False,
+        match_iou_thrs=dict(high=0.3),
+    ))
+
+train_pipeline = [
+    dict(
+        type='mmdet.Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        bbox_clip_border=True),
+    dict(
+        type='mmdet.RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        bbox_clip_border=True),
+    dict(
+        type='mmdet.MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0,
+        bbox_clip_border=True),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.Resize',
+        scale=img_scale,
+        keep_ratio=True,
+        clip_object_border=True),
+    dict(
+        type='mmdet.Pad',
+        size_divisor=32,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(
+        type='mmdet.FilterAnnotations',
+        min_gt_bbox_wh=(1, 1),
+        keep_empty=False),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='mmdet.Pad',
+        size_divisor=32,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        type='mmdet.MultiImageMixDataset',
+        dataset=dict(
+            type='mmdet.ConcatDataset',
+            datasets=[
+                dict(
+                    type='mmdet.CocoDataset',
+                    data_root='data/MOT20',
+                    ann_file='annotations/train_cocoformat.json',
+                    # TODO: mmdet use img as key, but img_path is needed
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian')),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadTrackAnnotations'),
+                    ]),
+                dict(
+                    type='mmdet.CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_train.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian')),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadTrackAnnotations'),
+                    ]),
+                dict(
+                    type='mmdet.CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_val.json',
+                    data_prefix=dict(img='val'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian')),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadTrackAnnotations'),
+                    ]),
+            ]),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/MOT17',
+        ann_file='annotations/train_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/MOT20',
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test'),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+
+test_evaluator = dict(
+    type='MOTChallengeMetrics',
+    postprocess_tracklet_cfg=[
+        dict(type='InterpolateTracklets', min_num_frames=5, max_num_frames=20)
+    ],
+    format_only=True,
+    outfile_prefix='./mot_20_test_res')
diff --git a/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 000000000..ef4f84ec0
--- /dev/null
+++ b/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,8 @@
+_base_ = [
+    './bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_'
+    'test-mot17halfval.py'
+]
+
+# fp16 settings
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
+test_cfg = dict(type='TestLoop', fp16=True)
diff --git a/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py b/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py
new file mode 100644
index 000000000..3aee1c188
--- /dev/null
+++ b/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_'
+    'test-mot17halfval.py'
+]
+
+test_dataloader = dict(
+    dataset=dict(
+        data_root='data/MOT17/',
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test')))
+test_evaluator = dict(
+    type='MOTChallengeMetrics',
+    postprocess_tracklet_cfg=[
+        dict(type='InterpolateTracklets', min_num_frames=5, max_num_frames=20)
+    ],
+    format_only=True,
+    outfile_prefix='./mot_17_test_res')
diff --git a/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py b/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py
new file mode 100644
index 000000000..9c652b825
--- /dev/null
+++ b/configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py
@@ -0,0 +1,7 @@
+_base_ = [
+    './bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py'
+]
+
+# fp16 settings
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
+test_cfg = dict(type='TestLoop', fp16=True)
diff --git a/configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private-half.py b/configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private-half.py
deleted file mode 100644
index 65b449884..000000000
--- a/configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private-half.py
+++ /dev/null
@@ -1,164 +0,0 @@
-_base_ = [
-    '../../_base_/models/yolox_x_8x8.py',
-    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
-]
-
-img_scale = (800, 1440)
-samples_per_gpu = 4
-
-model = dict(
-    type='ByteTrack',
-    detector=dict(
-        input_size=img_scale,
-        random_size_range=(18, 32),
-        bbox_head=dict(num_classes=1),
-        test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.7)),
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth'  # noqa: E501
-        )),
-    motion=dict(type='KalmanFilter'),
-    tracker=dict(
-        type='ByteTracker',
-        obj_score_thrs=dict(high=0.6, low=0.1),
-        init_track_thr=0.7,
-        weight_iou_with_det_scores=True,
-        match_iou_thrs=dict(high=0.1, low=0.5, tentative=0.3),
-        num_frames_retain=30))
-
-train_pipeline = [
-    dict(
-        type='Mosaic',
-        img_scale=img_scale,
-        pad_val=114.0,
-        bbox_clip_border=False),
-    dict(
-        type='RandomAffine',
-        scaling_ratio_range=(0.1, 2),
-        border=(-img_scale[0] // 2, -img_scale[1] // 2),
-        bbox_clip_border=False),
-    dict(
-        type='MixUp',
-        img_scale=img_scale,
-        ratio_range=(0.8, 1.6),
-        pad_val=114.0,
-        bbox_clip_border=False),
-    dict(type='YOLOXHSVRandomAug'),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(
-        type='Resize',
-        img_scale=img_scale,
-        keep_ratio=True,
-        bbox_clip_border=False),
-    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
-    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
-]
-
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=img_scale,
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(
-                type='Normalize',
-                mean=[0.0, 0.0, 0.0],
-                std=[1.0, 1.0, 1.0],
-                to_rgb=False),
-            dict(
-                type='Pad',
-                size_divisor=32,
-                pad_val=dict(img=(114.0, 114.0, 114.0))),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img'])
-        ])
-]
-data = dict(
-    samples_per_gpu=samples_per_gpu,
-    workers_per_gpu=4,
-    persistent_workers=True,
-    train=dict(
-        _delete_=True,
-        type='MultiImageMixDataset',
-        dataset=dict(
-            type='CocoDataset',
-            ann_file=[
-                'data/MOT17/annotations/half-train_cocoformat.json',
-                'data/crowdhuman/annotations/crowdhuman_train.json',
-                'data/crowdhuman/annotations/crowdhuman_val.json'
-            ],
-            img_prefix=[
-                'data/MOT17/train', 'data/crowdhuman/train',
-                'data/crowdhuman/val'
-            ],
-            classes=('pedestrian', ),
-            pipeline=[
-                dict(type='LoadImageFromFile'),
-                dict(type='LoadAnnotations', with_bbox=True)
-            ],
-            filter_empty_gt=False),
-        pipeline=train_pipeline),
-    val=dict(
-        pipeline=test_pipeline,
-        interpolate_tracks_cfg=dict(min_num_frames=5, max_num_frames=20)),
-    test=dict(
-        pipeline=test_pipeline,
-        interpolate_tracks_cfg=dict(min_num_frames=5, max_num_frames=20)))
-
-# optimizer
-# default 8 gpu
-optimizer = dict(
-    type='SGD',
-    lr=0.001 / 8 * samples_per_gpu,
-    momentum=0.9,
-    weight_decay=5e-4,
-    nesterov=True,
-    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0))
-optimizer_config = dict(grad_clip=None)
-
-# some hyper parameters
-total_epochs = 80
-num_last_epochs = 10
-resume_from = None
-interval = 5
-
-# learning policy
-lr_config = dict(
-    policy='YOLOX',
-    warmup='exp',
-    by_epoch=False,
-    warmup_by_epoch=True,
-    warmup_ratio=1,
-    warmup_iters=1,
-    num_last_epochs=num_last_epochs,
-    min_lr_ratio=0.05)
-
-custom_hooks = [
-    dict(
-        type='YOLOXModeSwitchHook',
-        num_last_epochs=num_last_epochs,
-        priority=48),
-    dict(
-        type='SyncNormHook',
-        num_last_epochs=num_last_epochs,
-        interval=interval,
-        priority=48),
-    dict(
-        type='ExpMomentumEMAHook',
-        resume_from=resume_from,
-        momentum=0.0001,
-        priority=49)
-]
-
-checkpoint_config = dict(interval=1)
-evaluation = dict(metric=['bbox', 'track'], interval=1)
-search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML']
-
-# you need to set mode='dynamic' if you are using pytorch<=1.5.0
-fp16 = dict(loss_scale=dict(init_scale=512.))
diff --git a/configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private.py b/configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private.py
deleted file mode 100644
index 407990511..000000000
--- a/configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = ['./bytetrack_yolox_x_crowdhuman_mot17-private-half.py']
-
-data = dict(
-    test=dict(
-        ann_file='data/MOT17/annotations/test_cocoformat.json',
-        img_prefix='data/MOT17/test'))
diff --git a/configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot20-private.py b/configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot20-private.py
deleted file mode 100644
index 7fa0a7294..000000000
--- a/configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot20-private.py
+++ /dev/null
@@ -1,76 +0,0 @@
-_base_ = ['./bytetrack_yolox_x_crowdhuman_mot17-private-half.py']
-
-img_scale = (896, 1600)
-
-model = dict(
-    detector=dict(input_size=img_scale, random_size_range=(20, 36)),
-    tracker=dict(
-        weight_iou_with_det_scores=False,
-        match_iou_thrs=dict(high=0.3),
-    ))
-
-train_pipeline = [
-    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
-    dict(
-        type='RandomAffine',
-        scaling_ratio_range=(0.1, 2),
-        border=(-img_scale[0] // 2, -img_scale[1] // 2)),
-    dict(
-        type='MixUp',
-        img_scale=img_scale,
-        ratio_range=(0.8, 1.6),
-        pad_val=114.0),
-    dict(type='YOLOXHSVRandomAug'),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Resize', img_scale=img_scale, keep_ratio=True),
-    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
-    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=img_scale,
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(
-                type='Normalize',
-                mean=[0.0, 0.0, 0.0],
-                std=[1.0, 1.0, 1.0],
-                to_rgb=False),
-            dict(
-                type='Pad',
-                size_divisor=32,
-                pad_val=dict(img=(114.0, 114.0, 114.0))),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img'])
-        ])
-]
-data = dict(
-    train=dict(
-        dataset=dict(
-            ann_file=[
-                'data/MOT20/annotations/train_cocoformat.json',
-                'data/crowdhuman/annotations/crowdhuman_train.json',
-                'data/crowdhuman/annotations/crowdhuman_val.json'
-            ],
-            img_prefix=[
-                'data/MOT20/train', 'data/crowdhuman/train',
-                'data/crowdhuman/val'
-            ]),
-        pipeline=train_pipeline),
-    val=dict(
-        ann_file='data/MOT17/annotations/train_cocoformat.json',
-        img_prefix='data/MOT17/train',
-        pipeline=test_pipeline),
-    test=dict(
-        ann_file='data/MOT20/annotations/test_cocoformat.json',
-        img_prefix='data/MOT20/test',
-        pipeline=test_pipeline))
-
-checkpoint_config = dict(interval=1)
-evaluation = dict(metric=['bbox', 'track'], interval=1)
diff --git a/configs/mot/bytetrack/metafile.yml b/configs/mot/bytetrack/metafile.yml
index 0932fdfe4..1859a7049 100644
--- a/configs/mot/bytetrack/metafile.yml
+++ b/configs/mot/bytetrack/metafile.yml
@@ -7,14 +7,14 @@ Collections:
       Architecture:
         - YOLOX
     Paper:
-        URL: https://arxiv.org/abs/2110.06864
-        Title: ByteTrack Multi-Object Tracking by Associating Every Detection Box
+      URL: https://arxiv.org/abs/2110.06864
+      Title: ByteTrack Multi-Object Tracking by Associating Every Detection Box
     README: configs/mot/bytetrack/README.md
 
 Models:
-  - Name: bytetrack_yolox_x_crowdhuman_mot17-private-half
+  - Name: bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval
     In Collection: ByteTrack
-    Config: configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private-half.py
+    Config: configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
     Metadata:
       Training Data: CrowdHuman + MOT17-half-train
     Results:
@@ -25,9 +25,9 @@ Models:
           IDF1: 79.2
     Weights: https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth
 
-  - Name: bytetrack_yolox_x_crowdhuman_mot17-private
+  - Name: bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test
     In Collection: ByteTrack
-    Config: configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private.py
+    Config: configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py
     Metadata:
       Training Data: CrowdHuman + MOT17-half-train
     Results:
@@ -38,9 +38,9 @@ Models:
           IDF1: 74.8
     Weights: https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth
 
-  - Name: bytetrack_yolox_x_crowdhuman_mot20-private
+  - Name: bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test
     In Collection: ByteTrack
-    Config: configs/mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot20-private.py
+    Config: configs/mot/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py
     Metadata:
       Training Data: CrowdHuman + MOT20-train
     Results:
diff --git a/configs/mot/deepsort/README.md b/configs/mot/deepsort/README.md
index b52f8fe3a..3a05b3145 100644
--- a/configs/mot/deepsort/README.md
+++ b/configs/mot/deepsort/README.md
@@ -17,14 +17,6 @@ Simple Online and Realtime Tracking (SORT) is a pragmatic approach to multiple o
 <!-- [ALGORITHM] -->
 
 ```latex
-@inproceedings{bewley2016simple,
-  title={Simple online and realtime tracking},
-  author={Bewley, Alex and Ge, Zongyuan and Ott, Lionel and Ramos, Fabio and Upcroft, Ben},
-  booktitle={2016 IEEE International Conference on Image Processing (ICIP)},
-  pages={3464--3468},
-  year={2016},
-  organization={IEEE}
-}
 @inproceedings{wojke2017simple,
   title={Simple online and realtime tracking with a deep association metric},
   author={Wojke, Nicolai and Bewley, Alex and Paulus, Dietrich},
@@ -37,17 +29,87 @@ Simple Online and Realtime Tracking (SORT) is a pragmatic approach to multiple o
 
 ## Results and models on MOT17
 
-We implement SORT and DeepSORT with independent detector and ReID models. To train a model by yourself, you need to train a detector following [here](../../det/) and also train a ReID model following [here](../../reid).
-The configs in this folder are basically for inference.
-
-Currently we do not support training ReID models.
+Currently we do not support training ReID models for DeepSORT.
 We directly use the ReID model from [Tracktor](https://github.com/phil-bergmann/tracking_wo_bnw). These missed features will be supported in the future.
 
-|  Method  |      Detector      | ReID | Train Set  | Test Set | Public | Inf time (fps) | MOTA | IDF1 |  FP   |  FN   | IDSw. |                           Config                            |                                                                                                         Download                                                                                                         |
-| :------: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :---: | :---: | :---: | :---------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|   SORT   | R50-FasterRCNN-FPN |  -   | half-train | half-val |   Y    |      28.3      | 46.0 | 46.6 |  289  | 82451 | 4581  |   [config](sort_faster-rcnn_fpn_4e_mot17-public-half.py)    |                                                   [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth)                                                   |
-|   SORT   | R50-FasterRCNN-FPN |  -   | half-train | half-val |   N    |      18.6      | 62.0 | 57.8 | 15171 | 40437 | 5841  |   [config](sort_faster-rcnn_fpn_4e_mot17-private-half.py)   |                                                   [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth)                                                   |
-| DeepSORT | R50-FasterRCNN-FPN | R50  | half-train | half-val |   Y    |      20.4      | 48.1 | 60.8 |  283  | 82445 | 1199  | [config](deepsort_faster-rcnn_fpn_4e_mot17-public-half.py)  | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth) |
-| DeepSORT | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |      13.8      | 63.8 | 69.6 | 15060 | 40326 | 3183  | [config](deepsort_faster-rcnn_fpn_4e_mot17-private-half.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth) |
+|  Method  |      Detector      | ReID | Train Set  | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP   |  FN   | IDSw. |                                       Config                                       |                                                                                                         Download                                                                                                         |
+| :------: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :--------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| DeepSORT | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |      13.8      | 56.9 | 63.7 | 69.5 | 15051 | 40311 | 3312  | [config](deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth) |
+
+## Get started
+
+### 1. Training
+
+We implement DeepSORT with independent detector and ReID models.
+Note that, due to the influence of parameters such as learning rate in default configuration file,
+we recommend using 8 GPUs for training in order to reproduce accuracy.
+
+You can train the detector as follows.
+
+```shell script
+# Training Faster R-CNN on mot17-half-train dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_train.sh \
+    configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`,
+please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+**2.1 Example on MOTxx-halfval dataset**
+
+```shell script
+# Example 1: Test on motXX-half-val set.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8
+```
+
+If you want to use your own detector checkpoint, you can change the cfg as follows.
+
+```shell script
+model = dict(
+    detector=dict(
+        init_cfg=dict(
+            checkpoint=  # noqa: E251
+            'path_to_your_checkpoint.pth'  # noqa: E501
+        )))
+```
+
+Or, you can specify it in commands as follows.
+
+```shell script
+./tools/dist_test.sh \
+    configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 \
+    --cfg-options model.detector.init_cfg.checkpoint=path_to_your_checkpoint.pth
+```
+
+**2.2 Example on MOTxx-test dataset**
+
+If you want to get the results of the [MOT Challenge](https://motchallenge.net/) test set,
+please use the following command to generate result files that can be used for submission.
+It will be stored in `./mot_17_test_res`, you can modify the saved path in `test_evaluator` of the config.
+
+```shell script
+# Example 2: Test on motxx-test set
+# The number after config file represents the number of GPUs used
+./tools/dist_test.sh \
+    configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py 8
+```
+
+If you want to know about more detailed usage of `test.py/dist_test.sh/slurm_test.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_mot_vis.py \
+    configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \
+    --input demo/demo.mp4 \
+    --output mot.mp4
+```
 
-Note: When running `demo_mot.py`, we suggest you use the config containing `private`, since `private` means the MOT method doesn't need external detections.
+If you want to know about more detailed usage of `demo_mot_vis.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/mot/deepsort/base_aic.py b/configs/mot/deepsort/base_aic.py
new file mode 100644
index 000000000..ec613f96d
--- /dev/null
+++ b/configs/mot/deepsort/base_aic.py
@@ -0,0 +1,90 @@
+_base_ = [
+    '../../_base_/models/faster-rcnn_r50_fpn.py',
+    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+]
+model = dict(
+    type='DeepSORT',
+    detector=dict(
+        rpn_head=dict(bbox_coder=dict(clip_border=False)),
+        roi_head=dict(
+            bbox_head=dict(bbox_coder=dict(clip_border=False), num_classes=1)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth'  # noqa: E501
+        )),
+    motion=dict(type='KalmanFilter', center_only=False),
+    reid=dict(
+        type='BaseReID',
+        data_preprocessor=None,
+        backbone=dict(
+            type='mmcls.ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(3, ),
+            style='pytorch'),
+        neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+        head=dict(
+            type='LinearReIDHead',
+            num_fcs=1,
+            in_channels=2048,
+            fc_channels=1024,
+            out_channels=128,
+            num_classes=380,
+            loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
+            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='ReLU')),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth'  # noqa: E501
+        )),
+    tracker=dict(
+        type='SORTTracker',
+        obj_score_thr=0.5,
+        reid=dict(
+            num_samples=10,
+            img_scale=(256, 128),
+            img_norm_cfg=None,
+            match_score_thr=2.0),
+        match_iou_thr=0.5,
+        momentums=None,
+        num_tentatives=2,
+        num_frames_retain=100))
+
+train_dataloader = None
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# dataset settings
+dataset_type = 'MOTChallengeDataset'
+data_root = '../../datasets/AIC23_Track1_MTMC_Tracking/'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=True),
+    dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/validation_cocoformat_subset_0.2_consec.json',
+        data_prefix=dict(img_path='validation'),
+        metainfo=dict(CLASSES=('person', )),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
\ No newline at end of file
diff --git a/configs/mot/deepsort/deepsort_biou.py b/configs/mot/deepsort/deepsort_biou.py
new file mode 100644
index 000000000..b013b9e4d
--- /dev/null
+++ b/configs/mot/deepsort/deepsort_biou.py
@@ -0,0 +1,96 @@
+_base_ = [
+    '../../_base_/models/faster-rcnn_r50_fpn.py',
+    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+]
+
+custom_imports = dict(
+    imports=['mmtrack.models.trackers.my_sort_tracker'],
+    allow_failed_imports=False)
+
+model = dict(
+    type='DeepSORT',
+    detector=dict(
+        rpn_head=dict(bbox_coder=dict(clip_border=False)),
+        roi_head=dict(
+            bbox_head=dict(bbox_coder=dict(clip_border=False), num_classes=1)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth'  # noqa: E501
+        )),
+    motion=dict(type='KalmanFilter', center_only=False),
+    reid=dict(
+        type='BaseReID',
+        data_preprocessor=None,
+        backbone=dict(
+            type='mmcls.ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(3, ),
+            style='pytorch'),
+        neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+        head=dict(
+            type='LinearReIDHead',
+            num_fcs=1,
+            in_channels=2048,
+            fc_channels=1024,
+            out_channels=128,
+            num_classes=380,
+            loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
+            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='ReLU')),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth'  # noqa: E501
+        )),
+    tracker=dict(
+        type='MySORTTracker',
+        obj_score_thr=0.5,
+        reid=dict(
+            num_samples=10,
+            img_scale=(256, 128),
+            img_norm_cfg=None,
+            match_score_thr=2.0),
+        match_iou_thr=0.5,
+        momentums=None,
+        num_tentatives=2,
+        num_frames_retain=100,
+        biou=True))
+
+train_dataloader = None
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# dataset settings
+dataset_type = 'MOTChallengeDataset'
+data_root = '../../datasets/AIC23_Track1_MTMC_Tracking/'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=True),
+    dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/validation_cocoformat_subset_0.2_consec.json',
+        data_prefix=dict(img_path='validation'),
+        metainfo=dict(CLASSES=('person', )),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
\ No newline at end of file
diff --git a/configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-public-half.py b/configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-public-half.py
deleted file mode 100644
index c89cfb5d3..000000000
--- a/configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-public-half.py
+++ /dev/null
@@ -1,27 +0,0 @@
-_base_ = ['./deepsort_faster-rcnn_fpn_4e_mot17-private-half.py']
-data_root = 'data/MOT17/'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadDetections'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img', 'public_bboxes'])
-        ])
-]
-data = dict(
-    val=dict(
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        pipeline=test_pipeline),
-    test=dict(
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        pipeline=test_pipeline))
diff --git a/configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py b/configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
similarity index 74%
rename from configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py
rename to configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
index 202f4506a..9c7c6f669 100644
--- a/configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py
+++ b/configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -1,5 +1,5 @@
 _base_ = [
-    '../../_base_/models/faster_rcnn_r50_fpn.py',
+    '../../_base_/models/faster-rcnn_r50_fpn.py',
     '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
 ]
 model = dict(
@@ -16,8 +16,9 @@
     motion=dict(type='KalmanFilter', center_only=False),
     reid=dict(
         type='BaseReID',
+        data_preprocessor=None,
         backbone=dict(
-            type='ResNet',
+            type='mmcls.ResNet',
             depth=50,
             num_stages=4,
             out_indices=(3, ),
@@ -30,9 +31,8 @@
             fc_channels=1024,
             out_channels=128,
             num_classes=380,
-            loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
-            loss_pairwise=dict(
-                type='TripletLoss', margin=0.3, loss_weight=1.0),
+            loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
+            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
             norm_cfg=dict(type='BN1d'),
             act_cfg=dict(type='ReLU')),
         init_cfg=dict(
@@ -41,7 +41,7 @@
             'https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth'  # noqa: E501
         )),
     tracker=dict(
-        type='SortTracker',
+        type='SORTTracker',
         obj_score_thr=0.5,
         reid=dict(
             num_samples=10,
@@ -52,14 +52,9 @@
         momentums=None,
         num_tentatives=2,
         num_frames_retain=100))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=100,
-    warmup_ratio=1.0 / 100,
-    step=[3])
-# runtime settings
-total_epochs = 4
-evaluation = dict(metric=['bbox', 'track'], interval=1)
-search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML']
+
+train_dataloader = None
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py b/configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py
new file mode 100644
index 000000000..c8694fefd
--- /dev/null
+++ b/configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py
@@ -0,0 +1,22 @@
+_base_ = [
+    './deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
+model = dict(
+    detector=dict(
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth'  # noqa: E501
+        )))
+
+# dataloader
+val_dataloader = dict(
+    dataset=dict(ann_file='annotations/train_cocoformat.json'))
+test_dataloader = dict(
+    dataset=dict(
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test')))
+
+# evaluator
+test_evaluator = dict(format_only=True, outfile_prefix='./mot_17_test_res')
diff --git a/configs/mot/deepsort/deepsort_pose.py b/configs/mot/deepsort/deepsort_pose.py
new file mode 100644
index 000000000..da0aa91c2
--- /dev/null
+++ b/configs/mot/deepsort/deepsort_pose.py
@@ -0,0 +1,129 @@
+_base_ = [
+    '../../_base_/models/faster-rcnn_r50_fpn.py',
+    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+]
+
+custom_imports = dict(
+    imports=[
+        'mmtrack.models.reid.pose_reid',
+    ], allow_failed_imports=False)
+
+model = dict(
+    type='DeepSORT',
+    detector=dict(
+        rpn_head=dict(bbox_coder=dict(clip_border=False)),
+        roi_head=dict(
+            bbox_head=dict(bbox_coder=dict(clip_border=False), num_classes=1)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth'  # noqa: E501
+        )),
+    motion=dict(type='KalmanFilter', center_only=False),
+    reid=dict(
+        type='PoseReID',
+        base_reid=dict(
+            type='BaseReID',
+            data_preprocessor=None,
+            backbone=dict(
+                type='mmcls.ResNet',
+                depth=50,
+                num_stages=4,
+                out_indices=(3, ),
+                style='pytorch'),
+            neck=dict(
+                type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+            head=dict(
+                type='LinearReIDHead',
+                num_fcs=1,
+                in_channels=2048,
+                fc_channels=1024,
+                out_channels=128,
+                num_classes=380,
+                loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
+                loss_triplet=dict(
+                    type='TripletLoss', margin=0.3, loss_weight=1.0),
+                norm_cfg=dict(type='BN1d'),
+                act_cfg=dict(type='ReLU')),
+            init_cfg=dict(
+                type='Pretrained',
+                checkpoint=  # noqa: E251
+                'https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth'  # noqa: E501
+            )),
+        pose_model=dict(
+            type='TopdownPoseEstimator',
+            _scope_='mmpose',
+            data_preprocessor=None,
+            backbone=dict(
+                type='ResNet',
+                depth=50,
+                init_cfg=dict(
+                    type='Pretrained', checkpoint='torchvision://resnet50'),
+            ),
+            head=dict(
+                type='HeatmapHead',
+                in_channels=2048,
+                out_channels=17,
+                loss=dict(type='KeypointMSELoss', use_target_weight=True),
+                decoder=dict(
+                    type='MSRAHeatmap',
+                    input_size=(192, 256),
+                    heatmap_size=(48, 64),
+                    sigma=2)),
+            init_cfg=dict(
+                type='Pretrained',
+                checkpoint=
+                'https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth'
+            ),
+            test_cfg=dict(
+                flip_test=False,
+                flip_mode='heatmap',
+                shift_heatmap=True,
+            ))),
+    tracker=dict(
+        type='SORTTracker',
+        obj_score_thr=0.5,
+        reid=dict(
+            num_samples=10,
+            img_scale=(256, 128),
+            img_norm_cfg=None,
+            match_score_thr=2.0),
+        match_iou_thr=0.5,
+        momentums=None,
+        num_tentatives=2,
+        num_frames_retain=100))
+
+train_dataloader = None
+
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# dataset settings
+dataset_type = 'MOTChallengeDataset'
+data_root = '../../datasets/AIC23_Track1_MTMC_Tracking/'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=True),
+    dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/validation_cocoformat_subset_0.2_consec.json',
+        data_prefix=dict(img_path='validation'),
+        metainfo=dict(CLASSES=('person', )),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
\ No newline at end of file
diff --git a/configs/mot/deepsort/deepsort_reid_aic.py b/configs/mot/deepsort/deepsort_reid_aic.py
new file mode 100644
index 000000000..b650875af
--- /dev/null
+++ b/configs/mot/deepsort/deepsort_reid_aic.py
@@ -0,0 +1,75 @@
+_base_ = [
+    '../../_base_/models/faster-rcnn_r50_fpn.py',
+    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+]
+
+custom_imports = dict(
+    imports=['mmtrack.models.reid.my_reid'], allow_failed_imports=False)
+
+model = dict(
+    type='DeepSORT',
+    detector=dict(
+        rpn_head=dict(bbox_coder=dict(clip_border=False)),
+        roi_head=dict(
+            bbox_head=dict(bbox_coder=dict(clip_border=False), num_classes=1)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth'  # noqa: E501
+        )),
+    motion=dict(type='KalmanFilter', center_only=False),
+    reid=dict(
+        type='MyReID',
+        model_name='osnet_x1_0',
+        model_path=
+        '../reid/logs/osnet_x1_0_from_scratch_full_data/model.pth.tar-5',
+        device='cuda',
+    ),
+    tracker=dict(
+        type='SORTTracker',
+        obj_score_thr=0.5,
+        reid=dict(
+            num_samples=10,
+            img_scale=(256, 128),
+            img_norm_cfg=None,
+            match_score_thr=2.0),
+        match_iou_thr=0.5,
+        momentums=None,
+        num_tentatives=2,
+        num_frames_retain=100))
+
+train_dataloader = None
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# dataset settings
+dataset_type = 'MOTChallengeDataset'
+data_root = '../../datasets/AIC23_Track1_MTMC_Tracking/'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=True),
+    dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/validation_cocoformat_subset_0.2_consec.json',
+        data_prefix=dict(img_path='validation'),
+        metainfo=dict(CLASSES=('person', )),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
\ No newline at end of file
diff --git a/configs/mot/deepsort/deepsort_yolo_x_aic.py b/configs/mot/deepsort/deepsort_yolo_x_aic.py
new file mode 100644
index 000000000..78eb22796
--- /dev/null
+++ b/configs/mot/deepsort/deepsort_yolo_x_aic.py
@@ -0,0 +1,72 @@
+_base_ = [
+    '../../_base_/models/yolox_x_8x8.py',
+    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+]
+
+custom_imports = dict(
+    imports=['mmtrack.models.reid.my_reid'], allow_failed_imports=False)
+
+model = dict(
+    type='DeepSORT',
+    detector=dict(
+        bbox_head=dict(num_classes=1),
+        test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.7)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='./checkpoints/detector/epoch_10.pth')),
+    motion=dict(type='KalmanFilter', center_only=False),
+    reid=dict(
+        type='MyReID',
+        model_name='osnet_x1_0',
+        model_path=
+        '../reid/logs/osnet_x1_0_from_scratch_full_data/model.pth.tar-5',
+        device='cuda',
+    ),
+    tracker=dict(
+        type='SORTTracker',
+        obj_score_thr=0.5,
+        reid=dict(
+            num_samples=10,
+            img_scale=(256, 128),
+            img_norm_cfg=None,
+            match_score_thr=2.0),
+        match_iou_thr=0.5,
+        momentums=None,
+        num_tentatives=2,
+        num_frames_retain=100))
+
+train_dataloader = None
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# dataset settings
+dataset_type = 'MOTChallengeDataset'
+data_root = '../../datasets/AIC23_Track1_MTMC_Tracking/'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=True),
+    dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/validation_cocoformat_subset_0.2_consec.json',
+        data_prefix=dict(img_path='validation'),
+        metainfo=dict(CLASSES=('person', )),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
\ No newline at end of file
diff --git a/configs/mot/deepsort/metafile.yml b/configs/mot/deepsort/metafile.yml
index 2d9e97b86..d5d8a0997 100644
--- a/configs/mot/deepsort/metafile.yml
+++ b/configs/mot/deepsort/metafile.yml
@@ -1,5 +1,5 @@
 Collections:
-  - Name: SORT
+  - Name: DeepSORT
     Metadata:
       Training Techniques:
         - SGD with Momentum
@@ -8,82 +8,14 @@ Collections:
         - ResNet
         - FPN
     Paper:
-        URL: https://arxiv.org/abs/1602.00763
-        Title: Simple Online and Realtime Tracking
+      URL: https://arxiv.org/abs/1703.07402
+      Title: Simple Online and Realtime Tracking with a Deep Association Metric
     README: configs/mot/deepsort/README.md
 
 Models:
-  - Name: sort_faster-rcnn_fpn_4e_mot17-public-half
-    In Collection: SORT
-    Config: configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-public-half.py
-    Metadata:
-      Training Data: MOT17-half-train
-      inference time (ms/im):
-        - value: 35.3
-          hardware: V100
-          backend: PyTorch
-          batch size: 1
-          mode: FP32
-          resolution: (640, 1088)
-    Results:
-      - Task: Multiple Object Tracking
-        Dataset: MOT17-half-val
-        Metrics:
-          MOTA: 46.0
-          IDF1: 46.6
-    Weights: https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth
-
-  - Name: sort_faster-rcnn_fpn_4e_mot17-private-half
-    In Collection: SORT
-    Config: configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-private-half.py
-    Metadata:
-      Training Data: MOT17-half-train
-      inference time (ms/im):
-        - value: 53.8
-          hardware: V100
-          backend: PyTorch
-          batch size: 1
-          mode: FP32
-          resolution: (640, 1088)
-    Results:
-      - Task: Multiple Object Tracking
-        Dataset: MOT17-half-val
-        Metrics:
-          MOTA: 62.0
-          IDF1: 57.8
-    Weights: https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth
-
-  - Name: deepsort_faster-rcnn_fpn_4e_mot17-public-half
-    In Collection: SORT
-    Config: configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-public-half.py
-    Paper:
-        URL: https://arxiv.org/abs/1703.07402
-        Title: Simple Online and Realtime Tracking with a Deep Association Metric
-    Metadata:
-      Training Data: MOT17-half-train
-      inference time (ms/im):
-        - value: 49.0
-          hardware: V100
-          backend: PyTorch
-          batch size: 1
-          mode: FP32
-          resolution: (640, 1088)
-    Results:
-      - Task: Multiple Object Tracking
-        Dataset: MOT17-half-val
-        Metrics:
-          MOTA: 48.1
-          IDF1: 60.8
-    Weights:
-      - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth
-      - https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth
-
-  - Name: deepsort_faster-rcnn_fpn_4e_mot17-private-half
-    In Collection: SORT
-    Config: configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py
-    Paper:
-        URL: https://arxiv.org/abs/1703.07402
-        Title: Simple Online and Realtime Tracking with a Deep Association Metric
+  - Name: deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
+    In Collection: DeepSORT
+    Config: configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
     Metadata:
       Training Data: MOT17-half-train
       inference time (ms/im):
@@ -97,8 +29,9 @@ Models:
       - Task: Multiple Object Tracking
         Dataset: MOT17-half-val
         Metrics:
-          MOTA: 63.8
-          IDF1: 69.6
+          MOTA: 63.7
+          IDF1: 69.5
+          HOTA: 56.9
     Weights:
       - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth
       - https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth
diff --git a/configs/mot/deepsort/my_config.py b/configs/mot/deepsort/my_config.py
new file mode 100644
index 000000000..613ca92c7
--- /dev/null
+++ b/configs/mot/deepsort/my_config.py
@@ -0,0 +1,129 @@
+_base_ = [
+    '../../_base_/models/faster-rcnn_r50_fpn.py',
+    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+]
+
+custom_imports = dict(
+    imports=[
+        'mmtrack.models.mot.my_deep_sort',
+        'mmtrack.models.trackers.my_sort_tracker'
+    ],
+    allow_failed_imports=False)
+
+model = dict(
+    type='MyDeepSORT',
+    detector=dict(
+        rpn_head=dict(bbox_coder=dict(clip_border=False)),
+        roi_head=dict(
+            bbox_head=dict(bbox_coder=dict(clip_border=False), num_classes=1)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth'  # noqa: E501
+        )),
+    motion=dict(type='KalmanFilter', center_only=False),
+    reid=dict(
+        type='BaseReID',
+        data_preprocessor=None,
+        backbone=dict(
+            type='mmcls.ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(3, ),
+            style='pytorch'),
+        neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+        head=dict(
+            type='LinearReIDHead',
+            num_fcs=1,
+            in_channels=2048,
+            fc_channels=1024,
+            out_channels=128,
+            num_classes=380,
+            loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
+            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='ReLU')),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth'  # noqa: E501
+        )),
+    pose=dict(
+        type='TopdownPoseEstimator',
+        _scope_='mmpose',
+        data_preprocessor=None,
+        backbone=dict(
+            type='ResNet',
+            depth=50,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='torchvision://resnet50'),
+        ),
+        head=dict(
+            type='HeatmapHead',
+            in_channels=2048,
+            out_channels=17,
+            loss=dict(type='KeypointMSELoss', use_target_weight=True),
+            decoder=dict(
+                type='MSRAHeatmap',
+                input_size=(192, 256),
+                heatmap_size=(48, 64),
+                sigma=2)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=
+            'https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth'
+        ),
+        test_cfg=dict(
+            flip_test=False,
+            flip_mode='heatmap',
+            shift_heatmap=True,
+        )),
+    tracker=dict(
+        type='MySORTTracker',
+        obj_score_thr=0.5,
+        reid=dict(
+            num_samples=10,
+            img_scale=(256, 128),
+            img_norm_cfg=None,
+            match_score_thr=2.0),
+        match_iou_thr=0.5,
+        momentums=None,
+        num_tentatives=2,
+        num_frames_retain=100,
+        pose=True))
+
+train_dataloader = None
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# dataset settings
+dataset_type = 'MOTChallengeDataset'
+data_root = '../../datasets/AIC23_Track1_MTMC_Tracking/'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadTrackAnnotations', with_instance_id=True),
+    dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+# dataloader
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/validation_cocoformat_subset_0.2_consec.json',
+        data_prefix=dict(img_path='validation'),
+        metainfo=dict(CLASSES=('person', )),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
\ No newline at end of file
diff --git a/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-private.py b/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-private.py
deleted file mode 100644
index 30ecde712..000000000
--- a/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-private.py
+++ /dev/null
@@ -1,16 +0,0 @@
-_base_ = ['./sort_faster-rcnn_fpn_4e_mot17-private-half.py']
-model = dict(
-    detector=dict(
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth'  # noqa: E501
-        )))
-data_root = 'data/MOT17/'
-test_set = 'train'
-data = dict(
-    train=dict(ann_file=data_root + 'annotations/train_cocoformat.json'),
-    val=dict(ann_file=data_root + 'annotations/train_cocoformat.json'),
-    test=dict(
-        ann_file=data_root + f'annotations/{test_set}_cocoformat.json',
-        img_prefix=data_root + test_set))
diff --git a/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-public-half.py b/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-public-half.py
deleted file mode 100644
index 9fedf2090..000000000
--- a/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-public-half.py
+++ /dev/null
@@ -1,27 +0,0 @@
-_base_ = ['./sort_faster-rcnn_fpn_4e_mot17-private-half.py']
-data_root = 'data/MOT17/'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadDetections'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img', 'public_bboxes'])
-        ])
-]
-data = dict(
-    val=dict(
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        pipeline=test_pipeline),
-    test=dict(
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        pipeline=test_pipeline))
diff --git a/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-public.py b/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-public.py
deleted file mode 100644
index c856664dd..000000000
--- a/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-public.py
+++ /dev/null
@@ -1,19 +0,0 @@
-_base_ = ['./sort_faster-rcnn_fpn_4e_mot17-public-half.py']
-model = dict(
-    detector=dict(
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth'  # noqa: E501
-        )))
-data_root = 'data/MOT17/'
-test_set = 'train'
-data = dict(
-    train=dict(ann_file=data_root + 'annotations/train_cocoformat.json'),
-    val=dict(
-        ann_file=data_root + 'annotations/train_cocoformat.json',
-        detection_file=data_root + 'annotations/train_detections.pkl'),
-    test=dict(
-        ann_file=data_root + f'annotations/{test_set}_cocoformat.json',
-        img_prefix=data_root + test_set,
-        detection_file=data_root + f'annotations/{test_set}_detections.pkl'))
diff --git a/configs/mot/deepsort/test.ipynb b/configs/mot/deepsort/test.ipynb
new file mode 100644
index 000000000..228bf5fb5
--- /dev/null
+++ b/configs/mot/deepsort/test.ipynb
@@ -0,0 +1,66 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "ckpt = torch.load(\"../../../checkpoints/res50_coco_256x192-ec54d7f3_20200709.pth\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'mmpose_version': '0.1.0+96649df',\n",
+       " 'config': \"log_level = 'INFO'\\nload_from = None\\nresume_from = None\\ndist_params = dict(backend='nccl')\\nworkflow = [('train', 1)]\\ncheckpoint_config = dict(interval=10)\\nevaluation = dict(interval=8, metric='mAP')\\noptimizer = dict(type='Adam', lr=0.0005)\\noptimizer_config = dict(grad_clip=None)\\nlr_config = dict(\\n    policy='step',\\n    warmup='linear',\\n    warmup_iters=500,\\n    warmup_ratio=0.001,\\n    step=[170, 200])\\ntotal_epochs = 210\\nlog_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])\\nchannel_cfg = dict(\\n    num_output_channels=17,\\n    dataset_joints=17,\\n    dataset_channel=[[\\n        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\\n    ]],\\n    inference_channel=[\\n        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\\n    ])\\nmodel = dict(\\n    type='TopDown',\\n    pretrained='models/pytorch/imagenet/resnet50-19c8e357.pth',\\n    backbone=dict(type='ResNet', depth=50),\\n    keypoint_head=dict(type='SimpleHead', in_channels=2048, out_channels=17),\\n    train_cfg=dict(),\\n    test_cfg=dict(\\n        flip_test=True,\\n        post_process=True,\\n        shift_heatmap=True,\\n        unbiased_decoding=False,\\n        modulate_kernel=11),\\n    loss_pose=dict(type='JointsMSELoss', use_target_weight=True))\\ndata_cfg = dict(\\n    image_size=[192, 256],\\n    heatmap_size=[48, 64],\\n    num_output_channels=17,\\n    num_joints=17,\\n    dataset_channel=[[\\n        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\\n    ]],\\n    inference_channel=[\\n        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\\n    ],\\n    soft_nms=False,\\n    nms_thr=1.0,\\n    oks_thr=0.9,\\n    vis_thr=0.2,\\n    bbox_thr=1.0,\\n    use_gt_bbox=True,\\n    image_thr=0.0,\\n    bbox_file=\\n    'pretrained_models/det/COCO_val2017_detections_AP_H_56_person.json')\\ntrain_pipeline = [\\n    dict(type='LoadImageFromFile'),\\n    dict(type='RandomFlip', flip_prob=0.5),\\n    dict(type='HalfBodyTransform', num_joints_half_body=8, prob_half_body=0.3),\\n    dict(type='RandomScaleRotation', rot_factor=40, scale_factor=0.5),\\n    dict(type='AffineTransform'),\\n    dict(type='ToTensor'),\\n    dict(\\n        type='NormalizeTensor',\\n        mean=[0.485, 0.456, 0.406],\\n        std=[0.229, 0.224, 0.225]),\\n    dict(type='GenerateTarget', sigma=2),\\n    dict(\\n        type='Collect',\\n        keys=['img', 'target', 'target_weight'],\\n        meta_keys=[\\n            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',\\n            'rotation', 'bbox_score', 'flip_pairs'\\n        ])\\n]\\nvalid_pipeline = [\\n    dict(type='LoadImageFromFile'),\\n    dict(type='AffineTransform'),\\n    dict(type='ToTensor'),\\n    dict(\\n        type='NormalizeTensor',\\n        mean=[0.485, 0.456, 0.406],\\n        std=[0.229, 0.224, 0.225]),\\n    dict(\\n        type='Collect',\\n        keys=['img'],\\n        meta_keys=[\\n            'image_file', 'center', 'scale', 'rotation', 'bbox_score',\\n            'flip_pairs'\\n        ])\\n]\\ndata_root = 'data/coco'\\ndata = dict(\\n    samples_per_gpu=64,\\n    workers_per_gpu=2,\\n    train=dict(\\n        type='TopDownCocoDataset',\\n        ann_file='data/coco/annotations/person_keypoints_train2017.json',\\n        img_prefix='data/coco/train2017/',\\n        data_cfg=dict(\\n            image_size=[192, 256],\\n            heatmap_size=[48, 64],\\n            num_output_channels=17,\\n            num_joints=17,\\n            dataset_channel=[[\\n                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\\n            ]],\\n            inference_channel=[\\n                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\\n            ],\\n            soft_nms=False,\\n            nms_thr=1.0,\\n            oks_thr=0.9,\\n            vis_thr=0.2,\\n            bbox_thr=1.0,\\n            use_gt_bbox=True,\\n            image_thr=0.0,\\n            bbox_file=\\n            'pretrained_models/det/COCO_val2017_detections_AP_H_56_person.json'\\n        ),\\n        pipeline=[\\n            dict(type='LoadImageFromFile'),\\n            dict(type='RandomFlip', flip_prob=0.5),\\n            dict(\\n                type='HalfBodyTransform',\\n                num_joints_half_body=8,\\n                prob_half_body=0.3),\\n            dict(type='RandomScaleRotation', rot_factor=40, scale_factor=0.5),\\n            dict(type='AffineTransform'),\\n            dict(type='ToTensor'),\\n            dict(\\n                type='NormalizeTensor',\\n                mean=[0.485, 0.456, 0.406],\\n                std=[0.229, 0.224, 0.225]),\\n            dict(type='GenerateTarget', sigma=2),\\n            dict(\\n                type='Collect',\\n                keys=['img', 'target', 'target_weight'],\\n                meta_keys=[\\n                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',\\n                    'scale', 'rotation', 'bbox_score', 'flip_pairs'\\n                ])\\n        ]),\\n    val=dict(\\n        type='TopDownCocoDataset',\\n        ann_file='data/coco/annotations/person_keypoints_val2017.json',\\n        img_prefix='data/coco/val2017/',\\n        data_cfg=dict(\\n            image_size=[192, 256],\\n            heatmap_size=[48, 64],\\n            num_output_channels=17,\\n            num_joints=17,\\n            dataset_channel=[[\\n                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\\n            ]],\\n            inference_channel=[\\n                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\\n            ],\\n            soft_nms=False,\\n            nms_thr=1.0,\\n            oks_thr=0.9,\\n            vis_thr=0.2,\\n            bbox_thr=1.0,\\n            use_gt_bbox=True,\\n            image_thr=0.0,\\n            bbox_file=\\n            'pretrained_models/det/COCO_val2017_detections_AP_H_56_person.json'\\n        ),\\n        pipeline=[\\n            dict(type='LoadImageFromFile'),\\n            dict(type='AffineTransform'),\\n            dict(type='ToTensor'),\\n            dict(\\n                type='NormalizeTensor',\\n                mean=[0.485, 0.456, 0.406],\\n                std=[0.229, 0.224, 0.225]),\\n            dict(\\n                type='Collect',\\n                keys=['img'],\\n                meta_keys=[\\n                    'image_file', 'center', 'scale', 'rotation', 'bbox_score',\\n                    'flip_pairs'\\n                ])\\n        ]),\\n    test=dict(\\n        type='TopDownCocoDataset',\\n        ann_file='data/coco/annotations/person_keypoints_val2017.json',\\n        img_prefix='data/coco/val2017/',\\n        data_cfg=dict(\\n            image_size=[192, 256],\\n            heatmap_size=[48, 64],\\n            num_output_channels=17,\\n            num_joints=17,\\n            dataset_channel=[[\\n                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\\n            ]],\\n            inference_channel=[\\n                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16\\n            ],\\n            soft_nms=False,\\n            nms_thr=1.0,\\n            oks_thr=0.9,\\n            vis_thr=0.2,\\n            bbox_thr=1.0,\\n            use_gt_bbox=True,\\n            image_thr=0.0,\\n            bbox_file=\\n            'pretrained_models/det/COCO_val2017_detections_AP_H_56_person.json'\\n        ),\\n        pipeline=[\\n            dict(type='LoadImageFromFile'),\\n            dict(type='AffineTransform'),\\n            dict(type='ToTensor'),\\n            dict(\\n                type='NormalizeTensor',\\n                mean=[0.485, 0.456, 0.406],\\n                std=[0.229, 0.224, 0.225]),\\n            dict(\\n                type='Collect',\\n                keys=['img'],\\n                meta_keys=[\\n                    'image_file', 'center', 'scale', 'rotation', 'bbox_score',\\n                    'flip_pairs'\\n                ])\\n        ]))\\nwork_dir = 'work_dirs/res50_coco_256x192/'\\ngpu_ids = range(0, 1)\\nseed = None\\n\",\n",
+       " 'epoch': 210,\n",
+       " 'iter': 61320,\n",
+       " 'mmcv_version': '1.0rc0',\n",
+       " 'time': 'Wed Jul  8 17:24:25 2020'}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ckpt['meta']"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mot-mmtrack",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "e4bfa47eb115d085a5fd36886584e2476f8ebafa1f4dedfed6cf2234c0e3adec"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/configs/mot/ocsort/ocsort_faster-rcnn_r50_fpn.py b/configs/mot/ocsort/ocsort_faster-rcnn_r50_fpn.py
new file mode 100644
index 000000000..fc4bed242
--- /dev/null
+++ b/configs/mot/ocsort/ocsort_faster-rcnn_r50_fpn.py
@@ -0,0 +1,41 @@
+_base_ = [
+    '../../_base_/models/faster-rcnn_r50_fpn.py',
+    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='OCSORT',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='mmdet.BatchSyncRandomResize',
+                random_size_range=(576, 1024),
+                size_divisor=32,
+                interval=10)
+        ]),
+    detector=dict(
+        rpn_head=dict(bbox_coder=dict(clip_border=False)),
+        roi_head=dict(
+            bbox_head=dict(bbox_coder=dict(clip_border=False), num_classes=1)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth'  # noqa: E501
+        )),
+    motion=dict(type='KalmanFilter'),
+    tracker=dict(
+        type='OCSORTTracker',
+        obj_score_thr=0.3,
+        init_track_thr=0.7,
+        weight_iou_with_det_scores=True,
+        match_iou_thr=0.3,
+        num_tentatives=3,
+        vel_consist_weight=0.2,
+        vel_delta_t=3,
+        num_frames_retain=30))
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/mot/ocsort/ocsort_yolox_x_crowdhuman_mot17-private-half.py b/configs/mot/ocsort/ocsort_yolox_x_crowdhuman_mot17-private-half.py
new file mode 100644
index 000000000..36d7f52e6
--- /dev/null
+++ b/configs/mot/ocsort/ocsort_yolox_x_crowdhuman_mot17-private-half.py
@@ -0,0 +1,216 @@
+_base_ = [
+    '../../_base_/models/yolox_x_8x8.py',
+    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+]
+
+dataset_type = 'MOTChallengeDataset'
+data_root = 'data/MOT17/'
+
+img_scale = (800, 1440)
+batch_size = 4
+
+model = dict(
+    type='OCSORT',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='mmdet.BatchSyncRandomResize',
+                random_size_range=(576, 1024),
+                size_divisor=32,
+                interval=10)
+        ]),
+    detector=dict(
+        _scope_='mmdet',
+        bbox_head=dict(num_classes=1),
+        test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.7)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth'  # noqa: E501
+        )),
+    motion=dict(type='KalmanFilter'),
+    tracker=dict(
+        type='OCSORTTracker',
+        obj_score_thr=0.3,
+        init_track_thr=0.7,
+        weight_iou_with_det_scores=True,
+        match_iou_thr=0.3,
+        num_tentatives=3,
+        vel_consist_weight=0.2,
+        vel_delta_t=3,
+        num_frames_retain=30))
+
+train_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        bbox_clip_border=False),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        bbox_clip_border=False),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0,
+        bbox_clip_border=False),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='Resize',
+        img_scale=img_scale,
+        keep_ratio=True,
+        bbox_clip_border=False),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='mmdet.Pad',
+        size_divisor=32,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=batch_size,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='mmdet.MultiImageMixDataset',
+        dataset=dict(
+            type='mmdet.ConcatDataset',
+            datasets=[
+                dict(
+                    type='mmdet.CocoDataset',
+                    data_root='data/MOT17',
+                    ann_file='annotations/half-train_cocoformat.json',
+                    # TODO: mmdet use img as key, but img_path is needed
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian')),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadTrackAnnotations'),
+                    ]),
+                dict(
+                    type='mmdet.CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_train.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian')),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadTrackAnnotations'),
+                    ]),
+                dict(
+                    type='mmdet.CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_val.json',
+                    data_prefix=dict(img='val'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(CLASSES=('pedestrian')),
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadTrackAnnotations'),
+                    ]),
+            ]),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+
+# optimizer
+# default 8 gpu
+lr = 0.001 / 8 * batch_size
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=lr, momentum=0.9, weight_decay=5e-4, nesterov=True),
+    paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))
+
+# some hyper parameters
+# training settings
+total_epochs = 80
+num_last_epochs = 10
+resume_from = None
+interval = 5
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=total_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# learning policy
+param_scheduler = [
+    dict(
+        # use quadratic formula to warm up 1 epochs
+        # and lr is updated by iteration
+        # TODO: fix default scope in get function
+        type='mmdet.QuadraticWarmupLR',
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        # use cosine lr from 1 to 70 epoch
+        type='mmdet.CosineAnnealingLR',
+        eta_min=lr * 0.05,
+        begin=1,
+        T_max=total_epochs - num_last_epochs,
+        end=total_epochs - num_last_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        # use fixed lr during last 10 epochs
+        type='mmdet.ConstantLR',
+        by_epoch=True,
+        factor=1,
+        begin=total_epochs - num_last_epochs,
+        end=total_epochs,
+    )
+]
+
+custom_hooks = [
+    dict(
+        type='YOLOXModeSwitchHook',
+        num_last_epochs=num_last_epochs,
+        priority=48),
+    dict(type='mmdet.SyncNormHook', priority=48),
+    dict(
+        type='mmdet.EMAHook',
+        ema_type='mmdet.ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        priority=49)
+]
+default_hooks = dict(checkpoint=dict(interval=1))
+# evaluator
+val_evaluator = dict(postprocess_tracklet_cfg=[
+    dict(type='InterpolateTracklets', min_num_frames=5, max_num_frames=20)
+])
+test_evaluator = val_evaluator
+
diff --git a/configs/mot/qdtrack/README.md b/configs/mot/qdtrack/README.md
index 895ddd30e..e675f34e2 100644
--- a/configs/mot/qdtrack/README.md
+++ b/configs/mot/qdtrack/README.md
@@ -29,7 +29,114 @@ Similarity learning has been recognized as a crucial step for object tracking. H
 
 ## Results and models on MOT17
 
-| Method  |   Detector   |        Train Set        | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP  |  FN   | IDSw. |                                  Config                                   |                                                                                                                                                   Download                                                                                                                                                   |
-| :-----: | :----------: | :---------------------: | :------: | :----: | :------------: | :--: | :--: | :--: | :--: | :---: | :---: | :-----------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| QDTrack | Faster R-CNN |       half-train        | half-val |   N    |       -        | 57.1 | 68.2 | 68.5 | 8373 | 42939 | 1071  |      [config](qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py)       |            [model](https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635-76f295ef.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635.log.json)            |
-| QDTrack | Faster R-CNN | CrowdHuman + half-train | half-val |   N    |       -        | 59.1 | 71.7 | 71.6 | 6072 | 38733 |  867  | [config](qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17-private-half.py) | [model](https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17_20220315_163453-68899b0a.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17_20220315_163453.log.json) |
+| Method  |   Detector   |        Train Set        | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP  |  FN   | IDSw. |                                            Config                                            |                                                                                                                                                   Download                                                                                                                                                   |
+| :-----: | :----------: | :---------------------: | :------: | :----: | :------------: | :--: | :--: | :--: | :--: | :---: | :---: | :------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| QDTrack | Faster R-CNN |       half-train        | half-val |   N    |       -        | 57.1 | 68.2 | 68.5 | 8373 | 42939 | 1071  |      [config](qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py)       |            [model](https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635-76f295ef.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635.log.json)            |
+| QDTrack | Faster R-CNN | CrowdHuman + half-train | half-val |   N    |       -        | 59.1 | 71.7 | 71.6 | 6072 | 38733 |  867  | [config](qdtrack_faster-rcnn_r50_fpn_8xb2-4e_crowdhuman-mot17halftrain_test-mot17halfval.py) | [model](https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17_20220315_163453-68899b0a.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17_20220315_163453.log.json) |
+
+## Results and models on LVIS dataset
+
+| Method  |   Detector   |     Train Set     |    Test Set    | Inf time (fps) |  AP  | AP50 | AP75 | AP_S | AP_M | AP_L |                              Config                              |                                                                                                                                         Download                                                                                                                                         |
+| :-----: | :----------: | :---------------: | :------------: | :------------: | :--: | :--: | :--: | :--: | :--: | :--: | :--------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| QDTrack | Faster R-CNN | LVISv0.5+COCO2017 | TAO validation |       -        | 17.2 | 28.6 | 17.7 | 5.3  | 13.0 | 22.1 | [config](qdtrack_faster-rcnn_r101_fpn_8xb2-24e_lvis_test-tao.py) | [model](https://download.openmmlab.com/mmtracking/mot/qdtrack/tao_dataset/qdtrack_faster-rcnn_r101_fpn_24e_lvis_20220430_024513-88911daf.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/qdtrack/tao_dataset/qdtrack_faster-rcnn_r101_fpn_24e_lvis_20220430_024513.log.json) |
+
+## Results and models on TAO dataset
+
+Note: If you want to achieve a track AP of 11.0 on the TAO dataset, you need to do pre-training on LVIS dataset.
+
+a. Pre-train the QDTrack on LVISv0.5+COCO2017 training set and save the model to `checkpoints/lvis/**.pth`.
+
+The pre-trained checkpoint is given above([model](https://download.openmmlab.com/mmtracking/mot/qdtrack/tao_dataset/qdtrack_faster-rcnn_r101_fpn_24e_lvis_20220430_024513-88911daf.pth)).
+
+b. Modify the configs for TAO accordingly(set `load_from` to your **ckpt path**).
+
+See `1.2 Example on TAO Dataset` to get more details.
+
+We observe around 0.5 track AP fluctuations in performance, and provide the best model.
+
+| Method  |   Detector   | Train Set |    Test Set    | Inf time (fps) | Track AP(50:75) | Track AP50 | Track AP75 |                         Config                         |                                                                                                                                        Download                                                                                                                                        |
+| :-----: | :----------: | :-------: | :------------: | :------------: | :-------------: | :--------: | :--------: | :----------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| QDTrack | Faster R-CNN | TAO train | TAO validation |       -        |      11.0       |    15.8    |    6.1     | [config](qdtrack_faster-rcnn_r101_fpn_8xb2-12e_tao.py) | [model](https://download.openmmlab.com/mmtracking/mot/qdtrack/tao_dataset/qdtrack_faster-rcnn_r101_fpn_12e_tao_20220613_211934-7cbf4062.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/qdtrack/tao_dataset/qdtrack_faster-rcnn_r101_fpn_12e_tao_20220613_211934.log.json) |
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training.
+
+**1.1 Example on MOT Challenge Dataset**
+
+```shell
+# Training QDTrack on crowdhuman and mot17-half-train dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_train.sh \
+    configs/mot/qdtrack/qdtrack_faster-rcnn-r50_fpn_8xb2-4e_crowdhuman-mot17halftrain_test-mot17halfval.py 8
+```
+
+**1.2 Example on TAO Dataset**
+
+- a. Pre-train the QDTrack on LVISv0.5+COCO2017 training set and save the model to `checkpoints/lvis/**.pth`.
+
+```shell
+./tools/dist_train.sh \
+    configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-24e_lvis_test-tao.py 8
+```
+
+- b. Modify the configs for TAO accordingly(set `load_from` to your **ckpt path**).
+
+```shell
+./tools/dist_train.sh \
+    configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-12e_tao.py 8 \
+    --cfg-options load_from=checkpoints/lvis/qdtrack_faster-rcnn_r101_fpn_24e_lvis_20220430_024513-88911daf.pth
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+**2.1 Example on MOTxx-halfval dataset**
+
+```shell
+# Example 1: Test on motXX-half-val set
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/mot/qdtrack/qdtrack_faster-rcnn-r50_fpn_8xb2-4e_crowdhuman-mot17halftrain_test-mot17halfval.py 8 \
+    --checkpoint ./checkpoints/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17_20220315_163453-68899b0a.pth
+```
+
+**2.2 Example on TAO dataset**
+
+Note that the previous section `Results and models on TAO dataset` is evaluated using this command.
+
+```shell
+# Example 2: Test on TAO dataset
+# The number after config file represents the number of GPUs used.
+./tools/dist_test.sh \
+    configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-12e_tao.py 8 \
+    --checkpoint ./checkpoints/qdtrack_faster-rcnn_r101_fpn_12e_tao_20220613_211934-7cbf4062.pth
+```
+
+In addition, you can use the following command to check the results of the previous section `Results and models on LVIS dataset`.
+
+```shell
+# Please note that their test sets are the same, only the training sets are different.
+./tools/dist_test.sh \
+    configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-24e_lvis_test-tao.py 8 \
+    --checkpoint ./checkpoints/qdtrack_faster-rcnn_r101_fpn_24e_lvis_20220430_024513-88911daf.pth
+```
+
+If you want to know about more detailed usage of `test.py/dist_test.sh/slurm_test.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_mot_vis.py \
+    configs/mot/qdtrack/qdtrack_faster-rcnn-r50_fpn_8xb2-4e_crowdhuman-mot17halftrain_test-mot17halfval.py \
+    --checkpoint ./checkpoints/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17_20220315_163453-68899b0a.pth \
+    --input demo/demo.mp4 \
+    --output mot.mp4
+```
+
+If you want to know about more detailed usage of `demo_mot_vis.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/mot/qdtrack/metafile.yml b/configs/mot/qdtrack/metafile.yml
index 32cc2a17e..bde1518f3 100644
--- a/configs/mot/qdtrack/metafile.yml
+++ b/configs/mot/qdtrack/metafile.yml
@@ -8,14 +8,14 @@ Collections:
       Architecture:
         - ResNet
     Paper:
-        URL: https://arxiv.org/pdf/2006.06664.pdf
-        Title: Quasi-Dense Similarity Learning for Multiple Object Tracking
+      URL: https://arxiv.org/pdf/2006.06664.pdf
+      Title: Quasi-Dense Similarity Learning for Multiple Object Tracking
     README: configs/mot/qdtrack/README.md
 
 Models:
-  - Name: qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half
+  - Name: qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
     In Collection: QDTrack
-    Config: configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py
+    Config: configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
     Metadata:
       Training Data: MOT17
       Training Memory (GB): 5.83
@@ -29,9 +29,9 @@ Models:
           IDF1: 68.5
     Weights: https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635-76f295ef.pth
 
-  - Name: qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17-private-half
+  - Name: qdtrack_faster-rcnn_r50_fpn_8xb2-4e_crowdhuman-mot17halftrain_test-mot17halfval
     In Collection: QDTrack
-    Config: configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17-private-half.py
+    Config: configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_crowdhuman-mot17halftrain_test-mot17halfval.py
     Metadata:
       Training Data: MOT17, crowdhuman
       Training Memory (GB): 6.31
@@ -44,3 +44,35 @@ Models:
           MOTA: 71.7
           IDF1: 71.6
     Weights: https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17_20220315_163453-68899b0a.pth
+
+  - Name: qdtrack_faster-rcnn_r101_fpn_8xb2-24e_lvis_test-tao
+    In Collection: QDTrack
+    Config: configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-24e_lvis_test-tao.py
+    Metadata:
+      Training Data: LVIS
+      Training Memory (GB): 9.78
+      Epochs: 24
+    Results:
+      - Task: Multi-object Tracking
+        Dataset: TAO
+        Metrics:
+          AP: 17.2
+          AP50: 28.6
+          AP75: 17.7
+    Weights: https://download.openmmlab.com/mmtracking/mot/qdtrack/tao_dataset/qdtrack_faster-rcnn_r101_fpn_24e_lvis_20220430_024513-88911daf.pth
+
+  - Name: qdtrack_faster-rcnn_r101_fpn_8xb2-12e_tao
+    In Collection: QDTrack
+    Config: configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-12e_tao.py
+    Metadata:
+      Training Data: TAO
+      Training Memory (GB): 1.36
+      Epochs: 12
+    Results:
+      - Task: Multi-object Tracking
+        Dataset: TAO
+        Metrics:
+          Track AP: 11.0
+          Track AP50: 15.8
+          Track AP75: 6.1
+    Weights: https://download.openmmlab.com/mmtracking/mot/qdtrack/tao_dataset/qdtrack_faster-rcnn_r101_fpn_12e_tao_20220613_211934-7cbf4062.pth
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn-r50_fpn_8xb2-4e_crowdhuman-mot17halftrain_test-mot17halfval.py b/configs/mot/qdtrack/qdtrack_faster-rcnn-r50_fpn_8xb2-4e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 000000000..3d277166a
--- /dev/null
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn-r50_fpn_8xb2-4e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,69 @@
+_base_ = [
+    './qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_'
+    'test-mot17halfval.py'
+]
+
+# data pipeline
+train_pipeline = [
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadTrackAnnotations', with_instance_id=True),
+            dict(
+                type='mmdet.RandomResize',
+                resize_type='mmdet.Resize',
+                scale=(1088, 1088),
+                ratio_range=(0.8, 1.2),
+                keep_ratio=True,
+                clip_object_border=False),
+            dict(type='mmdet.PhotoMetricDistortion')
+        ]),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=False,
+        transforms=[
+            dict(
+                type='mmdet.RandomCrop',
+                crop_size=(1088, 1088),
+                bbox_clip_border=False)
+        ]),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='mmdet.RandomFlip', prob=0.5),
+        ]),
+    dict(type='PackTrackInputs', ref_prefix='ref')
+]
+mot_cfg = dict(
+    type='MOTChallengeDataset',
+    data_root='data/MOT17',
+    metainfo=dict(CLASSES=('pedestrian')),
+    visibility_thr=-1,
+    ann_file='annotations/half-train_cocoformat.json',
+    data_prefix=dict(img_path='train'),
+    ref_img_sampler=dict(
+        num_ref_imgs=1, frame_range=10, filter_key_img=True, method='uniform'),
+    pipeline=train_pipeline)
+crowdhuman_cfg = dict(
+    type='BaseVideoDataset',
+    data_root='data/crowdhuman',
+    load_as_video=False,
+    metainfo=dict(CLASSES=('pedestrian')),
+    ann_file='annotations/crowdhuman_train.json',
+    data_prefix=dict(img_path='train'),
+    ref_img_sampler=dict(num_ref_imgs=1, frame_range=0),
+    pipeline=train_pipeline)
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='ConcatDataset',
+        datasets=[mot_cfg, crowdhuman_cfg]))
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_12e_tao.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_12e_tao.py
deleted file mode 100644
index a9d2b4ea4..000000000
--- a/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_12e_tao.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# model settings
-_base_ = ['./qdtrack_faster-rcnn_r101_fpn_24e_lvis.py']
-model = dict(freeze_detector=True)
-data_root = 'data/tao/'
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
-        dataset=dict(
-            classes=data_root + 'annotations/tao_classes.txt',
-            ann_file=data_root + 'annotations/train_482_classes.json',
-            img_prefix=data_root + 'train/',
-            load_as_video=True,
-            key_img_sampler=dict(interval=1),
-            ref_img_sampler=dict(
-                num_ref_imgs=1, frame_range=[-1, 1], method='uniform'))))
-# learning policy
-lr_config = dict(step=[8, 11])
-total_epochs = 12
-optimizer = dict(type='SGD', lr=0.002, momentum=0.9, weight_decay=0.0001)
-load_from = None
-resume_from = None
-evaluation = dict(metric=['track'], start=1, interval=1)
-work_dir = None
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_24e_lvis.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_24e_lvis.py
deleted file mode 100644
index 4a3dc6cd0..000000000
--- a/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_24e_lvis.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# model settings
-_base_ = [
-    './qdtrack_faster-rcnn_r50_fpn_4e_base.py', '../../_base_/datasets/tao.py'
-]
-model = dict(
-    type='QDTrack',
-    detector=dict(
-        backbone=dict(
-            depth=101,
-            init_cfg=dict(
-                type='Pretrained', checkpoint='torchvision://resnet101')),
-        roi_head=dict(
-            bbox_head=dict(
-                loss_bbox=dict(type='L1Loss', loss_weight=1.0),
-                num_classes=482)),
-        test_cfg=dict(
-            rcnn=dict(
-                score_thr=0.0001,
-                nms=dict(type='nms', iou_threshold=0.5),
-                max_per_img=300)),
-        init_cfg=None),
-    tracker=dict(
-        _delete_=True,
-        type='QuasiDenseTAOTracker',
-        init_score_thr=0.0001,
-        obj_score_thr=0.0001,
-        match_score_thr=0.5,
-        memo_frames=10,
-        memo_momentum=0.8,
-        momentum_obj_score=0.5,
-        obj_score_diff_thr=1.0,
-        distractor_nms_thr=0.3,
-        distractor_score_thr=0.5,
-        match_metric='bisoftmax',
-        match_with_cosine=True))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=1000,
-    warmup_ratio=1.0 / 1000,
-    step=[16, 22])
-total_epochs = 24
-load_from = None
-resume_from = None
-evaluation = dict(metric=['bbox'], start=16, interval=2)
-work_dir = None
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-12e_tao.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-12e_tao.py
new file mode 100644
index 000000000..1be896a2c
--- /dev/null
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-12e_tao.py
@@ -0,0 +1,50 @@
+# model settings
+_base_ = ['./qdtrack_faster-rcnn_r101_fpn_8xb2-24e_lvis_test-tao.py']
+
+model = dict(freeze_detector=True)
+
+data_root = 'data/tao/'
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),
+    dataset=dict(
+        dataset=dict(
+            data_root=data_root,
+            data_prefix=dict(img_path=''),
+            load_as_video=True,
+            metainfo=dict(CLASSES=(data_root + 'annotations/tao_classes.txt')),
+            ann_file='annotations/train_482_classes.json',
+            ref_img_sampler=dict(
+                num_ref_imgs=1, frame_range=[-1, 1], method='uniform'))))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='mmdet.MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11])
+]
+# runtime settings
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=12, val_begin=0, val_interval=1)
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.002, momentum=0.9, weight_decay=0.0001),
+    clip_grad=None)
+
+val_evaluator = dict(type='TAOMetric', metric=['tao_track_ap'])
+test_evaluator = val_evaluator
+load_from = None
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-24e_lvis_test-tao.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-24e_lvis_test-tao.py
new file mode 100644
index 000000000..85cc0314c
--- /dev/null
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r101_fpn_8xb2-24e_lvis_test-tao.py
@@ -0,0 +1,74 @@
+# model settings
+_base_ = [
+    './qdtrack_faster-rcnn_r50_fpn_4e_base.py', '../../_base_/datasets/tao.py'
+]
+model = dict(
+    type='QDTrack',
+    data_preprocessor=dict(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    detector=dict(
+        backbone=dict(
+            depth=101,
+            norm_cfg=dict(requires_grad=True),
+            style='pytorch',
+            init_cfg=dict(
+                type='Pretrained', checkpoint='torchvision://resnet101')),
+        rpn_head=dict(bbox_coder=dict(clip_border=True)),
+        roi_head=dict(
+            bbox_head=dict(bbox_coder=dict(
+                clip_border=True), num_classes=482)),
+        test_cfg=dict(
+            rcnn=dict(
+                score_thr=0.0001,
+                nms=dict(type='nms', iou_threshold=0.5),
+                max_per_img=300)),
+        init_cfg=None),
+    track_head=dict(train_cfg=dict(assigner=dict(neg_iou_thr=0.3))),
+    tracker=dict(
+        _delete_=True,
+        type='QuasiDenseTAOTracker',
+        init_score_thr=0.0001,
+        obj_score_thr=0.0001,
+        match_score_thr=0.5,
+        memo_frames=10,
+        memo_momentum=0.8,
+        momentum_obj_score=0.5,
+        obj_score_diff_thr=1.0,
+        distractor_nms_thr=0.3,
+        distractor_score_thr=0.5,
+        match_metric='bisoftmax',
+        match_with_cosine=True))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='mmdet.MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22])
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001),
+    clip_grad=None)
+
+# runtime settings
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=24, val_begin=16, val_interval=2)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# evaluator
+val_evaluator = dict(type='TAOMetric', metric=['bbox'])
+test_evaluator = val_evaluator
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py
index 4c44880ae..2c393616c 100644
--- a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py
@@ -1,15 +1,24 @@
 _base_ = [
-    '../../_base_/models/faster_rcnn_r50_fpn.py',
+    '../../_base_/models/faster-rcnn_r50_fpn.py',
     '../../_base_/default_runtime.py'
 ]
 model = dict(
     type='QDTrack',
+    data_preprocessor=dict(
+        _delete_=True,
+        type='TrackDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        # TODO: it is different from the master branch
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
     detector=dict(
         backbone=dict(
             norm_cfg=dict(requires_grad=False),
             style='caffe',
             init_cfg=dict(
-                type='Pretrained', checkpoint='torchvision://resnet50')),
+                type='Pretrained',
+                checkpoint='open-mmlab://detectron2/resnet50_caffe')),
         rpn_head=dict(bbox_coder=dict(clip_border=False)),
         roi_head=dict(
             bbox_head=dict(
@@ -19,11 +28,15 @@
         init_cfg=dict(
             type='Pretrained',
             checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
+            'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+            'faster_rcnn_r50_fpn_1x_coco-person/'
+            'faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'
+            # noqa: E501
         )),
     track_head=dict(
         type='QuasiDenseTrackHead',
         roi_extractor=dict(
+            _scope_='mmdet',
             type='SingleRoIExtractor',
             roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
             out_channels=256,
@@ -45,6 +58,7 @@
         loss_bbox=dict(type='L1Loss', loss_weight=1.0),
         train_cfg=dict(
             assigner=dict(
+                _scope_='mmdet',
                 type='MaxIoUAssigner',
                 pos_iou_thr=0.7,
                 neg_iou_thr=0.5,
@@ -52,6 +66,7 @@
                 match_low_quality=False,
                 ignore_iof_thr=-1),
             sampler=dict(
+                _scope_='mmdet',
                 type='CombinedSampler',
                 num=256,
                 pos_fraction=0.5,
@@ -72,10 +87,22 @@
         nms_class_iou_thr=0.7,
         with_cats=True,
         match_metric='bisoftmax'))
-# optimizer && learning policy
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-lr_config = dict(policy='step', step=[3])
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+# learning policy
+param_scheduler = [
+    dict(
+        type='mmdet.MultiStepLR',
+        begin=0,
+        end=4,
+        by_epoch=True,
+        milestones=[3])
+]
+
 # runtime settings
-total_epochs = 4
-evaluation = dict(metric=['bbox', 'track'], interval=1)
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=4, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17-private-half.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17-private-half.py
deleted file mode 100644
index f513739e4..000000000
--- a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17-private-half.py
+++ /dev/null
@@ -1,52 +0,0 @@
-_base_ = ['./qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-train_pipeline = [
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
-    dict(
-        type='SeqResize',
-        img_scale=(1088, 1088),
-        share_params=True,
-        ratio_range=(0.8, 1.2),
-        keep_ratio=True,
-        bbox_clip_border=False),
-    dict(type='SeqPhotoMetricDistortion', share_params=True),
-    dict(
-        type='SeqRandomCrop',
-        share_params=False,
-        crop_size=(1088, 1088),
-        bbox_clip_border=False),
-    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='SeqPad', size_divisor=32),
-    dict(type='MatchInstances', skip_nomatch=True),
-    dict(
-        type='VideoCollect',
-        keys=[
-            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
-            'gt_instance_ids'
-        ]),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
-]
-mot_cfg = dict(
-    type='MOTChallengeDataset',
-    classes=('pedestrian', ),
-    visibility_thr=-1,
-    ann_file='data/MOT17/annotations/half-train_cocoformat.json',
-    img_prefix='data/MOT17/train',
-    ref_img_sampler=dict(num_ref_imgs=1, frame_range=10, method='uniform'),
-    pipeline=train_pipeline)
-crowdhuman_cfg = dict(
-    type='CocoVideoDataset',
-    load_as_video=False,
-    classes=('pedestrian', ),
-    ann_file='data/crowdhuman/annotations/crowdhuman_train.json',
-    img_prefix='data/crowdhuman/train',
-    pipeline=train_pipeline)
-data = dict(
-    train=dict(
-        _delete_=True,
-        type='ConcatDataset',
-        datasets=[mot_cfg, crowdhuman_cfg],
-        saparate_eval=False))
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
deleted file mode 100644
index 2d6a21bb2..000000000
--- a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_dancetrack.py
+++ /dev/null
@@ -1,53 +0,0 @@
-_base_ = [
-    './qdtrack_faster-rcnn_r50_fpn_4e_base.py',
-    '../../_base_/datasets/dancetrack.py',
-]
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-train_pipeline = [
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
-    dict(
-        type='SeqResize',
-        img_scale=(1088, 1088),
-        share_params=True,
-        ratio_range=(0.8, 1.2),
-        keep_ratio=True,
-        bbox_clip_border=False),
-    dict(type='SeqPhotoMetricDistortion', share_params=True),
-    dict(
-        type='SeqRandomCrop',
-        share_params=False,
-        crop_size=(1088, 1088),
-        bbox_clip_border=False),
-    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='SeqPad', size_divisor=32),
-    dict(type='MatchInstances', skip_nomatch=True),
-    dict(
-        type='VideoCollect',
-        keys=[
-            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
-            'gt_instance_ids'
-        ]),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img'])
-        ])
-]
-data = dict(
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py
deleted file mode 100644
index d38ab84c4..000000000
--- a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py
+++ /dev/null
@@ -1,53 +0,0 @@
-_base_ = [
-    './qdtrack_faster-rcnn_r50_fpn_4e_base.py',
-    '../../_base_/datasets/mot_challenge.py',
-]
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-train_pipeline = [
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
-    dict(
-        type='SeqResize',
-        img_scale=(1088, 1088),
-        share_params=True,
-        ratio_range=(0.8, 1.2),
-        keep_ratio=True,
-        bbox_clip_border=False),
-    dict(type='SeqPhotoMetricDistortion', share_params=True),
-    dict(
-        type='SeqRandomCrop',
-        share_params=False,
-        crop_size=(1088, 1088),
-        bbox_clip_border=False),
-    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
-    dict(type='SeqNormalize', **img_norm_cfg),
-    dict(type='SeqPad', size_divisor=32),
-    dict(type='MatchInstances', skip_nomatch=True),
-    dict(
-        type='VideoCollect',
-        keys=[
-            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',
-            'gt_instance_ids'
-        ]),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img'])
-        ])
-]
-data = dict(
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
diff --git a/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
new file mode 100644
index 000000000..7245a261a
--- /dev/null
+++ b/configs/mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,12 @@
+_base_ = [
+    './qdtrack_faster-rcnn_r50_fpn_4e_base.py',
+    '../../_base_/datasets/mot_challenge.py',
+]
+
+# evaluator
+val_evaluator = [
+    dict(type='CocoVideoMetric', metric=['bbox'], classwise=True),
+    dict(type='MOTChallengeMetrics', metric=['HOTA', 'CLEAR', 'Identity'])
+]
+
+test_evaluator = val_evaluator
diff --git a/configs/mot/qdtrack/qdtrack_yolox_x.py b/configs/mot/qdtrack/qdtrack_yolox_x.py
new file mode 100644
index 000000000..8e4e5292f
--- /dev/null
+++ b/configs/mot/qdtrack/qdtrack_yolox_x.py
@@ -0,0 +1,181 @@
+_base_ = [
+    '../../_base_/models/yolox_x_8x8.py',
+    '../../_base_/default_runtime.py',
+    '../../_base_/datasets/mot_challenge.py'
+]
+
+img_scale = (800, 1040)
+strides = [8, 16, 32]
+
+model = dict(
+    type='QDTrackSSTG',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='mmdet.BatchSyncRandomResize',
+                random_size_range=(576, 1024),
+                size_divisor=32,
+                interval=10)
+        ]),
+    detector=dict(
+        _scope_='mmdet',
+        bbox_head=dict(num_classes=1),
+        test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.7)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth'  # noqa: E501
+        )),
+    track_head=dict(
+        type='QuasiDenseTrackHead',
+        # roi_extractor=dict(
+        #     _scope_='mmdet',
+        #     type='GenericRoIExtractor',
+        #     aggregation='concat',
+        #     roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+        #     out_channels=1792, # 256 + 512 + 1024
+        #     featmap_strides=strides),
+        roi_extractor=dict(
+            _scope_='mmdet',
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=320,
+            featmap_strides=strides),
+        embed_head=dict(
+            type='QuasiDenseEmbedHead',
+            num_convs=4,
+            num_fcs=1,
+            in_channels=320,
+            embed_channels=256,
+            norm_cfg=dict(type='GN', num_groups=32),
+            loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
+            loss_track_aux=dict(
+                type='L2Loss',
+                neg_pos_ub=3,
+                pos_margin=0,
+                neg_margin=0.1,
+                hard_mining=True,
+                loss_weight=1.0)),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                _scope_='mmdet',
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                _scope_='mmdet',
+                type='CombinedSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=3,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type='InstanceBalancedPosSampler'),
+                neg_sampler=dict(type='RandomSampler')))),
+    tracker=dict(
+        type='QuasiDenseTracker',
+        init_score_thr=0.9,
+        obj_score_thr=0.5,
+        match_score_thr=0.5,
+        memo_tracklet_frames=30,
+        memo_backdrop_frames=1,
+        memo_momentum=0.8,
+        nms_conf_thr=0.5,
+        nms_backdrop_iou_thr=0.3,
+        nms_class_iou_thr=0.7,
+        with_cats=True,
+        match_metric='bisoftmax'),
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
+
+# optimizer
+lr = 0.001
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=lr, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# some hyper parameters
+# training settings
+total_epochs = 80
+num_last_epochs = 10
+resume_from = None
+interval = 5
+
+# learning policy
+param_scheduler = [
+    dict(
+        # use quadratic formula to warm up 1 epochs
+        # and lr is updated by iteration
+        # TODO: fix default scope in get function
+        type='mmdet.QuadraticWarmupLR',
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        # use cosine lr from 1 to 70 epoch
+        type='mmdet.CosineAnnealingLR',
+        eta_min=lr * 0.05,
+        begin=1,
+        T_max=total_epochs - num_last_epochs,
+        end=total_epochs - num_last_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        # use fixed lr during last 10 epochs
+        type='mmdet.ConstantLR',
+        by_epoch=True,
+        factor=1,
+        begin=total_epochs - num_last_epochs,
+        end=total_epochs,
+    )
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=total_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+custom_hooks = [
+    dict(type='mmdet.SyncNormHook', priority=48),
+    dict(
+        type='mmdet.EMAHook',
+        ema_type='mmdet.ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        priority=49)
+]
+default_hooks = dict(checkpoint=dict(interval=1))
\ No newline at end of file
diff --git a/configs/mot/sort/README.md b/configs/mot/sort/README.md
new file mode 100644
index 000000000..cf5d8c41f
--- /dev/null
+++ b/configs/mot/sort/README.md
@@ -0,0 +1,112 @@
+# Simple online and realtime tracking
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+This paper explores a pragmatic approach to multiple object tracking where the main focus is to associate objects efficiently for online and realtime applications. To this end, detection quality is identified as a key factor influencing tracking performance, where changing the detector can improve tracking by up to 18.9%. Despite only using a rudimentary combination of familiar techniques such as the Kalman Filter and Hungarian algorithm for the tracking components, this approach achieves an accuracy comparable to state-of-the-art online trackers. Furthermore, due to the simplicity of our tracking method, the tracker updates at a rate of 260 Hz which is over 20x faster than other state-of-the-art trackers.
+
+<!-- [IMAGE] -->
+
+<div align="center">
+  <img src="https://user-images.githubusercontent.com/99722489/176848133-d6621813-7b8f-4b25-96cd-2fbcc87983ce.png"/>
+</div>
+
+## Citation
+
+<!-- [ALGORITHM] -->
+
+```latex
+@inproceedings{bewley2016simple,
+  title={Simple online and realtime tracking},
+  author={Bewley, Alex and Ge, Zongyuan and Ott, Lionel and Ramos, Fabio and Upcroft, Ben},
+  booktitle={2016 IEEE International Conference on Image Processing (ICIP)},
+  pages={3464--3468},
+  year={2016},
+  organization={IEEE}
+}
+```
+
+## Results and models on MOT17
+
+| Method |      Detector      | ReID | Train Set  | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP   |  FN   | IDSw. |                                     Config                                     |                                                       Download                                                       |
+| :----: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :----------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: |
+|  SORT  | R50-FasterRCNN-FPN |  -   | half-train | half-val |   N    |      18.6      | 52.0 | 62.0 | 57.8 | 15150 | 40410 | 5847  | [config](sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth) |
+
+## Get started
+
+### 1. Training
+
+We implement SORT with independent detector models.
+Note that, due to the influence of parameters such as learning rate in default configuration file,
+we recommend using 8 GPUs for training in order to reproduce accuracy.
+
+You can train the detector as follows.
+
+```shell script
+# Training Faster R-CNN on mot17-half-train dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_train.sh \
+    configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`,
+please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+**2.1 Example on MOTxx-halfval dataset**
+
+```shell script
+# Example 1: Test on motXX-half-val set.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8
+```
+
+If you want to use your own detector checkpoint, you can change the cfg as follows.
+
+```shell script
+model = dict(
+    detector=dict(
+        init_cfg=dict(
+            checkpoint=  # noqa: E251
+            'path_to_your_checkpoint.pth'  # noqa: E501
+        )))
+```
+
+Or, you can specify it in commands as follows.
+
+```shell script
+./tools/dist_test.sh \
+    configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 \
+    --cfg-options model.detector.init_cfg.checkpoint=path_to_your_checkpoint.pth
+```
+
+**2.2 Example on MOTxx-test dataset**
+
+If you want to get the results of the [MOT Challenge](https://motchallenge.net/) test set,
+please use the following command to generate result files that can be used for submission.
+It will be stored in `./mot_17_test_res`, you can modify the saved path in `test_evaluator` of the config.
+
+```shell script
+# Example 2: Test on motxx-test set
+# The number after config file represents the number of GPUs used
+./tools/dist_test.sh \
+    configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py 8
+```
+
+If you want to know about more detailed usage of `test.py/dist_test.sh/slurm_test.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_mot_vis.py \
+    configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \
+    --input demo/demo.mp4 \
+    --output mot.mp4
+```
+
+If you want to know about more detailed usage of `demo_mot_vis.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/mot/sort/metafile.yml b/configs/mot/sort/metafile.yml
new file mode 100644
index 000000000..928a90bd9
--- /dev/null
+++ b/configs/mot/sort/metafile.yml
@@ -0,0 +1,35 @@
+Collections:
+  - Name: SORT
+    Metadata:
+      Training Techniques:
+        - SGD with Momentum
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - FPN
+    Paper:
+      URL: https://arxiv.org/abs/1602.00763
+      Title: Simple Online and Realtime Tracking
+    README: configs/mot/sort/README.md
+
+Models:
+  - Name: sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
+    In Collection: SORT
+    Config: configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+    Metadata:
+      Training Data: MOT17-half-train
+      inference time (ms/im):
+        - value: 53.8
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (640, 1088)
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT17-half-val
+        Metrics:
+          MOTA: 62.0
+          IDF1: 57.8
+          HOTA: 52.0
+    Weights: https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth
diff --git a/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-private-half.py b/configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
similarity index 60%
rename from configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-private-half.py
rename to configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
index 7573565eb..484b2f686 100644
--- a/configs/mot/deepsort/sort_faster-rcnn_fpn_4e_mot17-private-half.py
+++ b/configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -1,5 +1,5 @@
 _base_ = [
-    '../../_base_/models/faster_rcnn_r50_fpn.py',
+    '../../_base_/models/faster-rcnn_r50_fpn.py',
     '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
 ]
 model = dict(
@@ -15,15 +15,10 @@
         )),
     motion=dict(type='KalmanFilter', center_only=False),
     tracker=dict(
-        type='SortTracker', obj_score_thr=0.5, match_iou_thr=0.5, reid=None))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=100,
-    warmup_ratio=1.0 / 100,
-    step=[3])
-# runtime settings
-total_epochs = 4
-evaluation = dict(metric=['bbox', 'track'], interval=1)
-search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML']
+        type='SORTTracker', obj_score_thr=0.5, match_iou_thr=0.5, reid=None))
+
+train_dataloader = None
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py b/configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py
new file mode 100644
index 000000000..aaddeb210
--- /dev/null
+++ b/configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py
@@ -0,0 +1,22 @@
+_base_ = [
+    './sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
+model = dict(
+    detector=dict(
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth'  # noqa: E501
+        )))
+
+# dataloader
+val_dataloader = dict(
+    dataset=dict(ann_file='annotations/train_cocoformat.json'))
+test_dataloader = dict(
+    dataset=dict(
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test')))
+
+# evaluator
+test_evaluator = dict(format_only=True, outfile_prefix='./mot_17_test_res')
diff --git a/configs/mot/strongsort/README.md b/configs/mot/strongsort/README.md
new file mode 100644
index 000000000..1308f7a96
--- /dev/null
+++ b/configs/mot/strongsort/README.md
@@ -0,0 +1,130 @@
+# StrongSORT: Make DeepSORT Great Again
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Existing Multi-Object Tracking (MOT) methods can be roughly classified as tracking-by-detection and joint-detection-association paradigms. Although the latter has elicited more attention and demonstrates comparable performance relative to the former, we claim that the tracking-by-detection paradigm is still the optimal solution in terms of tracking accuracy. In this paper, we revisit the classic tracker DeepSORT and upgrade it from various aspects, i.e., detection, embedding and association. The resulting tracker, called StrongSORT, sets new HOTA and IDF1 records on MOT17 and MOT20. We also present two lightweight and plug-and-play algorithms to further refine the tracking results. Firstly, an appearance-free link model (AFLink) is proposed to associate short tracklets into complete trajectories. To the best of our knowledge, this is the first global link model without appearance information. Secondly, we propose Gaussian-smoothed interpolation (GSI) to compensate for missing detections. Instead of ignoring motion information like linear interpolation, GSI is based on the Gaussian process regression algorithm and can achieve more accurate localizations. Moreover, AFLink and GSI can be plugged into various trackers with a negligible extra computational cost (591.9 and 140.9 Hz, respectively, on MOT17). By integrating StrongSORT with the two algorithms, the final tracker StrongSORT++ ranks first on MOT17 and MOT20 in terms of HOTA and IDF1 metrics and surpasses the second-place one by 1.3 - 2.2. Code will be released soon.
+
+<!-- [IMAGE] -->
+
+<div align="center">
+  <img src="https://user-images.githubusercontent.com/99722489/185282811-ec82bdf6-8889-4f01-9c4d-a8e104f775b7.png"/>
+</div>
+
+## Citation
+
+<!-- [ALGORITHM] -->
+
+```latex
+@article{du2022strongsort,
+  title={Strongsort: Make deepsort great again},
+  author={Du, Yunhao and Song, Yang and Yang, Bo and Zhao, Yanyun},
+  journal={arXiv preprint arXiv:2202.13514},
+  year={2022}
+}
+```
+
+## Results and models on MOT17
+
+|    Method    | Detector | ReID |           Train Set           |    Test Set    | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP   |  FN   | IDSw. |                                        Config                                        |                                                                                                                                                                                   Download                                                                                                                                                                                    |
+| :----------: | :------: | :--: | :---------------------------: | :------------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :----------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| StrongSORT++ | YOLOX-X  | R50  | CrowdHuman + MOT17-half-train | MOT17-half-val |   N    |       -        | 70.9 | 78.3 | 83.2 | 15336 | 19065 |  621  | [config](strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py) | [detector](https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot17-private-half_20220812_192036-b6c9ce9a.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth) [AFLink](https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth) |
+
+## Results and models on MOT20
+
+|    Method    | Detector | ReID |        Train Set         |  Test Set  | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP   |  FN   | IDSw. |                                    Config                                     |                                                                                                                                                                                         Download                                                                                                                                                                                         |
+| :----------: | :------: | :--: | :----------------------: | :--------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :---------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| StrongSORT++ | YOLOX-X  | R50  | CrowdHuman + MOT20-train | MOT20-test |   N    |       -        | 62.9 | 75.5 | 77.3 | 29043 | 96155 | 1640  | [config](strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py) | [detector](https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot20-private_20220812_192123-77c014de.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth) [AFLink](https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth) |
+
+## Get started
+
+### 1. Training
+
+We implement StrongSORT with independent detector and ReID models.
+Note that, due to the influence of parameters such as learning rate in default configuration file,
+we recommend using 8 GPUs for training in order to reproduce accuracy.
+
+You can train the detector as follows.
+
+```shell script
+# Training YOLOX-X on crowdhuman and mot17-half-train dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_train.sh \
+    configs/det/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py 8
+```
+
+And you can train the ReID model as follows.
+
+```shell script
+# Training ReID model on mot17-train80 dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_train.sh \
+    configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`,
+please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+**2.1 Example on MOTxx-halfval dataset**
+
+```shell script
+# Example 1: Test on motXX-half-val set.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py 8
+```
+
+If you want to use your own detector and ReID checkpoint, you can change the cfg as follows.
+
+```shell script
+model = dict(
+    detector=dict(
+        init_cfg=dict(
+            checkpoint=  # noqa: E251
+            'path_to_your_det_checkpoint.pth'  # noqa: E501
+        )),
+    reid=dict(
+        init_cfg=dict(
+            checkpoint=  # noqa: E251
+            'path_to_your_reid_checkpoint.pth'  # noqa: E501
+        )))
+```
+
+Or, you can specify them in commands as follows.
+
+```shell script
+./tools/dist_test.sh \
+    configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 \
+    --cfg-options model.detector.init_cfg.checkpoint=path_to_your_det_checkpoint.pth model.reid.init_cfg.checkpoint=path_to_your_reid_checkpoint.pth
+```
+
+**2.2 Example on MOTxx-test dataset**
+
+If you want to get the results of the [MOT Challenge](https://motchallenge.net/) test set,
+please use the following command to generate result files that can be used for submission.
+It will be stored in `./mot_20_test_res`, you can modify the saved path in `test_evaluator` of the config.
+
+```shell script
+# Example 2: Test on motxx-test set
+# The number after config file represents the number of GPUs used
+./tools/dist_test.sh \
+    configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py 8
+```
+
+If you want to know about more detailed usage of `test.py/dist_test.sh/slurm_test.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_mot_vis.py \
+    configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py \
+    --input demo/demo.mp4 \
+    --output mot.mp4
+```
+
+If you want to know about more detailed usage of `demo_mot_vis.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/mot/strongsort/metafile.yml b/configs/mot/strongsort/metafile.yml
new file mode 100644
index 000000000..997ef1250
--- /dev/null
+++ b/configs/mot/strongsort/metafile.yml
@@ -0,0 +1,48 @@
+Collections:
+  - Name: StrongSORT++
+    Metadata:
+      Training Techniques:
+        - SGD with Momentum
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - YOLOX
+    Paper:
+      URL: https://arxiv.org/abs/2202.13514
+      Title: "StrongSORT: Make DeepSORT Great Again"
+    README: configs/mot/strongsort/README.md
+
+Models:
+  - Name: strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval
+    In Collection: StrongSORT++
+    Config: configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
+    Metadata:
+      Training Data: CrowdHuman + MOT17-half-train
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT17-half-val
+        Metrics:
+          MOTA: 78.3
+          IDF1: 83.2
+          HOTA: 70.9
+    Weights:
+      - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot17-private-half_20220812_192036-b6c9ce9a.pth
+      - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth
+      - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth
+
+  - Name: strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test
+    In Collection: StrongSORT++
+    Config: configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
+    Metadata:
+      Training Data: CrowdHuman + MOT20-train
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT20-test
+        Metrics:
+          MOTA: 75.5
+          IDF1: 77.3
+          HOTA: 62.9
+    Weights:
+      - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot20-private_20220812_192123-77c014de.pth
+      - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth
+      - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth
diff --git a/configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 000000000..d5e7009d4
--- /dev/null
+++ b/configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../../_base_/models/yolox_x_8x8.py',
+    '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='StrongSORT',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='mmdet.BatchSyncRandomResize',
+                random_size_range=(480, 800),
+                size_divisor=32,
+                interval=10)
+        ]),
+    detector=dict(
+        _scope_='mmdet',
+        bbox_head=dict(num_classes=1),
+        test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.7)),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot17-private-half_20220812_192036-b6c9ce9a.pth'  # noqa: E501
+        )),
+    kalman=dict(type='KalmanFilter', center_only=False, use_nsa=True),
+    cmc=dict(
+        type='CameraMotionCompensation',
+        warp_mode='cv2.MOTION_EUCLIDEAN',
+        num_iters=100,
+        stop_eps=0.00001),
+    reid=dict(
+        type='BaseReID',
+        data_preprocessor=None,
+        backbone=dict(
+            type='mmcls.ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(3, ),
+            style='pytorch'),
+        neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+        head=dict(
+            type='LinearReIDHead',
+            num_fcs=1,
+            in_channels=2048,
+            fc_channels=1024,
+            out_channels=128,
+            num_classes=380,
+            loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
+            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='ReLU')),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth'  # noqa: E501
+        )),
+    tracker=dict(
+        type='StrongSORTTracker',
+        obj_score_thr=0.6,
+        reid=dict(
+            num_samples=None,
+            img_scale=(256, 128),
+            img_norm_cfg=dict(
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                to_rgb=True),
+            match_score_thr=0.3,
+            motion_weight=0.02,
+        ),
+        match_iou_thr=0.7,
+        momentums=dict(embeds=0.1, ),
+        num_tentatives=2,
+        num_frames_retain=100))
+
+dataset_type = 'MOTChallengeDataset'
+data_root = 'data/MOT17/'
+img_scale = (800, 1440)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='mmdet.Pad',
+        size_divisor=32,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+train_dataloader = None
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        ref_img_sampler=None,
+        load_as_video=True,
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# evaluator
+val_evaluator = dict(postprocess_tracklet_cfg=[
+    dict(
+        type='AppearanceFreeLink',
+        checkpoint=  # noqa: E251
+        'https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth',  # noqa: E501
+        temporal_threshold=(0, 30),
+        spatial_threshold=50,
+        confidence_threshold=0.95,
+    ),
+    dict(
+        type='InterpolateTracklets',
+        min_num_frames=5,
+        max_num_frames=20,
+        use_gsi=True,
+        smooth_tau=10)
+])
+test_evaluator = val_evaluator
diff --git a/configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py b/configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
new file mode 100644
index 000000000..b41b63ceb
--- /dev/null
+++ b/configs/mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
@@ -0,0 +1,52 @@
+_base_ = [
+    './strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain'
+    '_test-mot17halfval.py'
+]
+
+model = dict(
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='mmdet.BatchSyncRandomResize',
+                random_size_range=(640, 1152))
+        ]),
+    detector=dict(
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot20-private_20220812_192123-77c014de.pth'  # noqa: E501
+        )),
+    reid=dict(
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth'  # noqa: E501
+        )),
+)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='mmdet.Resize', scale=(896, 1600), keep_ratio=True),
+    dict(
+        type='mmdet.Pad',
+        size_divisor=32,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='PackTrackInputs', pack_single_img=True)
+]
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root='data/MOT17',
+        ann_file='annotations/train_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        pipeline=test_pipeline))
+test_dataloader = dict(
+    dataset=dict(
+        data_root='data/MOT20',
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test'),
+        pipeline=test_pipeline))
+
+test_evaluator = dict(format_only=True, outfile_prefix='./mot_20_test_res')
diff --git a/configs/mot/tracktor/README.md b/configs/mot/tracktor/README.md
index 5d694e8d9..08d6bded7 100644
--- a/configs/mot/tracktor/README.md
+++ b/configs/mot/tracktor/README.md
@@ -26,36 +26,24 @@ The problem of tracking multiple objects in a video sequence poses several chall
 }
 ```
 
-We implement Tracktor with independent detector and ReID models. To train a model by yourself, you need to train a detector following [here](../../det/) and also train a ReID model following [here](../../reid/).
-The configs in this folder are basically for inference.
-
 ## Results and models on MOT15
 
-|  Method  |      Detector      | ReID | Train Set  | Test Set | Public | Inf time (fps) | MOTA | IDF1 |  FP  |  FN  | IDSw. |                             Config                              |                                                                                                                                                                                                                                                Download                                                                                                                                                                                                                                                |
-| :------: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :--: | :---: | :-------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| Tracktor | R50-FasterRCNN-FPN | R50  | half-train | half-val |   Y    |       -        | 61.8 | 64.9 | 1235 | 6877 |  116  | [config](tracktor_faster-rcnn_r50_fpn_4e_mot15-public-half.py)  | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot15-half_20210804_001040-ae733d0c.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot15-half_20210804_001040.log.json) \| [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot15_20210803_192157-65b5e2d7.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot15_20210803_192157.log.json) |
-| Tracktor | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |       -        | 66.8 | 68.4 | 3049 | 3922 |  179  | [config](tracktor_faster-rcnn_r50_fpn_4e_mot15-private-half.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot15-half_20210804_001040-ae733d0c.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot15-half_20210804_001040.log.json) \| [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot15_20210803_192157-65b5e2d7.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot15_20210803_192157.log.json) |
+|  Method  |      Detector      | ReID | Train Set  | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP  |  FN  | IDSw. |                                       Config                                       |                                                                                                                                                                                                                                                Download                                                                                                                                                                                                                                                |
+| :------: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :--: | :--: | :---: | :--------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Tracktor | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |       -        | 54.3 | 66.6 | 68.3 | 3052 | 3957 |  178  | [config](tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot15halftrain_test-mot15halfval.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot15-half_20210804_001040-ae733d0c.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot15-half_20210804_001040.log.json) \| [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot15_20210803_192157-65b5e2d7.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot15_20210803_192157.log.json) |
 
 ## Results and models on MOT16
 
-|  Method  |      Detector      | ReID | Train Set  | Test Set | Public | Inf time (fps) | MOTA | IDF1 |  FP  |  FN   | IDSw. |                             Config                              |                                                                                                                                                                                                                                                Download                                                                                                                                                                                                                                                |
-| :------: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :-------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| Tracktor | R50-FasterRCNN-FPN | R50  | half-train | half-val |   Y    |       -        | 54.1 | 61.5 | 425  | 23894 |  182  | [config](tracktor_faster-rcnn_r50_fpn_4e_mot16-public-half.py)  | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot16-half_20210804_001054-73477869.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot16-half_20210804_001054.log.json) \| [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot16_20210803_204826-1b3e3cfd.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot16_20210803_204826.log.json) |
-| Tracktor | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |       -        | 63.4 | 66.2 | 4175 | 14911 |  444  | [config](tracktor_faster-rcnn_r50_fpn_4e_mot16-private-half.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot16-half_20210804_001054-73477869.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot16-half_20210804_001054.log.json) \| [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot16_20210803_204826-1b3e3cfd.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot16_20210803_204826.log.json) |
+|  Method  |      Detector      | ReID | Train Set  | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP  |  FN   | IDSw. |                                       Config                                       |                                                                                                                                                                                                                                                Download                                                                                                                                                                                                                                                |
+| :------: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :--: | :---: | :---: | :--------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Tracktor | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |       -        | 55.0 | 63.4 | 66.2 | 4179 | 14910 |  444  | [config](tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot16halftrain_test-mot16halfval.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot16-half_20210804_001054-73477869.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot16-half_20210804_001054.log.json) \| [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot16_20210803_204826-1b3e3cfd.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot16_20210803_204826.log.json) |
 
 ## Results and models on MOT17
 
-The implementations of Tracktor follow the official practices.
-In the table below, the result marked with * (the last line) is the official one.
-Our implementation outperform it by 4.9 points on MOTA and 3.3 points on IDF1.
-
-|        Method        |      Detector      | ReID | Train Set  | Test Set | Public | Inf time (fps) | MOTA | IDF1 |  FP   |   FN   | IDSw. |                                Config                                |                                                                                                                                                                                                                                                Download                                                                                                                                                                                                                                                |
-| :------------------: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :---: | :----: | :---: | :------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|       Tracktor       | R50-FasterRCNN-FPN | R50  | half-train | half-val |   Y    |      3.2       | 57.3 | 63.4 | 1254  | 67091  |  614  |    [config](tracktor_faster-rcnn_r50_fpn_4e_mot17-public-half.py)    |                                                                                                                                             [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth)                                                                                                                                             |
-|       Tracktor       | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |      3.1       | 64.1 | 66.9 | 11088 | 45762  | 1233  |   [config](tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py)    |                                                                                                                                             [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth)                                                                                                                                             |
-|       Tracktor       | R50-FasterRCNN-FPN | R50  |   train    |   test   |   Y    |      3.2       | 61.2 | 58.4 | 8609  | 207627 | 2634  |      [config](tracktor_faster-rcnn_r50_fpn_4e_mot17-public.py)       |                                                                                                                                               [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth)                                                                                                                                                |
-|      Tracktor\*      | R50-FasterRCNN-FPN | R50  |   train    |   test   |   Y    |       -        | 56.3 | 55.1 | 8866  | 235449 | 1987  |                                  -                                   |                                                                                                                                                                                                                                                   -                                                                                                                                                                                                                                                    |
-| Tracktor <br> (FP16) | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |       -        | 64.7 | 66.6 | 10710 | 45270  | 1152  | [config](tracktor_faster-rcnn_r50_fpn_fp16_4e_mot17-private-half.py) | [detector](https://download.openmmlab.com/mmtracking/fp16/faster-rcnn_r50_fpn_fp16_4e_mot17-half_20210730_002436-f4ba7d61.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/fp16/faster-rcnn_r50_fpn_fp16_4e_mot17-half_20210730_002436.log.json) \| [reid](https://download.openmmlab.com/mmtracking/fp16/reid_r50_fp16_8x32_6e_mot17_20210731_033055-4747ee95.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/fp16/reid_r50_fp16_8x32_6e_mot17_20210731_033055.log.json) |
+|        Method        |      Detector      | ReID | Train Set  | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP   |  FN   | IDSw. |                                         Config                                         |                                                                                                                                                                                                                                                Download                                                                                                                                                                                                                                                |
+| :------------------: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|       Tracktor       | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |      3.1       | 55.8 | 64.1 | 67.0 | 11109 | 45771 | 1227  |   [config](tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py)   |                                                                                                                                             [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth)                                                                                                                                             |
+| Tracktor <br> (FP16) | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |       -        | 55.5 | 64.7 | 66.7 | 10668 | 45279 | 1185  | [config](tracktor_faster-rcnn_r50_fpn_8xb2-amp-4e_mot17halftrain_test-mot17halfval.py) | [detector](https://download.openmmlab.com/mmtracking/fp16/faster-rcnn_r50_fpn_fp16_4e_mot17-half_20210730_002436-f4ba7d61.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/fp16/faster-rcnn_r50_fpn_fp16_4e_mot17-half_20210730_002436.log.json) \| [reid](https://download.openmmlab.com/mmtracking/fp16/reid_r50_fp16_8x32_6e_mot17_20210731_033055-4747ee95.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/fp16/reid_r50_fp16_8x32_6e_mot17_20210731_033055.log.json) |
 
 Note:
 
@@ -63,15 +51,98 @@ Note:
 
 ## Results and models on MOT20
 
-The implementations of Tracktor follow the official practices.
-In the table below, the result marked with * (the last line) is the official one.
-Our implementation outperform it by 5.3 points on MOTA and 2.1 points on IDF1.
+|  Method  |      Detector      | ReID | Train Set  | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 |  FP  |   FN   | IDSw. |                                       Config                                       |                                                                                                                                                                                                                                                Download                                                                                                                                                                                                                                                |
+| :------: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :--: | :----: | :---: | :--------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Tracktor | R50-FasterRCNN-FPN | R50  | half-train | half-val |   N    |       -        | 52.4 | 70.9 | 64.1 | 5544 | 171729 | 1618  | [config](tracktor_faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20-half_20210805_001244-2c323fd1.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20-half_20210805_001244.log.json) \| [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426.log.json) |
+
+## Get started
+
+### 1. Training
+
+We implement Tracktor with independent detector and ReID models.
+Note that, due to the influence of parameters such as learning rate in default configuration file,
+we recommend using 8 GPUs for training in order to reproduce accuracy.
+
+You can train the detector as follows.
+
+```shell script
+# Training Faster R-CNN on mot17-half-train dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_train.sh \
+    configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8
+```
 
-|  Method  |       Detector       | ReID | Train Set  | Test Set | Public | Inf time (fps) | MOTA | IDF1 |  FP   |   FN   | IDSw. |                             Config                              |                                                                                                                                                                                                                                                Download                                                                                                                                                                                                                                                |
-| :------: | :------------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :---: | :----: | :---: | :-------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| Tracktor |  R50-FasterRCNN-FPN  | R50  | half-train | half-val |   Y    |       -        | 70.6 | 65.4 | 3652  | 175955 | 1441  | [config](tracktor_faster-rcnn_r50_fpn_8e_mot20-public-half.py)  | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20-half_20210805_001244-2c323fd1.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20-half_20210805_001244.log.json) \| [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426.log.json) |
-| Tracktor |  R50-FasterRCNN-FPN  | R50  | half-train | half-val |   N    |       -        | 70.9 | 64.1 | 5539  | 171653 | 1619  | [config](tracktor_faster-rcnn_r50_fpn_8e_mot20-private-half.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20-half_20210805_001244-2c323fd1.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20-half_20210805_001244.log.json) \| [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426.log.json) |
-| Tracktor |  R50-FasterRCNN-FPN  | R50  |   train    |   test   |   Y    |       -        | 57.9 | 54.8 | 16203 | 199485 | 2299  |    [config](tracktor_faster-rcnn_r50_fpn_8e_mot20-public.py)    |      [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20_20210804_162232-7fde5e8d.pth) \| [detector_log](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20_20210804_162232.log.json) \| [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth) \| [reid_log](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426.log.json)      |
-| Tracktor | R50-FasterRCNN-FPN\* | R50  |   train    |   test   |   Y    |       -        | 52.6 | 52.7 | 6930  | 236680 | 1648  |                                -                                |                                                                                                                                                                                                                                                   -                                                                                                                                                                                                                                                    |
+And you can train the ReID model as follows.
+
+```shell script
+# Training ReID model on mot17-train80 dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_train.sh \
+    configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`,
+please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+**2.1 Example on MOTxx-halfval dataset**
+
+```shell script
+# Example 1: Test on motXX-half-val set.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8
+```
+
+If you want to use your own detector and ReID checkpoint, you can change the cfg as follows.
+
+```shell script
+model = dict(
+    detector=dict(
+        init_cfg=dict(
+            checkpoint=  # noqa: E251
+            'path_to_your_det_checkpoint.pth'  # noqa: E501
+        )),
+    reid=dict(
+        init_cfg=dict(
+            checkpoint=  # noqa: E251
+            'path_to_your_reid_checkpoint.pth'  # noqa: E501
+        )))
+```
+
+Or, you can specify them in commands as follows.
+
+```shell script
+./tools/dist_test.sh \
+    configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 \
+    --cfg-options model.detector.init_cfg.checkpoint=path_to_your_det_checkpoint.pth model.reid.init_cfg.checkpoint=path_to_your_reid_checkpoint.pth
+```
+
+**2.2 Example on MOTxx-test dataset**
+
+If you want to get the results of the [MOT Challenge](https://motchallenge.net/) test set,
+please use the following command to generate result files that can be used for submission.
+It will be stored in `./mot_17_test_res`, you can modify the saved path in `test_evaluator` of the config.
+
+```shell script
+# Example 2: Test on motxx-test set
+# The number after config file represents the number of GPUs used
+./tools/dist_test.sh \
+    configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py 8
+```
+
+If you want to know about more detailed usage of `test.py/dist_test.sh/slurm_test.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_mot_vis.py \
+    configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \
+    --input demo/demo.mp4 \
+    --output mot.mp4
+```
 
-Note: When running `demo_mot.py`, we suggest you use the config containing `private`, since `private` means the MOT method doesn't need external detections.
+If you want to know about more detailed usage of `demo_mot_vis.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/mot/tracktor/metafile.yml b/configs/mot/tracktor/metafile.yml
index 331019152..0623869d2 100644
--- a/configs/mot/tracktor/metafile.yml
+++ b/configs/mot/tracktor/metafile.yml
@@ -8,59 +8,30 @@ Collections:
         - ResNet
         - FPN
     Paper:
-        URL: https://arxiv.org/abs/1903.05625
-        Title: Tracking without bells and whistles
+      URL: https://arxiv.org/abs/1903.05625
+      Title: Tracking without bells and whistles
     README: configs/mot/tracktor/README.md
 
 Models:
-  - Name: tracktor_faster-rcnn_r50_fpn_4e_mot15-public-half
+  - Name: tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot15halftrain_test-mot15halfval
     In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot15-public-half.py
+    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot15halftrain_test-mot15halfval.py
     Metadata:
       Training Data: MOT15-half-train
     Results:
       - Task: Multiple Object Tracking
         Dataset: MOT15-half-val
         Metrics:
-          MOTA: 61.8
-          IDF1: 64.9
+          MOTA: 66.6
+          IDF1: 68.3
+          HOTA: 54.3
     Weights:
       - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot15-half_20210804_001040-ae733d0c.pth
       - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot15_20210803_192157-65b5e2d7.pth
 
-  - Name: tracktor_faster-rcnn_r50_fpn_4e_mot15-private-half
+  - Name: tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot16halftrain_test-mot16halfval
     In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot15-private-half.py
-    Metadata:
-      Training Data: MOT15-half-train
-    Results:
-      - Task: Multiple Object Tracking
-        Dataset: MOT15-half-val
-        Metrics:
-          MOTA: 66.8
-          IDF1: 68.4
-    Weights:
-      - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot15-half_20210804_001040-ae733d0c.pth
-      - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot15_20210803_192157-65b5e2d7.pth
-
-  - Name: tracktor_faster-rcnn_r50_fpn_4e_mot16-public-half
-    In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot16-public-half.py
-    Metadata:
-      Training Data: MOT16-half-train
-    Results:
-      - Task: Multiple Object Tracking
-        Dataset: MOT16-half-val
-        Metrics:
-          MOTA: 54.1
-          IDF1: 61.5
-    Weights:
-      - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot16-half_20210804_001054-73477869.pth
-      - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot16_20210803_204826-1b3e3cfd.pth
-
-  - Name: tracktor_faster-rcnn_r50_fpn_4e_mot16-private-half
-    In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot16-private-half.py
+    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot16halftrain_test-mot16halfval.py
     Metadata:
       Training Data: MOT16-half-train
     Results:
@@ -69,35 +40,14 @@ Models:
         Metrics:
           MOTA: 63.4
           IDF1: 66.2
+          HOTA: 55.0
     Weights:
       - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot16-half_20210804_001054-73477869.pth
       - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot16_20210803_204826-1b3e3cfd.pth
 
-  - Name: tracktor_faster-rcnn_r50_fpn_4e_mot17-public-half
-    In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public-half.py
-    Metadata:
-      Training Data: MOT17-half-train
-      inference time (ms/im):
-        - value: 312.5
-          hardware: V100
-          backend: PyTorch
-          batch size: 1
-          mode: FP32
-          resolution: (640, 1088)
-    Results:
-      - Task: Multiple Object Tracking
-        Dataset: MOT17-half-val
-        Metrics:
-          MOTA: 57.3
-          IDF1: 63.4
-    Weights:
-      - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth
-      - https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth
-
-  - Name: tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half
+  - Name: tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
     In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py
+    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
     Metadata:
       Training Data: MOT17-half-train
       inference time (ms/im):
@@ -112,51 +62,15 @@ Models:
         Dataset: MOT17-half-val
         Metrics:
           MOTA: 64.1
-          IDF1: 66.5
+          IDF1: 67.0
+          HOTA: 55.8
     Weights:
       - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth
       - https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth
 
-  - Name: tracktor_faster-rcnn_r50_fpn_4e_mot17-public
-    In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public.py
-    Metadata:
-      Training Data: MOT17-train
-      inference time (ms/im):
-        - value: 312.5
-          hardware: V100
-          backend: PyTorch
-          batch size: 1
-          mode: FP32
-          resolution: (640, 1088)
-    Results:
-      - Task: Multiple Object Tracking
-        Dataset: MOT17-test
-        Metrics:
-          MOTA: 61.2
-          IDF1: 58.4
-    Weights:
-      - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth
-      - https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth
-
-  - Name: tracktor_faster-rcnn_r50_fpn_8e_mot20-public-half
-    In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-public-half.py
-    Metadata:
-      Training Data: MOT20-half-train
-    Results:
-      - Task: Multiple Object Tracking
-        Dataset: MOT20-half-val
-        Metrics:
-          MOTA: 70.6
-          IDF1: 65.4
-    Weights:
-      - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20-half_20210805_001244-2c323fd1.pth
-      - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth
-
-  - Name: tracktor_faster-rcnn_r50_fpn_8e_mot20-private-half
+  - Name: tracktor_faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval
     In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-private-half.py
+    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py
     Metadata:
       Training Data: MOT20-half-train
     Results:
@@ -165,28 +79,14 @@ Models:
         Metrics:
           MOTA: 70.9
           IDF1: 64.1
+          HOTA: 52.4
     Weights:
       - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20-half_20210805_001244-2c323fd1.pth
       - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth
 
-  - Name: tracktor_faster-rcnn_r50_fpn_8e_mot20-public
-    In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-public.py
-    Metadata:
-      Training Data: MOT20-train
-    Results:
-      - Task: Multiple Object Tracking
-        Dataset: MOT20-test
-        Metrics:
-          MOTA: 57.9
-          IDF1: 54.8
-    Weights:
-      - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20_20210804_162232-7fde5e8d.pth
-      - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth
-
-  - Name: tracktor_faster-rcnn_r50_fpn_fp16_4e_mot17-private-half
+  - Name: tracktor_faster-rcnn_r50_fpn_8xb2-amp-4e_mot17halftrain_test-mot17halfval
     In Collection: Tracktor
-    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_fp16_4e_mot17-private-half.py
+    Config: configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-amp-4e_mot17halftrain_test-mot17halfval.py
     Metadata:
       Training Data: MOT17-half-train
     Results:
@@ -194,7 +94,8 @@ Models:
         Dataset: MOT17-half-val
         Metrics:
           MOTA: 64.7
-          IDF1: 66.6
+          IDF1: 66.7
+          HOTA: 55.5
     Weights:
       - https://download.openmmlab.com/mmtracking/fp16/faster-rcnn_r50_fpn_fp16_4e_mot17-half_20210730_002436-f4ba7d61.pth
       - https://download.openmmlab.com/mmtracking/fp16/reid_r50_fp16_8x32_6e_mot17_20210731_033055-4747ee95.pth
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot15-public-half.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot15-public-half.py
deleted file mode 100644
index 87d65e2ff..000000000
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot15-public-half.py
+++ /dev/null
@@ -1,51 +0,0 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
-
-model = dict(
-    detector=dict(
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot15-half_20210804_001040-ae733d0c.pth'  # noqa: E501
-        )),
-    reid=dict(
-        head=dict(num_classes=375),
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot15_20210803_192157-65b5e2d7.pth'  # noqa: E501
-        )))
-# data
-data_root = 'data/MOT15/'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadDetections'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img', 'public_bboxes'])
-        ])
-]
-data = dict(
-    train=dict(
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        detection_file=data_root + 'annotations/half-train_detections.pkl',
-        img_prefix=data_root + 'train'),
-    val=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        img_prefix=data_root + 'train',
-        pipeline=test_pipeline),
-    test=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        img_prefix=data_root + 'train',
-        pipeline=test_pipeline))
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot16-public-half.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot16-public-half.py
deleted file mode 100644
index 0de3c1aef..000000000
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot16-public-half.py
+++ /dev/null
@@ -1,51 +0,0 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
-
-model = dict(
-    detector=dict(
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot16-half_20210804_001054-73477869.pth'  # noqa: E501
-        )),
-    reid=dict(
-        head=dict(num_classes=375),
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot16_20210803_204826-1b3e3cfd.pth'  # noqa: E501
-        )))
-# data
-data_root = 'data/MOT16/'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadDetections'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img', 'public_bboxes'])
-        ])
-]
-data = dict(
-    train=dict(
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        detection_file=data_root + 'annotations/half-train_detections.pkl',
-        img_prefix=data_root + 'train'),
-    val=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        img_prefix=data_root + 'train',
-        pipeline=test_pipeline),
-    test=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        img_prefix=data_root + 'train',
-        pipeline=test_pipeline))
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-private.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-private.py
deleted file mode 100644
index 640ee4813..000000000
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-private.py
+++ /dev/null
@@ -1,18 +0,0 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
-
-model = dict(
-    detector=dict(
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth'  # noqa: E501
-        )))
-# data
-data_root = 'data/MOT17/'
-test_set = 'test'
-data = dict(
-    train=dict(ann_file=data_root + 'annotations/train_cocoformat.json'),
-    val=dict(ann_file=data_root + 'annotations/train_cocoformat.json'),
-    test=dict(
-        ann_file=data_root + f'annotations/{test_set}_cocoformat.json',
-        img_prefix=data_root + test_set))
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public-half.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public-half.py
deleted file mode 100644
index 4aa273577..000000000
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public-half.py
+++ /dev/null
@@ -1,28 +0,0 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
-# data
-data_root = 'data/MOT17/'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadDetections'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img', 'public_bboxes'])
-        ])
-]
-data = dict(
-    val=dict(
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        pipeline=test_pipeline),
-    test=dict(
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        pipeline=test_pipeline))
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public-half_search.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public-half_search.py
deleted file mode 100644
index fe7f18134..000000000
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public-half_search.py
+++ /dev/null
@@ -1,44 +0,0 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
-model = dict(
-    tracker=dict(
-        type='TracktorTracker',
-        obj_score_thr=[0.4, 0.5, 0.6],
-        regression=dict(
-            obj_score_thr=[0.4, 0.5, 0.6],
-            nms=dict(type='nms', iou_threshold=0.6),
-            match_iou_thr=[0.3, 0.5]),
-        reid=dict(
-            num_samples=10,
-            img_scale=(256, 128),
-            img_norm_cfg=None,
-            match_score_thr=2.0,
-            match_iou_thr=0.2),
-        momentums=None,
-        num_frames_retain=10))
-# data
-data_root = 'data/MOT17/'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadDetections'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img', 'public_bboxes'])
-        ])
-]
-data = dict(
-    val=dict(
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        pipeline=test_pipeline),
-    test=dict(
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        pipeline=test_pipeline))
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public.py
deleted file mode 100644
index a0e425c52..000000000
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-public.py
+++ /dev/null
@@ -1,41 +0,0 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
-
-model = dict(
-    detector=dict(
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth'  # noqa: E501
-        )))
-# data
-data_root = 'data/MOT17/'
-test_set = 'test'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadDetections'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img', 'public_bboxes'])
-        ])
-]
-data = dict(
-    train=dict(ann_file=data_root + 'annotations/train_cocoformat.json'),
-    val=dict(
-        ann_file=data_root + 'annotations/train_cocoformat.json',
-        detection_file=data_root + 'annotations/train_detections.pkl',
-        pipeline=test_pipeline),
-    test=dict(
-        ann_file=data_root + f'annotations/{test_set}_cocoformat.json',
-        img_prefix=data_root + test_set,
-        detection_file=data_root + f'annotations/{test_set}_detections.pkl',
-        pipeline=test_pipeline))
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-public-half.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-public-half.py
deleted file mode 100644
index a5b0afc8a..000000000
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-public-half.py
+++ /dev/null
@@ -1,57 +0,0 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
-
-model = dict(
-    detector=dict(
-        rpn_head=dict(bbox_coder=dict(clip_border=True)),
-        roi_head=dict(
-            bbox_head=dict(bbox_coder=dict(clip_border=True), num_classes=1)),
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_8e_mot20-half_20210805_001244-2c323fd1.pth'  # noqa: E501
-        )),
-    reid=dict(
-        head=dict(num_classes=1705),
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth'  # noqa: E501
-        )))
-data_root = 'data/MOT20/'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadDetections'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1088, 1088),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='VideoCollect', keys=['img', 'public_bboxes'])
-        ])
-]
-data = dict(
-    train=dict(
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        detection_file=data_root + 'annotations/half-train_detections.pkl',
-        img_prefix=data_root + 'train'),
-    val=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        img_prefix=data_root + 'train',
-        pipeline=test_pipeline),
-    test=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        detection_file=data_root + 'annotations/half-val_detections.pkl',
-        img_prefix=data_root + 'train',
-        pipeline=test_pipeline))
-# learning policy
-lr_config = dict(step=[6])
-# runtime settings
-total_epochs = 8
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot15-private-half.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot15halftrain_test-mot15halfval.py
similarity index 56%
rename from configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot15-private-half.py
rename to configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot15halftrain_test-mot15halfval.py
index 47deab62b..d7274050c 100644
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot15-private-half.py
+++ b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot15halftrain_test-mot15halfval.py
@@ -1,4 +1,7 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
+_base_ = [
+    './tracktor_faster-rcnn_r50-fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
 
 model = dict(
     detector=dict(
@@ -14,15 +17,12 @@
             checkpoint=  # noqa: E251
             'https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot15_20210803_192157-65b5e2d7.pth'  # noqa: E501
         )))
-# data
+
+# dataloader
 data_root = 'data/MOT15/'
-data = dict(
-    train=dict(
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    val=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    test=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+    ))
+test_dataloader = val_dataloader
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot16-private-half.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot16halftrain_test-mot16halfval.py
similarity index 56%
rename from configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot16-private-half.py
rename to configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot16halftrain_test-mot16halfval.py
index 3cdff65e3..30051a28b 100644
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot16-private-half.py
+++ b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot16halftrain_test-mot16halfval.py
@@ -1,4 +1,7 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
+_base_ = [
+    './tracktor_faster-rcnn_r50-fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
 
 model = dict(
     detector=dict(
@@ -14,15 +17,12 @@
             checkpoint=  # noqa: E251
             'https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot16_20210803_204826-1b3e3cfd.pth'  # noqa: E501
         )))
-# data
+
+# dataloader
 data_root = 'data/MOT16/'
-data = dict(
-    train=dict(
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    val=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    test=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+    ))
+test_dataloader = val_dataloader
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
similarity index 77%
rename from configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py
rename to configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
index 6244313e6..8e5dd9eaf 100644
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py
+++ b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -1,5 +1,5 @@
 _base_ = [
-    '../../_base_/models/faster_rcnn_r50_fpn.py',
+    '../../_base_/models/faster-rcnn_r50_fpn.py',
     '../../_base_/datasets/mot_challenge.py', '../../_base_/default_runtime.py'
 ]
 
@@ -17,7 +17,7 @@
     reid=dict(
         type='BaseReID',
         backbone=dict(
-            type='ResNet',
+            type='mmcls.ResNet',
             depth=50,
             num_stages=4,
             out_indices=(3, ),
@@ -30,9 +30,8 @@
             fc_channels=1024,
             out_channels=128,
             num_classes=380,
-            loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
-            loss_pairwise=dict(
-                type='TripletLoss', margin=0.3, loss_weight=1.0),
+            loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
+            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
             norm_cfg=dict(type='BN1d'),
             act_cfg=dict(type='ReLU')),
         init_cfg=dict(
@@ -60,14 +59,9 @@
             match_iou_thr=0.2),
         momentums=None,
         num_frames_retain=10))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=100,
-    warmup_ratio=1.0 / 100,
-    step=[3])
-# runtime settings
-total_epochs = 4
-evaluation = dict(metric=['bbox', 'track'], interval=1)
-search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML']
+
+train_dataloader = None
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval_search.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval_search.py
new file mode 100644
index 000000000..e254f8f4f
--- /dev/null
+++ b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval_search.py
@@ -0,0 +1,21 @@
+_base_ = [
+    './tracktor_faster-rcnn_r50-fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
+
+model = dict(
+    tracker=dict(
+        type='TracktorTracker',
+        obj_score_thr=[0.4, 0.5, 0.6],
+        regression=dict(
+            obj_score_thr=[0.4, 0.5, 0.6],
+            nms=dict(type='nms', iou_threshold=0.6),
+            match_iou_thr=[0.3, 0.5]),
+        reid=dict(
+            num_samples=10,
+            img_scale=(256, 128),
+            img_norm_cfg=None,
+            match_score_thr=2.0,
+            match_iou_thr=0.2),
+        momentums=None,
+        num_frames_retain=10))
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py
new file mode 100644
index 000000000..ffcdc6a7b
--- /dev/null
+++ b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py
@@ -0,0 +1,24 @@
+_base_ = [
+    './tracktor_faster-rcnn_r50-fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
+
+model = dict(
+    detector=dict(
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-ffa52ae7.pth'  # noqa: E501
+        )))
+
+# dataloader
+val_dataloader = dict(
+    dataset=dict(ann_file='annotations/train_cocoformat.json', ))
+test_dataloader = dict(
+    dataset=dict(
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test'),
+    ))
+
+# evaluator
+test_evaluator = dict(format_only=True, outfile_prefix='./mot_17_test_res')
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-private-half.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py
similarity index 57%
rename from configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-private-half.py
rename to configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py
index 61e3a5cb1..98fae20e0 100644
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-private-half.py
+++ b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py
@@ -1,4 +1,7 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
+_base_ = [
+    './tracktor_faster-rcnn_r50-fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
 
 model = dict(
     detector=dict(
@@ -17,19 +20,12 @@
             checkpoint=  # noqa: E251
             'https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth'  # noqa: E501
         )))
-# data
+
+# dataloader
 data_root = 'data/MOT20/'
-data = dict(
-    train=dict(
-        ann_file=data_root + 'annotations/half-train_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    val=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'),
-    test=dict(
-        ann_file=data_root + 'annotations/half-val_cocoformat.json',
-        img_prefix=data_root + 'train'))
-# learning policy
-lr_config = dict(step=[6])
-# runtime settings
-total_epochs = 8
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+    ))
+test_dataloader = val_dataloader
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-public.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20test.py
similarity index 55%
rename from configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-public.py
rename to configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20test.py
index 14aabcf52..618fa4972 100644
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8e_mot20-public.py
+++ b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20test.py
@@ -1,4 +1,7 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
+_base_ = [
+    './tracktor_faster-rcnn_r50-fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
 
 model = dict(
     detector=dict(
@@ -17,18 +20,20 @@
             checkpoint=  # noqa: E251
             'https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth'  # noqa: E501
         )))
+
+# dataloader
 data_root = 'data/MOT20/'
-test_set = 'test'
-data = dict(
-    train=dict(ann_file=data_root + 'annotations/train_cocoformat.json'),
-    val=dict(
-        ann_file=data_root + 'annotations/train_cocoformat.json',
-        detection_file=data_root + 'annotations/train_detections.pkl'),
-    test=dict(
-        ann_file=data_root + f'annotations/{test_set}_cocoformat.json',
-        img_prefix=data_root + test_set,
-        detection_file=data_root + f'annotations/{test_set}_detections.pkl'))
-# learning policy
-lr_config = dict(step=[6])
-# runtime settings
-total_epochs = 8
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/train_cocoformat.json',
+    ))
+test_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test'),
+    ))
+
+# evaluator
+test_evaluator = dict(format_only=True, outfile_prefix='./mot_20_test_res')
diff --git a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_fp16_4e_mot17-private-half.py b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-amp-4e_mot17halftrain_test-mot17halfval.py
similarity index 81%
rename from configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_fp16_4e_mot17-private-half.py
rename to configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-amp-4e_mot17halftrain_test-mot17halfval.py
index bf6fb92a4..773ac226e 100644
--- a/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_fp16_4e_mot17-private-half.py
+++ b/configs/mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-amp-4e_mot17halftrain_test-mot17halfval.py
@@ -1,4 +1,7 @@
-_base_ = ['./tracktor_faster-rcnn_r50_fpn_4e_mot17-private-half.py']
+_base_ = [
+    './tracktor_faster-rcnn_r50-fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
 
 model = dict(
     detector=dict(
@@ -13,4 +16,5 @@
             checkpoint=  # noqa: E251
             'https://download.openmmlab.com/mmtracking/fp16/reid_r50_fp16_8x32_6e_mot17_20210731_033055-4747ee95.pth'  # noqa: E501
         )))
+# TODO: wait for mmengine
 fp16 = dict(loss_scale=512.)
diff --git a/configs/reid/reid_r50_8xb32-6e_mot15train80_test-mot15val20.py b/configs/reid/reid_r50_8xb32-6e_mot15train80_test-mot15val20.py
new file mode 100644
index 000000000..4e30b2296
--- /dev/null
+++ b/configs/reid/reid_r50_8xb32-6e_mot15train80_test-mot15val20.py
@@ -0,0 +1,7 @@
+_base_ = ['./reid_r50_8xb32-6e_mot17train80_test-mot17val20.py']
+model = dict(head=dict(num_classes=368))
+# data
+data_root = 'data/MOT15/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = val_dataloader
diff --git a/configs/reid/reid_r50_8xb32-6e_mot16train80_test-mot16val20.py b/configs/reid/reid_r50_8xb32-6e_mot16train80_test-mot16val20.py
new file mode 100644
index 000000000..468b9bfb2
--- /dev/null
+++ b/configs/reid/reid_r50_8xb32-6e_mot16train80_test-mot16val20.py
@@ -0,0 +1,7 @@
+_base_ = ['./reid_r50_8xb32-6e_mot17train80_test-mot17val20.py']
+model = dict(head=dict(num_classes=371))
+# data
+data_root = 'data/MOT16/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = val_dataloader
diff --git a/configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py b/configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py
new file mode 100644
index 000000000..b268a4ea9
--- /dev/null
+++ b/configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py
@@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/datasets/mot_challenge_reid.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='BaseReID',
+    data_preprocessor=dict(
+        type='mmcls.ClsDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        to_rgb=True),
+    backbone=dict(
+        type='mmcls.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3, ),
+        style='pytorch'),
+    neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+    head=dict(
+        type='LinearReIDHead',
+        num_fcs=1,
+        in_channels=2048,
+        fc_channels=1024,
+        out_channels=128,
+        num_classes=380,
+        loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
+        loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU')),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint=  # noqa: E251
+        'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth'  # noqa: E501
+    ))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    clip_grad=None,
+    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=6,
+        by_epoch=True,
+        milestones=[5],
+        gamma=0.1)
+]
+
+# train, val, test setting
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=6, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/reid/reid_r50_8xb32-6e_mot20train80_test-mot20val20.py b/configs/reid/reid_r50_8xb32-6e_mot20train80_test-mot20val20.py
new file mode 100644
index 000000000..8a8079961
--- /dev/null
+++ b/configs/reid/reid_r50_8xb32-6e_mot20train80_test-mot20val20.py
@@ -0,0 +1,10 @@
+_base_ = ['./reid_r50_8xb32-6e_mot17train80_test-mot17val20.py']
+model = dict(head=dict(num_classes=1701))
+# data
+data_root = 'data/MOT20/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = val_dataloader
+
+# train, val, test setting
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=6, val_interval=7)
diff --git a/configs/reid/resnet50_b32x8_MOT15.py b/configs/reid/resnet50_b32x8_MOT15.py
deleted file mode 100644
index 3a9458625..000000000
--- a/configs/reid/resnet50_b32x8_MOT15.py
+++ /dev/null
@@ -1,15 +0,0 @@
-TRAIN_REID = True
-_base_ = ['./resnet50_b32x8_MOT17.py']
-model = dict(reid=dict(head=dict(num_classes=368)))
-# data
-data_root = 'data/MOT15/'
-data = dict(
-    train=dict(
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/train_80.txt'),
-    val=dict(
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/val_20.txt'),
-    test=dict(
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/val_20.txt'))
diff --git a/configs/reid/resnet50_b32x8_MOT16.py b/configs/reid/resnet50_b32x8_MOT16.py
deleted file mode 100644
index 9c33477dc..000000000
--- a/configs/reid/resnet50_b32x8_MOT16.py
+++ /dev/null
@@ -1,15 +0,0 @@
-TRAIN_REID = True
-_base_ = ['./resnet50_b32x8_MOT17.py']
-model = dict(reid=dict(head=dict(num_classes=371)))
-# data
-data_root = 'data/MOT16/'
-data = dict(
-    train=dict(
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/train_80.txt'),
-    val=dict(
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/val_20.txt'),
-    test=dict(
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/val_20.txt'))
diff --git a/configs/reid/resnet50_b32x8_MOT17.py b/configs/reid/resnet50_b32x8_MOT17.py
deleted file mode 100644
index ae2fed39a..000000000
--- a/configs/reid/resnet50_b32x8_MOT17.py
+++ /dev/null
@@ -1,42 +0,0 @@
-TRAIN_REID = True
-_base_ = [
-    '../_base_/datasets/mot_challenge_reid.py', '../_base_/default_runtime.py'
-]
-model = dict(
-    reid=dict(
-        type='BaseReID',
-        backbone=dict(
-            type='ResNet',
-            depth=50,
-            num_stages=4,
-            out_indices=(3, ),
-            style='pytorch'),
-        neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
-        head=dict(
-            type='LinearReIDHead',
-            num_fcs=1,
-            in_channels=2048,
-            fc_channels=1024,
-            out_channels=128,
-            num_classes=380,
-            loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
-            loss_pairwise=dict(
-                type='TripletLoss', margin=0.3, loss_weight=1.0),
-            norm_cfg=dict(type='BN1d'),
-            act_cfg=dict(type='ReLU')),
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth'  # noqa: E501
-        )))
-# optimizer
-optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(grad_clip=None)
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=1000,
-    warmup_ratio=1.0 / 1000,
-    step=[5])
-total_epochs = 6
diff --git a/configs/reid/resnet50_b32x8_MOT20.py b/configs/reid/resnet50_b32x8_MOT20.py
deleted file mode 100644
index beef6223b..000000000
--- a/configs/reid/resnet50_b32x8_MOT20.py
+++ /dev/null
@@ -1,15 +0,0 @@
-TRAIN_REID = True
-_base_ = ['./resnet50_b32x8_MOT17.py']
-model = dict(reid=dict(head=dict(num_classes=1701)))
-# data
-data_root = 'data/MOT20/'
-data = dict(
-    train=dict(
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/train_80.txt'),
-    val=dict(
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/val_20.txt'),
-    test=dict(
-        data_prefix=data_root + 'reid/imgs',
-        ann_file=data_root + 'reid/meta/val_20.txt'))
diff --git a/configs/sot/README.md b/configs/sot/README.md
deleted file mode 100644
index 114005a7c..000000000
--- a/configs/sot/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Usage of SOT configs
-
-## Training with SOT configs
-
-Please refer to [Train SOT models](../../docs/en/quick_run.md#examples-of-training-sot-model) to see the examples.
-
-## Testing with SOT configs
-
-Please refer to [Test SOT models](../../docs/en/quick_run.md#examples-of-testing-sot-model) to see the examples.
-
-## Inference with SOT configs
-
-Please refer to [Inference SOT models](../../docs/en/quick_run.md#inference-sot-models) to see the examples.
diff --git a/configs/sot/prdimp/README.md b/configs/sot/prdimp/README.md
new file mode 100644
index 000000000..389a1032b
--- /dev/null
+++ b/configs/sot/prdimp/README.md
@@ -0,0 +1,117 @@
+# Probabilistic Regression for Visual Tracking
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Visual tracking is fundamentally the problem of regressing the state of the target in each video frame. While significant progress has been achieved, trackers are still prone to failures and inaccuracies. It is therefore crucial to represent the uncertainty in the target estimation. Although current prominent paradigms rely on estimating a state-dependent confidence score, this value lacks a clear probabilistic interpretation, complicating its use.
+In this work, we therefore propose a probabilistic regression formulation and apply it to tracking. Our network predicts the conditional probability density of the target state given an input image. Crucially, our formulation is capable of modeling label noise stemming from inaccurate annotations and ambiguities in the task. The regression network is trained by minimizing the Kullback-Leibler divergence. When applied for tracking, our formulation not only allows a probabilistic representation of the output, but also substantially improves the performance. Our tracker sets a new state-of-the-art on six datasets, achieving 59.8% AUC on LaSOT and 75.8% Success on TrackingNet. The code and models are available at [this https URL](https://github.com/visionml/pytracking).
+
+<!-- [IMAGE] -->
+
+<div align="center">
+  <img src="https://user-images.githubusercontent.com/34888372/188844862-9bec1963-54f4-4c1c-b013-52fec3811465.png"/>
+</div>
+
+## Citation
+
+<!-- [ALGORITHM] -->
+
+```latex
+@inproceedings{Danelljan2020Probabilistic,
+  title={Probabilistic Regression for Visual Tracking},
+  author={Danelljan, Martin and Van Gool, Luc and Timofte, Radu},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  year={2020}
+}
+```
+
+## Results and models
+
+### LaSOT
+
+We provide the last-epoch model with its configuration and training log.
+
+| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                                   Config                                   |                                                                                                                                                  Download                                                                                                                                                  |
+| :----: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| PrDiMP |   R-50   |   -   |   50e   |   13.9   |       -        |  59.7   |      67.7      |   60.5    | [config](prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot.py) | [model](https://download.openmmlab.com/mmtracking/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot_20220822_082200-b7dbeca4.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot_20220822_082200.json) |
+
+### TrackingNet
+
+The last-epoch model on LaSOT is submitted to [the evaluation server on TrackingNet Challenge](https://eval.ai/web/challenges/challenge-page/1805/). We provide the model with its configuration and training log.
+
+| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                                      Config                                      |                                                                                                                                                  Download                                                                                                                                                  |
+| :----: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| PrDiMP |   R-50   |   -   |   50e   |   13.9   |       -        |  75.2   |      80.5      |   70.1    | [config](prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py) | [model](https://download.openmmlab.com/mmtracking/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot_20220822_082200-b7dbeca4.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot_20220822_082200.json) |
+
+### GOT10k
+
+The results of PrDiMP in GOT10k are reimplemented by ourselves. We only use the GOT10k train set to train the model, which is the common setting on GOT10k, while the setting on the PrDiMP paper is using the GOT10k, LaSOT, TrackingNet and coco to train the model. The result on our setting is lower about 1 than the original PrDiMP setting. The last-epoch model is submitted to [the evaluation server on GOT10k Challenge](http://got-10k.aitestunion.com/). We provide the model with its configuration and training log.
+
+| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) |  AO  | SR<sub>0.5</sub> | SR<sub>0.75</sub> |                  Config                  |                                                                                                                Download                                                                                                                |
+| :----: | :------: | :---: | :-----: | :------: | :------------: | :--: | :--------------: | :---------------: | :--------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| PrDiMP |   R-50   |   -   |   50e   |   13.9   |       -        | 62.9 |       72.5       |       52.8        | [config](prdimp_r50_8xb10-50e_got10k.py) | [model](https://download.openmmlab.com/mmtracking/sot/prdimp/prdimp_r50_8xb10-50e_got10k_20220907_173919-fa24df25.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/prdimp/prdimp_r50_8xb10-50e_got10k_20220907_173919.json) |
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. The following is an example of training PrDiMP tested on LaSOT dataset. The model on GOT10k is similar like this.
+
+```shell
+# Training PrDiMP on LaSOT、TrackingNet and coco dataset.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs
+./tools/dist_train.sh \
+    configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+**2.1 Example on LaSOT dataset**
+
+```shell
+# Test PrDiMP on LaSOT testset
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot.py 8 \
+    --checkpoint ./checkpoints/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot_20220822_082200-b7dbeca4.pth
+```
+
+**2.1 Example on TrackingNet and GOT10k datasets**
+
+If you want to get the results of the [TrackingNet](https://eval.ai/web/challenges/challenge-page/1805/) and [GOT10k](http://got-10k.aitestunion.com/), please use the following commands to generate result files that can be used for submission. You can modify the saved path in `test_evaluator` of the config.
+
+```shell
+# Test PrDIMP on TrackingNet testset.
+# The result is stored in `./results/prdimp_trackingnet.zip` by default.
+# We use the lasot checkpoint on LaSOT to test on the TrackingNet.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py 8 \
+    --checkpoint ./checkpoints/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot_20220822_082200-b7dbeca4.pth
+```
+
+```shell
+# Test PrDiMP on GOT10k testset.
+# The result is stored in `./results/prdimp_got10k.zip` by default.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k.py 8 \
+    --checkpoint ./checkpoints/prdimp_r50_8xb10-50e_got10k_20220907_173919-fa24df25.pth
+```
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_sot.py \
+    configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot.py \
+    --checkpoint ./checkpoints/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot_20220822_082200-b7dbeca4.pth \
+    --input demo/demo.mp4 \
+    --output sot.mp4
+```
+
+If you want to know about more detailed usage of `demo_sot.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/sot/prdimp/metafile.yml b/configs/sot/prdimp/metafile.yml
new file mode 100644
index 000000000..cceaddc19
--- /dev/null
+++ b/configs/sot/prdimp/metafile.yml
@@ -0,0 +1,62 @@
+Collections:
+  - Name: PrDiMP
+    Metadata:
+      Training Data: GOT10k, LaSOT, TrackingNet, MSCOCO
+      Training Techniques:
+        - Adam
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2003.12565
+      Title: Probabilistic Regression for Visual Tracking
+    README: configs/sot/prdimp/README.md
+
+Models:
+  - Name: prdimp_r50_8xb10-50e_got10k
+    In Collection: PrDiMP
+    Config: configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k.py
+    Metadata:
+      Training Data: GOT10k
+      Training Memory (GB): 13.9
+      Epochs: 50
+    Results:
+      - Task: Single Object Tracking
+        Dataset: GOT10k
+        Metrics:
+          AO: 62.9
+          SR0.5: 72.5
+          SR0.75: 52.8
+    Weights: https://download.openmmlab.com/mmtracking/sot/prdimp/prdimp_r50_8xb10-50e_got10k_20220907_173919-fa24df25.pth
+
+  - Name: prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot
+    In Collection: PrDiMP
+    Config: configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot.py
+    Metadata:
+      Training Data: GOT10k, LaSOT, TrackingNet, MSCOCO
+      Training Memory (GB): 13.9
+      Epochs: 50
+    Results:
+      - Task: Single Object Tracking
+        Dataset: LaSOT
+        Metrics:
+          Success: 59.7
+          Norm Precision: 67.7
+          Precision: 60.5
+    Weights: https://download.openmmlab.com/mmtracking/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot_20220822_082200-b7dbeca4.pth
+
+  - Name: prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-trackingnet
+    In Collection: PrDiMP
+    Config: configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py
+    Metadata:
+      Training Data: GOT10k, LaSOT, TrackingNet, MSCOCO
+      Training Memory (GB): 13.9
+      Epochs: 50
+    Results:
+      - Task: Single Object Tracking
+        Dataset: TrackingNet
+        Metrics:
+          Success: 75.2
+          Norm Precision: 80.5
+          Precision: 70.1
+    Weights: https://download.openmmlab.com/mmtracking/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot_20220822_082200-b7dbeca4.pth
diff --git a/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_base.py b/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_base.py
new file mode 100644
index 000000000..0ccce0db0
--- /dev/null
+++ b/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_base.py
@@ -0,0 +1,199 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+randomness = dict(seed=1, deterministic=False)
+
+# model setting
+model = dict(
+    type='PrDiMP',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='mmcls.ResNet',
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=[1, 1, 1],
+        out_indices=[1, 2],  # 0, 1, 2
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    cls_head=dict(
+        type='PrDiMPClsHead',
+        in_dim=1024,
+        out_dim=512,
+        filter_initializer=dict(
+            type='FilterInitializer',
+            filter_size=4,
+            feature_dim=512,
+            feature_stride=16),
+        filter_optimizer=dict(
+            type='PrDiMPFilterOptimizer',
+            num_iters=5,
+            feat_stride=16,
+            init_step_length=1.0,
+            init_filter_regular=0.05,
+            gauss_sigma=0.9,
+            alpha_eps=0.05,
+            min_filter_regular=0.05,
+            label_thres=0),
+        loss_cls=dict(type='KLGridLoss'),
+        locate_cfg=dict(
+            no_target_min_score=0.04,
+            distractor_thres=0.8,
+            hard_neg_thres=0.5,
+            target_neighborhood_scale=2.2,
+            dispalcement_scale=0.8,
+            update_scale_when_uncertain=True),
+        update_cfg=dict(
+            sample_memory_size=50,
+            normal_lr=0.01,
+            hard_neg_lr=0.02,
+            init_samples_min_weight=0.25,
+            train_skipping=20),
+        optimizer_cfg=dict(
+            init_update_iters=10, update_iters=2, hard_neg_iters=1),
+        train_cfg=dict(
+            feat_size=(18, 18),
+            img_size=(288, 288),
+            sigma_factor=0.05,
+            end_pad_if_even=True,
+            gauss_label_bias=0.,
+            use_gauss_density=True,
+            label_density_norm=True,
+            label_density_threshold=0.,
+            label_density_shrink=0,
+            loss_weights=dict(cls_init=0.25, cls_iter=1., cls_final=0.25))),
+    bbox_head=dict(
+        type='IouNetHead',
+        in_dim=(4 * 128, 4 * 256),
+        pred_in_dim=(256, 256),
+        pred_inter_dim=(256, 256),
+        loss_bbox=dict(type='KLMCLoss'),
+        bbox_cfg=dict(
+            num_init_random_boxes=9,
+            box_jitter_pos=0.1,
+            box_jitter_sz=0.5,
+            iounet_topk=3,
+            box_refine_step_length=2.5e-3,
+            box_refine_iter=10,
+            max_aspect_ratio=6,
+            box_refine_step_decay=1),
+        train_cfg=dict(
+            proposals_sigma=[(0.05, 0.05), (0.5, 0.5)],
+            gt_bboxes_sigma=(0.05, 0.05),
+            num_samples=128,
+            add_first_bbox=False,
+            loss_weights=dict(bbox=0.0025))),
+    test_cfg=dict(
+        img_sample_size=22 * 16,
+        feature_stride=16,
+        search_scale_factor=6,
+        patch_max_scale_change=1.5,
+        border_mode='inside_major',
+        bbox_inside_ratio=0.2,
+        init_aug_cfg=dict(
+            augmentation=dict(
+                fliplr=True,
+                rotate=[10, -10, 45, -45],
+                blur=[(3, 1), (1, 3), (2, 2)],
+                relativeshift=[(0.6, 0.6), (-0.6, 0.6), (0.6, -0.6),
+                               (-0.6, -0.6)],
+                dropout=[0.2, 0.2]),
+            aug_expansion_factor=2,
+            random_shift_factor=1 / 3)))
+
+train_pipeline = [
+    dict(
+        type='DiMPSampling',
+        num_search_frames=3,
+        num_template_frames=3,
+        max_frame_range=200),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadTrackAnnotations', with_instance_id=False),
+            dict(type='GrayAug', prob=0.05)
+        ]),
+    dict(
+        type='SeqBboxJitter',
+        center_jitter_factor=[3, 3, 3, 4.5, 4.5, 4.5],
+        scale_jitter_factor=[0.25, 0.25, 0.25, 0.5, 0.5, 0.5],
+        crop_size_factor=[5, 5, 5, 5, 5, 5]),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=False,
+        transforms=[
+            dict(type='CropLikeDiMP', crop_size_factor=5, output_size=288),
+            dict(type='BrightnessAug', jitter_range=0.2)
+        ]),
+    dict(type='PackTrackInputs', ref_prefix='search', num_template_frames=3)
+]
+
+data_root = 'data/'
+# dataset settings
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='QuotaSampler', samples_per_epoch=60000),
+    dataset=dict(
+        type='RandomSampleConcatDataset',
+        dataset_sampling_weights=[1, 1, 1, 1],
+        datasets=[
+            dict(
+                type='GOT10kDataset',
+                data_root=data_root,
+                ann_file='GOT10k/annotations/got10k_train_vot_infos.txt',
+                data_prefix=dict(img_path='GOT10k'),
+                pipeline=train_pipeline,
+                test_mode=False),
+            dict(
+                type='LaSOTDataset',
+                data_root=data_root,
+                ann_file='LaSOT_full/annotations/lasot_train_infos.txt',
+                data_prefix=dict(img_path='LaSOT_full/LaSOTBenchmark'),
+                pipeline=train_pipeline,
+                test_mode=False),
+            dict(
+                type='TrackingNetDataset',
+                chunks_list=[0, 1, 2, 3],
+                data_root=data_root,
+                ann_file='TrackingNet/annotations/trackingnet_train_infos.txt',
+                data_prefix=dict(img_path='TrackingNet'),
+                pipeline=train_pipeline,
+                test_mode=False),
+            dict(
+                type='SOTCocoDataset',
+                data_root=data_root,
+                ann_file='coco/annotations/instances_train2017.json',
+                data_prefix=dict(img_path='coco/train2017'),
+                pipeline=train_pipeline,
+                test_mode=False)
+        ]))
+
+# runner loop
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=50, val_begin=50, val_interval=1)
+
+# learning policy
+param_scheduler = dict(type='StepLR', step_size=15, gamma=0.2)
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=2e-4),
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            backbone=dict(lr_multi=0.1),
+            classifier=dict(lr_multi=5),
+            bbox_regressor=dict(lr_multi=5))))
+
+# checkpoint saving
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=10),
+    logger=dict(type='LoggerHook', interval=50))
diff --git a/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot.py b/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot.py
new file mode 100644
index 000000000..daadf3d3f
--- /dev/null
+++ b/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-lasot.py
@@ -0,0 +1,4 @@
+_base_ = [
+    '../../_base_/datasets/lasot.py',
+    './prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_base.py'
+]
diff --git a/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py b/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py
new file mode 100644
index 000000000..efad3a51a
--- /dev/null
+++ b/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../../_base_/datasets/trackingnet.py',
+    './prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_base.py'
+]
+
+# evaluator
+val_evaluator = dict(outfile_prefix='results/prdimp_trackingnet')
+test_evaluator = val_evaluator
diff --git a/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k.py b/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k.py
new file mode 100644
index 000000000..69d71ee27
--- /dev/null
+++ b/configs/sot/prdimp/prdimp_r50_8xb10-50e_got10k.py
@@ -0,0 +1,22 @@
+_base_ = [
+    '../../_base_/datasets/got10k.py',
+    './prdimp_r50_8xb10-50e_got10k-lasot-trackingnet-coco_base.py'
+]
+
+train_pipeline = {{_base_.train_pipeline}}
+
+# dataset settings
+data_root = {{_base_.data_root}}
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='GOT10kDataset',
+        data_root=data_root,
+        ann_file='GOT10k/annotations/got10k_train_infos.txt',
+        data_prefix=dict(img_path='GOT10k'),
+        pipeline=train_pipeline,
+        test_mode=False))
+
+# evaluator
+val_evaluator = dict(outfile_prefix='results/prdimp/prdimp_got10k')
+test_evaluator = val_evaluator
diff --git a/configs/sot/siamese_rpn/README.md b/configs/sot/siamese_rpn/README.md
index 6bd022ae6..a2951f496 100644
--- a/configs/sot/siamese_rpn/README.md
+++ b/configs/sot/siamese_rpn/README.md
@@ -34,10 +34,10 @@ Note that the checkpoints from 10-th to 20-th epoch will be evaluated during tra
 
 We provide the best model with its configuration and training log.
 
-|        Method         | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                   Config                    |                                                                                                                                              Download                                                                                                                                              |
-| :-------------------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :-----------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|       SiamRPN++       |   R-50   |   -   |   20e   |   7.54   |      50.0      |  50.4   |      59.6      |   49.7    |   [config](siamese_rpn_r50_20e_lasot.py)    | [model](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_20e_lasot_20220420_181845-dd0f151e.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_20e_lasot_20220420_181845.log.json) |
-| SiamRPN++ <br> (FP16) |   R-50   |   -   |   20e   |    -     |       -        |  50.4   |      59.6      |   49.2    | [config](siamese_rpn_r50_fp16_20e_lasot.py) |                                [model](https://download.openmmlab.com/mmtracking/fp16/siamese_rpn_r50_fp16_20e_lasot_20220422_181501-ce30fdfd.pth) \| [log](https://download.openmmlab.com/mmtracking/fp16/siamese_rpn_r50_fp16_20e_lasot_20220422_181501.log.json)                                |
+|        Method         | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                                       Config                                        |                                                                                                                                              Download                                                                                                                                              |
+| :-------------------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :---------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|       SiamRPN++       |   R-50   |   -   |   20e   |   7.54   |      50.0      |  50.4   |      59.6      |   49.7    | [config](siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py) | [model](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_20e_lasot_20220420_181845-dd0f151e.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_20e_lasot_20220420_181845.log.json) |
+| SiamRPN++ <br> (FP16) |   R-50   |   -   |   20e   |    -     |       -        |  50.4   |      59.6      |   49.2    |                     [config](siamese_rpn_r50_fp16_20e_lasot.py)                     |                                [model](https://download.openmmlab.com/mmtracking/fp16/siamese_rpn_r50_fp16_20e_lasot_20220422_181501-ce30fdfd.pth) \| [log](https://download.openmmlab.com/mmtracking/fp16/siamese_rpn_r50_fp16_20e_lasot_20220422_181501.log.json)                                |
 
 Note:
 
@@ -52,17 +52,17 @@ Experimentally, the hyperparameters search on UAV123 can bring around 1.0 Succes
 
 The results below are achieved without hyperparameters search.
 
-|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm Precision | Precision |                 Config                  |                                                                                                                                                Download                                                                                                                                                |
-| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :-------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| SiamRPN++ |   R-50   |   -   |   20e   |   7.54   |       -        |   60    |      77.3      |   80.3    | [config](siamese_rpn_r50_20e_uav123.py) | [model](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_uav123/siamese_rpn_r50_20e_uav123_20220420_181845-dc2d4831.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_uav123/siamese_rpn_r50_20e_uav123_20220420_181845.log.json) |
+|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm Precision | Precision |                                        Config                                        |                                                                                                                                                Download                                                                                                                                                |
+| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :----------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| SiamRPN++ |   R-50   |   -   |   20e   |   7.54   |       -        |   60    |      77.3      |   80.3    | [config](siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-uav123.py) | [model](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_uav123/siamese_rpn_r50_20e_uav123_20220420_181845-dc2d4831.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_uav123/siamese_rpn_r50_20e_uav123_20220420_181845.log.json) |
 
 ### TrackingNet
 
-The results of SiameseRPN++ in TrackingNet are reimplemented by ourselves. The best model on LaSOT is submitted to [the evaluation server on TrackingNet Challenge](http://eval.tracking-net.org/web/challenges/challenge-page/39/submission). We provide the best model with its configuration and training log.
+The results of SiameseRPN++ in TrackingNet are reimplemented by ourselves. The best model on LaSOT is submitted to [the evaluation server on TrackingNet Challenge](https://eval.ai/web/challenges/challenge-page/1805/submission). We provide the best model with its configuration and training log.
 
-|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                    Config                    |                                                                                                                                              Download                                                                                                                                              |
-| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| SiamRPN++ |   R-50   |   -   |   20e   |   7.54   |       -        |  68.8   |      75.9      |   63.2    | [config](siamese_rpn_r50_20e_trackingnet.py) | [model](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_20e_lasot_20220420_181845-dd0f151e.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_20e_lasot_20220420_181845.log.json) |
+|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                                          Config                                           |                                                                                                                                              Download                                                                                                                                              |
+| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :---------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| SiamRPN++ |   R-50   |   -   |   20e   |   7.54   |       -        |  68.8   |      75.9      |   63.2    | [config](siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-trackingnet.py) | [model](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_20e_lasot_20220420_181845-dd0f151e.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_20e_lasot_20220420_181845.log.json) |
 
 ### OTB100
 
@@ -72,9 +72,9 @@ If you want to get better results, you can use the best checkpoint to search the
 
 **Note:** The results reported in the paper are 69.6 Success and 91.4 Precision. We train the SiameseRPN++ in the official [pysot](https://github.com/STVIR/pysot) codebase and can not reproduce the same results. We only get 66.1 Success and 86.7 Precision by following the training and hyperparameters searching instructions of pysot, which are lower than those of the paper by 3.5 Succuess and 4.7 Precision respectively. Without hyperparameters search, we get 65.3 Success and 85.8 Precision. In our codebase, the results below are also achieved without hyperparameters search, close to the results reproduced in pysot in the same setting.
 
-|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm Precision | Precision |                 Config                  |                                                                                                                                                Download                                                                                                                                                |
-| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :-------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| SiamRPN++ |   R-50   |   -   |   20e   |    -     |       -        |  64.9   |      82.4      |   86.3    | [config](siamese_rpn_r50_20e_otb100.py) | [model](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_otb100/siamese_rpn_r50_20e_otb100_20220421_144232-6b8f1730.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_otb100/siamese_rpn_r50_20e_otb100_20220421_144232.log.json) |
+|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm Precision | Precision |                                        Config                                        |                                                                                                                                                Download                                                                                                                                                |
+| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :----------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| SiamRPN++ |   R-50   |   -   |   20e   |    -     |       -        |  64.9   |      82.4      |   86.3    | [config](siamese-rpn_resnet50_8xb16-20e_imagenetvid-imagenetdet-coco_test-otb100.py) | [model](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_otb100/siamese_rpn_r50_20e_otb100_20220421_144232-6b8f1730.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_otb100/siamese_rpn_r50_20e_otb100_20220421_144232.log.json) |
 
 ### VOT2018
 
@@ -84,6 +84,62 @@ If you want to get better results, you can use the best checkpoint to search the
 
 **Note:** The result reported in the paper is 0.414 EAO. We train the SiameseRPN++ in the official [pysot](https://github.com/STVIR/pysot) codebase and can not reproduce the same result. We only get 0.364 EAO by following the training and hyperparameters searching instructions of pysot, which is lower than that of the paper by 0.05 EAO. Without hyperparameters search, we get 0.346 EAO. In our codebase, the results below are also achieved without hyperparameters search, close to the results reproduced in pysot in the same setting.
 
-|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) |  EAO  | Accuracy | Robustness |                  Config                  |                                                                                                                                                  Download                                                                                                                                                  |
-| :-------: | :------: | :---: | :-----: | :------: | :------------: | :---: | :------: | :--------: | :--------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| SiamRPN++ |   R-50   |   -   |   20e   |    -     |       -        | 0.348 |  0.588   |   0.295    | [config](siamese_rpn_r50_20e_vot2018.py) | [model](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018/siamese_rpn_r50_20e_vot2018_20220420_181845-1111f25e.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018/siamese_rpn_r50_20e_vot2018_20220420_181845.log.json) |
+|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) |  EAO  | Accuracy | Robustness |                                        Config                                         |                                                                                                                                                  Download                                                                                                                                                  |
+| :-------: | :------: | :---: | :-----: | :------: | :------------: | :---: | :------: | :--------: | :-----------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| SiamRPN++ |   R-50   |   -   |   20e   |    -     |       -        | 0.348 |  0.588   |   0.295    | [config](siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-vot2018.py) | [model](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018/siamese_rpn_r50_20e_vot2018_20220420_181845-1111f25e.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_vot2018/siamese_rpn_r50_20e_vot2018_20220420_181845.log.json) |
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training.
+
+```shell
+# Training SiamRPN++ on ImageNetVID、ImageNetDET and coco dataset with following command
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs
+./tools/dist_train.sh \
+    configs/sot/siamese_fpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py 8
+```
+
+The models tested on LaSOT, TrackingNet, UAV123 and VOT2018 have the same training settings. For OTB100, there are some unique training [settings](./siamese-rpn_r50_8xb16-20e_imagenetvid-imagenetdet-coco_test-otb100.py).
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+**2.1 Example on LaSOT, UAV123, OTB100 and VOT2018 datasets**
+
+```shell
+# Example 1: Test on LaSOT testset
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/siamese_fpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py 8 \
+    --checkpoint ./checkpoints/siamese_rpn_r50_20e_lasot_20220420_181845-dd0f151e.pth
+```
+
+**2.1 Example on TrackingNet dataset**
+
+If you want to get the results of the [TrackingNet](https://eval.ai/web/challenges/challenge-page/1805/) test set, please use the following command to generate result files that can be used for submission. It will be stored in `./results/siamese_rpn_trackingnet.zip`, you can modify the saved path in `test_evaluator` of the config.
+
+```shell
+# Example 1: Test on TrackingNet testset
+# We use the best checkpoint on LaSOT to test on the TrackingNet.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/siamese_fpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-trackingnet.py 8 \
+    --checkpoint ./checkpoints/siamese_rpn_r50_20e_lasot_20220420_181845-dd0f151e.pth
+```
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_sot.py \
+    configs/sot/siamese_fpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py \
+    --checkpoint ./checkpoints/siamese_rpn_r50_20e_lasot_20220420_181845-dd0f151e.pth \
+    --input demo/demo.mp4 \
+    --output sot.mp4
+```
+
+If you want to know about more detailed usage of `demo_sot.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/sot/siamese_rpn/metafile.yml b/configs/sot/siamese_rpn/metafile.yml
index de491fedb..ae51a1151 100644
--- a/configs/sot/siamese_rpn/metafile.yml
+++ b/configs/sot/siamese_rpn/metafile.yml
@@ -8,14 +8,14 @@ Collections:
       Architecture:
         - ResNet
     Paper:
-        URL: https://arxiv.org/abs/1812.11703
-        Title: SiamRPN++ Evolution of Siamese Visual Tracking with Very Deep Networks
+      URL: https://arxiv.org/abs/1812.11703
+      Title: SiamRPN++ Evolution of Siamese Visual Tracking with Very Deep Networks
     README: configs/sot/siamese_rpn/README.md
 
 Models:
-  - Name: siamese_rpn_r50_20e_lasot
+  - Name: siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot
     In Collection: SiameseRPN++
-    Config: configs/sot/siamese_rpn/siamese_rpn_r50_20e_lasot.py
+    Config: configs/sot/siamese_rpn/siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py
     Metadata:
       Training Data: MSCOCO, ImageNet DET, ImageNet VID
       Training Memory (GB): 7.54
@@ -29,9 +29,9 @@ Models:
           Precision: 49.7
     Weights: https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_20e_lasot_20220420_181845-dd0f151e.pth
 
-  - Name: siamese_rpn_r50_20e_uav123
+  - Name: siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-uav123
     In Collection: SiameseRPN++
-    Config: configs/sot/siamese_rpn/siamese_rpn_r50_20e_uav123.py
+    Config: configs/sot/siamese_rpn/siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-uav123.py
     Metadata:
       Training Data: MSCOCO, ImageNet DET, ImageNet VID
       Training Memory (GB): 7.54
@@ -45,9 +45,9 @@ Models:
           Precision: 80.3
     Weights: https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_uav123/siamese_rpn_r50_20e_uav123_20220420_181845-dc2d4831.pth
 
-  - Name: siamese_rpn_r50_20e_trackingnet
+  - Name: siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-trackingnet
     In Collection: SiameseRPN++
-    Config: configs/sot/siamese_rpn/siamese_rpn_r50_20e_trackingnet.py
+    Config: configs/sot/siamese_rpn/siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-trackingnet.py
     Metadata:
       Training Data: MSCOCO, ImageNet DET, ImageNet VID
       Training Memory (GB): 7.54
@@ -61,9 +61,9 @@ Models:
           Precision: 63.2
     Weights: https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_20e_lasot_20220420_181845-dd0f151e.pth
 
-  - Name: siamese_rpn_r50_20e_otb100
+  - Name: siamese-rpn_resnet50_8xb16-20e_imagenetvid-imagenetdet-coco_test-otb100
     In Collection: SiameseRPN++
-    Config: configs/sot/siamese_rpn/siamese_rpn_r50_20e_otb100.py
+    Config: configs/sot/siamese_rpn/siamese-rpn_resnet50_8xb16-20e_imagenetvid-imagenetdet-coco_test-otb100.py
     Metadata:
       Training Data: MSCOCO, ImageNet DET, ImageNet VID
       Training Memory (GB): _
@@ -77,9 +77,9 @@ Models:
           Precision: 86.3
     Weights: https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_otb100/siamese_rpn_r50_20e_otb100_20220421_144232-6b8f1730.pth
 
-  - Name: siamese_rpn_r50_20e_vot2018
+  - Name: siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-vot2018
     In Collection: SiameseRPN++
-    Config: configs/sot/siamese_rpn/siamese_rpn_r50_20e_vot2018.py
+    Config: configs/sot/siamese_rpn/siamese-rpn_resnet50_8xb28-20e_imagenetvid-imagenetdet-coco_test-vot2018.py
     Metadata:
       Training Data: MSCOCO, ImageNet DET, ImageNet VID
       Training Memory (GB): _
diff --git a/configs/sot/siamese_rpn/siamese-rpn_r50_8xb16-20e_imagenetvid-imagenetdet-coco_test-otb100.py b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb16-20e_imagenetvid-imagenetdet-coco_test-otb100.py
new file mode 100644
index 000000000..d8c7ea8e2
--- /dev/null
+++ b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb16-20e_imagenetvid-imagenetdet-coco_test-otb100.py
@@ -0,0 +1,72 @@
+_base_ = [
+    '../../_base_/datasets/otb100.py',
+    './siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_base.py'
+]
+
+crop_size = 511
+exemplar_size = 127
+search_size = 255
+
+# model settings
+model = dict(
+    test_cfg=dict(rpn=dict(penalty_k=0.4, window_influence=0.5, lr=0.4)))
+
+data_root = {{_base_.data_root}}
+train_pipeline = [
+    dict(
+        type='PairSampling',
+        frame_range=100,
+        pos_prob=0.8,
+        filter_template_img=False),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=False,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadTrackAnnotations', with_instance_id=False),
+            dict(
+                type='CropLikeSiamFC',
+                context_amount=0.5,
+                exemplar_size=exemplar_size,
+                crop_size=crop_size)
+        ]),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=dict(type='GrayAug', prob=0.2)),
+    dict(
+        type='SeqShiftScaleAug',
+        target_size=[exemplar_size, search_size],
+        shift=[4, 64],
+        scale=[0.05, 0.18]),
+    dict(type='SeqColorAug', prob=[1.0, 1.0]),
+    dict(type='SeqBlurAug', prob=[0.0, 0.2]),
+    dict(type='PackTrackInputs', ref_prefix='search', num_template_frames=1)
+]
+
+# dataloader
+train_dataloader = dict(
+    batch_size=16,
+    dataset=dict(datasets=[
+        dict(
+            type='SOTImageNetVIDDataset',
+            data_root=data_root,
+            ann_file='ILSVRC/annotations/imagenet_vid_train.json',
+            data_prefix=dict(img_path='ILSVRC/Data/VID'),
+            pipeline=train_pipeline,
+            test_mode=False),
+        dict(
+            type='SOTCocoDataset',
+            data_root=data_root,
+            ann_file='coco/annotations/instances_train2017.json',
+            data_prefix=dict(img_path='coco/train2017'),
+            pipeline=train_pipeline,
+            test_mode=False),
+        dict(
+            type='SOTCocoDataset',
+            data_root=data_root,
+            ann_file='ILSVRC/annotations/imagenet_det_30plus1cls.json',
+            data_prefix=dict(img_path='ILSVRC/Data/DET'),
+            pipeline=train_pipeline,
+            test_mode=False)
+    ]))
diff --git a/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_base.py b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_base.py
new file mode 100644
index 000000000..18964240a
--- /dev/null
+++ b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_base.py
@@ -0,0 +1,173 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+randomness = dict(seed=1, deterministic=True)
+find_unused_parameters = True
+crop_size = 511
+exemplar_size = 127
+search_size = 255
+
+# model settings
+model = dict(
+    type='SiamRPN',
+    data_preprocessor=dict(type='TrackDataPreprocessor'),
+    backbone=dict(
+        type='SOTResNet',
+        depth=50,
+        out_indices=(1, 2, 3),
+        frozen_stages=4,
+        strides=(1, 2, 1, 1),
+        dilations=(1, 1, 2, 4),
+        norm_eval=True,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/pretrained_weights/sot_resnet50.model'  # noqa: E501
+        )),
+    neck=dict(
+        type='mmdet.ChannelMapper',
+        in_channels=[512, 1024, 2048],
+        out_channels=256,
+        kernel_size=1,
+        norm_cfg=dict(type='BN'),
+        act_cfg=None),
+    head=dict(
+        type='SiameseRPNHead',
+        anchor_generator=dict(
+            type='SiameseRPNAnchorGenerator',
+            strides=[8],
+            ratios=[0.33, 0.5, 1, 2, 3],
+            scales=[8]),
+        in_channels=[256, 256, 256],
+        weighted_sum=True,
+        bbox_coder=dict(
+            type='mmdet.DeltaXYWHBBoxCoder',
+            target_means=[0., 0., 0., 0.],
+            target_stds=[1., 1., 1., 1.]),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        loss_bbox=dict(type='mmdet.L1Loss', reduction='sum', loss_weight=1.2)),
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='mmdet.MaxIoUAssigner',
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.6,
+                match_low_quality=False,
+                iou_calculator=dict(type='mmdet.BboxOverlaps2D')),
+            sampler=dict(
+                type='mmdet.RandomSampler',
+                num=64,
+                pos_fraction=0.25,
+                add_gt_as_proposals=False),
+            num_neg=16,
+            exemplar_size=exemplar_size,
+            search_size=search_size)),
+    test_cfg=dict(
+        exemplar_size=exemplar_size,
+        search_size=search_size,
+        context_amount=0.5,
+        center_size=7,
+        rpn=dict(penalty_k=0.05, window_influence=0.42, lr=0.38)))
+
+# data pipeline
+data_root = 'data/'
+train_pipeline = [
+    dict(
+        type='PairSampling',
+        frame_range=100,
+        pos_prob=0.8,
+        filter_template_img=False),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=False,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadTrackAnnotations', with_instance_id=False),
+            dict(
+                type='CropLikeSiamFC',
+                context_amount=0.5,
+                exemplar_size=exemplar_size,
+                crop_size=crop_size)
+        ]),
+    dict(
+        type='SeqShiftScaleAug',
+        target_size=[exemplar_size, search_size],
+        shift=[4, 64],
+        scale=[0.05, 0.18]),
+    dict(type='SeqColorAug', prob=[1.0, 1.0]),
+    dict(type='SeqBlurAug', prob=[0.0, 0.2]),
+    dict(type='PackTrackInputs', ref_prefix='search', num_template_frames=1)
+]
+
+# dataloader
+train_dataloader = dict(
+    batch_size=28,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='QuotaSampler', samples_per_epoch=600000),
+    dataset=dict(
+        type='RandomSampleConcatDataset',
+        dataset_sampling_weights=[0.25, 0.2, 0.55],
+        datasets=[
+            dict(
+                type='SOTImageNetVIDDataset',
+                data_root=data_root,
+                ann_file='ILSVRC/annotations/imagenet_vid_train.json',
+                data_prefix=dict(img_path='ILSVRC/Data/VID'),
+                pipeline=train_pipeline,
+                test_mode=False),
+            dict(
+                type='SOTCocoDataset',
+                data_root=data_root,
+                ann_file='coco/annotations/instances_train2017.json',
+                data_prefix=dict(img_path='coco/train2017'),
+                pipeline=train_pipeline,
+                test_mode=False),
+            dict(
+                type='SOTCocoDataset',
+                data_root=data_root,
+                ann_file='ILSVRC/annotations/imagenet_det_30plus1cls.json',
+                data_prefix=dict(img_path='ILSVRC/Data/DET'),
+                pipeline=train_pipeline,
+                test_mode=False)
+        ]))
+
+# runner loop
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=10, val_interval=1)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='SiamRPNExpLR',
+        start_factor=0.2,
+        end_factor=1.0,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        endpoint=False),
+    dict(
+        type='SiamRPNExpLR',
+        start_factor=1.0,
+        end_factor=0.1,
+        by_epoch=True,
+        begin=5,
+        end=20,
+        endpoint=True)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=10.0, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys=dict(backbone=dict(lr_mult=0.1, decay_mult=1.0))))
+
+custom_hooks = [
+    dict(
+        type='SiamRPNBackboneUnfreezeHook',
+        backbone_start_train_epoch=10,
+        backbone_train_layers=['layer2', 'layer3', 'layer4'])
+]
diff --git a/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py
new file mode 100644
index 000000000..a501b967b
--- /dev/null
+++ b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py
@@ -0,0 +1,4 @@
+_base_ = [
+    '../../_base_/datasets/lasot.py',
+    './siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_base.py'
+]
diff --git a/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-trackingnet.py b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-trackingnet.py
new file mode 100644
index 000000000..88a26e093
--- /dev/null
+++ b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-trackingnet.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../../_base_/datasets/trackingnet.py',
+    './siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_base.py'
+]
+
+# evaluator
+val_evaluator = dict(outfile_prefix='results/siamese_rpn_trackingnet')
+test_evaluator = val_evaluator
diff --git a/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-uav123.py b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-uav123.py
new file mode 100644
index 000000000..67a7abdf5
--- /dev/null
+++ b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-uav123.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../../_base_/datasets/uav123.py',
+    './siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_base.py'
+]
+
+# model settings
+model = dict(
+    test_cfg=dict(rpn=dict(penalty_k=0.1, window_influence=0.1, lr=0.5)))
diff --git a/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-vot2018.py b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-vot2018.py
new file mode 100644
index 000000000..4bfbc2322
--- /dev/null
+++ b/configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-vot2018.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../../_base_/datasets/vot2018.py',
+    './siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_base.py'
+]
+
+# model settings
+model = dict(
+    test_cfg=dict(
+        rpn=dict(penalty_k=0.04, window_influence=0.44, lr=0.33),
+        test_mode='VOT'))
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_20e_lasot.py b/configs/sot/siamese_rpn/siamese_rpn_r50_20e_lasot.py
deleted file mode 100644
index 239ce882d..000000000
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_20e_lasot.py
+++ /dev/null
@@ -1,202 +0,0 @@
-cudnn_benchmark = False
-deterministic = True
-seed = 1
-find_unused_parameters = True
-crop_size = 511
-exemplar_size = 127
-search_size = 255
-
-# model settings
-model = dict(
-    type='SiamRPN',
-    backbone=dict(
-        type='SOTResNet',
-        depth=50,
-        out_indices=(1, 2, 3),
-        frozen_stages=4,
-        strides=(1, 2, 1, 1),
-        dilations=(1, 1, 2, 4),
-        norm_eval=True,
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/pretrained_weights/sot_resnet50.model'  # noqa: E501
-        )),
-    neck=dict(
-        type='ChannelMapper',
-        in_channels=[512, 1024, 2048],
-        out_channels=256,
-        kernel_size=1,
-        norm_cfg=dict(type='BN'),
-        act_cfg=None),
-    head=dict(
-        type='SiameseRPNHead',
-        anchor_generator=dict(
-            type='SiameseRPNAnchorGenerator',
-            strides=[8],
-            ratios=[0.33, 0.5, 1, 2, 3],
-            scales=[8]),
-        in_channels=[256, 256, 256],
-        weighted_sum=True,
-        bbox_coder=dict(
-            type='DeltaXYWHBBoxCoder',
-            target_means=[0., 0., 0., 0.],
-            target_stds=[1., 1., 1., 1.]),
-        loss_cls=dict(
-            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
-        loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=1.2)),
-    train_cfg=dict(
-        rpn=dict(
-            assigner=dict(
-                type='MaxIoUAssigner',
-                pos_iou_thr=0.6,
-                neg_iou_thr=0.3,
-                min_pos_iou=0.6,
-                match_low_quality=False),
-            sampler=dict(
-                type='RandomSampler',
-                num=64,
-                pos_fraction=0.25,
-                add_gt_as_proposals=False),
-            num_neg=16,
-            exemplar_size=exemplar_size,
-            search_size=search_size)),
-    test_cfg=dict(
-        exemplar_size=exemplar_size,
-        search_size=search_size,
-        context_amount=0.5,
-        center_size=7,
-        rpn=dict(penalty_k=0.05, window_influence=0.42, lr=0.38)))
-
-data_root = 'data/'
-train_pipeline = [
-    dict(
-        type='PairSampling',
-        frame_range=100,
-        pos_prob=0.8,
-        filter_template_img=False),
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_label=False),
-    dict(
-        type='SeqCropLikeSiamFC',
-        context_amount=0.5,
-        exemplar_size=exemplar_size,
-        crop_size=crop_size),
-    dict(
-        type='SeqShiftScaleAug',
-        target_size=[exemplar_size, search_size],
-        shift=[4, 64],
-        scale=[0.05, 0.18]),
-    dict(type='SeqColorAug', prob=[1.0, 1.0]),
-    dict(type='SeqBlurAug', prob=[0.0, 0.2]),
-    dict(type='VideoCollect', keys=['img', 'gt_bboxes', 'is_positive_pairs']),
-    dict(type='ConcatSameTypeFrames'),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='search')
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile', to_float32=True),
-    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
-    dict(
-        type='MultiScaleFlipAug',
-        scale_factor=1,
-        flip=False,
-        transforms=[
-            dict(type='VideoCollect', keys=['img', 'gt_bboxes']),
-            dict(type='ImageToTensor', keys=['img'])
-        ])
-]
-# dataset settings
-data = dict(
-    samples_per_gpu=28,
-    workers_per_gpu=4,
-    persistent_workers=True,
-    samples_per_epoch=600000,
-    train=dict(
-        type='RandomSampleConcatDataset',
-        dataset_sampling_weights=[0.25, 0.2, 0.55],
-        dataset_cfgs=[
-            dict(
-                type='SOTImageNetVIDDataset',
-                ann_file=data_root +
-                'ILSVRC/annotations/imagenet_vid_train.json',
-                img_prefix=data_root + 'ILSVRC/Data/VID',
-                pipeline=train_pipeline,
-                split='train',
-                test_mode=False),
-            dict(
-                type='SOTCocoDataset',
-                ann_file=data_root +
-                'coco/annotations/instances_train2017.json',
-                img_prefix=data_root + 'coco/train2017',
-                pipeline=train_pipeline,
-                split='train',
-                test_mode=False),
-            dict(
-                type='SOTCocoDataset',
-                ann_file=data_root +
-                'ILSVRC/annotations/imagenet_det_30plus1cls.json',
-                img_prefix=data_root + 'ILSVRC/Data/DET',
-                pipeline=train_pipeline,
-                split='train',
-                test_mode=False)
-        ]),
-    val=dict(
-        type='LaSOTDataset',
-        ann_file=data_root + 'lasot/annotations/lasot_test_infos.txt',
-        img_prefix=data_root + 'lasot/LaSOTBenchmark',
-        pipeline=test_pipeline,
-        split='test',
-        test_mode=True,
-        only_eval_visible=True),
-    test=dict(
-        type='LaSOTDataset',
-        ann_file=data_root + 'lasot/annotations/lasot_test_infos.txt',
-        img_prefix=data_root + 'lasot/LaSOTBenchmark',
-        pipeline=test_pipeline,
-        split='test',
-        test_mode=True,
-        only_eval_visible=True))
-# optimizer
-optimizer = dict(
-    type='SGD',
-    lr=0.005,
-    momentum=0.9,
-    weight_decay=0.0001,
-    paramwise_cfg=dict(
-        custom_keys=dict(backbone=dict(lr_mult=0.1, decay_mult=1.0))))
-optimizer_config = dict(
-    type='SiameseRPNOptimizerHook',
-    backbone_start_train_epoch=10,
-    backbone_train_layers=['layer2', 'layer3', 'layer4'],
-    grad_clip=dict(max_norm=10.0, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='SiameseRPN',
-    lr_configs=[
-        dict(type='step', start_lr_factor=0.2, end_lr_factor=1.0, end_epoch=5),
-        dict(type='log', start_lr_factor=1.0, end_lr_factor=0.1, end_epoch=20),
-    ])
-# checkpoint saving
-checkpoint_config = dict(interval=1)
-evaluation = dict(
-    metric=['track'],
-    interval=1,
-    start=10,
-    rule='greater',
-    save_best='success')
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
-total_epochs = 20
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/xxx'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_20e_otb100.py b/configs/sot/siamese_rpn/siamese_rpn_r50_20e_otb100.py
deleted file mode 100644
index 4d90abec7..000000000
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_20e_otb100.py
+++ /dev/null
@@ -1,73 +0,0 @@
-_base_ = ['./siamese_rpn_r50_20e_lasot.py']
-
-crop_size = 511
-exemplar_size = 127
-search_size = 255
-
-# model settings
-model = dict(
-    test_cfg=dict(rpn=dict(penalty_k=0.4, window_influence=0.5, lr=0.4)))
-
-data_root = 'data/'
-train_pipeline = [
-    dict(
-        type='PairSampling',
-        frame_range=100,
-        pos_prob=0.8,
-        filter_template_img=False),
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_label=False),
-    dict(
-        type='SeqCropLikeSiamFC',
-        context_amount=0.5,
-        exemplar_size=exemplar_size,
-        crop_size=crop_size),
-    dict(type='SeqGrayAug', prob=0.2),
-    dict(
-        type='SeqShiftScaleAug',
-        target_size=[exemplar_size, search_size],
-        shift=[4, 64],
-        scale=[0.05, 0.18]),
-    dict(type='SeqColorAug', prob=[1.0, 1.0]),
-    dict(type='SeqBlurAug', prob=[0.0, 0.2]),
-    dict(type='VideoCollect', keys=['img', 'gt_bboxes', 'is_positive_pairs']),
-    dict(type='ConcatSameTypeFrames'),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='search')
-]
-# dataset settings
-data = dict(
-    samples_per_gpu=16,
-    train=dict(dataset_cfgs=[
-        dict(
-            type='SOTImageNetVIDDataset',
-            ann_file=data_root + 'ILSVRC/annotations/imagenet_vid_train.json',
-            img_prefix=data_root + 'ILSVRC/Data/VID',
-            pipeline=train_pipeline,
-            split='train',
-            test_mode=False),
-        dict(
-            type='SOTCocoDataset',
-            ann_file=data_root + 'coco/annotations/instances_train2017.json',
-            img_prefix=data_root + 'coco/train2017',
-            pipeline=train_pipeline,
-            split='train',
-            test_mode=False),
-        dict(
-            type='SOTCocoDataset',
-            ann_file=data_root +
-            'ILSVRC/annotations/imagenet_det_30plus1cls.json',
-            img_prefix=data_root + 'ILSVRC/Data/DET',
-            pipeline=train_pipeline,
-            split='train',
-            test_mode=False)
-    ]),
-    val=dict(
-        type='OTB100Dataset',
-        ann_file=data_root + 'otb100/annotations/otb100_infos.txt',
-        img_prefix=data_root + 'otb100',
-        only_eval_visible=False),
-    test=dict(
-        type='OTB100Dataset',
-        ann_file=data_root + 'otb100/annotations/otb100_infos.txt',
-        img_prefix=data_root + 'otb100',
-        only_eval_visible=False))
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_20e_trackingnet.py b/configs/sot/siamese_rpn/siamese_rpn_r50_20e_trackingnet.py
deleted file mode 100644
index bb33ebef0..000000000
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_20e_trackingnet.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = ['./siamese_rpn_r50_20e_lasot.py']
-
-data_root = 'data/'
-# dataset settings
-data = dict(
-    test=dict(
-        type='TrackingNetDataset',
-        ann_file=data_root +
-        'trackingnet/annotations/trackingnet_test_infos.txt',
-        img_prefix=data_root + 'trackingnet'))
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_20e_uav123.py b/configs/sot/siamese_rpn/siamese_rpn_r50_20e_uav123.py
deleted file mode 100644
index 27e3dcbd1..000000000
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_20e_uav123.py
+++ /dev/null
@@ -1,19 +0,0 @@
-_base_ = ['./siamese_rpn_r50_20e_lasot.py']
-
-# model settings
-model = dict(
-    test_cfg=dict(rpn=dict(penalty_k=0.1, window_influence=0.1, lr=0.5)))
-
-data_root = 'data/'
-# dataset settings
-data = dict(
-    val=dict(
-        type='UAV123Dataset',
-        ann_file=data_root + 'UAV123/annotations/uav123_infos.txt',
-        img_prefix=data_root + 'UAV123',
-        only_eval_visible=False),
-    test=dict(
-        type='UAV123Dataset',
-        ann_file=data_root + 'UAV123/annotations/uav123_infos.txt',
-        img_prefix=data_root + 'UAV123',
-        only_eval_visible=False))
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_20e_vot2018.py b/configs/sot/siamese_rpn/siamese_rpn_r50_20e_vot2018.py
deleted file mode 100644
index ab62e33d5..000000000
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_20e_vot2018.py
+++ /dev/null
@@ -1,23 +0,0 @@
-_base_ = ['./siamese_rpn_r50_20e_lasot.py']
-
-# model settings
-model = dict(
-    test_cfg=dict(
-        rpn=dict(penalty_k=0.04, window_influence=0.44, lr=0.33),
-        test_mode='VOT'))
-
-data_root = 'data/'
-# dataset settings
-data = dict(
-    val=dict(
-        type='VOTDataset',
-        dataset_type='vot2018',
-        ann_file=data_root + 'vot2018/annotations/vot2018_infos.txt',
-        img_prefix=data_root + 'vot2018'),
-    test=dict(
-        type='VOTDataset',
-        dataset_type='vot2018',
-        ann_file=data_root + 'vot2018/annotations/vot2018_infos.txt',
-        img_prefix=data_root + 'vot2018'))
-evaluation = dict(
-    metric=['track'], interval=1, start=10, rule='greater', save_best='eao')
diff --git a/configs/sot/siamese_rpn/siamese_rpn_r50_fp16_20e_lasot.py b/configs/sot/siamese_rpn/siamese_rpn_r50_fp16_20e_lasot.py
deleted file mode 100644
index 98edae0d7..000000000
--- a/configs/sot/siamese_rpn/siamese_rpn_r50_fp16_20e_lasot.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = ['./siamese_rpn_r50_20e_lasot.py']
-optimizer_config = dict(type='SiameseRPNFp16OptimizerHook')
-fp16 = dict(loss_scale=512.)
diff --git a/configs/sot/stark/README.md b/configs/sot/stark/README.md
index fa6b1399d..752940fc0 100644
--- a/configs/sot/stark/README.md
+++ b/configs/sot/stark/README.md
@@ -28,11 +28,13 @@ In this paper, we present a new tracking architecture with an encoder-decoder tr
 
 ## Results and models
 
-The STARK is trained in 2 stages. We denote the 1st-stage model as `STARK-ST1`, and denote the 2nd-stage model as `STARK-ST2`. The following models we provide are the last-epoch models by default.
-
-Models from the 2 stages have different configurations. For example, `stark_st1_r50_500e_got10k` is the configuration of the 1st-stage model and `stark_st2_r50_50e_got10k` is the configuration of the 2nd-stage model.
+```
+bash ./tools/dist_train.sh \
+    ${CONFIG_FILE} \
+    ${GPU_NUM} \
+```
 
-Note: We have to pass an extra parameter `cfg-options` containing the key `load_from` from shell command to load the pretrained 1st-stage model when training the 2nd-stage model. Here is an example:
+When training the 2nd-stage model, we have to pass an extra parameter `cfg-options` containing the key `load_from` from shell command to load the pretrained 1st-stage model. Here is an example:
 
 ```
 bash ./tools/dist_train.sh \
@@ -45,25 +47,136 @@ bash ./tools/dist_train.sh \
 
 We provide the last-epoch model with its configuration and training log.
 
-|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                Config                 |                                                                                                                                       Download                                                                                                                                       |
-| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :-----------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| STARK-ST1 |   R-50   |   -   |  500e   |   8.45   |       -        |  67.0   |      77.3      |   71.7    | [config](stark_st1_r50_500e_lasot.py) | [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_lasot/stark_st1_r50_500e_lasot_20220414_185654-9c19e39e.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_lasot/stark_st1_r50_500e_lasot_20220414_185654.log.json) |
-| STARK-ST2 |   R-50   |   -   |   50e   |   2.31   |       -        |  67.8   |      78.5      |   73.0    | [config](stark_st2_r50_50e_lasot.py)  |   [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_lasot/stark_st2_r50_50e_lasot_20220416_170201-b1484149.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_lasot/stark_st2_r50_50e_lasot_20220416_170201.log.json)   |
+|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                                       Config                                        |                                                                                                                                       Download                                                                                                                                       |
+| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :---------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| STARK-ST1 |   R-50   |   -   |  500e   |   8.45   |       -        |  67.0   |      77.3      |   71.7    | [config](stark-st1_resnet50_8xb16-500e_got10k-lasot-trackingnet-coco_test-lasot.py) | [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_lasot/stark_st1_r50_500e_lasot_20220414_185654-9c19e39e.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_lasot/stark_st1_r50_500e_lasot_20220414_185654.log.json) |
+| STARK-ST2 |   R-50   |   -   |   50e   |   2.31   |       -        |  67.8   |      78.5      |   73.0    | [config](stark-st2_resnet50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot.py)  |   [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_lasot/stark_st2_r50_50e_lasot_20220416_170201-b1484149.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_lasot/stark_st2_r50_50e_lasot_20220416_170201.log.json)   |
 
 ### TrackingNet
 
-The results of STARK in TrackingNet are reimplemented by ourselves. The last-epoch model on TrackingNet is submitted to [the evaluation server on TrackingNet Challenge](http://eval.tracking-net.org/web/challenges/challenge-page/39/submission). We provide the model with its configuration and training log.
+The results of STARK in TrackingNet are reimplemented by ourselves. The last-epoch model on LaSOT is submitted to [the evaluation server on TrackingNet Challenge](https://eval.ai/web/challenges/challenge-page/1805/). We provide the model with its configuration and training log.
 
-|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                Config                 |                                                                                                                                       Download                                                                                                                                       |
-| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :-----------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| STARK-ST1 |   R-50   |   -   |  500e   |   8.45   |       -        |  80.3   |      85.0      |   77.7    | [config](stark_st1_r50_500e_lasot.py) | [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_lasot/stark_st1_r50_500e_lasot_20220414_185654-9c19e39e.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_lasot/stark_st1_r50_500e_lasot_20220414_185654.log.json) |
-| STARK-ST2 |   R-50   |   -   |   50e   |   2.31   |       -        |  81.4   |      86.2      |   79.0    | [config](stark_st2_r50_50e_lasot.py)  |   [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_lasot/stark_st2_r50_50e_lasot_20220416_170201-b1484149.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_lasot/stark_st2_r50_50e_lasot_20220416_170201.log.json)   |
+|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                                          Config                                           |                                                                                                                                       Download                                                                                                                                       |
+| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :---------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| STARK-ST1 |   R-50   |   -   |  500e   |   8.45   |       -        |  80.3   |      85.0      |   77.7    | [config](stark-st1_resnet50_8xb16-500e_got10k-lasot-trackingnet-coco_test-trackingnet.py) | [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_lasot/stark_st1_r50_500e_lasot_20220414_185654-9c19e39e.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_lasot/stark_st1_r50_500e_lasot_20220414_185654.log.json) |
+| STARK-ST2 |   R-50   |   -   |   50e   |   2.31   |       -        |  81.4   |      86.2      |   79.0    | [config](stark-st2_resnet50_8xb16-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py)  |   [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_lasot/stark_st2_r50_50e_lasot_20220416_170201-b1484149.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_lasot/stark_st2_r50_50e_lasot_20220416_170201.log.json)   |
 
 ### GOT10k
 
-The results of STARK in GOT10k are reimplemented by ourselves. The last-epoch model on GOT10k is submitted to [the evaluation server on GOT10k Challenge](http://got-10k.aitestunion.com/). We provide the model with its configuration and training log.
+The results of STARK in GOT10k are reimplemented by ourselves. The last-epoch model is submitted to [the evaluation server on GOT10k Challenge](http://got-10k.aitestunion.com/). We provide the model with its configuration and training log.
+
+|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) |  AO  | SR<sub>0.5</sub> | SR<sub>0.75</sub> |                                        Config                                        |                                                                                                                                         Download                                                                                                                                         |
+| :-------: | :------: | :---: | :-----: | :------: | :------------: | :--: | :--------------: | :---------------: | :----------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| STARK-ST1 |   R-50   |   -   |  500e   |   8.45   |       -        | 68.1 |       77.4       |       62.4        | [config](stark-st1_resnet50_8xb16-500e_got10k-lasot-trackingnet-coco_test-got10k.py) | [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_got10k/stark_st1_r50_500e_got10k_20220223_125400-40ead158.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_got10k/stark_st1_r50_500e_got10k_20220223_125400.log.json) |
+| STARK-ST2 |   R-50   |   -   |   50e   |   2.31   |       -        | 68.3 |       77.6       |       62.7        | [config](stark-st2_resnet50_8xb16-50e_got10k-lasot-trackingnet-coco_test-got10k.py)  |   [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_got10k/stark_st2_r50_50e_got10k_20220226_124213-ee39bbff.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_got10k/stark_st2_r50_50e_got10k_20220226_124213.log.json)   |
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy.
+
+The STARK is trained in 2 stages. We denote the 1st-stage model as `STARK-ST1`, and denote the 2nd-stage model as `STARK-ST2`. The following models we provide are the last-epoch models by default.
+
+Models from the 2 stages have different configurations. For example, `stark-st1_r50_8xb16-500e_got10k.py` is the configuration of the 1st-stage model and `stark-st2_r50_8xb16-50e_got10.py` is the configuration of the 2nd-stage model. The following is an example of training STARK on GOT10k dataset. The training on LaSOT and TrackingNet is similar like this.
+
+**Note** that the STARK can not be trained in the 1.8.0 version of pytorch since the potential [bug](https://github.com/pytorch/pytorch/pull/52944/files) about AdamW optimizer in pytorch.
+
+**Training the 1st-stage model**
+
+```shell
+# Training STARK-ST1 on GOT10k dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs
+./tools/dist_train.sh \
+    configs/sot/stark/stark-st1_r50_8xb16-500e_got10k.py 8
+```
+
+**Training the 2nd-stage model**
+
+When training the 2nd-stage model, we have to pass an extra parameter `cfg-options` containing the key `load_from` from shell command to load the pretrained 1st-stage model. Here is an example:
+
+```shell
+# Training STARK-ST2 on GOT10k dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs
+./tools/dist_train.sh \
+    configs/sot/stark/stark-st2_r50_8xb16-50e_got10k.py 8
+    --cfg-options load_from=${STARK-ST1 model}
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+**2.1 Example on LaSOT dataset**
+
+```shell
+# Example 1: Test STARK-ST1 on LaSOT testset
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-lasot.py 8 \
+    --checkpoint ./checkpoints/stark_st1_r50_500e_lasot_20220414_185654-9c19e39e.pth
+```
+
+```shell
+# Example 2: Test STARK-ST2 on LaSOT testset
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot.py 8 \
+    --checkpoint ./checkpoints/stark_st2_r50_50e_lasot_20220416_170201-b1484149.pth
+```
+
+**2.1 Example on TrackingNet and GOT10k datasets**
+
+If you want to get the results of the [TrackingNet](https://eval.ai/web/challenges/challenge-page/1805/) and [GOT10k](http://got-10k.aitestunion.com/), please use the following commands to generate result files that can be used for submission. You can modify the saved path in `test_evaluator` of the config.
+
+```shell
+# Example 3: Test STARK-ST1 on TrackingNet testset.
+# The result is stored in `./results/stark_st1_trackingnet.zip` by default.
+# We use the lasot checkpoint on LaSOT to test on the TrackingNet.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-trackingnet.py 8 \
+    --checkpoint ./checkpoints/stark_st1_r50_500e_lasot_20220414_185654-9c19e39e.pth
+```
+
+```shell
+# Example 4: Test STARK-ST1 on GOT10k testset.
+# The result is stored in `./results/stark_st1_got10k.zip` by default.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/stark/stark-st1_r50_8xb16-500e_got10k.py 8 \
+    --checkpoint ./checkpoints/stark_st1_r50_500e_got10k_20220223_125400-40ead158.pth
+```
+
+```shell
+# Example 5: Test STARK-ST2 on TrackingNet testset.
+# The result is stored in `./results/stark_st2_trackingnet.zip` by default.
+# We use the lasot checkpoint on LaSOT to test on the TrackingNet.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py 8 \
+    --checkpoint ./checkpoints/stark_st2_r50_50e_lasot_20220416_170201-b1484149.pth
+```
+
+```shell
+# Example 6: Test STARK-ST2 on GOT10k testset.
+# The result is stored in `./results/stark_st2_got10k.zip` by default.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/sot/stark/stark-st2_r50_8xb16-50e_got10k.py 8 \
+    --checkpoint ./checkpoints/stark_st2_r50_50e_got10k_20220226_124213-ee39bbff.pth
+```
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_sot.py \
+    configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot.py \
+    --checkpoint ./checkpoints/stark_st2_r50_50e_lasot_20220416_170201-b1484149.pth \
+    --input demo/demo.mp4 \
+    --output sot.mp4
+```
 
-|  Method   | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | Success | Norm precision | Precision |                Config                 |                                                                                                                                         Download                                                                                                                                         |
-| :-------: | :------: | :---: | :-----: | :------: | :------------: | :-----: | :------------: | :-------: | :-----------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| STARK-ST1 |   R-50   |   -   |  500e   |   8.45   |       -        |  68.1   |      77.4      |   62.4    | [config](stark_st1_r50_500e_lasot.py) | [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_got10k/stark_st1_r50_500e_got10k_20220223_125400-40ead158.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_got10k/stark_st1_r50_500e_got10k_20220223_125400.log.json) |
-| STARK-ST2 |   R-50   |   -   |   50e   |   2.31   |       -        |  68.3   |      77.6      |   62.7    | [config](stark_st2_r50_50e_lasot.py)  |   [model](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_got10k/stark_st2_r50_50e_got10k_20220226_124213-ee39bbff.pth) \| [log](https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_got10k/stark_st2_r50_50e_got10k_20220226_124213.log.json)   |
+If you want to know about more detailed usage of `demo_sot.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/sot/stark/metafile.yml b/configs/sot/stark/metafile.yml
index eb9ebec15..657b49dc8 100644
--- a/configs/sot/stark/metafile.yml
+++ b/configs/sot/stark/metafile.yml
@@ -8,14 +8,14 @@ Collections:
       Architecture:
         - ResNet
     Paper:
-        URL: https://arxiv.org/abs/2103.17154
-        Title: Learning Spatio-Temporal Transformer for Visual Tracking
+      URL: https://arxiv.org/abs/2103.17154
+      Title: Learning Spatio-Temporal Transformer for Visual Tracking
     README: configs/sot/stark/README.md
 
 Models:
-  - Name: stark_st1_r50_500e_got10k
+  - Name: stark-st1_resnet50_8xb16-500e_got10k
     In Collection: STARK
-    Config: configs/sot/stark/stark_st1_r50_500e_got10k.py
+    Config: configs/sot/stark/stark-st1_resnet50_8xb16-500e_got10k.py
     Metadata:
       Training Data: GOT10k
       Training Memory (GB): 8.45
@@ -29,9 +29,9 @@ Models:
           SR0.75: 62.4
     Weights: https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_got10k/stark_st1_r50_500e_got10k_20220223_125400-40ead158.pth
 
-  - Name: stark_st2_r50_50e_got10k
+  - Name: stark-st2_resnet50_8xb16-50e_got10k
     In Collection: STARK
-    Config: configs/sot/stark/stark_st2_r50_50e_got10k.py
+    Config: configs/sot/stark/stark-st2_resnet50_8xb16-50e_got10k.py
     Metadata:
       Training Data: GOT10k
       Training Memory (GB): 2.31
@@ -45,9 +45,9 @@ Models:
           SR0.75: 62.7
     Weights: https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_got10k/stark_st2_r50_50e_got10k_20220226_124213-ee39bbff.pth
 
-  - Name: stark_st1_r50_500e_lasot
+  - Name: stark-st1_resnet50_8xb16-500e_got10k-lasot-trackingnet-coco_test-lasot
     In Collection: STARK
-    Config: configs/sot/stark/stark_st1_r50_500e_lasot.py
+    Config: configs/sot/stark/stark-st1_resnet50_8xb16-500e_got10k-lasot-trackingnet-coco_test-lasot.py
     Metadata:
       Training Data: GOT10k, LaSOT, TrackingNet, MSCOCO
       Training Memory (GB): 8.45
@@ -61,9 +61,9 @@ Models:
           Precision: 71.7
     Weights: https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_lasot/stark_st1_r50_500e_lasot_20220414_185654-9c19e39e.pth
 
-  - Name: stark_st2_r50_50e_lasot
+  - Name: stark-st2_resnet50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot
     In Collection: STARK
-    Config: configs/sot/stark/stark_st2_r50_50e_lasot.py
+    Config: configs/sot/stark/stark-st2_resnet50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot.py
     Metadata:
       Training Data: GOT10k, LaSOT, TrackingNet, MSCOCO
       Training Memory (GB): 2.31
@@ -77,9 +77,9 @@ Models:
           Precision: 73.0
     Weights: https://download.openmmlab.com/mmtracking/sot/stark/stark_st2_r50_50e_lasot/stark_st2_r50_50e_lasot_20220416_170201-b1484149.pth
 
-  - Name: stark_st1_r50_500e_trackingnet
+  - Name: stark-st1_resnet50_8xb16-500e_got10k-lasot-trackingnet-coco_test-trackingnet
     In Collection: STARK
-    Config: configs/sot/stark/stark_st1_r50_500e_trackingnet.py
+    Config: configs/sot/stark/stark-st1_resnet50_8xb16-500e_got10k-lasot-trackingnet-coco_test-trackingnet.py
     Metadata:
       Training Data: GOT10k, LaSOT, TrackingNet, MSCOCO
       Training Memory (GB): 8.45
@@ -93,9 +93,9 @@ Models:
           Precision: 77.7
     Weights: https://download.openmmlab.com/mmtracking/sot/stark/stark_st1_r50_500e_trackingnet/stark_st1_r50_500e_lasot_20220414_185654-9c19e39e.pth
 
-  - Name: stark_st2_r50_50e_trackingnet
+  - Name: stark-st2_resnet50_8xb16-50e_got10k-lasot-trackingnet-coco_test-trackingnet
     In Collection: STARK
-    Config: configs/sot/stark/stark_st2_r50_50e_trackingnet.py
+    Config: configs/sot/stark/stark-st2_resnet50_8xb16-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py
     Metadata:
       Training Data: GOT10k, LaSOT, TrackingNet, MSCOCO
       Training Memory (GB): 2.31
diff --git a/configs/sot/stark/stark_st1_r50_500e_got10k.py b/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_base.py
similarity index 52%
rename from configs/sot/stark/stark_st1_r50_500e_got10k.py
rename to configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_base.py
index 789b108e6..d963d714f 100644
--- a/configs/sot/stark/stark_st1_r50_500e_got10k.py
+++ b/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_base.py
@@ -1,12 +1,17 @@
-cudnn_benchmark = False
-deterministic = True
-seed = 1
+_base_ = ['../../_base_/default_runtime.py']
 
-# model setting
+randomness = dict(seed=1, deterministic=True)
+
+# model settings
 model = dict(
     type='Stark',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
     backbone=dict(
-        type='ResNet',
+        type='mmdet.ResNet',
         depth=50,
         num_stages=3,
         strides=(1, 2, 2),
@@ -17,7 +22,7 @@
         norm_cfg=dict(type='BN', requires_grad=False),
         init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
     neck=dict(
-        type='ChannelMapper',
+        type='mmdet.ChannelMapper',
         in_channels=[1024],
         out_channels=256,
         kernel_size=1,
@@ -28,7 +33,7 @@
         transformer=dict(
             type='StarkTransformer',
             encoder=dict(
-                type='DetrTransformerEncoder',
+                type='mmdet.DetrTransformerEncoder',
                 num_layers=6,
                 transformerlayers=dict(
                     type='BaseTransformerLayer',
@@ -46,7 +51,7 @@
                         ffn_drop=0.1),
                     operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
             decoder=dict(
-                type='DetrTransformerDecoder',
+                type='mmdet.DetrTransformerDecoder',
                 return_intermediate=False,
                 num_layers=6,
                 transformerlayers=dict(
@@ -65,21 +70,22 @@
                                      'ffn', 'norm'))),
         ),
         positional_encoding=dict(
-            type='SinePositionalEncoding', num_feats=128, normalize=True),
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
         bbox_head=dict(
             type='CornerPredictorHead',
             inplanes=256,
             channel=256,
             feat_size=20,
             stride=16),
-        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
-        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+        loss_bbox=dict(type='mmdet.L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='mmdet.GIoULoss', loss_weight=2.0)),
     test_cfg=dict(
         search_factor=5.0,
         search_size=320,
         template_factor=2.0,
         template_size=128,
-        update_intervals=[200]))
+        num_templates=2))
 
 data_root = 'data/'
 train_pipeline = [
@@ -90,14 +96,15 @@
         max_frame_range=[200],
         cls_pos_prob=0.5,
         train_cls_head=False),
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_label=False),
-    dict(type='SeqGrayAug', prob=0.05),
     dict(
-        type='SeqRandomFlip',
-        share_params=True,
-        flip_ratio=0.5,
-        direction='horizontal'),
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadTrackAnnotations', with_instance_id=False),
+            dict(type='GrayAug', prob=0.05),
+            dict(type='mmdet.RandomFlip', prob=0.5, direction='horizontal')
+        ]),
     dict(
         type='SeqBboxJitter',
         center_jitter_factor=[0, 0, 4.5],
@@ -107,105 +114,68 @@
         type='SeqCropLikeStark',
         crop_size_factor=[2, 2, 5],
         output_size=[128, 128, 320]),
-    dict(type='SeqBrightnessAug', jitter_range=0.2),
-    dict(
-        type='SeqRandomFlip',
-        share_params=False,
-        flip_ratio=0.5,
-        direction='horizontal'),
     dict(
-        type='SeqNormalize',
-        mean=[123.675, 116.28, 103.53],
-        std=[58.395, 57.12, 57.375],
-        to_rgb=True),
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[dict(type='BrightnessAug', jitter_range=0.2)]),
     dict(type='CheckPadMaskValidity', stride=16),
-    dict(
-        type='VideoCollect',
-        keys=['img', 'gt_bboxes', 'padding_mask'],
-        meta_keys=('valid')),
-    dict(type='ConcatSameTypeFrames', num_key_frames=2),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='search')
+    dict(type='PackTrackInputs', ref_prefix='search', num_template_frames=2)
 ]
 
-img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
-test_pipeline = [
-    dict(type='LoadImageFromFile', to_float32=True),
-    dict(type='LoadAnnotations', with_bbox=True, with_label=False),
-    dict(
-        type='MultiScaleFlipAug',
-        scale_factor=1,
-        flip=False,
-        transforms=[
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='VideoCollect', keys=['img', 'gt_bboxes']),
-            dict(type='ImageToTensor', keys=['img'])
-        ])
-]
 # dataset settings
-data = dict(
-    samples_per_gpu=16,
-    workers_per_gpu=8,
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
     persistent_workers=True,
-    samples_per_epoch=60000,
-    train=dict(
+    sampler=dict(type='QuotaSampler', samples_per_epoch=60000),
+    dataset=dict(
         type='RandomSampleConcatDataset',
-        dataset_sampling_weights=[1],
-        dataset_cfgs=[
+        dataset_sampling_weights=[1, 1, 1, 1],
+        datasets=[
             dict(
                 type='GOT10kDataset',
-                ann_file=data_root +
-                'got10k/annotations/got10k_train_infos.txt',
-                img_prefix=data_root + 'got10k',
+                data_root=data_root,
+                ann_file='GOT10k/annotations/got10k_train_vot_infos.txt',
+                data_prefix=dict(img_path='GOT10k'),
+                pipeline=train_pipeline,
+                test_mode=False),
+            dict(
+                type='LaSOTDataset',
+                data_root=data_root,
+                ann_file='LaSOT_full/annotations/lasot_train_infos.txt',
+                data_prefix=dict(img_path='LaSOT_full/LaSOTBenchmark'),
+                pipeline=train_pipeline,
+                test_mode=False),
+            dict(
+                type='TrackingNetDataset',
+                data_root=data_root,
+                ann_file='TrackingNet/annotations/trackingnet_train_infos.txt',
+                data_prefix=dict(img_path='TrackingNet'),
+                pipeline=train_pipeline,
+                test_mode=False),
+            dict(
+                type='SOTCocoDataset',
+                data_root=data_root,
+                ann_file='coco/annotations/instances_train2017.json',
+                data_prefix=dict(img_path='coco/train2017'),
                 pipeline=train_pipeline,
-                split='train',
                 test_mode=False)
-        ]),
-    val=dict(
-        type='GOT10kDataset',
-        ann_file=data_root + 'got10k/annotations/got10k_test_infos.txt',
-        img_prefix=data_root + 'got10k',
-        pipeline=test_pipeline,
-        split='test',
-        test_mode=True),
-    test=dict(
-        type='GOT10kDataset',
-        ann_file=data_root + 'got10k/annotations/got10k_test_infos.txt',
-        img_prefix=data_root + 'got10k',
-        pipeline=test_pipeline,
-        split='test',
-        test_mode=True))
+        ]))
+
+# runner loop
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=500, val_begin=500, val_interval=1)
+
+# learning policy
+param_scheduler = dict(type='MultiStepLR', milestones=[400], gamma=0.1)
 
 # optimizer
-optimizer = dict(
-    type='AdamW',
-    lr=0.0001,
-    weight_decay=0.0001,
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
     paramwise_cfg=dict(
         custom_keys=dict(backbone=dict(lr_mult=0.1, decay_mult=1.0))))
-optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[400])
+
 # checkpoint saving
-checkpoint_config = dict(interval=100)
-evaluation = dict(
-    metric=['track'],
-    interval=100,
-    start=501,
-    rule='greater',
-    save_best='success')
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook')
-    ])
-# yapf:enable
-# runtime settings
-total_epochs = 500
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/xxx'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=100))
diff --git a/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-lasot.py b/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-lasot.py
new file mode 100644
index 000000000..afcd6b6f0
--- /dev/null
+++ b/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-lasot.py
@@ -0,0 +1,4 @@
+_base_ = [
+    '../../_base_/datasets/lasot.py',
+    './stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_base.py'
+]
diff --git a/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-trackingnet.py b/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-trackingnet.py
new file mode 100644
index 000000000..f5628614e
--- /dev/null
+++ b/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_test-trackingnet.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../../_base_/datasets/trackingnet.py',
+    './stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_base.py'
+]
+
+# evaluator
+val_evaluator = dict(outfile_prefix='results/stark_st1_trackingnet')
+test_evaluator = val_evaluator
diff --git a/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k.py b/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k.py
new file mode 100644
index 000000000..9802668bf
--- /dev/null
+++ b/configs/sot/stark/stark-st1_r50_8xb16-500e_got10k.py
@@ -0,0 +1,22 @@
+_base_ = [
+    '../../_base_/datasets/got10k.py',
+    './stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_base.py'
+]
+
+train_pipeline = {{_base_.train_pipeline}}
+
+# dataset settings
+data_root = {{_base_.data_root}}
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='GOT10kDataset',
+        data_root=data_root,
+        ann_file='GOT10k/annotations/got10k_train_infos.txt',
+        data_prefix=dict(img_path='GOT10k'),
+        pipeline=train_pipeline,
+        test_mode=False))
+
+# evaluator
+val_evaluator = dict(outfile_prefix='results/stark_st1_got10k')
+test_evaluator = val_evaluator
diff --git a/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_base.py b/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_base.py
new file mode 100644
index 000000000..4b8279881
--- /dev/null
+++ b/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_base.py
@@ -0,0 +1,97 @@
+_base_ = ['./stark-st1_r50_8xb16-500e_got10k-lasot-trackingnet-coco_base.py']
+
+# model setting
+model = dict(
+    type='Stark',
+    head=dict(
+        type='StarkHead',
+        cls_head=dict(
+            type='ScoreHead',
+            input_dim=256,
+            hidden_dim=256,
+            output_dim=1,
+            num_layers=3,
+            use_bn=False),
+        frozen_modules=['transformer', 'bbox_head', 'query_embedding'],
+        loss_cls=dict(type='mmdet.CrossEntropyLoss', use_sigmoid=True)),
+    frozen_modules=['backbone', 'neck'])
+
+data_root = {{_base_.data_root}}
+# the only difference of ``train_pipeline`` compared with that in stark_st1 is
+# ``train_cls_head=True`` in ``TridentSampling``.
+train_pipeline = [
+    dict(
+        type='TridentSampling',
+        num_search_frames=1,
+        num_template_frames=2,
+        max_frame_range=[200],
+        cls_pos_prob=0.5,
+        train_cls_head=True),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadTrackAnnotations', with_instance_id=False),
+            dict(type='GrayAug', prob=0.05),
+            dict(type='mmdet.RandomFlip', prob=0.5, direction='horizontal')
+        ]),
+    dict(
+        type='SeqBboxJitter',
+        center_jitter_factor=[0, 0, 4.5],
+        scale_jitter_factor=[0, 0, 0.5],
+        crop_size_factor=[2, 2, 5]),
+    dict(
+        type='SeqCropLikeStark',
+        crop_size_factor=[2, 2, 5],
+        output_size=[128, 128, 320]),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[dict(type='BrightnessAug', jitter_range=0.2)]),
+    dict(type='CheckPadMaskValidity', stride=16),
+    dict(type='PackTrackInputs', ref_prefix='search', num_template_frames=2)
+]
+
+# dataset settings
+train_dataloader = dict(
+    dataset=dict(datasets=[
+        dict(
+            type='GOT10kDataset',
+            data_root=data_root,
+            ann_file='GOT10k/annotations/got10k_train_vot_infos.txt',
+            data_prefix=dict(img_path='GOT10k'),
+            pipeline=train_pipeline,
+            test_mode=False),
+        dict(
+            type='LaSOTDataset',
+            data_root=data_root,
+            ann_file='LaSOT_full/annotations/lasot_train_infos.txt',
+            data_prefix=dict(img_path='LaSOT_full/LaSOTBenchmark'),
+            pipeline=train_pipeline,
+            test_mode=False),
+        dict(
+            type='TrackingNetDataset',
+            data_root=data_root,
+            ann_file='TrackingNet/annotations/trackingnet_train_infos.txt',
+            data_prefix=dict(img_path='TrackingNet'),
+            pipeline=train_pipeline,
+            test_mode=False),
+        dict(
+            type='SOTCocoDataset',
+            data_root=data_root,
+            ann_file='coco/annotations/instances_train2017.json',
+            data_prefix=dict(img_path='coco/train2017'),
+            pipeline=train_pipeline,
+            test_mode=False)
+    ]))
+
+# runner loop
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=50, val_begin=50, val_interval=1)
+
+# learning policy
+param_scheduler = dict(type='MultiStepLR', milestones=[40], gamma=0.1)
+
+# checkpoint saving
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=10))
diff --git a/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot.py b/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot.py
new file mode 100644
index 000000000..e9488e359
--- /dev/null
+++ b/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-lasot.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../../_base_/datasets/lasot.py',
+    './stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_base.py'
+]
+
+# model setting
+model = dict(test_cfg=dict(update_intervals=[300]))
diff --git a/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py b/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py
new file mode 100644
index 000000000..4044b3c9a
--- /dev/null
+++ b/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_test-trackingnet.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../../_base_/datasets/trackingnet.py',
+    './stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_base.py'
+]
+
+# model setting
+model = dict(test_cfg=dict(update_intervals=[25]))
+
+# evaluator
+val_evaluator = dict(outfile_prefix='results/stark_st2_trackingnet')
+test_evaluator = val_evaluator
diff --git a/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k.py b/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k.py
new file mode 100644
index 000000000..56beba266
--- /dev/null
+++ b/configs/sot/stark/stark-st2_r50_8xb16-50e_got10k.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../../_base_/datasets/got10k.py',
+    './stark-st2_r50_8xb16-50e_got10k-lasot-trackingnet-coco_base.py'
+]
+
+model = dict(test_cfg=dict(update_intervals=[200]))
+
+train_pipeline = {{_base_.train_pipeline}}
+
+# dataset settings
+data_root = {{_base_.data_root}}
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='GOT10kDataset',
+        data_root=data_root,
+        ann_file='GOT10k/annotations/got10k_train_infos.txt',
+        data_prefix=dict(img_path='GOT10k'),
+        pipeline=train_pipeline,
+        test_mode=False))
+
+# evaluator
+val_evaluator = dict(outfile_prefix='results/stark_st2_got10k')
+test_evaluator = val_evaluator
diff --git a/configs/sot/stark/stark_st1_r50_500e_lasot.py b/configs/sot/stark/stark_st1_r50_500e_lasot.py
deleted file mode 100644
index 2e0f1ac15..000000000
--- a/configs/sot/stark/stark_st1_r50_500e_lasot.py
+++ /dev/null
@@ -1,90 +0,0 @@
-_base_ = ['./stark_st1_r50_500e_got10k.py']
-
-data_root = 'data/'
-train_pipeline = [
-    dict(
-        type='TridentSampling',
-        num_search_frames=1,
-        num_template_frames=2,
-        max_frame_range=[200],
-        cls_pos_prob=0.5,
-        train_cls_head=False),
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_label=False),
-    dict(type='SeqGrayAug', prob=0.05),
-    dict(
-        type='SeqRandomFlip',
-        share_params=True,
-        flip_ratio=0.5,
-        direction='horizontal'),
-    dict(
-        type='SeqBboxJitter',
-        center_jitter_factor=[0, 0, 4.5],
-        scale_jitter_factor=[0, 0, 0.5],
-        crop_size_factor=[2, 2, 5]),
-    dict(
-        type='SeqCropLikeStark',
-        crop_size_factor=[2, 2, 5],
-        output_size=[128, 128, 320]),
-    dict(type='SeqBrightnessAug', jitter_range=0.2),
-    dict(
-        type='SeqRandomFlip',
-        share_params=False,
-        flip_ratio=0.5,
-        direction='horizontal'),
-    dict(
-        type='SeqNormalize',
-        mean=[123.675, 116.28, 103.53],
-        std=[58.395, 57.12, 57.375],
-        to_rgb=True),
-    dict(type='CheckPadMaskValidity', stride=16),
-    dict(
-        type='VideoCollect',
-        keys=['img', 'gt_bboxes', 'padding_mask'],
-        meta_keys=('valid')),
-    dict(type='ConcatSameTypeFrames', num_key_frames=2),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='search')
-]
-
-# dataset settings
-data = dict(
-    train=dict(
-        type='RandomSampleConcatDataset',
-        dataset_sampling_weights=[1, 1, 1, 1],
-        dataset_cfgs=[
-            dict(
-                type='GOT10kDataset',
-                ann_file=data_root +
-                'got10k/annotations/got10k_train_infos.txt',
-                img_prefix=data_root + 'got10k',
-                pipeline=train_pipeline,
-                split='train_vot',
-                test_mode=False),
-            dict(
-                type='LaSOTDataset',
-                ann_file=data_root + 'lasot/annotations/lasot_train_infos.txt',
-                img_prefix=data_root + 'lasot/LaSOTBenchmark',
-                pipeline=train_pipeline,
-                split='train',
-                test_mode=False),
-            dict(
-                type='TrackingNetDataset',
-                ann_file=data_root +
-                'trackingnet/annotations/trackingnet_train_infos.txt',
-                img_prefix=data_root + 'trackingnet',
-                pipeline=train_pipeline,
-                split='train',
-                test_mode=False),
-            dict(
-                type='SOTCocoDataset',
-                ann_file=data_root +
-                'coco/annotations/instances_train2017.json',
-                img_prefix=data_root + 'coco/train2017',
-                pipeline=train_pipeline,
-                split='train',
-                test_mode=False)
-        ]),
-    test=dict(
-        type='LaSOTDataset',
-        ann_file=data_root + 'lasot/annotations/lasot_test_infos.txt',
-        img_prefix=data_root + 'lasot/LaSOTBenchmark'))
diff --git a/configs/sot/stark/stark_st1_r50_500e_trackingnet.py b/configs/sot/stark/stark_st1_r50_500e_trackingnet.py
deleted file mode 100644
index 76c6aba9b..000000000
--- a/configs/sot/stark/stark_st1_r50_500e_trackingnet.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = ['./stark_st1_r50_500e_lasot.py']
-
-data_root = 'data/'
-data = dict(
-    test=dict(
-        type='TrackingNetDataset',
-        ann_file=data_root +
-        'trackingnet/annotations/trackingnet_test_infos.txt',
-        img_prefix=data_root + 'trackingnet'))
diff --git a/configs/sot/stark/stark_st2_r50_50e_got10k.py b/configs/sot/stark/stark_st2_r50_50e_got10k.py
deleted file mode 100644
index c8226e14c..000000000
--- a/configs/sot/stark/stark_st2_r50_50e_got10k.py
+++ /dev/null
@@ -1,85 +0,0 @@
-_base_ = ['./stark_st1_r50_500e_got10k.py']
-
-# model setting
-model = dict(
-    type='Stark',
-    head=dict(
-        type='StarkHead',
-        cls_head=dict(
-            type='ScoreHead',
-            input_dim=256,
-            hidden_dim=256,
-            output_dim=1,
-            num_layers=3,
-            use_bn=False),
-        frozen_modules=['transformer', 'bbox_head', 'query_embedding'],
-        loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True)),
-    frozen_modules=['backbone', 'neck'])
-
-data_root = 'data/'
-train_pipeline = [
-    dict(
-        type='TridentSampling',
-        num_search_frames=1,
-        num_template_frames=2,
-        max_frame_range=[200],
-        cls_pos_prob=0.5,
-        train_cls_head=True),
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_label=True),
-    dict(type='SeqGrayAug', prob=0.05),
-    dict(
-        type='SeqRandomFlip',
-        share_params=True,
-        flip_ratio=0.5,
-        direction='horizontal'),
-    dict(
-        type='SeqBboxJitter',
-        center_jitter_factor=[0, 0, 4.5],
-        scale_jitter_factor=[0, 0, 0.5],
-        crop_size_factor=[2, 2, 5]),
-    dict(
-        type='SeqCropLikeStark',
-        crop_size_factor=[2, 2, 5],
-        output_size=[128, 128, 320]),
-    dict(type='SeqBrightnessAug', jitter_range=0.2),
-    dict(
-        type='SeqRandomFlip',
-        share_params=False,
-        flip_ratio=0.5,
-        direction='horizontal'),
-    dict(
-        type='SeqNormalize',
-        mean=[123.675, 116.28, 103.53],
-        std=[58.395, 57.12, 57.375],
-        to_rgb=True),
-    dict(type='CheckPadMaskValidity', stride=16),
-    dict(
-        type='VideoCollect',
-        keys=['img', 'gt_bboxes', 'gt_labels', 'padding_mask'],
-        meta_keys=('valid')),
-    dict(type='ConcatSameTypeFrames', num_key_frames=2),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='search')
-]
-
-# dataset settings
-data = dict(
-    train=dict(dataset_cfgs=[
-        dict(
-            type='GOT10kDataset',
-            ann_file=data_root + 'got10k/annotations/got10k_train_infos.txt',
-            img_prefix=data_root + 'got10k',
-            pipeline=train_pipeline,
-            split='train',
-            test_mode=False)
-    ]))
-
-# learning policy
-lr_config = dict(policy='step', step=[40])
-# checkpoint saving
-checkpoint_config = dict(interval=10)
-evaluation = dict(interval=100, start=51)
-# yapf:enable
-# runtime settings
-total_epochs = 50
-load_from = 'logs/stark_st1_got10k_online/epoch_500.pth'
diff --git a/configs/sot/stark/stark_st2_r50_50e_lasot.py b/configs/sot/stark/stark_st2_r50_50e_lasot.py
deleted file mode 100644
index 1636dffb8..000000000
--- a/configs/sot/stark/stark_st2_r50_50e_lasot.py
+++ /dev/null
@@ -1,93 +0,0 @@
-_base_ = ['./stark_st2_r50_50e_got10k.py']
-
-# model setting
-model = dict(test_cfg=dict(update_intervals=[300]))
-
-data_root = 'data/'
-train_pipeline = [
-    dict(
-        type='TridentSampling',
-        num_search_frames=1,
-        num_template_frames=2,
-        max_frame_range=[200],
-        cls_pos_prob=0.5,
-        train_cls_head=True),
-    dict(type='LoadMultiImagesFromFile', to_float32=True),
-    dict(type='SeqLoadAnnotations', with_bbox=True, with_label=True),
-    dict(type='SeqGrayAug', prob=0.05),
-    dict(
-        type='SeqRandomFlip',
-        share_params=True,
-        flip_ratio=0.5,
-        direction='horizontal'),
-    dict(
-        type='SeqBboxJitter',
-        center_jitter_factor=[0, 0, 4.5],
-        scale_jitter_factor=[0, 0, 0.5],
-        crop_size_factor=[2, 2, 5]),
-    dict(
-        type='SeqCropLikeStark',
-        crop_size_factor=[2, 2, 5],
-        output_size=[128, 128, 320]),
-    dict(type='SeqBrightnessAug', jitter_range=0.2),
-    dict(
-        type='SeqRandomFlip',
-        share_params=False,
-        flip_ratio=0.5,
-        direction='horizontal'),
-    dict(
-        type='SeqNormalize',
-        mean=[123.675, 116.28, 103.53],
-        std=[58.395, 57.12, 57.375],
-        to_rgb=True),
-    dict(type='CheckPadMaskValidity', stride=16),
-    dict(
-        type='VideoCollect',
-        keys=['img', 'gt_bboxes', 'gt_labels', 'padding_mask'],
-        meta_keys=('valid')),
-    dict(type='ConcatSameTypeFrames', num_key_frames=2),
-    dict(type='SeqDefaultFormatBundle', ref_prefix='search')
-]
-
-# dataset settings
-data = dict(
-    train=dict(
-        type='RandomSampleConcatDataset',
-        dataset_sampling_weights=[1, 1, 1, 1],
-        dataset_cfgs=[
-            dict(
-                type='GOT10kDataset',
-                ann_file=data_root +
-                'got10k/annotations/got10k_train_infos.txt',
-                img_prefix=data_root + 'got10k',
-                pipeline=train_pipeline,
-                split='train',
-                test_mode=False),
-            dict(
-                type='LaSOTDataset',
-                ann_file=data_root + 'lasot/annotations/lasot_train_infos.txt',
-                img_prefix=data_root + 'lasot/LaSOTBenchmark',
-                pipeline=train_pipeline,
-                split='train',
-                test_mode=False),
-            dict(
-                type='TrackingNetDataset',
-                ann_file=data_root +
-                'trackingnet/annotations/trackingnet_train_infos.txt',
-                img_prefix=data_root + 'trackingnet',
-                pipeline=train_pipeline,
-                split='train',
-                test_mode=False),
-            dict(
-                type='SOTCocoDataset',
-                ann_file=data_root +
-                'coco/annotations/instances_train2017.json',
-                img_prefix=data_root + 'coco/train2017',
-                pipeline=train_pipeline,
-                split='train',
-                test_mode=False)
-        ]),
-    test=dict(
-        type='LaSOTDataset',
-        ann_file=data_root + 'lasot/annotations/lasot_test_infos.txt',
-        img_prefix=data_root + 'lasot/LaSOTBenchmark'))
diff --git a/configs/sot/stark/stark_st2_r50_50e_trackingnet.py b/configs/sot/stark/stark_st2_r50_50e_trackingnet.py
deleted file mode 100644
index ad0bcaa14..000000000
--- a/configs/sot/stark/stark_st2_r50_50e_trackingnet.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = ['./stark_st2_r50_50e_lasot.py']
-
-# model setting
-model = dict(test_cfg=dict(update_intervals=[25]))
-
-data_root = 'data/'
-data = dict(
-    test=dict(
-        type='TrackingNetDataset',
-        ann_file=data_root +
-        'trackingnet/annotations/trackingnet_test_infos.txt',
-        img_prefix=data_root + 'trackingnet'))
diff --git a/configs/vid/README.md b/configs/vid/README.md
deleted file mode 100644
index 9cd97d689..000000000
--- a/configs/vid/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Usage of VID configs
-
-## Training with VID configs
-
-Please refer to [Train VID models](../../docs/en/quick_run.md#examples-of-training-vid-model) to see the examples.
-
-## Testing with VID configs
-
-Please refer to [Test VID models](../../docs/en/quick_run.md#examples-of-testing-vid-model) to see the examples.
-
-## Inference with VID configs
-
-Please refer to [Inference VID models](../../docs/en/quick_run.md#inference-vid-models) to see the examples.
diff --git a/configs/vid/dff/README.md b/configs/vid/dff/README.md
index 51ae3164f..efddfcf15 100644
--- a/configs/vid/dff/README.md
+++ b/configs/vid/dff/README.md
@@ -30,8 +30,45 @@ Deep convolutional neutral networks have achieved great success on image recogni
 
 We observe around 1 mAP fluctuations in performance, and provide the best model.
 
-| Method | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP@50 |                        Config                        |                                                                                                                                                                   Download                                                                                                                                                                   |
-| :----: | :-------: | :-----: | :-----: | :------: | :------------: | :-------: | :--------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|  DFF   | R-50-DC5  | pytorch |   7e    |   2.50   |      44.0      |   70.3    | [config](dff_faster_rcnn_r50_dc5_1x_imagenetvid.py)  |   [model](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid/dff_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_213250-548911a4.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid/dff_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_213250.log.json)   |
-|  DFF   | R-101-DC5 | pytorch |   7e    |   3.25   |      39.8      |   73.5    | [config](dff_faster_rcnn_r101_dc5_1x_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r101_dc5_1x_imagenetvid/dff_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172720-ad732e17.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r101_dc5_1x_imagenetvid/dff_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172720.log.json) |
-|  DFF   | X-101-DC5 | pytorch |   7e    |   4.95   |       -        |   75.5    | [config](dff_faster_rcnn_x101_dc5_1x_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_x101_dc5_1x_imagenetvid/dff_faster_rcnn_x101_dc5_1x_imagenetvid_20210819_095932-0a9e6cb5.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_x101_dc5_1x_imagenetvid/dff_faster_rcnn_x101_dc5_1x_imagenetvid_20210819_095932.log.json) |
+| Method | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP@50 |                          Config                           |                                                                                                                                                                   Download                                                                                                                                                                   |
+| :----: | :-------: | :-----: | :-----: | :------: | :------------: | :-------: | :-------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  DFF   | R-50-DC5  | pytorch |   7e    |   2.50   |      44.0      |   70.3    | [config](dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py)  |   [model](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid/dff_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_213250-548911a4.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid/dff_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_213250.log.json)   |
+|  DFF   | R-101-DC5 | pytorch |   7e    |   3.25   |      39.8      |   73.5    | [config](dff_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r101_dc5_1x_imagenetvid/dff_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172720-ad732e17.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r101_dc5_1x_imagenetvid/dff_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172720.log.json) |
+|  DFF   | X-101-DC5 | pytorch |   7e    |   4.95   |       -        |   75.5    | [config](dff_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_x101_dc5_1x_imagenetvid/dff_faster_rcnn_x101_dc5_1x_imagenetvid_20210819_095932-0a9e6cb5.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_x101_dc5_1x_imagenetvid/dff_faster_rcnn_x101_dc5_1x_imagenetvid_20210819_095932.log.json) |
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training.
+
+```shell
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs
+./tools/dist_train.sh \
+    configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+```shell
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py 8 \
+    --checkpoint ./checkpoints/dff_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_213250-548911a4.pth
+```
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_vid.py \
+    configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py \
+    --checkpoint ./checkpoints/dff_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_213250-548911a4.pth \
+    --input demo/demo.mp4 \
+    --output vid.mp4
+```
+
+If you want to know about more detailed usage of `demo_vid.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/vid/dff/dff_faster_rcnn_r101_dc5_1x_imagenetvid.py b/configs/vid/dff/dff_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
similarity index 74%
rename from configs/vid/dff/dff_faster_rcnn_r101_dc5_1x_imagenetvid.py
rename to configs/vid/dff/dff_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
index c603ed414..b7d60e609 100644
--- a/configs/vid/dff/dff_faster_rcnn_r101_dc5_1x_imagenetvid.py
+++ b/configs/vid/dff/dff_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
@@ -1,4 +1,4 @@
-_base_ = ['./dff_faster_rcnn_r50_dc5_1x_imagenetvid.py']
+_base_ = ['./dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py']
 model = dict(
     detector=dict(
         backbone=dict(
diff --git a/configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py b/configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
new file mode 100644
index 000000000..63ceff1b2
--- /dev/null
+++ b/configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
@@ -0,0 +1,51 @@
+_base_ = [
+    '../../_base_/models/faster-rcnn_r50-dc5.py',
+    '../../_base_/datasets/imagenet_vid_dff_style.py',
+    '../../_base_/default_runtime.py'
+]
+model = dict(
+    type='DFF',
+    detector=dict(
+        train_cfg=dict(
+            rpn_proposal=dict(max_per_img=1000),
+            rcnn=dict(sampler=dict(num=512)))),
+    motion=dict(
+        type='FlowNetSimple',
+        img_scale_factor=0.5,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/pretrained_weights/flownet_simple.pth'  # noqa: E501
+        )),
+    train_cfg=None,
+    test_cfg=dict(key_frame_interval=10))
+
+# training schedule
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=7, val_interval=7)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=7,
+        by_epoch=True,
+        milestones=[2, 5],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+visualizer = dict(type='DetLocalVisualizer')
diff --git a/configs/vid/fgfa/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid.py b/configs/vid/dff/dff_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
similarity index 81%
rename from configs/vid/fgfa/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid.py
rename to configs/vid/dff/dff_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
index a71b71080..e0f5b5f9b 100644
--- a/configs/vid/fgfa/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid.py
+++ b/configs/vid/dff/dff_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
@@ -1,4 +1,4 @@
-_base_ = ['./fgfa_faster_rcnn_r50_dc5_1x_imagenetvid.py']
+_base_ = ['./dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py']
 model = dict(
     detector=dict(
         backbone=dict(
diff --git a/configs/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid.py b/configs/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid.py
deleted file mode 100644
index d32f6142e..000000000
--- a/configs/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid.py
+++ /dev/null
@@ -1,36 +0,0 @@
-_base_ = [
-    '../../_base_/models/faster_rcnn_r50_dc5.py',
-    '../../_base_/datasets/imagenet_vid_dff_style.py',
-    '../../_base_/default_runtime.py'
-]
-model = dict(
-    type='DFF',
-    detector=dict(
-        train_cfg=dict(
-            rpn_proposal=dict(max_per_img=1000),
-            rcnn=dict(sampler=dict(num=512)))),
-    motion=dict(
-        type='FlowNetSimple',
-        img_scale_factor=0.5,
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/pretrained_weights/flownet_simple.pth'  # noqa: E501
-        )),
-    train_cfg=None,
-    test_cfg=dict(key_frame_interval=10))
-
-# optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=1.0 / 3,
-    step=[2, 5])
-# runtime settings
-total_epochs = 7
-evaluation = dict(metric=['bbox'], interval=7)
diff --git a/configs/vid/dff/metafile.yml b/configs/vid/dff/metafile.yml
index a3d908e1b..d5922afab 100644
--- a/configs/vid/dff/metafile.yml
+++ b/configs/vid/dff/metafile.yml
@@ -8,14 +8,14 @@ Collections:
       Architecture:
         - ResNet
     Paper:
-        URL: https://arxiv.org/abs/1611.07715
-        Title: Deep Feature Flow for Video Recognition
+      URL: https://arxiv.org/abs/1611.07715
+      Title: Deep Feature Flow for Video Recognition
     README: configs/vid/dff/README.md
 
 Models:
-  - Name: dff_faster_rcnn_r50_dc5_1x_imagenetvid
+  - Name: dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid
     In Collection: DFF
-    Config: configs/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid.py
+    Config: configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 2.50
     Results:
@@ -25,9 +25,9 @@ Models:
           box AP@0.5: 70.3
     Weights: https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid/dff_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_213250-548911a4.pth
 
-  - Name: dff_faster_rcnn_r101_dc5_1x_imagenetvid
+  - Name: dff_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid
     In Collection: DFF
-    Config: configs/vid/dff/dff_faster_rcnn_r101_dc5_1x_imagenetvid.py
+    Config: configs/vid/dff/dff_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 3.25
     Results:
@@ -37,9 +37,9 @@ Models:
           box AP@0.5: 73.5
     Weights: https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r101_dc5_1x_imagenetvid/dff_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172720-ad732e17.pth
 
-  - Name: dff_faster_rcnn_x101_dc5_1x_imagenetvid
+  - Name: dff_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid
     In Collection: DFF
-    Config: configs/vid/dff/dff_faster_rcnn_x101_dc5_1x_imagenetvid.py
+    Config: configs/vid/dff/dff_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 4.95
     Results:
diff --git a/configs/vid/fgfa/README.md b/configs/vid/fgfa/README.md
index 701c4c388..8bd28554a 100644
--- a/configs/vid/fgfa/README.md
+++ b/configs/vid/fgfa/README.md
@@ -30,8 +30,45 @@ Extending state-of-the-art object detectors from image to video is challenging.
 
 We observe around 1 mAP fluctuations in performance, and provide the best model.
 
-| Method | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP@50 |                        Config                         |                                                                                                                                                                      Download                                                                                                                                                                      |
-| :----: | :-------: | :-----: | :-----: | :------: | :------------: | :-------: | :---------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|  FGFA  | R-50-DC5  | pytorch |   7e    |   4.10   |      6.9       |   74.7    | [config](fgfa_faster_rcnn_r50_dc5_1x_imagenetvid.py)  |   [model](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid_20201228_022657-f42016f3.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid_20201228_022657.log.json)   |
-|  FGFA  | R-101-DC5 | pytorch |   7e    |   5.80   |      6.4       |   77.8    | [config](fgfa_faster_rcnn_r101_dc5_1x_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid_20201219_011831-9c9d8183.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid_20201219_011831.log.json) |
-|  FGFA  | X-101-DC5 | pytorch |   7e    |   9.74   |       -        |   79.6    | [config](fgfa_faster_rcnn_x101_dc5_1x_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid_20210818_223334-8723c594.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid_20210818_223334.log.json) |
+| Method | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP@50 |                           Config                           |                                                                                                                                                                      Download                                                                                                                                                                      |
+| :----: | :-------: | :-----: | :-----: | :------: | :------------: | :-------: | :--------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  FGFA  | R-50-DC5  | pytorch |   7e    |   4.10   |      6.9       |   74.7    | [config](fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py)  |   [model](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid_20201228_022657-f42016f3.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid_20201228_022657.log.json)   |
+|  FGFA  | R-101-DC5 | pytorch |   7e    |   5.80   |      6.4       |   77.8    | [config](fgfa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid_20201219_011831-9c9d8183.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid_20201219_011831.log.json) |
+|  FGFA  | X-101-DC5 | pytorch |   7e    |   9.74   |       -        |   79.6    | [config](fgfa_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid_20210818_223334-8723c594.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid_20210818_223334.log.json) |
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training.
+
+```shell
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs
+./tools/dist_train.sh \
+    configs/vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+```shell
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py 8 \
+    --checkpoint ./checkpoints/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid_20201228_022657-f42016f3.pth
+```
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_vid.py \
+    configs/vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py \
+    --checkpoint ./checkpoints/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid_20201228_022657-f42016f3.pth \
+    --input demo/demo.mp4 \
+    --output vid.mp4
+```
+
+If you want to know about more detailed usage of `demo_vid.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/vid/fgfa/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid.py b/configs/vid/fgfa/fgfa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
similarity index 74%
rename from configs/vid/fgfa/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid.py
rename to configs/vid/fgfa/fgfa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
index 060b01478..e4a5d30a8 100644
--- a/configs/vid/fgfa/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid.py
+++ b/configs/vid/fgfa/fgfa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
@@ -1,4 +1,4 @@
-_base_ = ['./fgfa_faster_rcnn_r50_dc5_1x_imagenetvid.py']
+_base_ = ['./fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py']
 model = dict(
     detector=dict(
         backbone=dict(
diff --git a/configs/vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py b/configs/vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
new file mode 100644
index 000000000..fb1160672
--- /dev/null
+++ b/configs/vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
@@ -0,0 +1,48 @@
+_base_ = [
+    '../../_base_/models/faster-rcnn_r50-dc5.py',
+    '../../_base_/datasets/imagenet_vid_fgfa_style.py',
+    '../../_base_/default_runtime.py'
+]
+model = dict(
+    type='FGFA',
+    motion=dict(
+        type='FlowNetSimple',
+        img_scale_factor=0.5,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/pretrained_weights/flownet_simple.pth'  # noqa: E501
+        )),
+    aggregator=dict(
+        type='EmbedAggregator', num_convs=1, channels=512, kernel_size=3),
+)
+
+# training schedule
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=7, val_interval=7)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=7,
+        by_epoch=True,
+        milestones=[2, 5],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+visualizer = dict(type='DetLocalVisualizer')
diff --git a/configs/vid/dff/dff_faster_rcnn_x101_dc5_1x_imagenetvid.py b/configs/vid/fgfa/fgfa_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
similarity index 81%
rename from configs/vid/dff/dff_faster_rcnn_x101_dc5_1x_imagenetvid.py
rename to configs/vid/fgfa/fgfa_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
index 011a82403..1ca5de0bb 100644
--- a/configs/vid/dff/dff_faster_rcnn_x101_dc5_1x_imagenetvid.py
+++ b/configs/vid/fgfa/fgfa_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
@@ -1,4 +1,4 @@
-_base_ = ['./dff_faster_rcnn_r50_dc5_1x_imagenetvid.py']
+_base_ = ['./fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py']
 model = dict(
     detector=dict(
         backbone=dict(
diff --git a/configs/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid.py b/configs/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid.py
deleted file mode 100644
index 0fcdac968..000000000
--- a/configs/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid.py
+++ /dev/null
@@ -1,32 +0,0 @@
-_base_ = [
-    '../../_base_/models/faster_rcnn_r50_dc5.py',
-    '../../_base_/datasets/imagenet_vid_fgfa_style.py',
-    '../../_base_/default_runtime.py'
-]
-model = dict(
-    type='FGFA',
-    motion=dict(
-        type='FlowNetSimple',
-        img_scale_factor=0.5,
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint=  # noqa: E251
-            'https://download.openmmlab.com/mmtracking/pretrained_weights/flownet_simple.pth'  # noqa: E501
-        )),
-    aggregator=dict(
-        type='EmbedAggregator', num_convs=1, channels=512, kernel_size=3))
-
-# optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=1.0 / 3,
-    step=[2, 5])
-# runtime settings
-total_epochs = 7
-evaluation = dict(metric=['bbox'], interval=7)
diff --git a/configs/vid/fgfa/metafile.yml b/configs/vid/fgfa/metafile.yml
index 28995319f..90812efdd 100644
--- a/configs/vid/fgfa/metafile.yml
+++ b/configs/vid/fgfa/metafile.yml
@@ -8,14 +8,14 @@ Collections:
       Architecture:
         - ResNet
     Paper:
-        URL: https://arxiv.org/abs/1703.10025
-        Title: Flow-Guided Feature Aggregation for Video Object Detection
+      URL: https://arxiv.org/abs/1703.10025
+      Title: Flow-Guided Feature Aggregation for Video Object Detection
     README: configs/vid/fgfa/README.md
 
 Models:
-  - Name: fgfa_faster_rcnn_r50_dc5_1x_imagenetvid
+  - Name: fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid
     In Collection: FGFA
-    Config: configs/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid.py
+    Config: configs/vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 4.10
     Results:
@@ -25,9 +25,9 @@ Models:
           box AP@0.5: 74.7
     Weights: https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid/fgfa_faster_rcnn_r50_dc5_1x_imagenetvid_20201228_022657-f42016f3.pth
 
-  - Name: fgfa_faster_rcnn_r101_dc5_1x_imagenetvid
+  - Name: fgfa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid
     In Collection: FGFA
-    Config: configs/vid/fgfa/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid.py
+    Config: configs/vid/fgfa/fgfa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 5.80
     Results:
@@ -37,9 +37,9 @@ Models:
           box AP@0.5: 77.8
     Weights: https://download.openmmlab.com/mmtracking/vid/fgfa/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid_20201219_011831-9c9d8183.pth
 
-  - Name: fgfa_faster_rcnn_x101_dc5_1x_imagenetvid
+  - Name: fgfa_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid
     In Collection: FGFA
-    Config: configs/vid/fgfa/fgfa_faster_rcnn_x101_dc5_1x_imagenetvid.py
+    Config: configs/vid/fgfa/fgfa_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 9.74
     Results:
diff --git a/configs/vid/selsa/README.md b/configs/vid/selsa/README.md
index c095d6b41..2247e058d 100644
--- a/configs/vid/selsa/README.md
+++ b/configs/vid/selsa/README.md
@@ -30,13 +30,50 @@ Video objection detection (VID) has been a rising research direction in recent y
 
 We observe around 1 mAP fluctuations in performance, and provide the best model.
 
-|      Method       | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP@50 |                           Config                           |                                                                                                                                                                         Download                                                                                                                                                                         |
-| :---------------: | :-------: | :-----: | :-----: | :------: | :------------: | :-------: | :--------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|       SELSA       | R-50-DC5  | pytorch |   7e    |   3.49   |      7.5       |   78.4    |   [config](selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py)    |   [model](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835.log.json)   |
-|       SELSA       | R-101-DC5 | pytorch |   7e    |   5.18   |      7.2       |   81.5    |   [config](selsa_faster_rcnn_r101_dc5_1x_imagenetvid.py)   | [model](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r101_dc5_1x_imagenetvid/selsa_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172724-aa961bcc.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r101_dc5_1x_imagenetvid/selsa_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172724.log.json) |
-|       SELSA       | X-101-DC5 | pytorch |   7e    |   9.15   |       -        |   83.1    |   [config](selsa_faster_rcnn_x101_dc5_1x_imagenetvid.py)   | [model](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_x101_dc5_1x_imagenetvid/selsa_faster_rcnn_x101_dc5_1x_imagenetvid_20210825_205641-10252965.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_x101_dc5_1x_imagenetvid/selsa_faster_rcnn_x101_dc5_1x_imagenetvid_20210825_205641.log.json) |
-| SELSA <br> (FP16) | R-50-DC5  | pytorch |   7e    |   2.71   |       -        |   78.7    | [config](selsa_faster_rcnn_r50_dc5_fp16_1x_imagenetvid.py) |                                            [model](https://download.openmmlab.com/mmtracking/fp16/selsa_faster_rcnn_r50_dc5_fp16_1x_imagenetvid_20210728_193846-dce6eb09.pth) \| [log](https://download.openmmlab.com/mmtracking/fp16/selsa_faster_rcnn_r50_dc5_fp16_1x_imagenetvid_20210728_193846.log.json)                                            |
+|      Method       | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP@50 |                             Config                             |                                                                                                                                                                         Download                                                                                                                                                                         |
+| :---------------: | :-------: | :-----: | :-----: | :------: | :------------: | :-------: | :------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|       SELSA       | R-50-DC5  | pytorch |   7e    |   3.49   |      7.5       |   78.4    |   [config](selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py)   |   [model](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835.log.json)   |
+|       SELSA       | R-101-DC5 | pytorch |   7e    |   5.18   |      7.2       |   81.5    |  [config](selsa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py)   | [model](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r101_dc5_1x_imagenetvid/selsa_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172724-aa961bcc.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r101_dc5_1x_imagenetvid/selsa_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172724.log.json) |
+|       SELSA       | X-101-DC5 | pytorch |   7e    |   9.15   |       -        |   83.1    |   [config](selsa_faster-rcnn_x50-dc5_8xb1-7e_imagenetvid.py)   | [model](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_x101_dc5_1x_imagenetvid/selsa_faster_rcnn_x101_dc5_1x_imagenetvid_20210825_205641-10252965.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_x101_dc5_1x_imagenetvid/selsa_faster_rcnn_x101_dc5_1x_imagenetvid_20210825_205641.log.json) |
+| SELSA <br> (FP16) | R-50-DC5  | pytorch |   7e    |   2.71   |       -        |   78.7    | [config](selsa_faster-rcnn_r50-dc5_8xb1-amp-7e_imagenetvid.py) |                                            [model](https://download.openmmlab.com/mmtracking/fp16/selsa_faster_rcnn_r50_dc5_fp16_1x_imagenetvid_20210728_193846-dce6eb09.pth) \| [log](https://download.openmmlab.com/mmtracking/fp16/selsa_faster_rcnn_r50_dc5_fp16_1x_imagenetvid_20210728_193846.log.json)                                            |
 
 Note:
 
 - `FP16` means Mixed Precision (FP16) is adopted in training.
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training.
+
+```shell
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs
+./tools/dist_train.sh \
+    configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+```shell
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py 8 \
+    --checkpoint ./checkpoints/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth
+```
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_vid.py \
+    configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py \
+    --checkpoint ./checkpoints/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth \
+    --input demo/demo.mp4 \
+    --output vid.mp4
+```
+
+If you want to know about more detailed usage of `demo_vid.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/vid/selsa/metafile.yml b/configs/vid/selsa/metafile.yml
index f8ac3d23d..25083256e 100644
--- a/configs/vid/selsa/metafile.yml
+++ b/configs/vid/selsa/metafile.yml
@@ -8,14 +8,14 @@ Collections:
       Architecture:
         - ResNet
     Paper:
-        URL: https://arxiv.org/abs/1907.06390
-        Title: Sequence Level Semantics Aggregation for Video Object Detection
+      URL: https://arxiv.org/abs/1907.06390
+      Title: Sequence Level Semantics Aggregation for Video Object Detection
     README: configs/vid/selsa/README.md
 
 Models:
-  - Name: selsa_faster_rcnn_r50_dc5_1x_imagenetvid
+  - Name: selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid
     In Collection: SELSA
-    Config: configs/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py
+    Config: configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 3.49
     Results:
@@ -25,9 +25,9 @@ Models:
           box AP@0.5: 78.4
     Weights: https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth
 
-  - Name: selsa_faster_rcnn_r101_dc5_1x_imagenetvid
+  - Name: selsa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid
     In Collection: SELSA
-    Config: configs/vid/selsa/selsa_faster_rcnn_r101_dc5_1x_imagenetvid.py
+    Config: configs/vid/selsa/selsa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 5.18
     Results:
@@ -37,9 +37,9 @@ Models:
           box AP@0.5: 81.5
     Weights: https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r101_dc5_1x_imagenetvid/selsa_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172724-aa961bcc.pth
 
-  - Name: selsa_faster_rcnn_x101_dc5_1x_imagenetvid
+  - Name: selsa_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid
     In Collection: SELSA
-    Config: configs/vid/selsa/selsa_faster_rcnn_x101_dc5_1x_imagenetvid.py
+    Config: configs/vid/selsa/selsa_faster_rcnn-resnext101-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 9.15
     Results:
@@ -49,9 +49,9 @@ Models:
           box AP@0.5: 83.1
     Weights: https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_x101_dc5_1x_imagenetvid/selsa_faster_rcnn_x101_dc5_1x_imagenetvid_20210825_205641-10252965.pth
 
-  - Name: selsa_faster_rcnn_r50_dc5_fp16_1x_imagenetvid
+  - Name: selsa_faster-rcnn_r50-dc5_8xb1-amp-7e_imagenetvid
     In Collection: SELSA
-    Config: configs/vid/selsa/selsa_faster_rcnn_r50_dc5_fp16_1x_imagenetvid.py
+    Config: configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-amp-7e_imagenetvid.py
     Metadata:
       Training Data: ImageNet DET, ImageNet VID
       Training Memory (GB): 2.71
diff --git a/configs/vid/selsa/selsa_faster_rcnn_r101_dc5_1x_imagenetvid.py b/configs/vid/selsa/selsa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
similarity index 73%
rename from configs/vid/selsa/selsa_faster_rcnn_r101_dc5_1x_imagenetvid.py
rename to configs/vid/selsa/selsa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
index 1ef18eead..555cdcc77 100644
--- a/configs/vid/selsa/selsa_faster_rcnn_r101_dc5_1x_imagenetvid.py
+++ b/configs/vid/selsa/selsa_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
@@ -1,4 +1,4 @@
-_base_ = ['./selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py']
+_base_ = ['./selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py']
 model = dict(
     detector=dict(
         backbone=dict(
diff --git a/configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py b/configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
new file mode 100644
index 000000000..7b0c6a6cf
--- /dev/null
+++ b/configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
@@ -0,0 +1,63 @@
+_base_ = [
+    '../../_base_/models/faster-rcnn_r50-dc5.py',
+    '../../_base_/datasets/imagenet_vid_fgfa_style.py',
+    '../../_base_/default_runtime.py'
+]
+model = dict(
+    type='SELSA',
+    detector=dict(
+        roi_head=dict(
+            type='mmtrack.SelsaRoIHead',
+            bbox_head=dict(
+                type='mmtrack.SelsaBBoxHead',
+                num_shared_fcs=2,
+                aggregator=dict(
+                    type='mmtrack.SelsaAggregator',
+                    in_channels=1024,
+                    num_attention_blocks=16)),
+            bbox_roi_extractor=dict(
+                type='mmtrack.SingleRoIExtractor',
+                roi_layer=dict(
+                    type='RoIAlign', output_size=7, sampling_ratio=2),
+                out_channels=512,
+                featmap_strides=[16]))))
+
+# dataset settings
+val_dataloader = dict(
+    dataset=dict(
+        ref_img_sampler=dict(
+            _delete_=True,
+            num_ref_imgs=14,
+            frame_range=[-7, 7],
+            method='test_with_adaptive_stride')))
+test_dataloader = val_dataloader
+
+# training schedule
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=7, val_interval=7)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=7,
+        by_epoch=True,
+        milestones=[2, 5],
+        gamma=0.1)
+]
+
+visualizer = dict(type='DetLocalVisualizer')
diff --git a/configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-amp-7e_imagenetvid.py b/configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-amp-7e_imagenetvid.py
new file mode 100644
index 000000000..0ff7bceac
--- /dev/null
+++ b/configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-amp-7e_imagenetvid.py
@@ -0,0 +1,2 @@
+_base_ = ['./selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py']
+fp16 = dict(loss_scale=512.)
diff --git a/configs/vid/selsa/selsa_faster_rcnn_x101_dc5_1x_imagenetvid.py b/configs/vid/selsa/selsa_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
similarity index 81%
rename from configs/vid/selsa/selsa_faster_rcnn_x101_dc5_1x_imagenetvid.py
rename to configs/vid/selsa/selsa_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
index 11d9c8f83..3cdc0aeda 100644
--- a/configs/vid/selsa/selsa_faster_rcnn_x101_dc5_1x_imagenetvid.py
+++ b/configs/vid/selsa/selsa_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
@@ -1,4 +1,4 @@
-_base_ = ['./selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py']
+_base_ = ['./selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py']
 model = dict(
     detector=dict(
         backbone=dict(
diff --git a/configs/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py b/configs/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py
deleted file mode 100644
index 7995ad8bf..000000000
--- a/configs/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py
+++ /dev/null
@@ -1,47 +0,0 @@
-_base_ = [
-    '../../_base_/models/faster_rcnn_r50_dc5.py',
-    '../../_base_/datasets/imagenet_vid_fgfa_style.py',
-    '../../_base_/default_runtime.py'
-]
-model = dict(
-    type='SELSA',
-    detector=dict(
-        roi_head=dict(
-            type='SelsaRoIHead',
-            bbox_head=dict(
-                type='SelsaBBoxHead',
-                num_shared_fcs=2,
-                aggregator=dict(
-                    type='SelsaAggregator',
-                    in_channels=1024,
-                    num_attention_blocks=16)))))
-
-# dataset settings
-data = dict(
-    val=dict(
-        ref_img_sampler=dict(
-            _delete_=True,
-            num_ref_imgs=14,
-            frame_range=[-7, 7],
-            method='test_with_adaptive_stride')),
-    test=dict(
-        ref_img_sampler=dict(
-            _delete_=True,
-            num_ref_imgs=14,
-            frame_range=[-7, 7],
-            method='test_with_adaptive_stride')))
-
-# optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=1.0 / 3,
-    step=[2, 5])
-# runtime settings
-total_epochs = 7
-evaluation = dict(metric=['bbox'], interval=7)
diff --git a/configs/vid/selsa/selsa_faster_rcnn_r50_dc5_fp16_1x_imagenetvid.py b/configs/vid/selsa/selsa_faster_rcnn_r50_dc5_fp16_1x_imagenetvid.py
deleted file mode 100644
index 0da26b8f3..000000000
--- a/configs/vid/selsa/selsa_faster_rcnn_r50_dc5_fp16_1x_imagenetvid.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = ['./selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py']
-fp16 = dict(loss_scale=512.)
diff --git a/configs/vid/temporal_roi_align/README.md b/configs/vid/temporal_roi_align/README.md
index cfb3af80b..6ed83d310 100644
--- a/configs/vid/temporal_roi_align/README.md
+++ b/configs/vid/temporal_roi_align/README.md
@@ -34,8 +34,47 @@ We observed that the performance of this method has a fluctuation of about 0.5 m
 
 Note that the numbers of selsa modules in this method and `SELSA` are 3 and 2 respectively. This is because another selsa modules improve this method by 0.2 points but degrade `SELSA` by 0.5 points. We choose the best settings for the two methods for a fair comparison.
 
-|       Method       | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP@50 |                              Config                              |                                                                                                                                                                                                          Download                                                                                                                                                                                                          |
-| :----------------: | :-------: | :-----: | :-----: | :------: | :------------: | :-------: | :--------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| Temporal RoI Align | R-50-DC5  | pytorch |   7e    |   4.14   |       -        |   79.8    | [config](selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid.py)  |   [model](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid_20210820_162714-939fd657.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid_20210820_162714.log.json)   |
-| Temporal RoI Align | R-101-DC5 | pytorch |   7e    |   5.83   |       -        |   82.6    | [config](selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid_20210822_111621-22cb96b9.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid_20210822_111621.log.json) |
-| Temporal RoI Align | X-101-DC5 | pytorch |   7e    |   9.74   |       -        |   84.1    | [config](selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid_20210822_164036-4471ac42.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid_20210822_164036.log.json) |
+|       Method       | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) | box AP@50 |                                Config                                 |                                                                                                                                                                                                          Download                                                                                                                                                                                                          |
+| :----------------: | :-------: | :-----: | :-----: | :------: | :------------: | :-------: | :-------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Temporal RoI Align | R-50-DC5  | pytorch |   7e    |   4.14   |       -        |   79.8    | [config](selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py)  |   [model](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid_20210820_162714-939fd657.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid_20210820_162714.log.json)   |
+| Temporal RoI Align | R-101-DC5 | pytorch |   7e    |   5.83   |       -        |   82.6    | [config](selsa-troialign_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid_20210822_111621-22cb96b9.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid_20210822_111621.log.json) |
+| Temporal RoI Align | X-101-DC5 | pytorch |   7e    |   9.74   |       -        |   84.1    | [config](selsa-troialign_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py) | [model](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid_20210822_164036-4471ac42.pth) \| [log](https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid_20210822_164036.log.json) |
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file,
+we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training.
+
+```shell script
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs
+./tools/dist_train.sh \
+    configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`,
+please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+```shell script
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_test.sh \
+    configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py 8 \
+    --checkpoint ./checkpoints/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid_20210820_162714-939fd657.pth
+```
+
+### 3. Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell script
+python demo/demo_vid.py \
+    configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py \
+    --checkpoint ./checkpoints/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid_20210820_162714-939fd657.pth \
+    --input demo/demo.mp4 \
+    --output vid.mp4
+```
+
+If you want to know about more detailed usage of `demo_vid.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/vid/temporal_roi_align/metafile.yml b/configs/vid/temporal_roi_align/metafile.yml
index 67b0339f7..9ec87264c 100644
--- a/configs/vid/temporal_roi_align/metafile.yml
+++ b/configs/vid/temporal_roi_align/metafile.yml
@@ -8,14 +8,14 @@ Collections:
       Architecture:
         - ResNet
     Paper:
-        URL: https://ojs.aaai.org/index.php/AAAI/article/view/16234
-        Title: Temporal ROI Align for Video Object Recognition
+      URL: https://ojs.aaai.org/index.php/AAAI/article/view/16234
+      Title: Temporal ROI Align for Video Object Recognition
     README: configs/vid/temporal_roi_align/README.md
 
 Models:
-  - Name: selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid
+  - Name: selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid
     In Collection: SELSA-TemporalRoIAlign
-    Config: configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid.py
+    Config: configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 4.14
     Results:
@@ -25,9 +25,9 @@ Models:
           box AP@0.5: 79.8
     Weights: https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid_20210820_162714-939fd657.pth
 
-  - Name: selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid
+  - Name: selsa-troialign_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid
     In Collection: SELSA-TemporalRoIAlign
-    Config: configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid.py
+    Config: configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 5.83
     Results:
@@ -37,9 +37,9 @@ Models:
           box AP@0.5: 82.6
     Weights: https://download.openmmlab.com/mmtracking/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid_20210822_111621-22cb96b9.pth
 
-  - Name: selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid
+  - Name: selsa-troialign_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid
     In Collection: SELSA-TemporalRoIAlign
-    Config: configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid.py
+    Config: configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
     Metadata:
       Training Memory (GB): 9.74
     Results:
diff --git a/configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid.py b/configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
similarity index 71%
rename from configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid.py
rename to configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
index 20f7a35d4..fbbabd29f 100644
--- a/configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid.py
+++ b/configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r101-dc5_8xb1-7e_imagenetvid.py
@@ -1,4 +1,4 @@
-_base_ = ['./selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid.py']
+_base_ = ['./selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py']
 model = dict(
     detector=dict(
         backbone=dict(
diff --git a/configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py b/configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
new file mode 100644
index 000000000..d07125cce
--- /dev/null
+++ b/configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
@@ -0,0 +1,65 @@
+_base_ = [
+    '../../_base_/models/faster-rcnn_r50-dc5.py',
+    '../../_base_/datasets/imagenet_vid_fgfa_style.py',
+    '../../_base_/default_runtime.py'
+]
+model = dict(
+    type='SELSA',
+    detector=dict(
+        roi_head=dict(
+            type='mmtrack.SelsaRoIHead',
+            bbox_roi_extractor=dict(
+                type='mmtrack.TemporalRoIAlign',
+                num_most_similar_points=2,
+                num_temporal_attention_blocks=4,
+                roi_layer=dict(
+                    type='RoIAlign', output_size=7, sampling_ratio=2),
+                out_channels=512,
+                featmap_strides=[16]),
+            bbox_head=dict(
+                type='mmtrack.SelsaBBoxHead',
+                num_shared_fcs=3,
+                aggregator=dict(
+                    type='mmtrack.SelsaAggregator',
+                    in_channels=1024,
+                    num_attention_blocks=16)))))
+
+# dataset settings
+val_dataloader = dict(
+    dataset=dict(
+        ref_img_sampler=dict(
+            _delete_=True,
+            num_ref_imgs=14,
+            frame_range=[-7, 7],
+            method='test_with_adaptive_stride')))
+test_dataloader = val_dataloader
+
+# training schedule
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=7, val_interval=7)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=7,
+        by_epoch=True,
+        milestones=[2, 5],
+        gamma=0.1)
+]
+
+visualizer = dict(type='DetLocalVisualizer')
diff --git a/configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid.py b/configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
similarity index 79%
rename from configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid.py
rename to configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
index 2d54cc454..6e704d5d0 100644
--- a/configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_x101_dc5_7e_imagenetvid.py
+++ b/configs/vid/temporal_roi_align/selsa-troialign_faster-rcnn_x101-dc5_8xb1-7e_imagenetvid.py
@@ -1,4 +1,4 @@
-_base_ = ['./selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid.py']
+_base_ = ['./selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py']
 model = dict(
     detector=dict(
         backbone=dict(
diff --git a/configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid.py b/configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid.py
deleted file mode 100644
index 1a32c60d5..000000000
--- a/configs/vid/temporal_roi_align/selsa_troialign_faster_rcnn_r50_dc5_7e_imagenetvid.py
+++ /dev/null
@@ -1,55 +0,0 @@
-_base_ = [
-    '../../_base_/models/faster_rcnn_r50_dc5.py',
-    '../../_base_/datasets/imagenet_vid_fgfa_style.py',
-    '../../_base_/default_runtime.py'
-]
-model = dict(
-    type='SELSA',
-    detector=dict(
-        roi_head=dict(
-            type='SelsaRoIHead',
-            bbox_roi_extractor=dict(
-                type='TemporalRoIAlign',
-                num_most_similar_points=2,
-                num_temporal_attention_blocks=4,
-                roi_layer=dict(
-                    type='RoIAlign', output_size=7, sampling_ratio=2),
-                out_channels=512,
-                featmap_strides=[16]),
-            bbox_head=dict(
-                type='SelsaBBoxHead',
-                num_shared_fcs=3,
-                aggregator=dict(
-                    type='SelsaAggregator',
-                    in_channels=1024,
-                    num_attention_blocks=16)))))
-
-# dataset settings
-data = dict(
-    val=dict(
-        ref_img_sampler=dict(
-            _delete_=True,
-            num_ref_imgs=14,
-            frame_range=[-7, 7],
-            method='test_with_adaptive_stride')),
-    test=dict(
-        ref_img_sampler=dict(
-            _delete_=True,
-            num_ref_imgs=14,
-            frame_range=[-7, 7],
-            method='test_with_adaptive_stride')))
-
-# optimizer
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=1.0 / 3,
-    step=[2, 5])
-# runtime settings
-total_epochs = 7
-evaluation = dict(metric=['bbox'], interval=7)
diff --git a/configs/vis/README.md b/configs/vis/README.md
deleted file mode 100644
index 033bacb0b..000000000
--- a/configs/vis/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Usage of VIS configs
-
-## Training with VIS configs
-
-Please refer to [Train VIS models](../../docs/en/quick_run.md#examples-of-training-vis-model) to see the examples.
-
-## Testing with VIS configs
-
-Please refer to [Test VIS models](../../docs/en/quick_run.md#examples-of-testing-vis-model) to see the examples.
-
-## Inference with VIS configs
-
-Please refer to [Inference VIS models](../../docs/en/quick_run.md#inference-motvis-models) to see the examples.
diff --git a/configs/vis/mask2former/README.md b/configs/vis/mask2former/README.md
new file mode 100644
index 000000000..8155cd835
--- /dev/null
+++ b/configs/vis/mask2former/README.md
@@ -0,0 +1,78 @@
+# Mask2Former for Video Instance Segmentation
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We find Mask2Former also achieves state-of-the-art performance on video instance segmentation without modifying the architecture, the loss or even the training pipeline. In this report, we show universal image segmentation architectures trivially generalize to video segmentation by directly predicting 3D segmentation volumes. Specifically, Mask2Former sets a new state-of-the-art of 60.4 AP on YouTubeVIS-2019 and 52.6 AP on YouTubeVIS-2021. We believe Mask2Former is also capable of handling video semantic and panoptic segmentation, given its versatility in image segmentation. We hope this will make state-of-theart video segmentation research more accessible and bring more attention to designing universal image and video segmentation architectures.
+
+<!-- [IMAGE] -->
+
+<div align="center">
+  <img src="https://user-images.githubusercontent.com/46072190/188271377-164634a5-4d65-4161-8a69-2d0eaf2791f8.png"/>
+</div>
+
+## Citation
+
+<!-- [ALGORITHM] -->
+
+```latex
+@inproceedings{cheng2021mask2former,
+  title={Masked-attention Mask Transformer for Universal Image Segmentation},
+  author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar},
+  journal={CVPR},
+  year={2022}
+}
+```
+
+## Results and models of Mask2Former on YouTube-VIS 2021 validation dataset
+
+Note: Codalab has closed the evaluation portal of `YouTube-VIS 2019`, so we do not provide the results of `YouTube-VIS 2019` at present. If you want to evaluate the results of `YouTube-VIS 2021`, at present, you can submit the result to the evaluation portal of `YouTube-VIS 2022`. The value of `AP_S` is the result of `YouTube-VIS 2021`.
+
+|          Method          | Backbone |  Style  | Lr schd | Mem (GB) | Inf time (fps) |  AP  |                                 Config                                  |                                                                                                                                                    Download                                                                                                                                                    |
+| :----------------------: | :------: | :-----: | :-----: | :------: | :------------: | :--: | :---------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|       Mask2Former        |   R-50   | pytorch |   8e    |   6.0    |       -        | 41.2 |           [config](mask2former_r50_8xb2-8e_youtubevis2021.py)           |                     [model](https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2021_20220818_164043-1cab1219.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2021_20220818_164043.json)                     |
+|       Mask2Former        |  R-101   | pytorch |   8e    |   7.5    |       -        | 42.3 |          [config](mask2former_r101_8xb2-8e_youtubevis2021.py)           |                    [model](https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2021_20220823_092747-b7a7d7cc.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2021_20220823_092747.json)                    |
+| Mask2Former(200 queries) |  Swin-L  | pytorch |   8e    |   18.5   |       -        | 52.3 | [config](mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021_20220907_124752-c04b720e.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021_20220907_124752.json) |
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training.
+
+```shell
+# Training Mask2Former on YouTube-VIS-2019 dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_train.sh \
+    configs/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2019.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+If you want to get the results of the [YouTube-VOS](https://youtube-vos.org/dataset/vis/) val/test set, please use the following command to generate result files that can be used for submission. It will be stored in `./youtube_vis_results.submission_file.zip`, you can modify the saved path in `test_evaluator` of the config.
+
+```shell
+# The number after config file represents the number of GPUs used.
+./tools/dist_test.sh \
+    configs/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2019.py 8 \
+    --checkpoint ./checkpoints/xxx
+```
+
+If you want to know about more detailed usage of `test.py/dist_test.sh/slurm_test.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_mot_vis.py \
+    configs/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2019.py \
+    --checkpoint ./checkpoints/xxx \
+    --input demo/demo.mp4 \
+    --output vis.mp4
+```
+
+If you want to know about more detailed usage of `demo_mot_vis.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2019.py b/configs/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2019.py
new file mode 100644
index 000000000..04e7460e8
--- /dev/null
+++ b/configs/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2019.py
@@ -0,0 +1,12 @@
+_base_ = './mask2former_r50_8xb2-8e_youtubevis2019.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+        'mask2former/mask2former_r101_lsj_8x2_50e_coco/'
+        'mask2former_r101_lsj_8x2_50e_coco_20220426_100250-c50b6fa6.pth'))
diff --git a/configs/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2021.py b/configs/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2021.py
new file mode 100644
index 000000000..c890adbae
--- /dev/null
+++ b/configs/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2021.py
@@ -0,0 +1,12 @@
+_base_ = './mask2former_r50_8xb2-8e_youtubevis2021.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+        'mask2former/mask2former_r101_lsj_8x2_50e_coco/'
+        'mask2former_r101_lsj_8x2_50e_coco_20220426_100250-c50b6fa6.pth'))
diff --git a/configs/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2019.py b/configs/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2019.py
new file mode 100644
index 000000000..c503f828b
--- /dev/null
+++ b/configs/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2019.py
@@ -0,0 +1,227 @@
+_base_ = [
+    '../../_base_/datasets/youtube_vis.py', '../../_base_/default_runtime.py'
+]
+
+num_classes = 40
+num_frames = 2
+model = dict(
+    type='Mask2Former',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        _scope_='mmdet',
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    track_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_frames=num_frames,
+        num_transformer_feat_level=3,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='mmdet.DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=128,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='mmdet.SinePositionalEncoding',
+                num_feats=128,
+                normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='SinePositionalEncoding3D', num_feats=128, normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
+        'mask2former/mask2former_r50_lsj_8x2_50e_coco/'
+        'mask2former_r50_lsj_8x2_50e_coco_20220506_191028-8e96e88b.pth'))
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW',
+        lr=0.0001,
+        weight_decay=0.05,
+        eps=1e-8,
+        betas=(0.9, 0.999)),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0),
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+
+# learning policy
+max_iters = 6000
+param_scheduler = dict(
+    type='MultiStepLR',
+    begin=0,
+    end=max_iters,
+    by_epoch=False,
+    milestones=[
+        4000,
+    ],
+    gamma=0.1)
+# runtime settings
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=max_iters, val_interval=6001)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, save_last=True, interval=2000))
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False)
+
+train_pipeline = [
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='LoadTrackAnnotations',
+                with_instance_id=True,
+                with_mask=True,
+                with_bbox=True),
+            dict(type='mmdet.Resize', scale=(640, 360), keep_ratio=True),
+            dict(type='mmdet.RandomFlip', prob=0.5),
+        ]),
+    dict(type='PackTrackInputs', num_key_frames=num_frames)
+]
+# dataloader
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    dataset=dict(
+        pipeline=train_pipeline,
+        ref_img_sampler=dict(
+            num_ref_imgs=1,
+            frame_range=5,
+            filter_key_img=True,
+            method='uniform')))
+val_dataloader = dict(
+    num_workers=2,
+    sampler=dict(type='VideoSampler'),
+    batch_sampler=dict(type='EntireVideoBatchSampler'),
+)
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='YouTubeVISMetric',
+    metric='youtube_vis_ap',
+    outfile_prefix='./youtube_vis_results',
+    format_only=True)
+test_evaluator = val_evaluator
diff --git a/configs/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2021.py b/configs/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2021.py
new file mode 100644
index 000000000..462c34295
--- /dev/null
+++ b/configs/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2021.py
@@ -0,0 +1,32 @@
+_base_ = './mask2former_r50_8xb2-8e_youtubevis2019.py'
+
+dataset_type = 'YouTubeVISDataset'
+data_root = 'data/youtube_vis_2021/'
+dataset_version = data_root[-5:-1]  # 2019 or 2021
+
+# dataloader
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_train.json'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_valid.json'))
+test_dataloader = val_dataloader
+# learning policy
+max_iters = 8000
+param_scheduler = dict(
+    type='MultiStepLR',
+    begin=0,
+    end=max_iters,
+    by_epoch=False,
+    milestones=[
+        5500,
+    ],
+    gamma=0.1)
+# runtime settings
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=max_iters, val_interval=8001)
diff --git a/configs/vis/mask2former/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py b/configs/vis/mask2former/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py
new file mode 100644
index 000000000..d4c70ba56
--- /dev/null
+++ b/configs/vis/mask2former/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py
@@ -0,0 +1,64 @@
+_base_ = ['./mask2former_r50_8xb2-8e_youtubevis2021.py']
+depths = [2, 2, 18, 2]
+model = dict(
+    type='Mask2Former',
+    backbone=dict(
+        _delete_=True,
+        type='mmdet.SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=depths,
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    track_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[192, 384, 768, 1536],
+        num_queries=200),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint=  # noqa: E251
+        'https://download.openmmlab.com/mmdetection/v2.0/mask2former/'
+        'mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic/'
+        'mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic_'
+        '20220407_104949-d4919c44.pth'))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/vis/mask2former/metafile.yml b/configs/vis/mask2former/metafile.yml
new file mode 100644
index 000000000..026201428
--- /dev/null
+++ b/configs/vis/mask2former/metafile.yml
@@ -0,0 +1,53 @@
+Collections:
+  - Name: Mask2Former
+    Metadata:
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - Mask2Former
+    Paper:
+      URL: https://arxiv.org/pdf/2112.10764.pdf
+      Title: Mask2Former for Video Instance Segmentation
+    README: configs/vis/mask2former/README.md
+
+Models:
+  - Name: mask2former_r50_8xb2-8e_youtubevis2021
+    In Collection: Mask2Former
+    Config: configs/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2021.py
+    Metadata:
+      Training Data: YouTube-VIS 2021
+      Training Memory (GB): 6.0
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2021
+        Metrics:
+          AP: 41.2
+    Weights: https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2021_20220818_164043-1cab1219.pth
+
+  - Name: mask2former_r101_8xb2-8e_youtubevis2021
+    In Collection: Mask2Former
+    Config: configs/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2021.py
+    Metadata:
+      Training Data: YouTube-VIS 2021
+      Training Memory (GB): 7.5
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2021
+        Metrics:
+          AP: 42.3
+    Weights: https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2021_20220823_092747-b7a7d7cc.pth
+
+  - Name: mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py
+    In Collection: Mask2Former
+    Config: configs/vis/mask2former/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py
+    Metadata:
+      Training Data: YouTube-VIS 2021
+      Training Memory (GB): 18.5
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2021
+        Metrics:
+          AP: 52.3
+    Weights: https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021_20220907_124752-c04b720e.pth
diff --git a/configs/vis/masktrack_rcnn/README.md b/configs/vis/masktrack_rcnn/README.md
index 82d8c80ff..de30ce201 100644
--- a/configs/vis/masktrack_rcnn/README.md
+++ b/configs/vis/masktrack_rcnn/README.md
@@ -31,18 +31,60 @@ In this paper we present a new computer vision task, named video instance segmen
 As mentioned in [Issues #6](https://github.com/youtubevos/MaskTrackRCNN/issues/6#issuecomment-502503505) in MaskTrack R-CNN, the result is kind of unstable for different trials, which ranges from 28 AP to 31 AP when using R-50-FPN as backbone.
 The checkpoint provided below is the best one from two experiments.
 
-|     Method      | Base detector | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) |  AP  |                         Config                          |                                                                                                                                                                                    Download                                                                                                                                                                                    |
-| :-------------: | :-----------: | :-------: | :-----: | :-----: | :------: | :------------: | :--: | :-----------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| MaskTrack R-CNN |  Mask R-CNN   | R-50-FPN  | pytorch |   12e   |   1.61   |       -        | 30.2 | [config](masktrack_rcnn_r50_fpn_12e_youtubevis2019.py)  |   [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830.log.json)   |
-| MaskTrack R-CNN |  Mask R-CNN   | R-101-FPN | pytorch |   12e   |   2.27   |       -        | 32.2 | [config](masktrack_rcnn_r101_fpn_12e_youtubevis2019.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019/masktrack_rcnn_r101_fpn_12e_youtubevis2019_20211023_150038-454dc48b.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019/masktrack_rcnn_r101_fpn_12e_youtubevis2019_20211023_150038.log.json) |
-| MaskTrack R-CNN |  Mask R-CNN   | X-101-FPN | pytorch |   12e   |   3.69   |       -        | 34.7 | [config](masktrack_rcnn_x101_fpn_12e_youtubevis2019.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019/masktrack_rcnn_x101_fpn_12e_youtubevis2019_20211023_153205-fff7a102.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019/masktrack_rcnn_x101_fpn_12e_youtubevis2019_20211023_153205.log.json) |
+|     Method      | Base detector | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) |  AP  |                                 Config                                 |                                                                                                                                                                                    Download                                                                                                                                                                                    |
+| :-------------: | :-----------: | :-------: | :-----: | :-----: | :------: | :------------: | :--: | :--------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| MaskTrack R-CNN |  Mask R-CNN   | R-50-FPN  | pytorch |   12e   |   1.61   |       -        | 30.2 | [config](masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py)  |   [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830.log.json)   |
+| MaskTrack R-CNN |  Mask R-CNN   | R-101-FPN | pytorch |   12e   |   2.27   |       -        | 32.2 | [config](masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019/masktrack_rcnn_r101_fpn_12e_youtubevis2019_20211023_150038-454dc48b.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019/masktrack_rcnn_r101_fpn_12e_youtubevis2019_20211023_150038.log.json) |
+| MaskTrack R-CNN |  Mask R-CNN   | X-101-FPN | pytorch |   12e   |   3.69   |       -        | 34.7 | [config](masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019/masktrack_rcnn_x101_fpn_12e_youtubevis2019_20211023_153205-fff7a102.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019/masktrack_rcnn_x101_fpn_12e_youtubevis2019_20211023_153205.log.json) |
 
 ## Results and models of MaskTrack R-CNN on YouTube-VIS 2021 validation dataset
 
 The checkpoint provided below is the best one from two experiments.
 
-|     Method      | Base detector | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) |  AP  |                         Config                          |                                                                                                                                                                                    Download                                                                                                                                                                                    |
-| :-------------: | :-----------: | :-------: | :-----: | :-----: | :------: | :------------: | :--: | :-----------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| MaskTrack R-CNN |  Mask R-CNN   | R-50-FPN  | pytorch |   12e   |   1.61   |       -        | 28.7 | [config](masktrack_rcnn_r50_fpn_12e_youtubevis2021.py)  |   [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021/masktrack_rcnn_r50_fpn_12e_youtubevis2021_20211026_044948-10da90d9.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021/masktrack_rcnn_r50_fpn_12e_youtubevis2021_20211026_044948.log.json)   |
-| MaskTrack R-CNN |  Mask R-CNN   | R-101-FPN | pytorch |   12e   |   2.27   |       -        | 31.3 | [config](masktrack_rcnn_r101_fpn_12e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021/masktrack_rcnn_r101_fpn_12e_youtubevis2021_20211026_045509-3c49e4f3.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021/masktrack_rcnn_r101_fpn_12e_youtubevis2021_20211026_045509.log.json) |
-| MaskTrack R-CNN |  Mask R-CNN   | X-101-FPN | pytorch |   12e   |   3.69   |       -        | 33.5 | [config](masktrack_rcnn_x101_fpn_12e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021/masktrack_rcnn_x101_fpn_12e_youtubevis2021_20211026_095943-90831df4.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021/masktrack_rcnn_x101_fpn_12e_youtubevis2021_20211026_095943.log.json) |
+|     Method      | Base detector | Backbone  |  Style  | Lr schd | Mem (GB) | Inf time (fps) |  AP  |                                 Config                                 |                                                                                                                                                                                    Download                                                                                                                                                                                    |
+| :-------------: | :-----------: | :-------: | :-----: | :-----: | :------: | :------------: | :--: | :--------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| MaskTrack R-CNN |  Mask R-CNN   | R-50-FPN  | pytorch |   12e   |   1.61   |       -        | 28.7 | [config](masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py)  |   [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021/masktrack_rcnn_r50_fpn_12e_youtubevis2021_20211026_044948-10da90d9.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021/masktrack_rcnn_r50_fpn_12e_youtubevis2021_20211026_044948.log.json)   |
+| MaskTrack R-CNN |  Mask R-CNN   | R-101-FPN | pytorch |   12e   |   2.27   |       -        | 31.3 | [config](masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021/masktrack_rcnn_r101_fpn_12e_youtubevis2021_20211026_045509-3c49e4f3.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021/masktrack_rcnn_r101_fpn_12e_youtubevis2021_20211026_045509.log.json) |
+| MaskTrack R-CNN |  Mask R-CNN   | X-101-FPN | pytorch |   12e   |   3.69   |       -        | 33.5 | [config](masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021/masktrack_rcnn_x101_fpn_12e_youtubevis2021_20211026_095943-90831df4.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021/masktrack_rcnn_x101_fpn_12e_youtubevis2021_20211026_095943.log.json) |
+
+## Get started
+
+### 1. Training
+
+Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training.
+
+```shell
+# Training MaskTrack R-CNN on YouTube-VIS-2019 dataset with following command.
+# The number after config file represents the number of GPUs used. Here we use 8 GPUs.
+./tools/dist_train.sh \
+    configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py 8
+```
+
+If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 2. Testing and evaluation
+
+If you want to get the results of the [YouTube-VOS](https://youtube-vos.org/dataset/vis/) val/test set, please use the following command to generate result files that can be used for submission. It will be stored in `./youtube_vis_results.submission_file.zip`, you can modify the saved path in `test_evaluator` of the config.
+
+```shell
+# The number after config file represents the number of GPUs used.
+./tools/dist_test.sh \
+    configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py 8 \
+    --checkpoint ./checkpoints/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth
+```
+
+If you want to know about more detailed usage of `test.py/dist_test.sh/slurm_test.sh`, please refer to this [document](../../../docs/en/user_guides/4_train_test.md).
+
+### 3.Inference
+
+Use a single GPU to predict a video and save it as a video.
+
+```shell
+python demo/demo_mot_vis.py \
+    configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py \
+    --checkpoint ./checkpoints/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth \
+    --input demo/demo.mp4 \
+    --output vis.mp4
+```
+
+If you want to know about more detailed usage of `demo_mot_vis.py`, please refer to this [document](../../../docs/en/user_guides/3_inference.md).
diff --git a/configs/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019.py b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py
similarity index 85%
rename from configs/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019.py
rename to configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py
index 96d0b1a87..4be492d54 100644
--- a/configs/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019.py
+++ b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py
@@ -1,4 +1,4 @@
-_base_ = ['./masktrack_rcnn_r50_fpn_12e_youtubevis2019.py']
+_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py']
 model = dict(
     detector=dict(
         backbone=dict(
diff --git a/configs/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021.py b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py
similarity index 53%
rename from configs/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021.py
rename to configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py
index 1ea3b4fd2..81bae4af8 100644
--- a/configs/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021.py
+++ b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py
@@ -1,4 +1,4 @@
-_base_ = ['./masktrack_rcnn_r50_fpn_12e_youtubevis2019.py']
+_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py']
 model = dict(
     detector=dict(
         backbone=dict(
@@ -13,16 +13,16 @@
 
 data_root = 'data/youtube_vis_2021/'
 dataset_version = data_root[-5:-1]
-data = dict(
-    train=dict(
-        dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2021_train.json',
-        img_prefix=data_root + 'train/JPEGImages'),
-    val=dict(
+
+# dataloader
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
         dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2021_valid.json',
-        img_prefix=data_root + 'valid/JPEGImages'),
-    test=dict(
+        ann_file='annotations/youtube_vis_2021_train.json'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
         dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2021_valid.json',
-        img_prefix=data_root + 'valid/JPEGImages'))
+        ann_file='annotations/youtube_vis_2021_valid.json'))
+test_dataloader = val_dataloader
diff --git a/configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019.py b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py
similarity index 67%
rename from configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019.py
rename to configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py
index e46c83ec4..bedef077f 100644
--- a/configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019.py
+++ b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py
@@ -1,5 +1,5 @@
 _base_ = [
-    '../../_base_/models/mask_rcnn_r50_fpn.py',
+    '../../_base_/models/mask-rcnn_r50_fpn.py',
     '../../_base_/datasets/youtube_vis.py', '../../_base_/default_runtime.py'
 ]
 model = dict(
@@ -22,6 +22,7 @@
         type='RoITrackHead',
         roi_extractor=dict(
             type='SingleRoIExtractor',
+            _scope_='mmdet',
             roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
@@ -34,6 +35,7 @@
         train_cfg=dict(
             assigner=dict(
                 type='MaxIoUAssigner',
+                _scope_='mmdet',
                 pos_iou_thr=0.5,
                 neg_iou_thr=0.5,
                 min_pos_iou=0.5,
@@ -41,6 +43,7 @@
                 ignore_iof_thr=-1),
             sampler=dict(
                 type='RandomSampler',
+                _scope_='mmdet',
                 num=128,
                 pos_fraction=0.25,
                 neg_pos_ub=-1,
@@ -53,16 +56,36 @@
         num_frames_retain=20))
 
 # optimizer
-optimizer = dict(type='SGD', lr=0.00125, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.00125, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
 # learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=1.0 / 3,
-    step=[8, 11])
+param_scheduler = [
+    dict(
+        type='mmdet.LinearLR',
+        start_factor=1.0 / 3.0,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='mmdet.MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
 # runtime settings
-total_epochs = 12
-evaluation = dict(metric=['track_segm'], interval=13)
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_begin=13)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# evaluator
+val_evaluator = dict(
+    type='YouTubeVISMetric',
+    metric='youtube_vis_ap',
+    outfile_prefix='./youtube_vis_results',
+    format_only=True)
+test_evaluator = val_evaluator
diff --git a/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py
new file mode 100644
index 000000000..47263d509
--- /dev/null
+++ b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py
@@ -0,0 +1,17 @@
+_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py']
+
+data_root = 'data/youtube_vis_2021/'
+dataset_version = data_root[-5:-1]
+
+# dataloader
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_train.json'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_valid.json'))
+test_dataloader = val_dataloader
diff --git a/configs/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019.py b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py
similarity index 88%
rename from configs/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019.py
rename to configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py
index a42ad68d2..e7e3f11e1 100644
--- a/configs/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019.py
+++ b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py
@@ -1,4 +1,4 @@
-_base_ = ['./masktrack_rcnn_r50_fpn_12e_youtubevis2019.py']
+_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py']
 model = dict(
     detector=dict(
         backbone=dict(
diff --git a/configs/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021.py b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py
similarity index 57%
rename from configs/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021.py
rename to configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py
index 95f899da8..ea4c8b924 100644
--- a/configs/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021.py
+++ b/configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py
@@ -1,4 +1,4 @@
-_base_ = ['./masktrack_rcnn_r50_fpn_12e_youtubevis2019.py']
+_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py']
 model = dict(
     detector=dict(
         backbone=dict(
@@ -17,16 +17,16 @@
 
 data_root = 'data/youtube_vis_2021/'
 dataset_version = data_root[-5:-1]
-data = dict(
-    train=dict(
-        dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2021_train.json',
-        img_prefix=data_root + 'train/JPEGImages'),
-    val=dict(
+
+# dataloader
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
         dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2021_valid.json',
-        img_prefix=data_root + 'valid/JPEGImages'),
-    test=dict(
+        ann_file='annotations/youtube_vis_2021_train.json'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
         dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2021_valid.json',
-        img_prefix=data_root + 'valid/JPEGImages'))
+        ann_file='annotations/youtube_vis_2021_valid.json'))
+test_dataloader = val_dataloader
diff --git a/configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021.py b/configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021.py
deleted file mode 100644
index ceeebc471..000000000
--- a/configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021.py
+++ /dev/null
@@ -1,17 +0,0 @@
-_base_ = ['./masktrack_rcnn_r50_fpn_12e_youtubevis2019.py']
-
-data_root = 'data/youtube_vis_2021/'
-dataset_version = data_root[-5:-1]
-data = dict(
-    train=dict(
-        dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2021_train.json',
-        img_prefix=data_root + 'train/JPEGImages'),
-    val=dict(
-        dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2021_valid.json',
-        img_prefix=data_root + 'valid/JPEGImages'),
-    test=dict(
-        dataset_version=dataset_version,
-        ann_file=data_root + 'annotations/youtube_vis_2021_valid.json',
-        img_prefix=data_root + 'valid/JPEGImages'))
diff --git a/configs/vis/masktrack_rcnn/metafile.yml b/configs/vis/masktrack_rcnn/metafile.yml
index 52b6af968..882d8b46b 100644
--- a/configs/vis/masktrack_rcnn/metafile.yml
+++ b/configs/vis/masktrack_rcnn/metafile.yml
@@ -7,14 +7,14 @@ Collections:
       Architecture:
         - ResNet
     Paper:
-        URL: https://arxiv.org/pdf/1905.04804.pdf
-        Title: Video Instance Segmentation
+      URL: https://arxiv.org/pdf/1905.04804.pdf
+      Title: Video Instance Segmentation
     README: configs/vis/masktrack_rcnn/README.md
 
 Models:
-  - Name: masktrack_rcnn_r50_fpn_12e_youtubevis2019
+  - Name: masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019
     In Collection: MaskTrack R-CNN
-    Config: configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019.py
+    Config: configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py
     Metadata:
       Training Data: YouTube-VIS 2019
       Training Memory (GB): 1.16
@@ -25,9 +25,9 @@ Models:
           AP: 30.2
     Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth
 
-  - Name: masktrack_rcnn_r101_fpn_12e_youtubevis2019
+  - Name: masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019
     In Collection: MaskTrack R-CNN
-    Config: configs/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019.py
+    Config: configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py
     Metadata:
       Training Data: YouTube-VIS 2019
       Training Memory (GB): 2.27
@@ -38,9 +38,9 @@ Models:
           AP: 32.2
     Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019/masktrack_rcnn_r101_fpn_12e_youtubevis2019_20211023_150038-454dc48b.pth
 
-  - Name: masktrack_rcnn_x101_fpn_12e_youtubevis2019
+  - Name: masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019
     In Collection: MaskTrack R-CNN
-    Config: configs/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019.py
+    Config: configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py
     Metadata:
       Training Data: YouTube-VIS 2019
       Training Memory (GB): 3.69
@@ -51,9 +51,9 @@ Models:
           AP: 34.7
     Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019/masktrack_rcnn_x101_fpn_12e_youtubevis2019_20211023_153205-fff7a102.pth
 
-  - Name: masktrack_rcnn_r50_fpn_12e_youtubevis2021
+  - Name: masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021
     In Collection: MaskTrack R-CNN
-    Config: configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021.py
+    Config: configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py
     Metadata:
       Training Data: YouTube-VIS 2021
       Training Memory (GB): 1.16
@@ -64,9 +64,9 @@ Models:
           AP: 28.7
     Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021/masktrack_rcnn_r50_fpn_12e_youtubevis2021_20211026_044948-10da90d9.pth
 
-  - Name: masktrack_rcnn_r101_fpn_12e_youtubevis2021
+  - Name: masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021
     In Collection: MaskTrack R-CNN
-    Config: configs/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021.py
+    Config: configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py
     Metadata:
       Training Data: YouTube-VIS 2021
       Training Memory (GB): 2.27
@@ -77,9 +77,9 @@ Models:
           AP: 31.3
     Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021/masktrack_rcnn_r101_fpn_12e_youtubevis2021_20211026_045509-3c49e4f3.pth
 
-  - Name: masktrack_rcnn_x101_fpn_12e_youtubevis2021
+  - Name: masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021
     In Collection: MaskTrack R-CNN
-    Config: configs/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021.py
+    Config: configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py
     Metadata:
       Training Data: YouTube-VIS 2021
       Training Memory (GB): 3.69
diff --git a/demo/MMTracking_Tutorial.ipynb b/demo/MMTracking_Tutorial.ipynb
index e63d31e7b..596e2c7e8 100644
--- a/demo/MMTracking_Tutorial.ipynb
+++ b/demo/MMTracking_Tutorial.ipynb
@@ -39,14 +39,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 1,
       "id": "f8ced8f4-b07b-4216-8953-f7af6928b77c",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "f8ced8f4-b07b-4216-8953-f7af6928b77c",
-        "outputId": "6b5af5d7-5b52-4804-aae5-6edebf920816"
+        "outputId": "b078d371-b978-4c10-fc9b-d4b45e6c5a40"
       },
       "outputs": [
         {
@@ -75,135 +75,195 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 2,
       "id": "6b4f093f-e197-42bd-ba64-dc905e379382",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "6b4f093f-e197-42bd-ba64-dc905e379382",
-        "outputId": "cb4c26d9-d955-4a13-e789-814dbbd8847e"
+        "outputId": "b82ac7ec-2f1c-4d53-95e3-3b4d365a9ab0"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html\n",
-            "Collecting mmcv-full\n",
-            "  Downloading https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/mmcv_full-1.4.8-cp37-cp37m-manylinux1_x86_64.whl (45.7 MB)\n",
-            "\u001b[K     |████████████████████████████████| 45.7 MB 19 kB/s \n",
-            "\u001b[?25hCollecting addict\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
+            "Collecting torch==1.10.0+cu111\n",
+            "  Downloading https://download.pytorch.org/whl/cu111/torch-1.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl (2137.6 MB)\n",
+            "\u001b[K     |████████████▌                   | 834.1 MB 1.3 MB/s eta 0:16:48tcmalloc: large alloc 1147494400 bytes == 0x394ce000 @  0x7fd48a2f2615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7\n",
+            "\u001b[K     |███████████████▉                | 1055.7 MB 1.2 MB/s eta 0:14:29tcmalloc: large alloc 1434370048 bytes == 0x7db24000 @  0x7fd48a2f2615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7\n",
+            "\u001b[K     |████████████████████            | 1336.2 MB 1.2 MB/s eta 0:11:02tcmalloc: large alloc 1792966656 bytes == 0x2956000 @  0x7fd48a2f2615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7\n",
+            "\u001b[K     |█████████████████████████▎      | 1691.1 MB 1.2 MB/s eta 0:06:08tcmalloc: large alloc 2241208320 bytes == 0x6d73e000 @  0x7fd48a2f2615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7\n",
+            "\u001b[K     |████████████████████████████████| 2137.6 MB 1.3 MB/s eta 0:00:01tcmalloc: large alloc 2137645056 bytes == 0xf30a0000 @  0x7fd48a2f11e7 0x4a3940 0x4a39cc 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x511e2c 0x549576 0x593fce 0x511e2c 0x549576 0x593fce 0x511e2c 0x549576 0x593fce 0x511e2c 0x549576 0x593fce 0x511e2c 0x593dd7 0x511e2c 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x548ae9\n",
+            "tcmalloc: large alloc 2672058368 bytes == 0x1e6bf6000 @  0x7fd48a2f2615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x511e2c 0x549576 0x593fce 0x511e2c 0x549576 0x593fce 0x511e2c 0x549576 0x593fce 0x511e2c 0x549576 0x593fce 0x511e2c 0x593dd7 0x511e2c 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576\n",
+            "\u001b[K     |████████████████████████████████| 2137.6 MB 399 bytes/s \n",
+            "\u001b[?25hCollecting torchvision==0.11.0+cu111\n",
+            "  Downloading https://download.pytorch.org/whl/cu111/torchvision-0.11.0%2Bcu111-cp37-cp37m-linux_x86_64.whl (21.9 MB)\n",
+            "\u001b[K     |████████████████████████████████| 21.9 MB 556 kB/s \n",
+            "\u001b[?25hCollecting torchaudio==0.10.0\n",
+            "  Downloading https://download.pytorch.org/whl/rocm4.1/torchaudio-0.10.0%2Brocm4.1-cp37-cp37m-linux_x86_64.whl (2.7 MB)\n",
+            "\u001b[K     |████████████████████████████████| 2.7 MB 59.7 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch==1.10.0+cu111) (4.1.1)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from torchvision==0.11.0+cu111) (1.21.6)\n",
+            "Requirement already satisfied: pillow!=8.3.0,>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from torchvision==0.11.0+cu111) (7.1.2)\n",
+            "Installing collected packages: torch, torchvision, torchaudio\n",
+            "  Attempting uninstall: torch\n",
+            "    Found existing installation: torch 1.12.1+cu113\n",
+            "    Uninstalling torch-1.12.1+cu113:\n",
+            "      Successfully uninstalled torch-1.12.1+cu113\n",
+            "  Attempting uninstall: torchvision\n",
+            "    Found existing installation: torchvision 0.13.1+cu113\n",
+            "    Uninstalling torchvision-0.13.1+cu113:\n",
+            "      Successfully uninstalled torchvision-0.13.1+cu113\n",
+            "  Attempting uninstall: torchaudio\n",
+            "    Found existing installation: torchaudio 0.12.1+cu113\n",
+            "    Uninstalling torchaudio-0.12.1+cu113:\n",
+            "      Successfully uninstalled torchaudio-0.12.1+cu113\n",
+            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+            "torchtext 0.13.1 requires torch==1.12.1, but you have torch 1.10.0+cu111 which is incompatible.\u001b[0m\n",
+            "Successfully installed torch-1.10.0+cu111 torchaudio-0.10.0+rocm4.1 torchvision-0.11.0+cu111\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting mmengine\n",
+            "  Downloading mmengine-0.1.0-py3-none-any.whl (280 kB)\n",
+            "\u001b[K     |████████████████████████████████| 280 kB 9.9 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.7/dist-packages (from mmengine) (4.6.0.66)\n",
+            "Collecting addict\n",
             "  Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
-            "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.7/dist-packages (from mmcv-full) (4.1.2.30)\n",
-            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from mmcv-full) (3.13)\n",
-            "Requirement already satisfied: Pillow in /usr/local/lib/python3.7/dist-packages (from mmcv-full) (7.1.2)\n",
-            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from mmcv-full) (21.3)\n",
-            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from mmcv-full) (1.21.6)\n",
             "Collecting yapf\n",
             "  Downloading yapf-0.32.0-py2.py3-none-any.whl (190 kB)\n",
-            "\u001b[K     |████████████████████████████████| 190 kB 7.2 MB/s \n",
-            "\u001b[?25hRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->mmcv-full) (3.0.8)\n",
-            "Installing collected packages: yapf, addict, mmcv-full\n",
-            "Successfully installed addict-2.4.0 mmcv-full-1.4.8 yapf-0.32.0\n",
-            "Collecting mmdet\n",
-            "  Downloading mmdet-2.23.0-py3-none-any.whl (1.4 MB)\n",
-            "\u001b[K     |████████████████████████████████| 1.4 MB 9.7 MB/s \n",
-            "\u001b[?25hRequirement already satisfied: pycocotools in /usr/local/lib/python3.7/dist-packages (from mmdet) (2.0.4)\n",
-            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from mmdet) (1.21.6)\n",
-            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from mmdet) (3.2.2)\n",
+            "\u001b[K     |████████████████████████████████| 190 kB 61.1 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from mmengine) (3.2.2)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.7/dist-packages (from mmengine) (1.1.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from mmengine) (6.0)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from mmengine) (1.21.6)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmengine) (0.11.0)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmengine) (1.4.4)\n",
+            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmengine) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmengine) (2.8.2)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->mmengine) (4.1.1)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib->mmengine) (1.15.0)\n",
+            "Installing collected packages: yapf, addict, mmengine\n",
+            "Successfully installed addict-2.4.0 mmengine-0.1.0 yapf-0.32.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Looking in links: https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html\n",
+            "Collecting mmcv>=2.0.0rc1\n",
+            "  Downloading https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/mmcv-2.0.0rc1-cp37-cp37m-manylinux1_x86_64.whl (47.5 MB)\n",
+            "\u001b[K     |████████████████████████████████| 47.5 MB 173 kB/s \n",
+            "\u001b[?25hRequirement already satisfied: mmengine in /usr/local/lib/python3.7/dist-packages (from mmcv>=2.0.0rc1) (0.1.0)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from mmcv>=2.0.0rc1) (21.3)\n",
+            "Requirement already satisfied: addict in /usr/local/lib/python3.7/dist-packages (from mmcv>=2.0.0rc1) (2.4.0)\n",
+            "Requirement already satisfied: Pillow in /usr/local/lib/python3.7/dist-packages (from mmcv>=2.0.0rc1) (7.1.2)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from mmcv>=2.0.0rc1) (6.0)\n",
+            "Requirement already satisfied: opencv-python>=3 in /usr/local/lib/python3.7/dist-packages (from mmcv>=2.0.0rc1) (4.6.0.66)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from mmcv>=2.0.0rc1) (1.21.6)\n",
+            "Requirement already satisfied: yapf in /usr/local/lib/python3.7/dist-packages (from mmcv>=2.0.0rc1) (0.32.0)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.7/dist-packages (from mmengine->mmcv>=2.0.0rc1) (1.1.0)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from mmengine->mmcv>=2.0.0rc1) (3.2.2)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmengine->mmcv>=2.0.0rc1) (1.4.4)\n",
+            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmengine->mmcv>=2.0.0rc1) (2.8.2)\n",
+            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmengine->mmcv>=2.0.0rc1) (3.0.9)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmengine->mmcv>=2.0.0rc1) (0.11.0)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->mmengine->mmcv>=2.0.0rc1) (4.1.1)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib->mmengine->mmcv>=2.0.0rc1) (1.15.0)\n",
+            "Installing collected packages: mmcv\n",
+            "Successfully installed mmcv-2.0.0rc1\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting mmdet>=3.0.0rc0\n",
+            "  Downloading mmdet-3.0.0rc0-py3-none-any.whl (1.5 MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.5 MB 8.1 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from mmdet>=3.0.0rc0) (1.15.0)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from mmdet>=3.0.0rc0) (1.21.6)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.7/dist-packages (from mmdet>=3.0.0rc0) (2.0.4)\n",
             "Collecting terminaltables\n",
             "  Downloading terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)\n",
-            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from mmdet) (1.15.0)\n",
-            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmdet) (0.11.0)\n",
-            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmdet) (2.8.2)\n",
-            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmdet) (1.4.2)\n",
-            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmdet) (3.0.8)\n",
-            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->mmdet) (4.1.1)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from mmdet>=3.0.0rc0) (3.2.2)\n",
+            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmdet>=3.0.0rc0) (3.0.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmdet>=3.0.0rc0) (2.8.2)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmdet>=3.0.0rc0) (0.11.0)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmdet>=3.0.0rc0) (1.4.4)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->mmdet>=3.0.0rc0) (4.1.1)\n",
             "Installing collected packages: terminaltables, mmdet\n",
-            "Successfully installed mmdet-2.23.0 terminaltables-3.1.10\n",
+            "Successfully installed mmdet-3.0.0rc0 terminaltables-3.1.10\n",
             "Cloning into 'mmtracking'...\n",
-            "remote: Enumerating objects: 4189, done.\u001b[K\n",
-            "remote: Counting objects: 100% (4189/4189), done.\u001b[K\n",
-            "remote: Compressing objects: 100% (1638/1638), done.\u001b[K\n",
-            "remote: Total 4189 (delta 2502), reused 3981 (delta 2433), pack-reused 0\u001b[K\n",
-            "Receiving objects: 100% (4189/4189), 1.67 MiB | 17.14 MiB/s, done.\n",
-            "Resolving deltas: 100% (2502/2502), done.\n",
+            "remote: Enumerating objects: 8692, done.\u001b[K\n",
+            "remote: Counting objects: 100% (1182/1182), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (553/553), done.\u001b[K\n",
+            "remote: Total 8692 (delta 682), reused 1044 (delta 611), pack-reused 7510\u001b[K\n",
+            "Receiving objects: 100% (8692/8692), 3.10 MiB | 14.54 MiB/s, done.\n",
+            "Resolving deltas: 100% (5522/5522), done.\n",
             "/content/mmtracking\n",
-            "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from -r requirements/build.txt (line 1)) (0.29.28)\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from -r requirements/build.txt (line 1)) (0.29.32)\n",
             "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from -r requirements/build.txt (line 2)) (1.21.6)\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
             "Obtaining file:///content/mmtracking\n",
-            "Collecting attributee==0.1.5\n",
-            "  Downloading attributee-0.1.5.tar.gz (11 kB)\n",
-            "Collecting dotty_dict\n",
-            "  Downloading dotty_dict-1.3.0.tar.gz (32 kB)\n",
+            "Collecting attributee\n",
+            "  Downloading attributee-0.1.7.tar.gz (11 kB)\n",
             "Collecting lap\n",
             "  Downloading lap-0.4.0.tar.gz (1.5 MB)\n",
-            "\u001b[K     |████████████████████████████████| 1.5 MB 12.7 MB/s \n",
-            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from mmtrack==0.12.0) (3.2.2)\n",
-            "Collecting mmcls>=0.16.0\n",
-            "  Downloading mmcls-0.22.1-py2.py3-none-any.whl (548 kB)\n",
-            "\u001b[K     |████████████████████████████████| 548 kB 56.2 MB/s \n",
+            "\u001b[K     |████████████████████████████████| 1.5 MB 9.7 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from mmtrack==1.0.0rc0) (3.2.2)\n",
+            "Collecting mmcls>=1.0.0rc0\n",
+            "  Downloading mmcls-1.0.0rc0-py2.py3-none-any.whl (557 kB)\n",
+            "\u001b[K     |████████████████████████████████| 557 kB 62.5 MB/s \n",
             "\u001b[?25hCollecting motmetrics\n",
             "  Downloading motmetrics-1.2.5-py3-none-any.whl (161 kB)\n",
-            "\u001b[K     |████████████████████████████████| 161 kB 73.7 MB/s \n",
-            "\u001b[?25hRequirement already satisfied: opencv-python in /usr/local/lib/python3.7/dist-packages (from mmtrack==0.12.0) (4.1.2.30)\n",
-            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from mmtrack==0.12.0) (21.3)\n",
-            "Requirement already satisfied: pandas<=1.3.5 in /usr/local/lib/python3.7/dist-packages (from mmtrack==0.12.0) (1.3.5)\n",
-            "Collecting pycocotools<=2.0.2\n",
-            "  Downloading pycocotools-2.0.2.tar.gz (23 kB)\n",
-            "Requirement already satisfied: scipy<=1.7.3 in /usr/local/lib/python3.7/dist-packages (from mmtrack==0.12.0) (1.4.1)\n",
-            "Requirement already satisfied: seaborn in /usr/local/lib/python3.7/dist-packages (from mmtrack==0.12.0) (0.11.2)\n",
-            "Requirement already satisfied: terminaltables in /usr/local/lib/python3.7/dist-packages (from mmtrack==0.12.0) (3.1.10)\n",
-            "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from mmtrack==0.12.0) (4.64.0)\n",
-            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from mmcls>=0.16.0->mmtrack==0.12.0) (1.21.6)\n",
-            "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas<=1.3.5->mmtrack==0.12.0) (2022.1)\n",
-            "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas<=1.3.5->mmtrack==0.12.0) (2.8.2)\n",
-            "Requirement already satisfied: setuptools>=18.0 in /usr/local/lib/python3.7/dist-packages (from pycocotools<=2.0.2->mmtrack==0.12.0) (57.4.0)\n",
-            "Requirement already satisfied: cython>=0.27.3 in /usr/local/lib/python3.7/dist-packages (from pycocotools<=2.0.2->mmtrack==0.12.0) (0.29.28)\n",
-            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmtrack==0.12.0) (3.0.8)\n",
-            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmtrack==0.12.0) (1.4.2)\n",
-            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmtrack==0.12.0) (0.11.0)\n",
-            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->mmtrack==0.12.0) (4.1.1)\n",
-            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas<=1.3.5->mmtrack==0.12.0) (1.15.0)\n",
-            "Collecting setuptools_scm\n",
-            "  Downloading setuptools_scm-6.4.2-py3-none-any.whl (37 kB)\n",
+            "\u001b[K     |████████████████████████████████| 161 kB 69.4 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from mmtrack==1.0.0rc0) (21.3)\n",
+            "Requirement already satisfied: pandas<=1.3.5 in /usr/local/lib/python3.7/dist-packages (from mmtrack==1.0.0rc0) (1.3.5)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.7/dist-packages (from mmtrack==1.0.0rc0) (2.0.4)\n",
+            "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from mmtrack==1.0.0rc0) (1.0.2)\n",
+            "Requirement already satisfied: scipy<=1.7.3 in /usr/local/lib/python3.7/dist-packages (from mmtrack==1.0.0rc0) (1.7.3)\n",
+            "Requirement already satisfied: seaborn in /usr/local/lib/python3.7/dist-packages (from mmtrack==1.0.0rc0) (0.11.2)\n",
+            "Requirement already satisfied: tabulate in /usr/local/lib/python3.7/dist-packages (from mmtrack==1.0.0rc0) (0.8.10)\n",
+            "Requirement already satisfied: terminaltables in /usr/local/lib/python3.7/dist-packages (from mmtrack==1.0.0rc0) (3.1.10)\n",
+            "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from mmtrack==1.0.0rc0) (4.64.0)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from mmcls>=1.0.0rc0->mmtrack==1.0.0rc0) (1.21.6)\n",
+            "Collecting rich\n",
+            "  Downloading rich-12.5.1-py3-none-any.whl (235 kB)\n",
+            "\u001b[K     |████████████████████████████████| 235 kB 68.6 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas<=1.3.5->mmtrack==1.0.0rc0) (2022.2.1)\n",
+            "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas<=1.3.5->mmtrack==1.0.0rc0) (2.8.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas<=1.3.5->mmtrack==1.0.0rc0) (1.15.0)\n",
+            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmtrack==1.0.0rc0) (3.0.9)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmtrack==1.0.0rc0) (1.4.4)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mmtrack==1.0.0rc0) (0.11.0)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->mmtrack==1.0.0rc0) (4.1.1)\n",
             "Collecting xmltodict>=0.12.0\n",
-            "  Downloading xmltodict-0.12.0-py2.py3-none-any.whl (9.2 kB)\n",
-            "Requirement already satisfied: tomli>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from setuptools_scm->dotty_dict->mmtrack==0.12.0) (2.0.1)\n",
-            "Building wheels for collected packages: attributee, pycocotools, dotty-dict, lap\n",
+            "  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)\n",
+            "Collecting commonmark<0.10.0,>=0.9.0\n",
+            "  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)\n",
+            "\u001b[K     |████████████████████████████████| 51 kB 8.4 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: pygments<3.0.0,>=2.6.0 in /usr/local/lib/python3.7/dist-packages (from rich->mmcls>=1.0.0rc0->mmtrack==1.0.0rc0) (2.6.1)\n",
+            "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->mmtrack==1.0.0rc0) (1.1.0)\n",
+            "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->mmtrack==1.0.0rc0) (3.1.0)\n",
+            "Building wheels for collected packages: attributee, lap\n",
             "  Building wheel for attributee (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for attributee: filename=attributee-0.1.5-py3-none-any.whl size=12076 sha256=2cf5dd822113ae59026c7ddfd3c9cf740aea07ab346a901ed205f39bb2531db7\n",
-            "  Stored in directory: /root/.cache/pip/wheels/0f/12/3a/b7e98eb4e3d373862bf9f160f77171b72a3825c4867064d8b2\n",
-            "  Building wheel for pycocotools (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for pycocotools: filename=pycocotools-2.0.2-cp37-cp37m-linux_x86_64.whl size=264436 sha256=1ff4f755882b4265eb3e43b54da649ee84b0ea0df416777b0776d0c3c68955e2\n",
-            "  Stored in directory: /root/.cache/pip/wheels/bc/cf/1b/e95c99c5f9d1648be3f500ca55e7ce55f24818b0f48336adaf\n",
-            "  Building wheel for dotty-dict (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for dotty-dict: filename=dotty_dict-1.3.0-py3-none-any.whl size=7682 sha256=833334eb4b45b1732a113221af97e0a343d2fbd4d08601f07943726d4e855d12\n",
-            "  Stored in directory: /root/.cache/pip/wheels/2d/13/9a/3c1bbc95fedfbcb79816e33fd6439fdccf111aeea1b9be6640\n",
+            "  Created wheel for attributee: filename=attributee-0.1.7-py3-none-any.whl size=12696 sha256=73367345c942b1997e2a4c1def6389b0cadb090b21ab6015830b699e7fc69dfd\n",
+            "  Stored in directory: /root/.cache/pip/wheels/28/2d/85/6a50232dcc3c9814e3bd623402757e1759e57eb99aed930729\n",
             "  Building wheel for lap (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for lap: filename=lap-0.4.0-cp37-cp37m-linux_x86_64.whl size=1590213 sha256=81c1fbc8b3a1dcacedae4e2c1f8d9921cca478fb0c7e0c047f9b3ac5b9cda70a\n",
+            "  Created wheel for lap: filename=lap-0.4.0-cp37-cp37m-linux_x86_64.whl size=1590201 sha256=cc56f5c066424629f72b58945914bbe5918fe55f9938f7e1a85c43d2c66c6c8e\n",
             "  Stored in directory: /root/.cache/pip/wheels/b1/0b/e3/ef9daf1b5547b56389e42c80c3100f1e6479bf5fd00fd9d6ba\n",
-            "Successfully built attributee pycocotools dotty-dict lap\n",
-            "Installing collected packages: xmltodict, setuptools-scm, pycocotools, motmetrics, mmcls, lap, dotty-dict, attributee, mmtrack\n",
-            "  Attempting uninstall: pycocotools\n",
-            "    Found existing installation: pycocotools 2.0.4\n",
-            "    Uninstalling pycocotools-2.0.4:\n",
-            "      Successfully uninstalled pycocotools-2.0.4\n",
+            "Successfully built attributee lap\n",
+            "Installing collected packages: commonmark, xmltodict, rich, motmetrics, mmcls, lap, attributee, mmtrack\n",
             "  Running setup.py develop for mmtrack\n",
-            "Successfully installed attributee-0.1.5 dotty-dict-1.3.0 lap-0.4.0 mmcls-0.22.1 mmtrack-0.12.0 motmetrics-1.2.5 pycocotools-2.0.2 setuptools-scm-6.4.2 xmltodict-0.12.0\n",
+            "Successfully installed attributee-0.1.7 commonmark-0.9.1 lap-0.4.0 mmcls-1.0.0rc0 mmtrack-1.0.0rc0 motmetrics-1.2.5 rich-12.5.1 xmltodict-0.13.0\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
             "Collecting git+https://github.com/JonathonLuiten/TrackEval.git\n",
-            "  Cloning https://github.com/JonathonLuiten/TrackEval.git to /tmp/pip-req-build-dqaie2ts\n",
-            "  Running command git clone -q https://github.com/JonathonLuiten/TrackEval.git /tmp/pip-req-build-dqaie2ts\n",
+            "  Cloning https://github.com/JonathonLuiten/TrackEval.git to /tmp/pip-req-build-2qhoo76v\n",
+            "  Running command git clone -q https://github.com/JonathonLuiten/TrackEval.git /tmp/pip-req-build-2qhoo76v\n",
             "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
             "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
             "    Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
             "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from trackeval==1.0.dev1) (1.21.6)\n",
-            "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from trackeval==1.0.dev1) (1.4.1)\n",
+            "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from trackeval==1.0.dev1) (1.7.3)\n",
             "Building wheels for collected packages: trackeval\n",
             "  Building wheel for trackeval (PEP 517) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for trackeval: filename=trackeval-1.0.dev1-py3-none-any.whl size=121499 sha256=1a9a22d286fa771f961b78dd17820ee9fd9d0eb527b0d6e84e5944616ea2feff\n",
-            "  Stored in directory: /tmp/pip-ephem-wheel-cache-qv3rte23/wheels/f3/ca/38/409a5a8b4faf77d7e99a90462e20a4723c5b0f20fa12364aa7\n",
+            "  Created wheel for trackeval: filename=trackeval-1.0.dev1-py3-none-any.whl size=121499 sha256=2eb58220d115d6d402ec919174b2a665dbcb05063c08547b32d3e4e7523bede6\n",
+            "  Stored in directory: /tmp/pip-ephem-wheel-cache-jvadplo0/wheels/f3/ca/38/409a5a8b4faf77d7e99a90462e20a4723c5b0f20fa12364aa7\n",
             "Successfully built trackeval\n",
             "Installing collected packages: trackeval\n",
             "Successfully installed trackeval-1.0.dev1\n"
@@ -211,14 +271,20 @@
         }
       ],
       "source": [
+        "# install pytorch\n",
+        "!pip install torch==1.10.0+cu111 torchvision==0.11.0+cu111 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
+        "\n",
+        "# install MMEngine\n",
+        "!pip install mmengine\n",
+        "\n",
         "# install MMCV\n",
-        "!pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html\n",
+        "!pip install 'mmcv>=2.0.0rc1' -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html\n",
         "\n",
         "# install MMDetection\n",
-        "!pip install mmdet\n",
+        "!pip install 'mmdet>=3.0.0rc0'\n",
         "\n",
         "# clone the MMTracking repository\n",
-        "!git clone https://github.com/open-mmlab/mmtracking.git\n",
+        "!git clone -b 1.x https://github.com/open-mmlab/mmtracking.git\n",
         "%cd mmtracking\n",
         "\n",
         "# install MMTracking and its dependencies\n",
@@ -230,33 +296,34 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 3,
       "id": "03a4a583-78e7-40a1-a6ef-d80056989546",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "03a4a583-78e7-40a1-a6ef-d80056989546",
-        "outputId": "e183515f-9615-41e2-b405-70f4c0d8465d"
+        "outputId": "01ed3453-9d8b-4d91-e641-bdaf12259194"
       },
       "outputs": [
         {
           "data": {
             "text/plain": [
-              "{'CUDA available': True,\n",
-              " 'CUDA_HOME': '/usr/local/cuda',\n",
-              " 'GCC': 'gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0',\n",
-              " 'GPU 0': 'Tesla T4',\n",
-              " 'MMCV': '1.4.8',\n",
-              " 'MMCV CUDA Compiler': '11.1',\n",
-              " 'MMCV Compiler': 'GCC 7.3',\n",
-              " 'NVCC': 'Build cuda_11.1.TC455_06.29190527_0',\n",
-              " 'OpenCV': '4.1.2',\n",
-              " 'PyTorch': '1.10.0+cu111',\n",
-              " 'PyTorch compiling details': 'PyTorch built with:\\n  - GCC 7.3\\n  - C++ Version: 201402\\n  - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications\\n  - Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)\\n  - OpenMP 201511 (a.k.a. OpenMP 4.5)\\n  - LAPACK is enabled (usually provided by MKL)\\n  - NNPACK is enabled\\n  - CPU capability usage: AVX2\\n  - CUDA Runtime 11.1\\n  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86\\n  - CuDNN 8.0.5\\n  - Magma 2.5.2\\n  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.1, CUDNN_VERSION=8.0.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, \\n',\n",
-              " 'Python': '3.7.13 (default, Mar 16 2022, 17:37:17) [GCC 7.5.0]',\n",
-              " 'TorchVision': '0.11.1+cu111',\n",
-              " 'sys.platform': 'linux'}"
+              "OrderedDict([('sys.platform', 'linux'),\n",
+              "             ('Python', '3.7.13 (default, Apr 24 2022, 01:04:09) [GCC 7.5.0]'),\n",
+              "             ('CUDA available', True),\n",
+              "             ('numpy_random_seed', 2147483648),\n",
+              "             ('GPU 0', 'Tesla T4'),\n",
+              "             ('CUDA_HOME', '/usr/local/cuda'),\n",
+              "             ('NVCC', 'Cuda compilation tools, release 11.1, V11.1.105'),\n",
+              "             ('GCC',\n",
+              "              'x86_64-linux-gnu-gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0'),\n",
+              "             ('PyTorch', '1.10.0+cu111'),\n",
+              "             ('PyTorch compiling details',\n",
+              "              'PyTorch built with:\\n  - GCC 7.3\\n  - C++ Version: 201402\\n  - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications\\n  - Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)\\n  - OpenMP 201511 (a.k.a. OpenMP 4.5)\\n  - LAPACK is enabled (usually provided by MKL)\\n  - NNPACK is enabled\\n  - CPU capability usage: AVX2\\n  - CUDA Runtime 11.1\\n  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86\\n  - CuDNN 8.0.5\\n  - Magma 2.5.2\\n  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.1, CUDNN_VERSION=8.0.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, \\n'),\n",
+              "             ('TorchVision', '0.11.0+cu111'),\n",
+              "             ('OpenCV', '4.6.0'),\n",
+              "             ('MMEngine', '0.1.0')])"
             ]
           },
           "execution_count": 3,
@@ -265,20 +332,20 @@
         }
       ],
       "source": [
-        "from mmcv import collect_env\n",
+        "from mmengine.utils.dl_utils import collect_env\n",
         "collect_env()"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 4,
       "id": "ff6aea79-2ce9-4b1c-b3c4-3f92d1a4e34c",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "ff6aea79-2ce9-4b1c-b3c4-3f92d1a4e34c",
-        "outputId": "1494d3f0-ebba-4aa4-e1dc-575615fb3794"
+        "outputId": "b7bced1c-800f-4314-9f2b-ad6c82ee1b47"
       },
       "outputs": [
         {
@@ -288,8 +355,8 @@
             "1.10.0+cu111 True\n",
             "11.1\n",
             "GCC 7.3\n",
-            "2.23.0\n",
-            "0.12.0\n"
+            "3.0.0rc0\n",
+            "1.0.0rc0\n"
           ]
         }
       ],
@@ -325,52 +392,52 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 5,
       "id": "dd7c8466-f057-455f-985a-71e5f22c36e4",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "dd7c8466-f057-455f-985a-71e5f22c36e4",
-        "outputId": "a9866284-1762-4852-b5f7-194737289ed8"
+        "outputId": "3160be6e-2a84-49c5-f5e0-3dab58a607a5"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "--2022-04-20 04:30:20--  https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth\n",
-            "Resolving download.openmmlab.com (download.openmmlab.com)... 47.88.36.72\n",
-            "Connecting to download.openmmlab.com (download.openmmlab.com)|47.88.36.72|:443... connected.\n",
+            "--2022-09-06 08:05:44--  https://download.openmmlab.com/mmtracking/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth\n",
+            "Resolving download.openmmlab.com (download.openmmlab.com)... 47.89.140.71\n",
+            "Connecting to download.openmmlab.com (download.openmmlab.com)|47.89.140.71|:443... connected.\n",
             "HTTP request sent, awaiting response... 200 OK\n",
             "Length: 282801031 (270M) [application/octet-stream]\n",
             "Saving to: ‘./checkpoints/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth’\n",
             "\n",
-            "selsa_faster_rcnn_r 100%[===================>] 269.70M  8.66MB/s    in 30s     \n",
+            "selsa_faster_rcnn_r 100%[===================>] 269.70M  7.75MB/s    in 31s     \n",
             "\n",
-            "2022-04-20 04:30:51 (9.01 MB/s) - ‘./checkpoints/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth’ saved [282801031/282801031]\n",
+            "2022-09-06 08:06:16 (8.70 MB/s) - ‘./checkpoints/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth’ saved [282801031/282801031]\n",
             "\n",
-            "--2022-04-20 04:30:51--  https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_1x_lasot_20211203_151612-da4b3c66.pth\n",
-            "Resolving download.openmmlab.com (download.openmmlab.com)... 47.88.36.72\n",
-            "Connecting to download.openmmlab.com (download.openmmlab.com)|47.88.36.72|:443... connected.\n",
+            "--2022-09-06 08:06:16--  https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_lasot/siamese_rpn_r50_1x_lasot_20211203_151612-da4b3c66.pth\n",
+            "Resolving download.openmmlab.com (download.openmmlab.com)... 47.89.140.71\n",
+            "Connecting to download.openmmlab.com (download.openmmlab.com)|47.89.140.71|:443... connected.\n",
             "HTTP request sent, awaiting response... 200 OK\n",
             "Length: 216134418 (206M) [application/octet-stream]\n",
             "Saving to: ‘./checkpoints/siamese_rpn_r50_1x_lasot_20211203_151612-da4b3c66.pth’\n",
             "\n",
-            "siamese_rpn_r50_1x_ 100%[===================>] 206.12M  11.0MB/s    in 21s     \n",
+            "siamese_rpn_r50_1x_ 100%[===================>] 206.12M  7.60MB/s    in 24s     \n",
             "\n",
-            "2022-04-20 04:31:12 (9.96 MB/s) - ‘./checkpoints/siamese_rpn_r50_1x_lasot_20211203_151612-da4b3c66.pth’ saved [216134418/216134418]\n",
+            "2022-09-06 08:06:41 (8.60 MB/s) - ‘./checkpoints/siamese_rpn_r50_1x_lasot_20211203_151612-da4b3c66.pth’ saved [216134418/216134418]\n",
             "\n",
-            "--2022-04-20 04:31:12--  https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth\n",
-            "Resolving download.openmmlab.com (download.openmmlab.com)... 47.88.36.72\n",
-            "Connecting to download.openmmlab.com (download.openmmlab.com)|47.88.36.72|:443... connected.\n",
+            "--2022-09-06 08:06:41--  https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth\n",
+            "Resolving download.openmmlab.com (download.openmmlab.com)... 47.89.140.71\n",
+            "Connecting to download.openmmlab.com (download.openmmlab.com)|47.89.140.71|:443... connected.\n",
             "HTTP request sent, awaiting response... 200 OK\n",
             "Length: 232596799 (222M) [application/octet-stream]\n",
             "Saving to: ‘./checkpoints/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth’\n",
             "\n",
-            "masktrack_rcnn_r50_ 100%[===================>] 221.82M  8.28MB/s    in 27s     \n",
+            "masktrack_rcnn_r50_ 100%[===================>] 221.82M  9.09MB/s    in 25s     \n",
             "\n",
-            "2022-04-20 04:31:40 (8.13 MB/s) - ‘./checkpoints/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth’ saved [232596799/232596799]\n",
+            "2022-09-06 08:07:07 (8.92 MB/s) - ‘./checkpoints/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth’ saved [232596799/232596799]\n",
             "\n"
           ]
         }
@@ -389,132 +456,93 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 7,
       "id": "420dae4b-4426-405e-97fb-7823943b8ee8",
       "metadata": {
         "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 418,
-          "referenced_widgets": [
-            "c2cdc25a1b9644b0ac9a45a5e5609314",
-            "7aacc564ceed400582af233b42822f7d",
-            "89b71747f37a4213be4750c562a7ccd9",
-            "7da6f24364594f56a573d31665913645",
-            "95eaa4b589df4f4da736e0c104759fd1",
-            "36dd9bf50205454493fc50a1cef64d23",
-            "8ea164a027b044278794e87a396a548a",
-            "bc127741c92f4b65ada91a4c4890ec87",
-            "f140214cae9c45f39c0ca28d856fe596",
-            "12bedcdf97a94489a2356e89d393d107",
-            "3d277b18e9d2486492614f9093bf19a4",
-            "0ce49a6424414d268a0224b7e57739b6",
-            "0b656d0fd04b4c05825748fdf9a34d48",
-            "aaa43b3573904c72af7fb76fb981bf36",
-            "c2d658f5adc3416e9be42fa615e2061d",
-            "cbddeebf0f8f464ca15e5899a8cee2a7",
-            "68c48555d8dd49f49e26956904746e60",
-            "758197b8b211425f864204242e518fed",
-            "6159a01b9931442c8b8968464517f4d5",
-            "591fb75b51d14989abfd7a5325afaff6",
-            "683972d6d4c4489dae2394c222f020d4",
-            "ab7f46e98c794292893599541f343047"
-          ]
+          "base_uri": "https://localhost:8080/"
         },
         "id": "420dae4b-4426-405e-97fb-7823943b8ee8",
-        "outputId": "ce13d7f0-b5c5-4aea-af61-d3ee93291f02"
+        "outputId": "74b49b29-5651-4b25-e866-b755c04c4797"
       },
       "outputs": [
         {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2022-04-20 04:49:35,918 - mmtrack - INFO - initialize FasterRCNN with init_cfg {'type': 'Pretrained', 'checkpoint': 'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth'}\n",
-            "2022-04-20 04:49:35,920 - mmcv - INFO - load model from: https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth\n",
-            "2022-04-20 04:49:35,922 - mmcv - INFO - load checkpoint from http path: https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth\n",
-            "Downloading: \"https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth\" to /root/.cache/torch/hub/checkpoints/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "c2cdc25a1b9644b0ac9a45a5e5609314",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "  0%|          | 0.00/158M [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stderr",
+          "name": "stdout",
           "output_type": "stream",
           "text": [
-            "2022-04-20 04:49:55,303 - mmtrack - INFO - initialize BaseReID with init_cfg {'type': 'Pretrained', 'checkpoint': 'https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth'}\n",
-            "2022-04-20 04:49:55,305 - mmcv - INFO - load model from: https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth\n",
-            "2022-04-20 04:49:55,308 - mmcv - INFO - load checkpoint from http path: https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth\n",
-            "Downloading: \"https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth\" to /root/.cache/torch/hub/checkpoints/tracktor_reid_r50_iter25245-a452f51f.pth\n"
+            "09/06 08:08:27 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth\n",
+            "09/06 08:08:27 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - http loads checkpoint from path: https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth\n",
+            "09/06 08:08:27 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth\n",
+            "09/06 08:08:27 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - http loads checkpoint from path: https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth\n",
+            "09/06 08:08:27 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "missing keys in source state_dict: head.bn.weight, head.bn.bias, head.bn.running_mean, head.bn.running_var, head.classifier.weight, head.classifier.bias\n",
+            "\n",
+            "[                                                  ] 0/8, elapsed: 0s, ETA:"
           ]
         },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "0ce49a6424414d268a0224b7e57739b6",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "  0%|          | 0.00/98.4M [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "2022-04-20 04:50:04,545 - mmcv - WARNING - The model and loaded state dict do not match exactly\n",
-            "\n",
-            "missing keys in source state_dict: head.bn.weight, head.bn.bias, head.bn.running_mean, head.bn.running_var, head.classifier.weight, head.classifier.bias\n",
-            "\n"
+            "/content/mmtracking/mmtrack/apis/inference.py:90: UserWarning: dataset_meta or class names are missed, use None by default.\n",
+            "  warnings.warn('dataset_meta or class names are missed, '\n",
+            "/usr/local/lib/python3.7/dist-packages/mmengine/visualization/visualizer.py:170: UserWarning: `Visualizer` backend is not initialized because save_dir is None.\n",
+            "  warnings.warn('`Visualizer` backend is not initialized '\n",
+            "/usr/local/lib/python3.7/dist-packages/mmengine/visualization/visualizer.py:709: UserWarning: Warning: The bbox is out of bounds, the drawn bbox may not be in the image\n",
+            "  ' the drawn bbox may not be in the image', UserWarning)\n",
+            "/usr/local/lib/python3.7/dist-packages/mmengine/visualization/visualizer.py:779: UserWarning: Warning: The polygon is out of bounds, the drawn polygon may not be in the image\n",
+            "  ' the drawn polygon may not be in the image', UserWarning)\n",
+            "/usr/local/lib/python3.7/dist-packages/mmengine/visualization/visualizer.py:469: UserWarning: Warning: The text is out of bounds, the drawn text may not be in the image\n",
+            "  ' the drawn text may not be in the image', UserWarning)\n"
           ]
         },
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Warning: The model doesn't have classes\n",
-            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 3.4 task/s, elapsed: 2s, ETA:     0s\n",
+            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 1.1 task/s, elapsed: 7s, ETA:     0s\n",
             " making the output video at ./demo/mot.mp4 with a FPS of 3.0\n",
-            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 23.5 task/s, elapsed: 0s, ETA:     0s\n"
+            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 10.7 task/s, elapsed: 1s, ETA:     0s\n"
           ]
         }
       ],
       "source": [
         "# run mot demo\n",
         "import mmcv\n",
+        "import mmengine\n",
         "import tempfile\n",
         "from mmtrack.apis import inference_mot, init_model\n",
-        "mot_config = './configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py'\n",
+        "from mmtrack.utils import register_all_modules\n",
+        "from mmtrack.registry import VISUALIZERS\n",
+        "\n",
+        "register_all_modules(init_default_scope=True)\n",
+        "mot_config = './configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py'\n",
         "input_video = './demo/demo.mp4'\n",
         "imgs = mmcv.VideoReader(input_video)\n",
         "# build the model from a config file\n",
         "mot_model = init_model(mot_config, device='cuda:0')\n",
-        "prog_bar = mmcv.ProgressBar(len(imgs))\n",
+        "\n",
+        "# build the visualizer. Different name for creating different visualizer instance\n",
+        "mot_model.cfg.visualizer.name = 'mot_visualizer'\n",
+        "visualizer = VISUALIZERS.build(mot_model.cfg.visualizer)\n",
+        "visualizer.dataset_meta = mot_model.dataset_meta\n",
+        "\n",
+        "prog_bar = mmengine.ProgressBar(len(imgs))\n",
         "out_dir = tempfile.TemporaryDirectory()\n",
         "out_path = out_dir.name\n",
+        "\n",
         "# test and show/save the images\n",
         "for i, img in enumerate(imgs):\n",
         "    result = inference_mot(mot_model, img, frame_id=i)\n",
-        "    mot_model.show_result(\n",
-        "            img,\n",
-        "            result,\n",
+        "    visualizer.add_datasample(\n",
+        "            'mot',\n",
+        "            img[..., ::-1],\n",
+        "            data_sample=result,\n",
         "            show=False,\n",
-        "            wait_time=int(1000. / imgs.fps),\n",
-        "            out_file=f'{out_path}/{i:06d}.jpg')\n",
+        "            out_file=f'{out_path}/{i:06d}.jpg',\n",
+        "            wait_time=float(1 / int(imgs.fps)),\n",
+        "            step=i)\n",
         "    prog_bar.update()\n",
         "\n",
         "output = './demo/mot.mp4'\n",
@@ -525,44 +553,49 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 8,
       "id": "d4a97033-b779-4169-84c9-781c58840ae5",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
-          "height": 386,
+          "height": 309,
           "referenced_widgets": [
-            "d189fcaddeff41fcb9cd7b27969d53de",
-            "f507f59b242a4a5ebb204c4bb84a8241",
-            "939e2242b3eb4388aa4e30ebccd020cc",
-            "15eafa933541440d822371cc6afce196",
-            "4ea1a24a8ffc4269b46dc1dc7539a0f3",
-            "a1106afabced4d418406b7bbb4f826cc",
-            "9a95f05e979e4a9bbad4675883350b1d",
-            "2bb53ac1784d421ca2050b5aa6701a0d",
-            "82f81af913424d3c96ba2cb8d905258a",
-            "3187d7f735ab45cfad16f5bb3471dc0b",
-            "24da18975ff0468a9bc0c294c45b8a16"
+            "a3fc7d320ed3417d833db4e9db0fce5c",
+            "4eb851325f4b4cab82b8cbd979237520",
+            "9f2fb13e136642728fbc97c8d7390118",
+            "33267c31afd64c58b153a357aaa4569b",
+            "98f236bb20444b919f136766394978ff",
+            "ec62bc9001d349e696b8d82942e5b8c4",
+            "b55031500bb345b58d0676c47e3c7843",
+            "dd2d50cb741b4a79a13f0bf4a2b2242b",
+            "9c5adcbe51ef438db298bff6a09c2065",
+            "936d523d65254c10a9c64033283ca0f9",
+            "4ea9a77bcea840e686d3525d4ef86cc8"
           ]
         },
         "id": "d4a97033-b779-4169-84c9-781c58840ae5",
-        "outputId": "a44ea1ef-bd8a-48c5-e297-97df926fb729"
+        "outputId": "76592f9f-93ff-4ce9-dd03-9839fca4f2f8"
       },
       "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "09/06 08:09:05 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth\n",
+            "09/06 08:09:05 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - http loads checkpoint from path: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth\n"
+          ]
+        },
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "2022-04-20 04:50:33,389 - mmtrack - INFO - initialize MaskRCNN with init_cfg {'type': 'Pretrained', 'checkpoint': 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth'}\n",
-            "2022-04-20 04:50:33,390 - mmcv - INFO - load model from: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth\n",
-            "2022-04-20 04:50:33,393 - mmcv - INFO - load checkpoint from http path: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth\n",
             "Downloading: \"https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth\" to /root/.cache/torch/hub/checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth\n"
           ]
         },
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "d189fcaddeff41fcb9cd7b27969d53de",
+              "model_id": "a3fc7d320ed3417d833db4e9db0fce5c",
               "version_major": 2,
               "version_minor": 0
             },
@@ -574,48 +607,51 @@
           "output_type": "display_data"
         },
         {
-          "name": "stderr",
+          "name": "stdout",
           "output_type": "stream",
           "text": [
-            "2022-04-20 04:50:53,187 - mmcv - WARNING - The model and loaded state dict do not match exactly\n",
+            "09/06 08:09:25 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
             "\n",
             "size mismatch for roi_head.bbox_head.fc_cls.weight: copying a param with shape torch.Size([81, 1024]) from checkpoint, the shape in current model is torch.Size([41, 1024]).\n",
             "size mismatch for roi_head.bbox_head.fc_cls.bias: copying a param with shape torch.Size([81]) from checkpoint, the shape in current model is torch.Size([41]).\n",
             "size mismatch for roi_head.bbox_head.fc_reg.weight: copying a param with shape torch.Size([320, 1024]) from checkpoint, the shape in current model is torch.Size([160, 1024]).\n",
             "size mismatch for roi_head.bbox_head.fc_reg.bias: copying a param with shape torch.Size([320]) from checkpoint, the shape in current model is torch.Size([160]).\n",
             "size mismatch for roi_head.mask_head.conv_logits.weight: copying a param with shape torch.Size([80, 256, 1, 1]) from checkpoint, the shape in current model is torch.Size([40, 256, 1, 1]).\n",
-            "size mismatch for roi_head.mask_head.conv_logits.bias: copying a param with shape torch.Size([80]) from checkpoint, the shape in current model is torch.Size([40]).\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "load checkpoint from local path: ./checkpoints/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth\n",
-            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 3.1 task/s, elapsed: 3s, ETA:     0s\n",
+            "size mismatch for roi_head.mask_head.conv_logits.bias: copying a param with shape torch.Size([80]) from checkpoint, the shape in current model is torch.Size([40]).\n",
+            "local loads checkpoint from path: ./checkpoints/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth\n",
+            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 1.1 task/s, elapsed: 7s, ETA:     0s\n",
             " making the output video at ./demo/vis.mp4 with a FPS of 3.0\n",
-            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 24.1 task/s, elapsed: 0s, ETA:     0s\n"
+            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 10.8 task/s, elapsed: 1s, ETA:     0s\n"
           ]
         }
       ],
       "source": [
         "# run vis demo\n",
         "from mmtrack.apis import inference_mot\n",
-        "vis_config = './configs/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019.py'\n",
+        "vis_config = './configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py'\n",
         "vis_checkpoint = './checkpoints/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth'\n",
         "# build the model from a config file and a checkpoint file\n",
         "vis_model = init_model(vis_config, vis_checkpoint, device='cuda:0')\n",
+        "\n",
+        "# build the visualizer. Different name for creating different visualizer instance\n",
+        "vis_model.cfg.visualizer.name = 'vis_visualizer'\n",
+        "visualizer = VISUALIZERS.build(vis_model.cfg.visualizer)\n",
+        "visualizer.dataset_meta = vis_model.dataset_meta\n",
+        "\n",
         "imgs = mmcv.VideoReader(input_video)\n",
-        "prog_bar = mmcv.ProgressBar(len(imgs))\n",
+        "prog_bar = mmengine.ProgressBar(len(imgs))\n",
         "out_dir = tempfile.TemporaryDirectory()\n",
         "out_path = out_dir.name\n",
         "for i, img in enumerate(imgs):\n",
         "    result = inference_mot(vis_model, img, frame_id=i)\n",
-        "    vis_model.show_result(\n",
-        "            img,\n",
-        "            result,\n",
-        "            wait_time=int(1000. / imgs.fps),\n",
-        "            out_file=f'{out_path}/{i:06d}.jpg')\n",
+        "    visualizer.add_datasample(\n",
+        "            'vis',\n",
+        "            img[..., ::-1],\n",
+        "            data_sample=result,\n",
+        "            show=False,\n",
+        "            out_file=f'{out_path}/{i:06d}.jpg',\n",
+        "            wait_time=float(1 / int(imgs.fps)),\n",
+        "            step=i)\n",
         "    prog_bar.update()\n",
         "output = './demo/vis.mp4'\n",
         "print(f'\\n making the output video at {output} with a FPS of {imgs.fps}')\n",
@@ -625,44 +661,49 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 9,
       "id": "abd0863b-933c-42d1-8442-70565d1b4b55",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
-          "height": 406,
+          "height": 240,
           "referenced_widgets": [
-            "091f72866bc64f8bba239684fe9ddeb6",
-            "c20f8918d08d4e07a544ac69541038fd",
-            "be103dd0cc5543be9dea3dd8c7890187",
-            "5be59a3bfc0148478a662f436e1677a8",
-            "4c616e4740f24416bbea27ba49709d50",
-            "6ad0273eb02440109ba7c2cde43739f9",
-            "5f08eb6751fc4438a5de45051218530f",
-            "57bc05e1d2b64e899fd524c13cd47f5a",
-            "9244d2f08f8c4e2a80e4f2f4921f1b0b",
-            "01fa8d3fb39b483195f7cfc293c6b3ba",
-            "ac746c3a37c145c0bc0d40fa7cabfd2d"
+            "d8911837396446c19d76bc22c5be63e2",
+            "4679ff15b0db45fb9931757e0328d296",
+            "64e629d4b1c74d2ab6c9ddc52ff4a2a9",
+            "e00476c38c4d473191eeed5d6c42ecce",
+            "a5c4355973c342acbae02ff5c32a677f",
+            "5d7f321bb1fe4f32b3c1733a846ef8f5",
+            "dc309139c6764cefa17ef542ebb36d3e",
+            "3a09540c2a4247debd7c35b384e7ebed",
+            "20b5b5e82f5548c98037ee236732ccfe",
+            "4ebcc9b78ac64aedb146685a2437612c",
+            "3d25971bbb3444b797b0faac0a934ffb"
           ]
         },
         "id": "abd0863b-933c-42d1-8442-70565d1b4b55",
-        "outputId": "62ea5c55-2b21-43de-97a6-922afac8bbbb"
+        "outputId": "ba9acbe5-1488-4992-9916-36aff7bd4622"
       },
       "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "09/06 08:09:34 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - load model from: torchvision://resnet50\n",
+            "09/06 08:09:34 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - torchvision loads checkpoint from path: torchvision://resnet50\n"
+          ]
+        },
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "2022-04-20 04:50:57,048 - mmtrack - INFO - initialize ResNet with init_cfg {'type': 'Pretrained', 'checkpoint': 'torchvision://resnet50'}\n",
-            "2022-04-20 04:50:57,049 - mmcv - INFO - load model from: torchvision://resnet50\n",
-            "2022-04-20 04:50:57,052 - mmcv - INFO - load checkpoint from torchvision path: torchvision://resnet50\n",
             "Downloading: \"https://download.pytorch.org/models/resnet50-0676ba61.pth\" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth\n"
           ]
         },
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "091f72866bc64f8bba239684fe9ddeb6",
+              "model_id": "d8911837396446c19d76bc22c5be63e2",
               "version_major": 2,
               "version_minor": 0
             },
@@ -674,62 +715,47 @@
           "output_type": "display_data"
         },
         {
-          "name": "stderr",
+          "name": "stdout",
           "output_type": "stream",
           "text": [
-            "2022-04-20 04:50:57,780 - mmcv - WARNING - The model and loaded state dict do not match exactly\n",
+            "09/06 08:09:34 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
             "\n",
             "unexpected key in source state_dict: fc.weight, fc.bias\n",
             "\n",
-            "2022-04-20 04:50:57,811 - mmtrack - INFO - initialize ChannelMapper with init_cfg {'type': 'Xavier', 'layer': 'Conv2d', 'distribution': 'uniform'}\n",
-            "2022-04-20 04:50:57,875 - mmtrack - INFO - initialize RPNHead with init_cfg {'type': 'Normal', 'layer': 'Conv2d', 'std': 0.01}\n",
-            "2022-04-20 04:50:57,894 - mmtrack - INFO - initialize SelsaBBoxHead with init_cfg [{'type': 'Normal', 'std': 0.01, 'override': {'name': 'fc_cls'}}, {'type': 'Normal', 'std': 0.001, 'override': {'name': 'fc_reg'}}, {'type': 'Xavier', 'distribution': 'uniform', 'override': [{'name': 'shared_fcs'}, {'name': 'cls_fcs'}, {'name': 'reg_fcs'}]}]\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "load checkpoint from local path: ./checkpoints/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth\n",
-            "[                                                  ] 0/8, elapsed: 0s, ETA:"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/content/mmtracking/mmtrack/datasets/pipelines/formatting.py:138: UserWarning: The 'ConcatVideoReferences' class will be deprecated in the future, please use 'ConcatSameTypeFrames' instead\n",
-            "  \"The 'ConcatVideoReferences' class will be deprecated in the \"\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 2.5 task/s, elapsed: 3s, ETA:     0s\n",
+            "local loads checkpoint from path: ./checkpoints/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth\n",
+            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 1.1 task/s, elapsed: 7s, ETA:     0s\n",
             " making the output video at ./demo/vid.mp4 with a FPS of 3.0\n",
-            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 24.7 task/s, elapsed: 0s, ETA:     0s\n"
+            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 11.1 task/s, elapsed: 1s, ETA:     0s\n"
           ]
         }
       ],
       "source": [
         "# run vid demo\n",
         "from mmtrack.apis import inference_vid\n",
-        "vid_config = './configs/vid/selsa/selsa_faster_rcnn_r50_dc5_1x_imagenetvid.py'\n",
+        "vid_config = './configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py'\n",
         "vid_checkpoint = './checkpoints/selsa_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_204835-2f5a4952.pth'\n",
         "# build the model from a config file and a checkpoint file\n",
         "vid_model = init_model(vid_config, vid_checkpoint, device='cuda:0')\n",
+        "\n",
+        "# build the visualizer. Different name for creating different visualizer instance\n",
+        "vid_model.cfg.visualizer.name = 'vid_visualizer'\n",
+        "visualizer = VISUALIZERS.build(vid_model.cfg.visualizer)\n",
+        "visualizer.dataset_meta = vid_model.dataset_meta\n",
+        "\n",
         "imgs = mmcv.VideoReader(input_video)\n",
-        "prog_bar = mmcv.ProgressBar(len(imgs))\n",
+        "prog_bar = mmengine.ProgressBar(len(imgs))\n",
         "out_dir = tempfile.TemporaryDirectory()\n",
         "out_path = out_dir.name\n",
         "for i, img in enumerate(imgs):\n",
         "    result = inference_vid(vid_model, img, frame_id=i)\n",
-        "    vid_model.show_result(\n",
-        "            img,\n",
-        "            result,\n",
-        "            wait_time=int(1000. / imgs.fps),\n",
-        "            out_file=f'{out_path}/{i:06d}.jpg')\n",
+        "    visualizer.add_datasample(\n",
+        "            'vid',\n",
+        "            img[..., ::-1],\n",
+        "            data_sample=result,\n",
+        "            show=False,\n",
+        "            out_file=f'{out_path}/{i:06d}.jpg',\n",
+        "            wait_time=float(1 / int(imgs.fps)),\n",
+        "            step=i)\n",
         "    prog_bar.update()\n",
         "output = './demo/vid.mp4'\n",
         "print(f'\\n making the output video at {output} with a FPS of {imgs.fps}')\n",
@@ -739,44 +765,49 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 10,
       "id": "0189f86a-b216-4f63-a58a-97e40e326869",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
-          "height": 227,
+          "height": 240,
           "referenced_widgets": [
-            "321a9bbadb4b4167ac37f84320bdf76f",
-            "f69886a5f4984ee09804aaa411660955",
-            "fc977a68f1494ad68c203f5544ce1521",
-            "995bca8653914372874f8861a38f0185",
-            "f636c4fc5cfe4489a47a14afc88a0b9c",
-            "7dce393a07d14ccbb1d43554976aeb1c",
-            "ddfcad5fc8854b5cbe0fe8840013ffe5",
-            "e8b957c1eadc45ba9f7092df0d080f08",
-            "3590a55482b14c379f1a63433c140cfb",
-            "535faa89033349c08b3bd31ff5be0018",
-            "5c2860d7559940a7be431ab1228bbe6f"
+            "2db8224b7bb849c0a3c8a281bb57eb22",
+            "b811cd2dd72040fe86cdc5238b522d06",
+            "378b791183a24c558b2ce4afc1fe8a57",
+            "9f46a04413cf4cce9f56a710aebe1424",
+            "fb5e2db40e004e21a852c9f28740615c",
+            "40b5616f3b3741d68210206a1d852418",
+            "482683c5f93f41108ef0added3850519",
+            "3942fdab3efa47e98ef1398b27eade57",
+            "357be7f73b2848f89efe394758edc839",
+            "b704c75bb577455dbac176c728b5540b",
+            "e34931d882f44879aee48bfd46d62dce"
           ]
         },
         "id": "0189f86a-b216-4f63-a58a-97e40e326869",
-        "outputId": "a8fccd93-2171-4110-d971-4b6277dfc3c2"
+        "outputId": "3160aaa9-20cf-45d6-e78d-ff4ffdb08bff"
       },
       "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "09/06 08:09:44 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmtracking/pretrained_weights/sot_resnet50.model\n",
+            "09/06 08:09:44 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - http loads checkpoint from path: https://download.openmmlab.com/mmtracking/pretrained_weights/sot_resnet50.model\n"
+          ]
+        },
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "2022-04-20 04:51:02,961 - mmtrack - INFO - initialize SOTResNet with init_cfg {'type': 'Pretrained', 'checkpoint': 'https://download.openmmlab.com/mmtracking/pretrained_weights/sot_resnet50.model'}\n",
-            "2022-04-20 04:51:02,967 - mmcv - INFO - load model from: https://download.openmmlab.com/mmtracking/pretrained_weights/sot_resnet50.model\n",
-            "2022-04-20 04:51:02,968 - mmcv - INFO - load checkpoint from http path: https://download.openmmlab.com/mmtracking/pretrained_weights/sot_resnet50.model\n",
             "Downloading: \"https://download.openmmlab.com/mmtracking/pretrained_weights/sot_resnet50.model\" to /root/.cache/torch/hub/checkpoints/sot_resnet50.model\n"
           ]
         },
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "321a9bbadb4b4167ac37f84320bdf76f",
+              "model_id": "2db8224b7bb849c0a3c8a281bb57eb22",
               "version_major": 2,
               "version_minor": 0
             },
@@ -791,33 +822,58 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "load checkpoint from local path: ./checkpoints/siamese_rpn_r50_1x_lasot_20211203_151612-da4b3c66.pth\n",
-            "Warning: The model doesn't have classes\n",
-            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 8.7 task/s, elapsed: 1s, ETA:     0s\n",
+            "local loads checkpoint from path: ./checkpoints/siamese_rpn_r50_1x_lasot_20211203_151612-da4b3c66.pth\n",
+            "[                                                  ] 0/8, elapsed: 0s, ETA:"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/content/mmtracking/mmtrack/apis/inference.py:90: UserWarning: dataset_meta or class names are missed, use None by default.\n",
+            "  warnings.warn('dataset_meta or class names are missed, '\n",
+            "/usr/local/lib/python3.7/dist-packages/mmengine/visualization/visualizer.py:170: UserWarning: `Visualizer` backend is not initialized because save_dir is None.\n",
+            "  warnings.warn('`Visualizer` backend is not initialized '\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 1.6 task/s, elapsed: 5s, ETA:     0s\n",
             " making the output video at ./demo/sot.mp4 with a FPS of 3.0\n",
-            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 25.0 task/s, elapsed: 0s, ETA:     0s\n"
+            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 8/8, 11.0 task/s, elapsed: 1s, ETA:     0s\n"
           ]
         }
       ],
       "source": [
         "# run sot demo\n",
         "from mmtrack.apis import inference_sot\n",
-        "sot_config = './configs/sot/siamese_rpn/siamese_rpn_r50_20e_lasot.py'\n",
+        "sot_config = './configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py'\n",
         "sot_checkpoint = './checkpoints/siamese_rpn_r50_1x_lasot_20211203_151612-da4b3c66.pth'\n",
         "# build the model from a config file and a checkpoint file\n",
         "sot_model = init_model(sot_config, sot_checkpoint, device='cuda:0')\n",
+        "\n",
+        "# build the visualizer. Different name for creating different visualizer instance\n",
+        "sot_model.cfg.visualizer.name = 'sot_visualizer'\n",
+        "visualizer = VISUALIZERS.build(sot_model.cfg.visualizer)\n",
+        "visualizer.dataset_meta = sot_model.dataset_meta\n",
+        "\n",
         "init_bbox = [371, 411, 450, 646]\n",
         "imgs = mmcv.VideoReader(input_video)\n",
-        "prog_bar = mmcv.ProgressBar(len(imgs))\n",
+        "prog_bar = mmengine.ProgressBar(len(imgs))\n",
         "out_dir = tempfile.TemporaryDirectory()\n",
         "out_path = out_dir.name\n",
         "for i, img in enumerate(imgs):\n",
         "    result = inference_sot(sot_model, img, init_bbox, frame_id=i)\n",
-        "    sot_model.show_result(\n",
-        "            img,\n",
-        "            result,\n",
-        "            wait_time=int(1000. / imgs.fps),\n",
-        "            out_file=f'{out_path}/{i:06d}.jpg')\n",
+        "    visualizer.add_datasample(\n",
+        "            'vid',\n",
+        "            img[..., ::-1],\n",
+        "            data_sample=result,\n",
+        "            show=False,\n",
+        "            out_file=f'{out_path}/{i:06d}.jpg',\n",
+        "            wait_time=float(1 / int(imgs.fps)),\n",
+        "            step=i)\n",
         "    prog_bar.update()\n",
         "output = './demo/sot.mp4'\n",
         "print(f'\\n making the output video at {output} with a FPS of {imgs.fps}')\n",
@@ -849,30 +905,30 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 11,
       "id": "a91a55bd-14be-46bf-aa18-5e30d9abe5b7",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "a91a55bd-14be-46bf-aa18-5e30d9abe5b7",
-        "outputId": "6c21c659-20e9-464d-c576-fcf2d726865d"
+        "outputId": "837e0474-91fd-4090-f2e1-9017d2f6a389"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "--2022-04-20 04:51:26--  https://download.openmmlab.com/mmtracking/data/MOT17_tiny.zip\n",
-            "Resolving download.openmmlab.com (download.openmmlab.com)... 47.88.36.72\n",
-            "Connecting to download.openmmlab.com (download.openmmlab.com)|47.88.36.72|:443... connected.\n",
+            "--2022-09-06 08:10:13--  https://download.openmmlab.com/mmtracking/data/MOT17_tiny.zip\n",
+            "Resolving download.openmmlab.com (download.openmmlab.com)... 47.89.140.71\n",
+            "Connecting to download.openmmlab.com (download.openmmlab.com)|47.89.140.71|:443... connected.\n",
             "HTTP request sent, awaiting response... 200 OK\n",
             "Length: 344566302 (329M) [application/zip]\n",
             "Saving to: ‘./data/MOT17_tiny.zip’\n",
             "\n",
-            "MOT17_tiny.zip      100%[===================>] 328.60M  7.03MB/s    in 31s     \n",
+            "MOT17_tiny.zip      100%[===================>] 328.60M  8.43MB/s    in 38s     \n",
             "\n",
-            "2022-04-20 04:51:58 (10.6 MB/s) - ‘./data/MOT17_tiny.zip’ saved [344566302/344566302]\n",
+            "2022-09-06 08:10:52 (8.63 MB/s) - ‘./data/MOT17_tiny.zip’ saved [344566302/344566302]\n",
             "\n"
           ]
         }
@@ -885,14 +941,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 14,
       "id": "d0db0d5f-b192-48ee-b145-149f33ad3685",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "d0db0d5f-b192-48ee-b145-149f33ad3685",
-        "outputId": "27bef171-e8f0-4c97-d70d-b5a7e370cd61"
+        "outputId": "8cad1091-09cf-4c97-9992-74af7d480dc3"
       },
       "outputs": [
         {
@@ -900,31 +956,31 @@
           "output_type": "stream",
           "text": [
             "Converting train set to COCO format\n",
-            "100% 2/2 [00:00<00:00,  2.70it/s]\n",
-            "train has 145 instances.\n",
+            "100% 2/2 [00:00<00:00,  2.09it/s]\n",
+            "train has 224 instances.\n",
             "Done! Saved as ./data/MOT17_tiny/annotations/train_cocoformat.json and ./data/MOT17_tiny/annotations/train_detections.pkl\n",
             "Converting test set to COCO format\n",
             "0it [00:00, ?it/s]\n",
             "test has 0 instances.\n",
             "Done! Saved as ./data/MOT17_tiny/annotations/test_cocoformat.json and ./data/MOT17_tiny/annotations/test_detections.pkl\n",
             "Converting half-train set to COCO format\n",
-            "100% 2/2 [00:01<00:00,  1.06it/s]\n",
-            "half-train has 104 instances.\n",
+            "100% 2/2 [00:01<00:00,  1.01it/s]\n",
+            "half-train has 182 instances.\n",
             "Done! Saved as ./data/MOT17_tiny/annotations/half-train_cocoformat.json and ./data/MOT17_tiny/annotations/half-train_detections.pkl\n",
             "Converting half-val set to COCO format\n",
-            "100% 2/2 [00:01<00:00,  1.03it/s]\n",
-            "half-val has 122 instances.\n",
+            "100% 2/2 [00:02<00:00,  1.01s/it]\n",
+            "half-val has 201 instances.\n",
             "Done! Saved as ./data/MOT17_tiny/annotations/half-val_cocoformat.json and ./data/MOT17_tiny/annotations/half-val_detections.pkl\n",
-            "100% 2/2 [08:35<00:00, 257.80s/it]\n"
+            "100% 2/2 [09:35<00:00, 287.68s/it]\n"
           ]
         }
       ],
       "source": [
         "# convert the dataset to coco format\n",
-        "!python ./tools/convert_datasets/mot/mot2coco.py -i ./data/MOT17_tiny/ -o ./data/MOT17_tiny/annotations --split-train --convert-det\n",
+        "!python ./tools/dataset_converters/mot/mot2coco.py -i ./data/MOT17_tiny/ -o ./data/MOT17_tiny/annotations --split-train --convert-det\n",
         "# crop pedestrian patches from the original dataset for training reid model. It may take a few minutes.\n",
         "!rm -rf ./data/MOT17_tiny/reid\n",
-        "!python ./tools/convert_datasets/mot/mot2reid.py -i ./data/MOT17_tiny/ -o ./data/MOT17_tiny/reid --val-split 0.9 --vis-threshold 0.8"
+        "!python ./tools/dataset_converters/mot/mot2reid.py -i ./data/MOT17_tiny/ -o ./data/MOT17_tiny/reid --val-split 0.9 --vis-threshold 0.8"
       ]
     },
     {
@@ -940,14 +996,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 30,
       "id": "bce04095-8586-45c5-a556-40c51d08b2cb",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "bce04095-8586-45c5-a556-40c51d08b2cb",
-        "outputId": "05f85750-c853-4887-c8e4-306401b866c3"
+        "outputId": "407e272f-b1fe-4fb2-9265-55af95525335"
       },
       "outputs": [
         {
@@ -955,334 +1011,555 @@
           "output_type": "stream",
           "text": [
             "Config:\n",
-            "model = dict(\n",
-            "    detector=dict(\n",
-            "        type='FasterRCNN',\n",
-            "        backbone=dict(\n",
-            "            type='ResNet',\n",
-            "            depth=50,\n",
-            "            num_stages=4,\n",
-            "            out_indices=(0, 1, 2, 3),\n",
-            "            frozen_stages=1,\n",
-            "            norm_cfg=dict(type='BN', requires_grad=True),\n",
-            "            norm_eval=True,\n",
-            "            style='pytorch',\n",
-            "            init_cfg=dict(\n",
-            "                type='Pretrained', checkpoint='torchvision://resnet50')),\n",
-            "        neck=dict(\n",
-            "            type='FPN',\n",
-            "            in_channels=[256, 512, 1024, 2048],\n",
-            "            out_channels=256,\n",
-            "            num_outs=5),\n",
-            "        rpn_head=dict(\n",
-            "            type='RPNHead',\n",
-            "            in_channels=256,\n",
-            "            feat_channels=256,\n",
-            "            anchor_generator=dict(\n",
-            "                type='AnchorGenerator',\n",
-            "                scales=[8],\n",
-            "                ratios=[0.5, 1.0, 2.0],\n",
-            "                strides=[4, 8, 16, 32, 64]),\n",
-            "            bbox_coder=dict(\n",
-            "                type='DeltaXYWHBBoxCoder',\n",
-            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
-            "                target_stds=[1.0, 1.0, 1.0, 1.0],\n",
-            "                clip_border=False),\n",
-            "            loss_cls=dict(\n",
-            "                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
-            "            loss_bbox=dict(\n",
-            "                type='SmoothL1Loss', beta=0.1111111111111111,\n",
-            "                loss_weight=1.0)),\n",
-            "        roi_head=dict(\n",
-            "            type='StandardRoIHead',\n",
-            "            bbox_roi_extractor=dict(\n",
-            "                type='SingleRoIExtractor',\n",
-            "                roi_layer=dict(\n",
-            "                    type='RoIAlign', output_size=7, sampling_ratio=0),\n",
-            "                out_channels=256,\n",
-            "                featmap_strides=[4, 8, 16, 32]),\n",
-            "            bbox_head=dict(\n",
-            "                type='Shared2FCBBoxHead',\n",
-            "                in_channels=256,\n",
-            "                fc_out_channels=1024,\n",
-            "                roi_feat_size=7,\n",
-            "                num_classes=1,\n",
-            "                bbox_coder=dict(\n",
-            "                    type='DeltaXYWHBBoxCoder',\n",
-            "                    target_means=[0.0, 0.0, 0.0, 0.0],\n",
-            "                    target_stds=[0.1, 0.1, 0.2, 0.2],\n",
-            "                    clip_border=False),\n",
-            "                reg_class_agnostic=False,\n",
-            "                loss_cls=dict(\n",
-            "                    type='CrossEntropyLoss',\n",
-            "                    use_sigmoid=False,\n",
-            "                    loss_weight=1.0),\n",
-            "                loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))),\n",
-            "        train_cfg=dict(\n",
-            "            rpn=dict(\n",
-            "                assigner=dict(\n",
-            "                    type='MaxIoUAssigner',\n",
-            "                    pos_iou_thr=0.7,\n",
-            "                    neg_iou_thr=0.3,\n",
-            "                    min_pos_iou=0.3,\n",
-            "                    match_low_quality=True,\n",
-            "                    ignore_iof_thr=-1),\n",
-            "                sampler=dict(\n",
-            "                    type='RandomSampler',\n",
-            "                    num=256,\n",
-            "                    pos_fraction=0.5,\n",
-            "                    neg_pos_ub=-1,\n",
-            "                    add_gt_as_proposals=False),\n",
-            "                allowed_border=-1,\n",
-            "                pos_weight=-1,\n",
-            "                debug=False),\n",
-            "            rpn_proposal=dict(\n",
-            "                nms_pre=2000,\n",
-            "                max_per_img=1000,\n",
-            "                nms=dict(type='nms', iou_threshold=0.7),\n",
-            "                min_bbox_size=0),\n",
-            "            rcnn=dict(\n",
-            "                assigner=dict(\n",
-            "                    type='MaxIoUAssigner',\n",
-            "                    pos_iou_thr=0.5,\n",
-            "                    neg_iou_thr=0.5,\n",
-            "                    min_pos_iou=0.5,\n",
-            "                    match_low_quality=False,\n",
-            "                    ignore_iof_thr=-1),\n",
-            "                sampler=dict(\n",
-            "                    type='RandomSampler',\n",
-            "                    num=512,\n",
-            "                    pos_fraction=0.25,\n",
-            "                    neg_pos_ub=-1,\n",
-            "                    add_gt_as_proposals=True),\n",
-            "                pos_weight=-1,\n",
-            "                debug=False)),\n",
-            "        test_cfg=dict(\n",
-            "            rpn=dict(\n",
-            "                nms_pre=1000,\n",
-            "                max_per_img=1000,\n",
-            "                nms=dict(type='nms', iou_threshold=0.7),\n",
-            "                min_bbox_size=0),\n",
-            "            rcnn=dict(\n",
-            "                score_thr=0.05,\n",
-            "                nms=dict(type='nms', iou_threshold=0.5),\n",
-            "                max_per_img=100)),\n",
-            "        init_cfg=dict(\n",
-            "            type='Pretrained',\n",
-            "            checkpoint=\n",
-            "            'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'\n",
-            "        )))\n",
-            "dataset_type = 'CocoDataset'\n",
-            "img_norm_cfg = dict(\n",
-            "    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)\n",
+            "dataset_type = 'mmdet.CocoDataset'\n",
+            "data_root = 'data/MOT17_tiny/'\n",
             "train_pipeline = [\n",
             "    dict(type='LoadImageFromFile', to_float32=True),\n",
             "    dict(type='LoadAnnotations', with_bbox=True),\n",
             "    dict(\n",
-            "        type='Resize',\n",
-            "        img_scale=(1088, 1088),\n",
+            "        type='RandomResize',\n",
+            "        scale=(1088, 1088),\n",
             "        ratio_range=(0.8, 1.2),\n",
             "        keep_ratio=True,\n",
-            "        bbox_clip_border=False),\n",
+            "        clip_object_border=False),\n",
             "    dict(type='PhotoMetricDistortion'),\n",
             "    dict(type='RandomCrop', crop_size=(1088, 1088), bbox_clip_border=False),\n",
-            "    dict(type='RandomFlip', flip_ratio=0.5),\n",
-            "    dict(\n",
-            "        type='Normalize',\n",
-            "        mean=[123.675, 116.28, 103.53],\n",
-            "        std=[58.395, 57.12, 57.375],\n",
-            "        to_rgb=True),\n",
-            "    dict(type='Pad', size_divisor=32),\n",
-            "    dict(type='DefaultFormatBundle'),\n",
-            "    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])\n",
+            "    dict(type='RandomFlip', prob=0.5),\n",
+            "    dict(type='PackDetInputs')\n",
             "]\n",
             "test_pipeline = [\n",
             "    dict(type='LoadImageFromFile'),\n",
+            "    dict(type='Resize', scale=(1088, 1088), keep_ratio=True),\n",
             "    dict(\n",
-            "        type='MultiScaleFlipAug',\n",
-            "        img_scale=(1088, 1088),\n",
-            "        flip=False,\n",
-            "        transforms=[\n",
-            "            dict(type='Resize', keep_ratio=True),\n",
-            "            dict(type='RandomFlip'),\n",
-            "            dict(\n",
-            "                type='Normalize',\n",
-            "                mean=[123.675, 116.28, 103.53],\n",
-            "                std=[58.395, 57.12, 57.375],\n",
-            "                to_rgb=True),\n",
-            "            dict(type='Pad', size_divisor=32),\n",
-            "            dict(type='ImageToTensor', keys=['img']),\n",
-            "            dict(type='Collect', keys=['img'])\n",
-            "        ])\n",
+            "        type='PackDetInputs',\n",
+            "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                   'scale_factor'))\n",
             "]\n",
-            "data_root = 'data/MOT17_tiny/'\n",
-            "data = dict(\n",
-            "    samples_per_gpu=2,\n",
-            "    workers_per_gpu=2,\n",
-            "    train=dict(\n",
-            "        type='CocoDataset',\n",
-            "        ann_file='data/MOT17_tiny/annotations/half-train_cocoformat.json',\n",
-            "        img_prefix='data/MOT17_tiny/train',\n",
-            "        classes=('pedestrian', ),\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='mmdet.CocoDataset',\n",
+            "        data_root='data/MOT17_tiny/',\n",
+            "        _scope_='mmdet',\n",
+            "        ann_file='annotations/half-train_cocoformat.json',\n",
+            "        data_prefix=dict(img='train/'),\n",
+            "        metainfo=dict(CLASSES=('pedestrian', )),\n",
+            "        filter_cfg=dict(filter_empty_gt=True, min_size=32),\n",
             "        pipeline=[\n",
             "            dict(type='LoadImageFromFile', to_float32=True),\n",
             "            dict(type='LoadAnnotations', with_bbox=True),\n",
             "            dict(\n",
-            "                type='Resize',\n",
-            "                img_scale=(1088, 1088),\n",
+            "                type='RandomResize',\n",
+            "                scale=(1088, 1088),\n",
             "                ratio_range=(0.8, 1.2),\n",
             "                keep_ratio=True,\n",
-            "                bbox_clip_border=False),\n",
+            "                clip_object_border=False),\n",
             "            dict(type='PhotoMetricDistortion'),\n",
             "            dict(\n",
             "                type='RandomCrop',\n",
             "                crop_size=(1088, 1088),\n",
             "                bbox_clip_border=False),\n",
-            "            dict(type='RandomFlip', flip_ratio=0.5),\n",
-            "            dict(\n",
-            "                type='Normalize',\n",
-            "                mean=[123.675, 116.28, 103.53],\n",
-            "                std=[58.395, 57.12, 57.375],\n",
-            "                to_rgb=True),\n",
-            "            dict(type='Pad', size_divisor=32),\n",
-            "            dict(type='DefaultFormatBundle'),\n",
-            "            dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])\n",
-            "        ]),\n",
-            "    val=dict(\n",
-            "        type='CocoDataset',\n",
-            "        ann_file='data/MOT17_tiny/annotations/half-val_cocoformat.json',\n",
-            "        img_prefix='data/MOT17_tiny/train',\n",
-            "        classes=('pedestrian', ),\n",
-            "        pipeline=[\n",
-            "            dict(type='LoadImageFromFile'),\n",
-            "            dict(\n",
-            "                type='MultiScaleFlipAug',\n",
-            "                img_scale=(1088, 1088),\n",
-            "                flip=False,\n",
-            "                transforms=[\n",
-            "                    dict(type='Resize', keep_ratio=True),\n",
-            "                    dict(type='RandomFlip'),\n",
-            "                    dict(\n",
-            "                        type='Normalize',\n",
-            "                        mean=[123.675, 116.28, 103.53],\n",
-            "                        std=[58.395, 57.12, 57.375],\n",
-            "                        to_rgb=True),\n",
-            "                    dict(type='Pad', size_divisor=32),\n",
-            "                    dict(type='ImageToTensor', keys=['img']),\n",
-            "                    dict(type='Collect', keys=['img'])\n",
-            "                ])\n",
-            "        ]),\n",
-            "    test=dict(\n",
-            "        type='CocoDataset',\n",
-            "        ann_file='data/MOT17_tiny/annotations/half-val_cocoformat.json',\n",
-            "        img_prefix='data/MOT17_tiny/train',\n",
-            "        classes=('pedestrian', ),\n",
-            "        pipeline=[\n",
-            "            dict(type='LoadImageFromFile'),\n",
-            "            dict(\n",
-            "                type='MultiScaleFlipAug',\n",
-            "                img_scale=(1088, 1088),\n",
-            "                flip=False,\n",
-            "                transforms=[\n",
-            "                    dict(type='Resize', keep_ratio=True),\n",
-            "                    dict(type='RandomFlip'),\n",
-            "                    dict(\n",
-            "                        type='Normalize',\n",
-            "                        mean=[123.675, 116.28, 103.53],\n",
-            "                        std=[58.395, 57.12, 57.375],\n",
-            "                        to_rgb=True),\n",
-            "                    dict(type='Pad', size_divisor=32),\n",
-            "                    dict(type='ImageToTensor', keys=['img']),\n",
-            "                    dict(type='Collect', keys=['img'])\n",
-            "                ])\n",
+            "            dict(type='RandomFlip', prob=0.5),\n",
+            "            dict(type='PackDetInputs')\n",
             "        ]))\n",
-            "evaluation = dict(metric=['bbox'])\n",
-            "optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)\n",
-            "optimizer_config = dict(grad_clip=None)\n",
-            "checkpoint_config = dict(interval=1)\n",
-            "log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])\n",
-            "dist_params = dict(backend='nccl')\n",
+            "val_dataloader = None\n",
+            "test_dataloader = None\n",
+            "val_evaluator = None\n",
+            "test_evaluator = None\n",
+            "default_scope = 'mmtrack'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='mmdet.DetVisualizationHook', draw=False))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='mmdet.DetLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='mot_visualizer')\n",
             "log_level = 'INFO'\n",
             "load_from = None\n",
-            "resume_from = None\n",
-            "workflow = [('train', 1)]\n",
-            "opencv_num_threads = 0\n",
-            "mp_start_method = 'fork'\n",
-            "USE_MMDET = True\n",
-            "lr_config = dict(\n",
-            "    policy='step',\n",
-            "    warmup='linear',\n",
-            "    warmup_iters=100,\n",
-            "    warmup_ratio=0.01,\n",
-            "    step=[3])\n",
-            "total_epochs = 4\n",
+            "resume = False\n",
+            "model = dict(\n",
+            "    data_preprocessor=dict(\n",
+            "        type='DetDataPreprocessor',\n",
+            "        mean=[123.675, 116.28, 103.53],\n",
+            "        std=[58.395, 57.12, 57.375],\n",
+            "        bgr_to_rgb=True,\n",
+            "        pad_size_divisor=32),\n",
+            "    type='FasterRCNN',\n",
+            "    _scope_='mmdet',\n",
+            "    backbone=dict(\n",
+            "        type='ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(0, 1, 2, 3),\n",
+            "        frozen_stages=1,\n",
+            "        norm_cfg=dict(type='BN', requires_grad=True),\n",
+            "        norm_eval=True,\n",
+            "        style='pytorch',\n",
+            "        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),\n",
+            "    neck=dict(\n",
+            "        type='FPN',\n",
+            "        in_channels=[256, 512, 1024, 2048],\n",
+            "        out_channels=256,\n",
+            "        num_outs=5),\n",
+            "    rpn_head=dict(\n",
+            "        type='RPNHead',\n",
+            "        in_channels=256,\n",
+            "        feat_channels=256,\n",
+            "        anchor_generator=dict(\n",
+            "            type='AnchorGenerator',\n",
+            "            scales=[8],\n",
+            "            ratios=[0.5, 1.0, 2.0],\n",
+            "            strides=[4, 8, 16, 32, 64]),\n",
+            "        bbox_coder=dict(\n",
+            "            type='DeltaXYWHBBoxCoder',\n",
+            "            target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "            target_stds=[1.0, 1.0, 1.0, 1.0],\n",
+            "            clip_border=False),\n",
+            "        loss_cls=dict(\n",
+            "            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "        loss_bbox=dict(\n",
+            "            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=1.0)),\n",
+            "    roi_head=dict(\n",
+            "        type='StandardRoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor',\n",
+            "            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "            out_channels=256,\n",
+            "            featmap_strides=[4, 8, 16, 32]),\n",
+            "        bbox_head=dict(\n",
+            "            type='Shared2FCBBoxHead',\n",
+            "            in_channels=256,\n",
+            "            fc_out_channels=1024,\n",
+            "            roi_feat_size=7,\n",
+            "            num_classes=1,\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[0.1, 0.1, 0.2, 0.2],\n",
+            "                clip_border=False),\n",
+            "            reg_class_agnostic=False,\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n",
+            "            loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))),\n",
+            "    train_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.7,\n",
+            "                neg_iou_thr=0.3,\n",
+            "                min_pos_iou=0.3,\n",
+            "                match_low_quality=True,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=256,\n",
+            "                pos_fraction=0.5,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=False),\n",
+            "            allowed_border=-1,\n",
+            "            pos_weight=-1,\n",
+            "            debug=False),\n",
+            "        rpn_proposal=dict(\n",
+            "            nms_pre=2000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.5,\n",
+            "                neg_iou_thr=0.5,\n",
+            "                min_pos_iou=0.5,\n",
+            "                match_low_quality=False,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=512,\n",
+            "                pos_fraction=0.25,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=-1,\n",
+            "            debug=False)),\n",
+            "    test_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            nms_pre=1000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            score_thr=0.05,\n",
+            "            nms=dict(type='nms', iou_threshold=0.5),\n",
+            "            max_per_img=100)),\n",
+            "    init_cfg=dict(\n",
+            "        type='Pretrained',\n",
+            "        checkpoint=\n",
+            "        'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'\n",
+            "    ))\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=4, val_interval=1)\n",
+            "val_cfg = None\n",
+            "test_cfg = None\n",
+            "param_scheduler = [\n",
+            "    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100),\n",
+            "    dict(\n",
+            "        type='MultiStepLR',\n",
+            "        begin=0,\n",
+            "        end=4,\n",
+            "        by_epoch=True,\n",
+            "        milestones=[3],\n",
+            "        gamma=0.1)\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))\n",
             "work_dir = './tutorial_exps/detector'\n",
-            "seed = 0\n",
+            "randomness = dict(seed=0, deterministic=False)\n",
             "gpu_ids = range(0, 1)\n",
             "\n"
           ]
         }
       ],
       "source": [
-        "import mmcv\n",
-        "from mmdet.apis import set_random_seed\n",
-        "cfg = mmcv.Config.fromfile('./configs/det/faster-rcnn_r50_fpn_4e_mot17-half.py')\n",
+        "import mmengine\n",
+        "from mmengine.runner import set_random_seed\n",
+        "cfg = mmengine.Config.fromfile('./configs/det/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py')\n",
         "cfg.data_root = 'data/MOT17_tiny/'\n",
-        "cfg.data.test.ann_file = cfg.data.test.ann_file.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.train.ann_file = cfg.data.train.ann_file.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.val.ann_file = cfg.data.val.ann_file.replace('data/MOT17/','data/MOT17_tiny/')\n",
+        "cfg.train_dataloader.dataset.data_root = 'data/MOT17_tiny/'\n",
+        "cfg.test_dataloader = cfg.test_cfg = cfg.test_evaluator = None\n",
+        "cfg.val_dataloader = cfg.val_cfg = cfg.val_evaluator = None\n",
+        "# different name for creating different visualizer instance\n",
+        "cfg.visualizer.name = 'mot_visualizer'\n",
         "\n",
-        "cfg.data.test.img_prefix = cfg.data.test.img_prefix.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.train.img_prefix = cfg.data.train.img_prefix.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.val.img_prefix = cfg.data.val.img_prefix.replace('data/MOT17/','data/MOT17_tiny/')\n",
         "\n",
         "cfg.work_dir = './tutorial_exps/detector'\n",
-        "cfg.seed = 0\n",
-        "set_random_seed(0, deterministic=False)\n",
+        "cfg.randomness = dict(seed=0, deterministic=False)\n",
         "cfg.gpu_ids = range(1)\n",
         "print(f'Config:\\n{cfg.pretty_text}')"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 31,
       "id": "889b4255-50be-4da3-85c4-dcd19c8111ac",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 1000,
           "referenced_widgets": [
-            "7f20458c3bf7422fba34beba519c27bd",
-            "9e5d67b0f3fb4baaa7254a102464057b",
-            "f750ac53345746ee8c0364b5f850a56d",
-            "3ed731e3194e4465aa7b79579f8eaaca",
-            "7a4444bb015249ae9caa2b14c21fa0fd",
-            "9718af278d2d45e99d1355cafc2c7518",
-            "cb11dac7ded84ad59f12a03b04660b3c",
-            "f2613e8b5e1e453db11090a1aa888799",
-            "9bdb4cdea53b429cad0e8c418fd7e715",
-            "750088e53a064a13872ed5fa5616face",
-            "eb8e5c96ffea40e298a38db2b5a38ace"
+            "4757d999925e44748b3b4b8476ff7069",
+            "64133dab9fa94eb9ad7819da4740a90f",
+            "34b3be1af1704bd4a1a25992ff3a2c33",
+            "1a722030c2344bf6a5ba1941c9e056de",
+            "e0b78a16c9d8488eb6f2b78788bc1987",
+            "36f603cf7dab445ab6120b805799c1d0",
+            "f572376e2dbb437c95e1f62820d6f0d3",
+            "9c5e87a7ba0f4b8395a3fd0027508930",
+            "a83c0a2e5ee8432d9ecb05468194f3ed",
+            "c1a6f0f21a864752bb77a435e5633c5f",
+            "f4510cc71cf54df785fbe760b7dfc32c"
           ]
         },
         "id": "889b4255-50be-4da3-85c4-dcd19c8111ac",
-        "outputId": "bcc096ba-5ce0-4eb3-ab81-776d54424104"
+        "outputId": "b9fb7a8b-ef72-4735-99a8-48e721f64130"
       },
       "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "09/06 08:48:30 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.7.13 (default, Apr 24 2022, 01:04:09) [GCC 7.5.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 0\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.1, V11.1.105\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0\n",
+            "    PyTorch: 1.10.0+cu111\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 7.3\n",
+            "  - C++ Version: 201402\n",
+            "  - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.1\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86\n",
+            "  - CuDNN 8.0.5\n",
+            "  - Magma 2.5.2\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.1, CUDNN_VERSION=8.0.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, \n",
+            "\n",
+            "    TorchVision: 0.11.0+cu111\n",
+            "    OpenCV: 4.6.0\n",
+            "    MMEngine: 0.1.0\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 0\n",
+            "    deterministic: False\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "09/06 08:48:31 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Config:\n",
+            "dataset_type = 'mmdet.CocoDataset'\n",
+            "data_root = 'data/MOT17_tiny/'\n",
+            "train_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', to_float32=True),\n",
+            "    dict(type='LoadAnnotations', with_bbox=True),\n",
+            "    dict(\n",
+            "        type='RandomResize',\n",
+            "        scale=(1088, 1088),\n",
+            "        ratio_range=(0.8, 1.2),\n",
+            "        keep_ratio=True,\n",
+            "        clip_object_border=False),\n",
+            "    dict(type='PhotoMetricDistortion'),\n",
+            "    dict(type='RandomCrop', crop_size=(1088, 1088), bbox_clip_border=False),\n",
+            "    dict(type='RandomFlip', prob=0.5),\n",
+            "    dict(type='PackDetInputs')\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile'),\n",
+            "    dict(type='Resize', scale=(1088, 1088), keep_ratio=True),\n",
+            "    dict(\n",
+            "        type='PackDetInputs',\n",
+            "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+            "                   'scale_factor'))\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=2,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    batch_sampler=dict(type='mmdet.AspectRatioBatchSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='mmdet.CocoDataset',\n",
+            "        data_root='data/MOT17_tiny/',\n",
+            "        _scope_='mmdet',\n",
+            "        ann_file='annotations/half-train_cocoformat.json',\n",
+            "        data_prefix=dict(img='train/'),\n",
+            "        metainfo=dict(CLASSES=('pedestrian', )),\n",
+            "        filter_cfg=dict(filter_empty_gt=True, min_size=32),\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile', to_float32=True),\n",
+            "            dict(type='LoadAnnotations', with_bbox=True),\n",
+            "            dict(\n",
+            "                type='RandomResize',\n",
+            "                scale=(1088, 1088),\n",
+            "                ratio_range=(0.8, 1.2),\n",
+            "                keep_ratio=True,\n",
+            "                clip_object_border=False),\n",
+            "            dict(type='PhotoMetricDistortion'),\n",
+            "            dict(\n",
+            "                type='RandomCrop',\n",
+            "                crop_size=(1088, 1088),\n",
+            "                bbox_clip_border=False),\n",
+            "            dict(type='RandomFlip', prob=0.5),\n",
+            "            dict(type='PackDetInputs')\n",
+            "        ]))\n",
+            "val_dataloader = None\n",
+            "test_dataloader = None\n",
+            "val_evaluator = None\n",
+            "test_evaluator = None\n",
+            "default_scope = 'mmtrack'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='mmdet.DetVisualizationHook', draw=False))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='mmdet.DetLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='mot_visualizer')\n",
+            "log_level = 'INFO'\n",
+            "load_from = None\n",
+            "resume = False\n",
+            "model = dict(\n",
+            "    data_preprocessor=dict(\n",
+            "        type='DetDataPreprocessor',\n",
+            "        mean=[123.675, 116.28, 103.53],\n",
+            "        std=[58.395, 57.12, 57.375],\n",
+            "        bgr_to_rgb=True,\n",
+            "        pad_size_divisor=32),\n",
+            "    type='FasterRCNN',\n",
+            "    _scope_='mmdet',\n",
+            "    backbone=dict(\n",
+            "        type='ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(0, 1, 2, 3),\n",
+            "        frozen_stages=1,\n",
+            "        norm_cfg=dict(type='BN', requires_grad=True),\n",
+            "        norm_eval=True,\n",
+            "        style='pytorch',\n",
+            "        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),\n",
+            "    neck=dict(\n",
+            "        type='FPN',\n",
+            "        in_channels=[256, 512, 1024, 2048],\n",
+            "        out_channels=256,\n",
+            "        num_outs=5),\n",
+            "    rpn_head=dict(\n",
+            "        type='RPNHead',\n",
+            "        in_channels=256,\n",
+            "        feat_channels=256,\n",
+            "        anchor_generator=dict(\n",
+            "            type='AnchorGenerator',\n",
+            "            scales=[8],\n",
+            "            ratios=[0.5, 1.0, 2.0],\n",
+            "            strides=[4, 8, 16, 32, 64]),\n",
+            "        bbox_coder=dict(\n",
+            "            type='DeltaXYWHBBoxCoder',\n",
+            "            target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "            target_stds=[1.0, 1.0, 1.0, 1.0],\n",
+            "            clip_border=False),\n",
+            "        loss_cls=dict(\n",
+            "            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "        loss_bbox=dict(\n",
+            "            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=1.0)),\n",
+            "    roi_head=dict(\n",
+            "        type='StandardRoIHead',\n",
+            "        bbox_roi_extractor=dict(\n",
+            "            type='SingleRoIExtractor',\n",
+            "            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "            out_channels=256,\n",
+            "            featmap_strides=[4, 8, 16, 32]),\n",
+            "        bbox_head=dict(\n",
+            "            type='Shared2FCBBoxHead',\n",
+            "            in_channels=256,\n",
+            "            fc_out_channels=1024,\n",
+            "            roi_feat_size=7,\n",
+            "            num_classes=1,\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[0.1, 0.1, 0.2, 0.2],\n",
+            "                clip_border=False),\n",
+            "            reg_class_agnostic=False,\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),\n",
+            "            loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))),\n",
+            "    train_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.7,\n",
+            "                neg_iou_thr=0.3,\n",
+            "                min_pos_iou=0.3,\n",
+            "                match_low_quality=True,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=256,\n",
+            "                pos_fraction=0.5,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=False),\n",
+            "            allowed_border=-1,\n",
+            "            pos_weight=-1,\n",
+            "            debug=False),\n",
+            "        rpn_proposal=dict(\n",
+            "            nms_pre=2000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            assigner=dict(\n",
+            "                type='MaxIoUAssigner',\n",
+            "                pos_iou_thr=0.5,\n",
+            "                neg_iou_thr=0.5,\n",
+            "                min_pos_iou=0.5,\n",
+            "                match_low_quality=False,\n",
+            "                ignore_iof_thr=-1),\n",
+            "            sampler=dict(\n",
+            "                type='RandomSampler',\n",
+            "                num=512,\n",
+            "                pos_fraction=0.25,\n",
+            "                neg_pos_ub=-1,\n",
+            "                add_gt_as_proposals=True),\n",
+            "            pos_weight=-1,\n",
+            "            debug=False)),\n",
+            "    test_cfg=dict(\n",
+            "        rpn=dict(\n",
+            "            nms_pre=1000,\n",
+            "            max_per_img=1000,\n",
+            "            nms=dict(type='nms', iou_threshold=0.7),\n",
+            "            min_bbox_size=0),\n",
+            "        rcnn=dict(\n",
+            "            score_thr=0.05,\n",
+            "            nms=dict(type='nms', iou_threshold=0.5),\n",
+            "            max_per_img=100)),\n",
+            "    init_cfg=dict(\n",
+            "        type='Pretrained',\n",
+            "        checkpoint=\n",
+            "        'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'\n",
+            "    ))\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=4, val_interval=1)\n",
+            "val_cfg = None\n",
+            "test_cfg = None\n",
+            "param_scheduler = [\n",
+            "    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100),\n",
+            "    dict(\n",
+            "        type='MultiStepLR',\n",
+            "        begin=0,\n",
+            "        end=4,\n",
+            "        by_epoch=True,\n",
+            "        milestones=[3],\n",
+            "        gamma=0.1)\n",
+            "]\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))\n",
+            "work_dir = './tutorial_exps/detector'\n",
+            "randomness = dict(seed=0, deterministic=False)\n",
+            "gpu_ids = range(0, 1)\n",
+            "\n",
+            "Result has been saved to /content/mmtracking/tutorial_exps/detector/modules_statistic_results.json\n",
+            "09/06 08:48:32 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "loading annotations into memory...\n",
+            "Done (t=0.54s)\n",
+            "creating index...\n",
+            "index created!\n",
+            "09/06 08:48:34 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - load model from: http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth\n",
+            "09/06 08:48:34 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - http loads checkpoint from path: http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth\n"
+          ]
+        },
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "2022-04-20 05:00:52,752 - mmtrack - INFO - initialize FasterRCNN with init_cfg {'type': 'Pretrained', 'checkpoint': 'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'}\n",
-            "2022-04-20 05:00:52,754 - mmcv - INFO - load model from: http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth\n",
-            "2022-04-20 05:00:52,757 - mmcv - INFO - load checkpoint from http path: http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth\n",
             "Downloading: \"http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth\" to /root/.cache/torch/hub/checkpoints/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth\n"
           ]
         },
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "7f20458c3bf7422fba34beba519c27bd",
+              "model_id": "4757d999925e44748b3b4b8476ff7069",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1294,141 +1571,323 @@
           "output_type": "display_data"
         },
         {
-          "name": "stderr",
+          "name": "stdout",
           "output_type": "stream",
           "text": [
-            "2022-04-20 05:01:06,227 - mmcv - WARNING - The model and loaded state dict do not match exactly\n",
+            "09/06 08:48:53 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
             "\n",
             "size mismatch for roi_head.bbox_head.fc_cls.weight: copying a param with shape torch.Size([81, 1024]) from checkpoint, the shape in current model is torch.Size([2, 1024]).\n",
             "size mismatch for roi_head.bbox_head.fc_cls.bias: copying a param with shape torch.Size([81]) from checkpoint, the shape in current model is torch.Size([2]).\n",
             "size mismatch for roi_head.bbox_head.fc_reg.weight: copying a param with shape torch.Size([320, 1024]) from checkpoint, the shape in current model is torch.Size([4, 1024]).\n",
-            "size mismatch for roi_head.bbox_head.fc_reg.bias: copying a param with shape torch.Size([320]) from checkpoint, the shape in current model is torch.Size([4]).\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "loading annotations into memory...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/mmdet/apis/train.py:135: UserWarning: config is now expected to have a `runner` section, please set `runner` in your config.\n",
-            "  'please set `runner` in your config.', UserWarning)\n",
-            "2022-04-20 05:01:06,593 - mmdet - INFO - Start running, host: root@597380361c27, work_dir: /content/mmtracking/tutorial_exps/detector\n",
-            "2022-04-20 05:01:06,597 - mmdet - INFO - Hooks will be executed in the following order:\n",
-            "before_run:\n",
-            "(VERY_HIGH   ) StepLrUpdaterHook                  \n",
-            "(NORMAL      ) CheckpointHook                     \n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "before_train_epoch:\n",
-            "(VERY_HIGH   ) StepLrUpdaterHook                  \n",
-            "(LOW         ) IterTimerHook                      \n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "before_train_iter:\n",
-            "(VERY_HIGH   ) StepLrUpdaterHook                  \n",
-            "(LOW         ) IterTimerHook                      \n",
-            " -------------------- \n",
-            "after_train_iter:\n",
-            "(ABOVE_NORMAL) OptimizerHook                      \n",
-            "(NORMAL      ) CheckpointHook                     \n",
-            "(LOW         ) IterTimerHook                      \n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "after_train_epoch:\n",
-            "(NORMAL      ) CheckpointHook                     \n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "before_val_epoch:\n",
-            "(LOW         ) IterTimerHook                      \n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "before_val_iter:\n",
-            "(LOW         ) IterTimerHook                      \n",
-            " -------------------- \n",
-            "after_val_iter:\n",
-            "(LOW         ) IterTimerHook                      \n",
-            " -------------------- \n",
-            "after_val_epoch:\n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "after_run:\n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "2022-04-20 05:01:06,598 - mmdet - INFO - workflow: [('train', 1)], max: 4 epochs\n",
-            "2022-04-20 05:01:06,603 - mmdet - INFO - Checkpoints will be saved to /content/mmtracking/tutorial_exps/detector by HardDiskBackend.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Done (t=0.24s)\n",
-            "creating index...\n",
-            "index created!\n"
+            "size mismatch for roi_head.bbox_head.fc_reg.bias: copying a param with shape torch.Size([320]) from checkpoint, the shape in current model is torch.Size([4]).\n",
+            "09/06 08:48:53 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Checkpoints will be saved to /content/mmtracking/tutorial_exps/detector by HardDiskBackend.\n",
+            "09/06 08:49:17 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][50/414]  lr: 1.0000e-02  eta: 0:12:31  time: 0.4476  data_time: 0.0086  memory: 4097  loss_rpn_cls: 0.0432  loss_rpn_bbox: 0.1128  loss_cls: 0.3478  acc: 87.7930  loss_bbox: 0.2661  loss: 0.7698\n",
+            "09/06 08:49:40 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][100/414]  lr: 2.0000e-02  eta: 0:12:03  time: 0.4664  data_time: 0.0077  memory: 4096  loss_rpn_cls: 0.0310  loss_rpn_bbox: 0.1023  loss_cls: 0.3020  acc: 85.7422  loss_bbox: 0.2093  loss: 0.6447\n",
+            "09/06 08:50:03 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][150/414]  lr: 2.0000e-02  eta: 0:11:41  time: 0.4669  data_time: 0.0085  memory: 4097  loss_rpn_cls: 0.0210  loss_rpn_bbox: 0.0953  loss_cls: 0.3098  acc: 87.9883  loss_bbox: 0.2082  loss: 0.6343\n",
+            "09/06 08:50:27 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][200/414]  lr: 2.0000e-02  eta: 0:11:20  time: 0.4539  data_time: 0.0089  memory: 4097  loss_rpn_cls: 0.0263  loss_rpn_bbox: 0.0771  loss_cls: 0.2932  acc: 84.3750  loss_bbox: 0.2172  loss: 0.6138\n",
+            "09/06 08:50:50 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][250/414]  lr: 2.0000e-02  eta: 0:10:54  time: 0.4517  data_time: 0.0098  memory: 4097  loss_rpn_cls: 0.0162  loss_rpn_bbox: 0.0778  loss_cls: 0.2588  acc: 86.9141  loss_bbox: 0.1758  loss: 0.5286\n",
+            "09/06 08:51:13 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][300/414]  lr: 2.0000e-02  eta: 0:10:33  time: 0.4759  data_time: 0.0090  memory: 4097  loss_rpn_cls: 0.0133  loss_rpn_bbox: 0.0772  loss_cls: 0.2558  acc: 87.4023  loss_bbox: 0.1703  loss: 0.5165\n",
+            "09/06 08:51:36 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][350/414]  lr: 2.0000e-02  eta: 0:10:07  time: 0.4605  data_time: 0.0085  memory: 4096  loss_rpn_cls: 0.0155  loss_rpn_bbox: 0.0722  loss_cls: 0.2608  acc: 90.4297  loss_bbox: 0.1860  loss: 0.5345\n",
+            "09/06 08:51:59 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][400/414]  lr: 2.0000e-02  eta: 0:09:43  time: 0.4660  data_time: 0.0085  memory: 4097  loss_rpn_cls: 0.0159  loss_rpn_bbox: 0.0762  loss_cls: 0.2045  acc: 92.2852  loss_bbox: 0.1252  loss: 0.4218\n",
+            "09/06 08:52:05 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Exp name: faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval_20220906_084830\n",
+            "09/06 08:52:05 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Saving checkpoint at 1 epochs\n",
+            "09/06 08:52:32 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][50/414]  lr: 2.0000e-02  eta: 0:09:08  time: 0.4438  data_time: 0.0085  memory: 4097  loss_rpn_cls: 0.0117  loss_rpn_bbox: 0.0568  loss_cls: 0.2145  acc: 92.7734  loss_bbox: 0.1412  loss: 0.4241\n",
+            "09/06 08:52:55 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][100/414]  lr: 2.0000e-02  eta: 0:08:47  time: 0.4764  data_time: 0.0090  memory: 4097  loss_rpn_cls: 0.0128  loss_rpn_bbox: 0.0551  loss_cls: 0.2117  acc: 92.8711  loss_bbox: 0.1495  loss: 0.4291\n",
+            "09/06 08:53:19 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][150/414]  lr: 2.0000e-02  eta: 0:08:24  time: 0.4226  data_time: 0.0090  memory: 4097  loss_rpn_cls: 0.0185  loss_rpn_bbox: 0.0557  loss_cls: 0.2072  acc: 89.3555  loss_bbox: 0.1273  loss: 0.4087\n",
+            "09/06 08:53:41 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][200/414]  lr: 2.0000e-02  eta: 0:08:00  time: 0.4544  data_time: 0.0081  memory: 4097  loss_rpn_cls: 0.0110  loss_rpn_bbox: 0.0498  loss_cls: 0.1904  acc: 90.1367  loss_bbox: 0.1130  loss: 0.3641\n",
+            "09/06 08:54:04 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][250/414]  lr: 2.0000e-02  eta: 0:07:36  time: 0.4227  data_time: 0.0082  memory: 4097  loss_rpn_cls: 0.0146  loss_rpn_bbox: 0.0475  loss_cls: 0.2017  acc: 90.4297  loss_bbox: 0.1279  loss: 0.3917\n",
+            "09/06 08:54:27 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][300/414]  lr: 2.0000e-02  eta: 0:07:13  time: 0.4572  data_time: 0.0083  memory: 4097  loss_rpn_cls: 0.0100  loss_rpn_bbox: 0.0637  loss_cls: 0.1965  acc: 93.2617  loss_bbox: 0.1399  loss: 0.4102\n",
+            "09/06 08:54:50 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][350/414]  lr: 2.0000e-02  eta: 0:06:50  time: 0.4661  data_time: 0.0093  memory: 4097  loss_rpn_cls: 0.0096  loss_rpn_bbox: 0.0440  loss_cls: 0.1747  acc: 91.6016  loss_bbox: 0.1094  loss: 0.3377\n",
+            "09/06 08:55:13 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][400/414]  lr: 2.0000e-02  eta: 0:06:27  time: 0.4538  data_time: 0.0085  memory: 4097  loss_rpn_cls: 0.0083  loss_rpn_bbox: 0.0413  loss_cls: 0.1579  acc: 94.7266  loss_bbox: 0.1079  loss: 0.3153\n",
+            "09/06 08:55:19 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Exp name: faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval_20220906_084830\n",
+            "09/06 08:55:19 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Saving checkpoint at 2 epochs\n",
+            "09/06 08:55:44 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [3][50/414]  lr: 2.0000e-02  eta: 0:05:56  time: 0.4519  data_time: 0.0089  memory: 4097  loss_rpn_cls: 0.0062  loss_rpn_bbox: 0.0405  loss_cls: 0.1676  acc: 91.6992  loss_bbox: 0.1083  loss: 0.3226\n",
+            "09/06 08:56:08 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [3][100/414]  lr: 2.0000e-02  eta: 0:05:33  time: 0.4734  data_time: 0.0091  memory: 4097  loss_rpn_cls: 0.0049  loss_rpn_bbox: 0.0337  loss_cls: 0.1571  acc: 94.1406  loss_bbox: 0.0945  loss: 0.2903\n",
+            "09/06 08:56:31 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [3][150/414]  lr: 2.0000e-02  eta: 0:05:10  time: 0.4527  data_time: 0.0087  memory: 4097  loss_rpn_cls: 0.0112  loss_rpn_bbox: 0.0406  loss_cls: 0.1630  acc: 93.9453  loss_bbox: 0.1075  loss: 0.3223\n",
+            "09/06 08:56:41 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Exp name: faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval_20220906_084830\n",
+            "09/06 08:56:54 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [3][200/414]  lr: 2.0000e-02  eta: 0:04:48  time: 0.4468  data_time: 0.0086  memory: 4097  loss_rpn_cls: 0.0069  loss_rpn_bbox: 0.0554  loss_cls: 0.1763  acc: 90.3320  loss_bbox: 0.1082  loss: 0.3468\n",
+            "09/06 08:57:17 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [3][250/414]  lr: 2.0000e-02  eta: 0:04:25  time: 0.4645  data_time: 0.0084  memory: 4097  loss_rpn_cls: 0.0079  loss_rpn_bbox: 0.0441  loss_cls: 0.1617  acc: 94.3359  loss_bbox: 0.1107  loss: 0.3244\n",
+            "09/06 08:57:40 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [3][300/414]  lr: 2.0000e-02  eta: 0:04:02  time: 0.4733  data_time: 0.0089  memory: 4097  loss_rpn_cls: 0.0078  loss_rpn_bbox: 0.0374  loss_cls: 0.1390  acc: 95.5078  loss_bbox: 0.1000  loss: 0.2842\n",
+            "09/06 08:58:03 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [3][350/414]  lr: 2.0000e-02  eta: 0:03:39  time: 0.4550  data_time: 0.0088  memory: 4097  loss_rpn_cls: 0.0071  loss_rpn_bbox: 0.0365  loss_cls: 0.1668  acc: 93.8477  loss_bbox: 0.1056  loss: 0.3161\n",
+            "09/06 08:58:26 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [3][400/414]  lr: 2.0000e-02  eta: 0:03:16  time: 0.4494  data_time: 0.0080  memory: 4097  loss_rpn_cls: 0.0048  loss_rpn_bbox: 0.0341  loss_cls: 0.1349  acc: 95.3125  loss_bbox: 0.0942  loss: 0.2680\n",
+            "09/06 08:58:32 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Exp name: faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval_20220906_084830\n",
+            "09/06 08:58:32 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Saving checkpoint at 3 epochs\n",
+            "09/06 08:58:58 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [4][50/414]  lr: 2.0000e-03  eta: 0:02:46  time: 0.4481  data_time: 0.0088  memory: 4097  loss_rpn_cls: 0.0031  loss_rpn_bbox: 0.0288  loss_cls: 0.1293  acc: 96.3867  loss_bbox: 0.0897  loss: 0.2510\n",
+            "09/06 08:59:22 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [4][100/414]  lr: 2.0000e-03  eta: 0:02:23  time: 0.4767  data_time: 0.0085  memory: 4097  loss_rpn_cls: 0.0048  loss_rpn_bbox: 0.0314  loss_cls: 0.1376  acc: 94.2383  loss_bbox: 0.1000  loss: 0.2737\n",
+            "09/06 08:59:45 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [4][150/414]  lr: 2.0000e-03  eta: 0:02:01  time: 0.4850  data_time: 0.0098  memory: 4096  loss_rpn_cls: 0.0037  loss_rpn_bbox: 0.0232  loss_cls: 0.1152  acc: 92.9688  loss_bbox: 0.0722  loss: 0.2142\n",
+            "09/06 09:00:08 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [4][200/414]  lr: 2.0000e-03  eta: 0:01:38  time: 0.4694  data_time: 0.0089  memory: 4097  loss_rpn_cls: 0.0054  loss_rpn_bbox: 0.0279  loss_cls: 0.1285  acc: 93.4570  loss_bbox: 0.0880  loss: 0.2497\n",
+            "09/06 09:00:31 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [4][250/414]  lr: 2.0000e-03  eta: 0:01:15  time: 0.4595  data_time: 0.0084  memory: 4097  loss_rpn_cls: 0.0030  loss_rpn_bbox: 0.0255  loss_cls: 0.1147  acc: 95.4102  loss_bbox: 0.0822  loss: 0.2254\n",
+            "09/06 09:00:54 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [4][300/414]  lr: 2.0000e-03  eta: 0:00:52  time: 0.4778  data_time: 0.0090  memory: 4097  loss_rpn_cls: 0.0063  loss_rpn_bbox: 0.0279  loss_cls: 0.1356  acc: 93.7500  loss_bbox: 0.0900  loss: 0.2598\n",
+            "09/06 09:01:17 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [4][350/414]  lr: 2.0000e-03  eta: 0:00:29  time: 0.4581  data_time: 0.0090  memory: 4097  loss_rpn_cls: 0.0064  loss_rpn_bbox: 0.0233  loss_cls: 0.1073  acc: 96.5820  loss_bbox: 0.0767  loss: 0.2138\n",
+            "09/06 09:01:40 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [4][400/414]  lr: 2.0000e-03  eta: 0:00:06  time: 0.4541  data_time: 0.0090  memory: 4097  loss_rpn_cls: 0.0037  loss_rpn_bbox: 0.0238  loss_cls: 0.1253  acc: 96.5820  loss_bbox: 0.0766  loss: 0.2293\n",
+            "09/06 09:01:46 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Exp name: faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval_20220906_084830\n",
+            "09/06 09:01:46 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Saving checkpoint at 4 epochs\n"
           ]
         },
         {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2022-04-20 05:01:31,165 - mmdet - INFO - Epoch [1][50/414]\tlr: 9.902e-03, eta: 0:13:04, time: 0.488, data_time: 0.057, memory: 4076, loss_rpn_cls: 0.0903, loss_rpn_bbox: 0.1182, loss_cls: 0.4110, acc: 80.7461, loss_bbox: 0.3526, loss: 0.9721\n",
-            "2022-04-20 05:01:53,688 - mmdet - INFO - Epoch [1][100/414]\tlr: 1.980e-02, eta: 0:12:10, time: 0.450, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0428, loss_rpn_bbox: 0.1039, loss_cls: 0.3214, acc: 86.3828, loss_bbox: 0.2210, loss: 0.6892\n",
-            "2022-04-20 05:02:16,652 - mmdet - INFO - Epoch [1][150/414]\tlr: 2.000e-02, eta: 0:11:41, time: 0.460, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0323, loss_rpn_bbox: 0.0995, loss_cls: 0.3064, acc: 86.8105, loss_bbox: 0.2305, loss: 0.6687\n",
-            "2022-04-20 05:02:39,044 - mmdet - INFO - Epoch [1][200/414]\tlr: 2.000e-02, eta: 0:11:11, time: 0.448, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0250, loss_rpn_bbox: 0.0851, loss_cls: 0.2920, acc: 87.6348, loss_bbox: 0.2078, loss: 0.6100\n",
-            "2022-04-20 05:03:01,524 - mmdet - INFO - Epoch [1][250/414]\tlr: 2.000e-02, eta: 0:10:45, time: 0.450, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0215, loss_rpn_bbox: 0.0885, loss_cls: 0.2655, acc: 88.7930, loss_bbox: 0.1828, loss: 0.5583\n",
-            "2022-04-20 05:03:23,948 - mmdet - INFO - Epoch [1][300/414]\tlr: 2.000e-02, eta: 0:10:20, time: 0.448, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0183, loss_rpn_bbox: 0.0782, loss_cls: 0.2394, acc: 89.7695, loss_bbox: 0.1668, loss: 0.5026\n",
-            "2022-04-20 05:03:46,433 - mmdet - INFO - Epoch [1][350/414]\tlr: 2.000e-02, eta: 0:09:55, time: 0.450, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0138, loss_rpn_bbox: 0.0567, loss_cls: 0.2319, acc: 89.9453, loss_bbox: 0.1573, loss: 0.4597\n",
-            "2022-04-20 05:04:08,858 - mmdet - INFO - Epoch [1][400/414]\tlr: 2.000e-02, eta: 0:09:31, time: 0.449, data_time: 0.011, memory: 4076, loss_rpn_cls: 0.0162, loss_rpn_bbox: 0.0612, loss_cls: 0.2286, acc: 90.2344, loss_bbox: 0.1549, loss: 0.4609\n",
-            "2022-04-20 05:04:14,853 - mmdet - INFO - Saving checkpoint at 1 epochs\n",
-            "2022-04-20 05:04:41,618 - mmdet - INFO - Epoch [2][50/414]\tlr: 2.000e-02, eta: 0:08:51, time: 0.494, data_time: 0.056, memory: 4076, loss_rpn_cls: 0.0112, loss_rpn_bbox: 0.0556, loss_cls: 0.2086, acc: 91.0938, loss_bbox: 0.1399, loss: 0.4153\n",
-            "2022-04-20 05:05:04,096 - mmdet - INFO - Epoch [2][100/414]\tlr: 2.000e-02, eta: 0:08:29, time: 0.449, data_time: 0.009, memory: 4076, loss_rpn_cls: 0.0142, loss_rpn_bbox: 0.0651, loss_cls: 0.2114, acc: 91.0215, loss_bbox: 0.1455, loss: 0.4361\n",
-            "2022-04-20 05:05:26,701 - mmdet - INFO - Epoch [2][150/414]\tlr: 2.000e-02, eta: 0:08:07, time: 0.452, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0134, loss_rpn_bbox: 0.0591, loss_cls: 0.2047, acc: 91.0449, loss_bbox: 0.1366, loss: 0.4139\n",
-            "2022-04-20 05:05:49,108 - mmdet - INFO - Epoch [2][200/414]\tlr: 2.000e-02, eta: 0:07:45, time: 0.448, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0120, loss_rpn_bbox: 0.0558, loss_cls: 0.1954, acc: 91.4688, loss_bbox: 0.1318, loss: 0.3950\n",
-            "2022-04-20 05:06:11,711 - mmdet - INFO - Epoch [2][250/414]\tlr: 2.000e-02, eta: 0:07:23, time: 0.452, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0103, loss_rpn_bbox: 0.0494, loss_cls: 0.1877, acc: 91.9941, loss_bbox: 0.1246, loss: 0.3719\n",
-            "2022-04-20 05:06:34,057 - mmdet - INFO - Epoch [2][300/414]\tlr: 2.000e-02, eta: 0:07:01, time: 0.447, data_time: 0.009, memory: 4076, loss_rpn_cls: 0.0112, loss_rpn_bbox: 0.0543, loss_cls: 0.1757, acc: 92.4961, loss_bbox: 0.1208, loss: 0.3621\n",
-            "2022-04-20 05:06:56,363 - mmdet - INFO - Epoch [2][350/414]\tlr: 2.000e-02, eta: 0:06:38, time: 0.446, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0139, loss_rpn_bbox: 0.0483, loss_cls: 0.1883, acc: 91.9629, loss_bbox: 0.1334, loss: 0.3840\n",
-            "2022-04-20 05:07:18,669 - mmdet - INFO - Epoch [2][400/414]\tlr: 2.000e-02, eta: 0:06:16, time: 0.446, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0096, loss_rpn_bbox: 0.0529, loss_cls: 0.1849, acc: 92.0371, loss_bbox: 0.1230, loss: 0.3703\n",
-            "2022-04-20 05:07:24,693 - mmdet - INFO - Saving checkpoint at 2 epochs\n",
-            "2022-04-20 05:07:51,179 - mmdet - INFO - Epoch [3][50/414]\tlr: 2.000e-02, eta: 0:05:44, time: 0.493, data_time: 0.055, memory: 4076, loss_rpn_cls: 0.0130, loss_rpn_bbox: 0.0511, loss_cls: 0.1712, acc: 92.7227, loss_bbox: 0.1197, loss: 0.3550\n",
-            "2022-04-20 05:08:13,608 - mmdet - INFO - Epoch [3][100/414]\tlr: 2.000e-02, eta: 0:05:22, time: 0.448, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0092, loss_rpn_bbox: 0.0470, loss_cls: 0.1818, acc: 92.3516, loss_bbox: 0.1197, loss: 0.3577\n",
-            "2022-04-20 05:08:36,172 - mmdet - INFO - Epoch [3][150/414]\tlr: 2.000e-02, eta: 0:05:00, time: 0.452, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0100, loss_rpn_bbox: 0.0480, loss_cls: 0.1692, acc: 92.8691, loss_bbox: 0.1111, loss: 0.3383\n",
-            "2022-04-20 05:08:58,612 - mmdet - INFO - Epoch [3][200/414]\tlr: 2.000e-02, eta: 0:04:38, time: 0.449, data_time: 0.009, memory: 4076, loss_rpn_cls: 0.0092, loss_rpn_bbox: 0.0416, loss_cls: 0.1636, acc: 93.0352, loss_bbox: 0.1088, loss: 0.3231\n",
-            "2022-04-20 05:09:21,221 - mmdet - INFO - Epoch [3][250/414]\tlr: 2.000e-02, eta: 0:04:16, time: 0.452, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0092, loss_rpn_bbox: 0.0464, loss_cls: 0.1630, acc: 93.0508, loss_bbox: 0.1082, loss: 0.3268\n",
-            "2022-04-20 05:09:43,526 - mmdet - INFO - Epoch [3][300/414]\tlr: 2.000e-02, eta: 0:03:54, time: 0.446, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0086, loss_rpn_bbox: 0.0413, loss_cls: 0.1602, acc: 93.1465, loss_bbox: 0.1061, loss: 0.3162\n",
-            "2022-04-20 05:10:05,856 - mmdet - INFO - Epoch [3][350/414]\tlr: 2.000e-02, eta: 0:03:32, time: 0.447, data_time: 0.010, memory: 4076, loss_rpn_cls: 0.0098, loss_rpn_bbox: 0.0396, loss_cls: 0.1594, acc: 93.2461, loss_bbox: 0.1122, loss: 0.3210\n",
-            "2022-04-20 05:10:28,156 - mmdet - INFO - Epoch [3][400/414]\tlr: 2.000e-02, eta: 0:03:10, time: 0.446, data_time: 0.009, memory: 4076, loss_rpn_cls: 0.0071, loss_rpn_bbox: 0.0440, loss_cls: 0.1539, acc: 93.3965, loss_bbox: 0.1079, loss: 0.3130\n",
-            "2022-04-20 05:10:34,139 - mmdet - INFO - Saving checkpoint at 3 epochs\n",
-            "2022-04-20 05:11:00,630 - mmdet - INFO - Epoch [4][50/414]\tlr: 2.000e-03, eta: 0:02:40, time: 0.491, data_time: 0.056, memory: 4076, loss_rpn_cls: 0.0069, loss_rpn_bbox: 0.0311, loss_cls: 0.1461, acc: 93.8008, loss_bbox: 0.0945, loss: 0.2786\n",
-            "2022-04-20 05:11:23,069 - mmdet - INFO - Epoch [4][100/414]\tlr: 2.000e-03, eta: 0:02:18, time: 0.449, data_time: 0.009, memory: 4077, loss_rpn_cls: 0.0051, loss_rpn_bbox: 0.0305, loss_cls: 0.1382, acc: 94.1562, loss_bbox: 0.0933, loss: 0.2671\n",
-            "2022-04-20 05:11:45,602 - mmdet - INFO - Epoch [4][150/414]\tlr: 2.000e-03, eta: 0:01:56, time: 0.451, data_time: 0.010, memory: 4077, loss_rpn_cls: 0.0041, loss_rpn_bbox: 0.0296, loss_cls: 0.1318, acc: 94.4727, loss_bbox: 0.0914, loss: 0.2569\n",
-            "2022-04-20 05:12:08,081 - mmdet - INFO - Epoch [4][200/414]\tlr: 2.000e-03, eta: 0:01:34, time: 0.449, data_time: 0.010, memory: 4077, loss_rpn_cls: 0.0048, loss_rpn_bbox: 0.0269, loss_cls: 0.1206, acc: 94.9609, loss_bbox: 0.0827, loss: 0.2350\n",
-            "2022-04-20 05:12:30,694 - mmdet - INFO - Epoch [4][250/414]\tlr: 2.000e-03, eta: 0:01:12, time: 0.452, data_time: 0.010, memory: 4077, loss_rpn_cls: 0.0036, loss_rpn_bbox: 0.0263, loss_cls: 0.1191, acc: 94.9473, loss_bbox: 0.0811, loss: 0.2301\n",
-            "2022-04-20 05:12:53,057 - mmdet - INFO - Epoch [4][300/414]\tlr: 2.000e-03, eta: 0:00:50, time: 0.447, data_time: 0.010, memory: 4077, loss_rpn_cls: 0.0043, loss_rpn_bbox: 0.0273, loss_cls: 0.1191, acc: 94.9453, loss_bbox: 0.0842, loss: 0.2349\n",
-            "2022-04-20 05:13:15,463 - mmdet - INFO - Epoch [4][350/414]\tlr: 2.000e-03, eta: 0:00:28, time: 0.448, data_time: 0.011, memory: 4077, loss_rpn_cls: 0.0045, loss_rpn_bbox: 0.0257, loss_cls: 0.1154, acc: 95.1660, loss_bbox: 0.0806, loss: 0.2262\n",
-            "2022-04-20 05:13:37,778 - mmdet - INFO - Epoch [4][400/414]\tlr: 2.000e-03, eta: 0:00:06, time: 0.447, data_time: 0.010, memory: 4077, loss_rpn_cls: 0.0045, loss_rpn_bbox: 0.0268, loss_cls: 0.1229, acc: 94.8574, loss_bbox: 0.0858, loss: 0.2400\n",
-            "2022-04-20 05:13:43,772 - mmdet - INFO - Saving checkpoint at 4 epochs\n"
-          ]
+          "data": {
+            "text/plain": [
+              "FasterRCNN(\n",
+              "  (data_preprocessor): DetDataPreprocessor()\n",
+              "  (backbone): ResNet(\n",
+              "    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n",
+              "    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "    (relu): ReLU(inplace=True)\n",
+              "    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n",
+              "    (layer1): ResLayer(\n",
+              "      (0): Bottleneck(\n",
+              "        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (downsample): Sequential(\n",
+              "          (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        )\n",
+              "      )\n",
+              "      (1): Bottleneck(\n",
+              "        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "      (2): Bottleneck(\n",
+              "        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "    )\n",
+              "    (layer2): ResLayer(\n",
+              "      (0): Bottleneck(\n",
+              "        (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (downsample): Sequential(\n",
+              "          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+              "          (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        )\n",
+              "      )\n",
+              "      (1): Bottleneck(\n",
+              "        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "      (2): Bottleneck(\n",
+              "        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "      (3): Bottleneck(\n",
+              "        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "    )\n",
+              "    (layer3): ResLayer(\n",
+              "      (0): Bottleneck(\n",
+              "        (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (downsample): Sequential(\n",
+              "          (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+              "          (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        )\n",
+              "      )\n",
+              "      (1): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "      (2): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "      (3): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "      (4): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "      (5): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "    )\n",
+              "    (layer4): ResLayer(\n",
+              "      (0): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (downsample): Sequential(\n",
+              "          (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+              "          (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        )\n",
+              "      )\n",
+              "      (1): Bottleneck(\n",
+              "        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "      (2): Bottleneck(\n",
+              "        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "      )\n",
+              "    )\n",
+              "  )\n",
+              "  init_cfg={'type': 'Pretrained', 'checkpoint': 'torchvision://resnet50'}\n",
+              "  (neck): FPN(\n",
+              "    (lateral_convs): ModuleList(\n",
+              "      (0): ConvModule(\n",
+              "        (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))\n",
+              "      )\n",
+              "      (1): ConvModule(\n",
+              "        (conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))\n",
+              "      )\n",
+              "      (2): ConvModule(\n",
+              "        (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))\n",
+              "      )\n",
+              "      (3): ConvModule(\n",
+              "        (conv): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))\n",
+              "      )\n",
+              "    )\n",
+              "    (fpn_convs): ModuleList(\n",
+              "      (0): ConvModule(\n",
+              "        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+              "      )\n",
+              "      (1): ConvModule(\n",
+              "        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+              "      )\n",
+              "      (2): ConvModule(\n",
+              "        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+              "      )\n",
+              "      (3): ConvModule(\n",
+              "        (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+              "      )\n",
+              "    )\n",
+              "  )\n",
+              "  init_cfg={'type': 'Xavier', 'layer': 'Conv2d', 'distribution': 'uniform'}\n",
+              "  (rpn_head): RPNHead(\n",
+              "    (loss_cls): CrossEntropyLoss(avg_non_ignore=False)\n",
+              "    (loss_bbox): SmoothL1Loss()\n",
+              "    (rpn_conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+              "    (rpn_cls): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))\n",
+              "    (rpn_reg): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))\n",
+              "  )\n",
+              "  init_cfg={'type': 'Normal', 'layer': 'Conv2d', 'std': 0.01}\n",
+              "  (roi_head): StandardRoIHead(\n",
+              "    (bbox_roi_extractor): SingleRoIExtractor(\n",
+              "      (roi_layers): ModuleList(\n",
+              "        (0): RoIAlign(output_size=(7, 7), spatial_scale=0.25, sampling_ratio=0, pool_mode=avg, aligned=True, use_torchvision=False)\n",
+              "        (1): RoIAlign(output_size=(7, 7), spatial_scale=0.125, sampling_ratio=0, pool_mode=avg, aligned=True, use_torchvision=False)\n",
+              "        (2): RoIAlign(output_size=(7, 7), spatial_scale=0.0625, sampling_ratio=0, pool_mode=avg, aligned=True, use_torchvision=False)\n",
+              "        (3): RoIAlign(output_size=(7, 7), spatial_scale=0.03125, sampling_ratio=0, pool_mode=avg, aligned=True, use_torchvision=False)\n",
+              "      )\n",
+              "    )\n",
+              "    (bbox_head): Shared2FCBBoxHead(\n",
+              "      (loss_cls): CrossEntropyLoss(avg_non_ignore=False)\n",
+              "      (loss_bbox): SmoothL1Loss()\n",
+              "      (fc_cls): Linear(in_features=1024, out_features=2, bias=True)\n",
+              "      (fc_reg): Linear(in_features=1024, out_features=4, bias=True)\n",
+              "      (shared_convs): ModuleList()\n",
+              "      (shared_fcs): ModuleList(\n",
+              "        (0): Linear(in_features=12544, out_features=1024, bias=True)\n",
+              "        (1): Linear(in_features=1024, out_features=1024, bias=True)\n",
+              "      )\n",
+              "      (cls_convs): ModuleList()\n",
+              "      (cls_fcs): ModuleList()\n",
+              "      (reg_convs): ModuleList()\n",
+              "      (reg_fcs): ModuleList()\n",
+              "      (relu): ReLU(inplace=True)\n",
+              "    )\n",
+              "    init_cfg=[{'type': 'Normal', 'std': 0.01, 'override': {'name': 'fc_cls'}}, {'type': 'Normal', 'std': 0.001, 'override': {'name': 'fc_reg'}}, {'type': 'Xavier', 'distribution': 'uniform', 'override': [{'name': 'shared_fcs'}, {'name': 'cls_fcs'}, {'name': 'reg_fcs'}]}]\n",
+              "  )\n",
+              ")\n",
+              "init_cfg={'type': 'Pretrained', 'checkpoint': 'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'}"
+            ]
+          },
+          "execution_count": 31,
+          "metadata": {},
+          "output_type": "execute_result"
         }
       ],
       "source": [
         "import os.path as osp\n",
         "\n",
-        "from mmtrack.datasets import build_dataset\n",
-        "from mmdet.apis import train_detector as train_model\n",
-        "from mmdet.models import build_detector as build_model\n",
+        "from mmengine.utils import mkdir_or_exist\n",
+        "from mmengine.runner import Runner\n",
         "\n",
-        "mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))\n",
-        "model = build_model(cfg.model.detector)\n",
-        "model.init_weights()\n",
-        "datasets = [build_dataset(cfg.data.train)]\n",
-        "model.CLASSES = datasets[0].CLASSES\n",
-        "train_model(model, datasets, cfg)"
+        "mkdir_or_exist(osp.abspath(cfg.work_dir))\n",
+        "runner = Runner.from_cfg(cfg)\n",
+        "runner.train()"
       ]
     },
     {
@@ -1444,14 +1903,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 42,
       "id": "6705deeb-a9d7-42e2-9d52-b51b7b588d1f",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "6705deeb-a9d7-42e2-9d52-b51b7b588d1f",
-        "outputId": "b3b974a6-9bfa-4c04-c568-aab06c6828d4"
+        "outputId": "a4f24af8-fc91-4459-f4c0-3f0ced62e0f9"
       },
       "outputs": [
         {
@@ -1460,227 +1919,380 @@
           "text": [
             "Config:\n",
             "dataset_type = 'ReIDDataset'\n",
-            "img_norm_cfg = dict(\n",
-            "    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)\n",
+            "data_root = 'data/MOT17/'\n",
             "train_pipeline = [\n",
-            "    dict(type='LoadMultiImagesFromFile', to_float32=True),\n",
-            "    dict(\n",
-            "        type='SeqResize',\n",
-            "        img_scale=(128, 256),\n",
-            "        share_params=False,\n",
-            "        keep_ratio=False,\n",
-            "        bbox_clip_border=False,\n",
-            "        override=False),\n",
-            "    dict(\n",
-            "        type='SeqRandomFlip',\n",
-            "        share_params=False,\n",
-            "        flip_ratio=0.5,\n",
-            "        direction='horizontal'),\n",
             "    dict(\n",
-            "        type='SeqNormalize',\n",
-            "        mean=[123.675, 116.28, 103.53],\n",
-            "        std=[58.395, 57.12, 57.375],\n",
-            "        to_rgb=True),\n",
-            "    dict(type='VideoCollect', keys=['img', 'gt_label']),\n",
-            "    dict(type='ReIDFormatBundle')\n",
+            "        type='TransformBroadcaster',\n",
+            "        share_random_params=False,\n",
+            "        transforms=[\n",
+            "            dict(type='LoadImageFromFile', to_float32=True),\n",
+            "            dict(\n",
+            "                type='mmdet.Resize',\n",
+            "                scale=(128, 256),\n",
+            "                keep_ratio=False,\n",
+            "                clip_object_border=False),\n",
+            "            dict(type='RandomFlip', prob=0.5, direction='horizontal')\n",
+            "        ]),\n",
+            "    dict(type='PackReIDInputs')\n",
             "]\n",
             "test_pipeline = [\n",
-            "    dict(type='LoadImageFromFile'),\n",
-            "    dict(type='Resize', img_scale=(128, 256), keep_ratio=False),\n",
-            "    dict(\n",
-            "        type='Normalize',\n",
-            "        mean=[123.675, 116.28, 103.53],\n",
-            "        std=[58.395, 57.12, 57.375],\n",
-            "        to_rgb=True),\n",
-            "    dict(type='ImageToTensor', keys=['img']),\n",
-            "    dict(type='Collect', keys=['img'], meta_keys=[])\n",
+            "    dict(type='LoadImageFromFile', to_float32=True),\n",
+            "    dict(type='mmdet.Resize', scale=(128, 256), keep_ratio=False),\n",
+            "    dict(type='PackReIDInputs')\n",
             "]\n",
-            "data_root = 'data/MOT17_tiny/'\n",
-            "data = dict(\n",
-            "    samples_per_gpu=1,\n",
-            "    workers_per_gpu=2,\n",
-            "    train=dict(\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    dataset=dict(\n",
             "        type='ReIDDataset',\n",
+            "        data_root='data/MOT17_tiny/',\n",
             "        triplet_sampler=dict(num_ids=8, ins_per_id=4),\n",
-            "        data_prefix='data/MOT17_tiny/reid/imgs',\n",
-            "        ann_file='data/MOT17_tiny/reid/meta/train_9.txt',\n",
-            "        pipeline=[\n",
-            "            dict(type='LoadMultiImagesFromFile', to_float32=True),\n",
-            "            dict(\n",
-            "                type='SeqResize',\n",
-            "                img_scale=(128, 256),\n",
-            "                share_params=False,\n",
-            "                keep_ratio=False,\n",
-            "                bbox_clip_border=False,\n",
-            "                override=False),\n",
-            "            dict(\n",
-            "                type='SeqRandomFlip',\n",
-            "                share_params=False,\n",
-            "                flip_ratio=0.5,\n",
-            "                direction='horizontal'),\n",
-            "            dict(\n",
-            "                type='SeqNormalize',\n",
-            "                mean=[123.675, 116.28, 103.53],\n",
-            "                std=[58.395, 57.12, 57.375],\n",
-            "                to_rgb=True),\n",
-            "            dict(type='VideoCollect', keys=['img', 'gt_label']),\n",
-            "            dict(type='ReIDFormatBundle')\n",
-            "        ]),\n",
-            "    val=dict(\n",
-            "        type='ReIDDataset',\n",
-            "        triplet_sampler=None,\n",
-            "        data_prefix='data/MOT17_tiny/reid/imgs',\n",
-            "        ann_file='data/MOT17_tiny/reid/meta/val_20.txt',\n",
-            "        pipeline=[\n",
-            "            dict(type='LoadImageFromFile'),\n",
-            "            dict(type='Resize', img_scale=(128, 256), keep_ratio=False),\n",
-            "            dict(\n",
-            "                type='Normalize',\n",
-            "                mean=[123.675, 116.28, 103.53],\n",
-            "                std=[58.395, 57.12, 57.375],\n",
-            "                to_rgb=True),\n",
-            "            dict(type='ImageToTensor', keys=['img']),\n",
-            "            dict(type='Collect', keys=['img'], meta_keys=[])\n",
-            "        ]),\n",
-            "    test=dict(\n",
-            "        type='ReIDDataset',\n",
-            "        triplet_sampler=None,\n",
-            "        data_prefix='data/MOT17_tiny/reid/imgs',\n",
-            "        ann_file='data/MOT17_tiny/reid/meta/val_20.txt',\n",
+            "        data_prefix=dict(img_path='reid/imgs'),\n",
+            "        ann_file='reid/meta/train_9.txt',\n",
             "        pipeline=[\n",
-            "            dict(type='LoadImageFromFile'),\n",
-            "            dict(type='Resize', img_scale=(128, 256), keep_ratio=False),\n",
             "            dict(\n",
-            "                type='Normalize',\n",
-            "                mean=[123.675, 116.28, 103.53],\n",
-            "                std=[58.395, 57.12, 57.375],\n",
-            "                to_rgb=True),\n",
-            "            dict(type='ImageToTensor', keys=['img']),\n",
-            "            dict(type='Collect', keys=['img'], meta_keys=[])\n",
+            "                type='TransformBroadcaster',\n",
+            "                share_random_params=False,\n",
+            "                transforms=[\n",
+            "                    dict(type='LoadImageFromFile', to_float32=True),\n",
+            "                    dict(\n",
+            "                        type='mmdet.Resize',\n",
+            "                        scale=(128, 256),\n",
+            "                        keep_ratio=False,\n",
+            "                        clip_object_border=False),\n",
+            "                    dict(type='RandomFlip', prob=0.5, direction='horizontal')\n",
+            "                ]),\n",
+            "            dict(type='PackReIDInputs')\n",
             "        ]))\n",
-            "evaluation = dict(interval=1, metric='mAP')\n",
-            "optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)\n",
-            "optimizer_config = dict(grad_clip=None)\n",
-            "checkpoint_config = dict(interval=1)\n",
-            "log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])\n",
-            "dist_params = dict(backend='nccl')\n",
+            "val_dataloader = None\n",
+            "test_dataloader = None\n",
+            "val_evaluator = None\n",
+            "test_evaluator = None\n",
+            "default_scope = 'mmtrack'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='TrackVisualizationHook', draw=False))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='TrackLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='mot_reid_visualizer')\n",
             "log_level = 'INFO'\n",
             "load_from = None\n",
-            "resume_from = None\n",
-            "workflow = [('train', 1)]\n",
-            "opencv_num_threads = 0\n",
-            "mp_start_method = 'fork'\n",
-            "TRAIN_REID = True\n",
+            "resume = False\n",
             "model = dict(\n",
-            "    reid=dict(\n",
-            "        type='BaseReID',\n",
-            "        backbone=dict(\n",
-            "            type='ResNet',\n",
-            "            depth=50,\n",
-            "            num_stages=4,\n",
-            "            out_indices=(3, ),\n",
-            "            style='pytorch'),\n",
-            "        neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),\n",
-            "        head=dict(\n",
-            "            type='LinearReIDHead',\n",
-            "            num_fcs=1,\n",
-            "            in_channels=2048,\n",
-            "            fc_channels=1024,\n",
-            "            out_channels=128,\n",
-            "            num_classes=380,\n",
-            "            loss=dict(type='CrossEntropyLoss', loss_weight=1.0),\n",
-            "            loss_pairwise=dict(\n",
-            "                type='TripletLoss', margin=0.3, loss_weight=1.0),\n",
-            "            norm_cfg=dict(type='BN1d'),\n",
-            "            act_cfg=dict(type='ReLU')),\n",
-            "        init_cfg=dict(\n",
-            "            type='Pretrained',\n",
-            "            checkpoint=\n",
-            "            'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth'\n",
-            "        )))\n",
-            "lr_config = dict(\n",
-            "    policy='step',\n",
-            "    warmup='linear',\n",
-            "    warmup_iters=200,\n",
-            "    warmup_ratio=0.005,\n",
-            "    step=[1])\n",
-            "total_epochs = 2\n",
+            "    type='BaseReID',\n",
+            "    data_preprocessor=dict(\n",
+            "        type='mmcls.ClsDataPreprocessor',\n",
+            "        mean=[123.675, 116.28, 103.53],\n",
+            "        std=[58.395, 57.12, 57.375],\n",
+            "        to_rgb=True),\n",
+            "    backbone=dict(\n",
+            "        type='mmcls.ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(3, ),\n",
+            "        style='pytorch'),\n",
+            "    neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),\n",
+            "    head=dict(\n",
+            "        type='LinearReIDHead',\n",
+            "        num_fcs=1,\n",
+            "        in_channels=2048,\n",
+            "        fc_channels=1024,\n",
+            "        out_channels=128,\n",
+            "        num_classes=380,\n",
+            "        loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),\n",
+            "        loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),\n",
+            "        norm_cfg=dict(type='BN1d'),\n",
+            "        act_cfg=dict(type='ReLU')),\n",
+            "    init_cfg=dict(\n",
+            "        type='Pretrained',\n",
+            "        checkpoint=\n",
+            "        'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth'\n",
+            "    ))\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    clip_grad=None,\n",
+            "    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001))\n",
+            "param_scheduler = [\n",
+            "    dict(\n",
+            "        type='LinearLR', start_factor=0.005, by_epoch=False, begin=0, end=200),\n",
+            "    dict(\n",
+            "        type='MultiStepLR',\n",
+            "        begin=0,\n",
+            "        end=2,\n",
+            "        by_epoch=True,\n",
+            "        milestones=[1],\n",
+            "        gamma=0.1)\n",
+            "]\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_begin=3)\n",
+            "val_cfg = None\n",
+            "test_cfg = None\n",
             "work_dir = './tutorial_exps/reid'\n",
-            "seed = 0\n",
+            "randomness = dict(seed=0, deterministic=False)\n",
             "gpu_ids = range(0, 1)\n",
             "\n"
           ]
         }
       ],
       "source": [
-        "import mmcv\n",
-        "from mmdet.apis import set_random_seed\n",
-        "cfg = mmcv.Config.fromfile('./configs/reid/resnet50_b32x8_MOT17.py')\n",
-        "cfg.data_root = 'data/MOT17_tiny/'\n",
-        "cfg.data.test.ann_file = cfg.data.test.ann_file.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.train.ann_file = 'data/MOT17_tiny/reid/meta/train_9.txt'\n",
-        "cfg.data.val.ann_file = cfg.data.val.ann_file.replace('data/MOT17/','data/MOT17_tiny/')\n",
+        "import mmengine\n",
         "\n",
-        "cfg.data.test.data_prefix = cfg.data.test.data_prefix.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.train.data_prefix = cfg.data.train.data_prefix.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.val.data_prefix = cfg.data.val.data_prefix.replace('data/MOT17/','data/MOT17_tiny/')\n",
+        "cfg = mmengine.Config.fromfile('./configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py')\n",
+        "cfg.train_dataloader.dataset.data_root = 'data/MOT17_tiny/'\n",
+        "cfg.train_dataloader.dataset.ann_file = 'reid/meta/train_9.txt'\n",
+        "cfg.test_dataloader = cfg.test_cfg = cfg.test_evaluator = None\n",
+        "cfg.val_dataloader = cfg.val_cfg = cfg.val_evaluator = None\n",
+        "cfg.visualizer.name = 'mot_reid_visualizer'\n",
         "\n",
         "# learning policy\n",
-        "cfg.lr_config = dict(\n",
-        "    policy='step',\n",
-        "    warmup='linear',\n",
-        "    warmup_iters=200,\n",
-        "    warmup_ratio=1.0 / 200,\n",
-        "    step=[1])\n",
-        "cfg.total_epochs = 2\n",
+        "cfg.param_scheduler = [\n",
+        "    dict(\n",
+        "        type='LinearLR',\n",
+        "        start_factor=1.0 / 200,\n",
+        "        by_epoch=False,\n",
+        "        begin=0,\n",
+        "        end=200),\n",
+        "    dict(\n",
+        "        type='MultiStepLR',\n",
+        "        begin=0,\n",
+        "        end=2,\n",
+        "        by_epoch=True,\n",
+        "        milestones=[1],\n",
+        "        gamma=0.1)\n",
+        "]\n",
+        "cfg.train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_begin=3)\n",
         "\n",
         "cfg.work_dir = './tutorial_exps/reid'\n",
-        "cfg.seed = 0\n",
-        "set_random_seed(0, deterministic=False)\n",
+        "cfg.randomness = dict(seed=0, deterministic=False)\n",
         "cfg.gpu_ids = range(1)\n",
         "print(f'Config:\\n{cfg.pretty_text}')"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 43,
       "id": "12f2b54e-7e5f-4d95-9e27-528115717e03",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 1000,
           "referenced_widgets": [
-            "1fc5eb2eaa6a4ed88a005a081bc24a70",
-            "1f5d5858bede4242b51d1cb635c3f610",
-            "9344070d5d004c0fb5c0bdf22632e343",
-            "c4f0834d72224880b5a12217d95cb086",
-            "92345bd0ebed40f4bf0a9ac4aad2bc5b",
-            "4c38742132624fa38216e67e3771894f",
-            "6b83dd17433745a3ae3d2ef9b4b932ad",
-            "738b0850c1a4489a9147a63f3a587dd6",
-            "9604ece7c2bb4e8ba8914d4430a72876",
-            "7d5fd07e913046a6b5445a791197a287",
-            "5a4441a529664fdfb5dc4d8de70b3421"
+            "34594c7fd26c4f61af5e4ec5b34b6b02",
+            "8727613cf45f4cd4a79557afd1973632",
+            "28ef474b2c7c466baa81c1196f8e7505",
+            "7b268f5a3b334d2c89ae8d43fda6eb53",
+            "9d467daaa30e44288dd49152948db460",
+            "9793eaee82db45beaddf9e80cab6d0a1",
+            "e75e2e078b4246a9984fa0841de27a15",
+            "504570ec6de24f40b824fb8eacc65447",
+            "b9470c130a69411b8be481f4aa045d26",
+            "0b4dd8ace9b5467e9e7c0214df6df1a9",
+            "e495e81fecf844a4ae4a15ca9cb0329c"
           ]
         },
         "id": "12f2b54e-7e5f-4d95-9e27-528115717e03",
-        "outputId": "b3a6f9b1-5095-4b6f-82de-4c9f81e15e80"
+        "outputId": "c9952c03-33c9-450b-d325-a526821a93ee"
       },
       "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "09/06 09:16:26 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.7.13 (default, Apr 24 2022, 01:04:09) [GCC 7.5.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 0\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.1, V11.1.105\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0\n",
+            "    PyTorch: 1.10.0+cu111\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 7.3\n",
+            "  - C++ Version: 201402\n",
+            "  - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.1\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86\n",
+            "  - CuDNN 8.0.5\n",
+            "  - Magma 2.5.2\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.1, CUDNN_VERSION=8.0.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, \n",
+            "\n",
+            "    TorchVision: 0.11.0+cu111\n",
+            "    OpenCV: 4.6.0\n",
+            "    MMEngine: 0.1.0\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 0\n",
+            "    deterministic: False\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "09/06 09:16:27 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Config:\n",
+            "dataset_type = 'ReIDDataset'\n",
+            "data_root = 'data/MOT17/'\n",
+            "train_pipeline = [\n",
+            "    dict(\n",
+            "        type='TransformBroadcaster',\n",
+            "        share_random_params=False,\n",
+            "        transforms=[\n",
+            "            dict(type='LoadImageFromFile', to_float32=True),\n",
+            "            dict(\n",
+            "                type='mmdet.Resize',\n",
+            "                scale=(128, 256),\n",
+            "                keep_ratio=False,\n",
+            "                clip_object_border=False),\n",
+            "            dict(type='RandomFlip', prob=0.5, direction='horizontal')\n",
+            "        ]),\n",
+            "    dict(type='PackReIDInputs')\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile', to_float32=True),\n",
+            "    dict(type='mmdet.Resize', scale=(128, 256), keep_ratio=False),\n",
+            "    dict(type='PackReIDInputs')\n",
+            "]\n",
+            "train_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    sampler=dict(type='DefaultSampler', shuffle=True),\n",
+            "    dataset=dict(\n",
+            "        type='ReIDDataset',\n",
+            "        data_root='data/MOT17_tiny/',\n",
+            "        triplet_sampler=dict(num_ids=8, ins_per_id=4),\n",
+            "        data_prefix=dict(img_path='reid/imgs'),\n",
+            "        ann_file='reid/meta/train_9.txt',\n",
+            "        pipeline=[\n",
+            "            dict(\n",
+            "                type='TransformBroadcaster',\n",
+            "                share_random_params=False,\n",
+            "                transforms=[\n",
+            "                    dict(type='LoadImageFromFile', to_float32=True),\n",
+            "                    dict(\n",
+            "                        type='mmdet.Resize',\n",
+            "                        scale=(128, 256),\n",
+            "                        keep_ratio=False,\n",
+            "                        clip_object_border=False),\n",
+            "                    dict(type='RandomFlip', prob=0.5, direction='horizontal')\n",
+            "                ]),\n",
+            "            dict(type='PackReIDInputs')\n",
+            "        ]))\n",
+            "val_dataloader = None\n",
+            "test_dataloader = None\n",
+            "val_evaluator = None\n",
+            "test_evaluator = None\n",
+            "default_scope = 'mmtrack'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='TrackVisualizationHook', draw=False))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='TrackLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='mot_reid_visualizer')\n",
+            "log_level = 'INFO'\n",
+            "load_from = None\n",
+            "resume = False\n",
+            "model = dict(\n",
+            "    type='BaseReID',\n",
+            "    data_preprocessor=dict(\n",
+            "        type='mmcls.ClsDataPreprocessor',\n",
+            "        mean=[123.675, 116.28, 103.53],\n",
+            "        std=[58.395, 57.12, 57.375],\n",
+            "        to_rgb=True),\n",
+            "    backbone=dict(\n",
+            "        type='mmcls.ResNet',\n",
+            "        depth=50,\n",
+            "        num_stages=4,\n",
+            "        out_indices=(3, ),\n",
+            "        style='pytorch'),\n",
+            "    neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),\n",
+            "    head=dict(\n",
+            "        type='LinearReIDHead',\n",
+            "        num_fcs=1,\n",
+            "        in_channels=2048,\n",
+            "        fc_channels=1024,\n",
+            "        out_channels=128,\n",
+            "        num_classes=380,\n",
+            "        loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),\n",
+            "        loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),\n",
+            "        norm_cfg=dict(type='BN1d'),\n",
+            "        act_cfg=dict(type='ReLU')),\n",
+            "    init_cfg=dict(\n",
+            "        type='Pretrained',\n",
+            "        checkpoint=\n",
+            "        'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth'\n",
+            "    ))\n",
+            "optim_wrapper = dict(\n",
+            "    type='OptimWrapper',\n",
+            "    clip_grad=None,\n",
+            "    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001))\n",
+            "param_scheduler = [\n",
+            "    dict(\n",
+            "        type='LinearLR', start_factor=0.005, by_epoch=False, begin=0, end=200),\n",
+            "    dict(\n",
+            "        type='MultiStepLR',\n",
+            "        begin=0,\n",
+            "        end=2,\n",
+            "        by_epoch=True,\n",
+            "        milestones=[1],\n",
+            "        gamma=0.1)\n",
+            "]\n",
+            "train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=2, val_begin=3)\n",
+            "val_cfg = None\n",
+            "test_cfg = None\n",
+            "work_dir = './tutorial_exps/reid'\n",
+            "randomness = dict(seed=0, deterministic=False)\n",
+            "gpu_ids = range(0, 1)\n",
+            "\n",
+            "Result has been saved to /content/mmtracking/tutorial_exps/reid/modules_statistic_results.json\n",
+            "09/06 09:16:27 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "09/06 09:16:29 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - load model from: https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth\n",
+            "09/06 09:16:29 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - http loads checkpoint from path: https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth\n"
+          ]
+        },
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "2022-04-20 05:13:47,675 - mmtrack - INFO - initialize BaseReID with init_cfg {'type': 'Pretrained', 'checkpoint': 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth'}\n",
-            "2022-04-20 05:13:47,678 - mmcv - INFO - load model from: https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth\n",
-            "2022-04-20 05:13:47,680 - mmcv - INFO - load checkpoint from http path: https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth\n",
             "Downloading: \"https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth\" to /root/.cache/torch/hub/checkpoints/resnet50_batch256_imagenet_20200708-cfb998bf.pth\n"
           ]
         },
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "1fc5eb2eaa6a4ed88a005a081bc24a70",
+              "model_id": "34594c7fd26c4f61af5e4ec5b34b6b02",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1692,140 +2304,322 @@
           "output_type": "display_data"
         },
         {
-          "name": "stderr",
+          "name": "stdout",
           "output_type": "stream",
           "text": [
-            "2022-04-20 05:13:57,491 - mmcv - WARNING - The model and loaded state dict do not match exactly\n",
+            "09/06 09:16:40 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
             "\n",
             "unexpected key in source state_dict: head.fc.weight, head.fc.bias\n",
             "\n",
             "missing keys in source state_dict: head.fcs.0.fc.weight, head.fcs.0.fc.bias, head.fcs.0.bn.weight, head.fcs.0.bn.bias, head.fcs.0.bn.running_mean, head.fcs.0.bn.running_var, head.fc_out.weight, head.fc_out.bias, head.bn.weight, head.bn.bias, head.bn.running_mean, head.bn.running_var, head.classifier.weight, head.classifier.bias\n",
             "\n",
-            "/usr/local/lib/python3.7/dist-packages/mmdet/apis/train.py:135: UserWarning: config is now expected to have a `runner` section, please set `runner` in your config.\n",
-            "  'please set `runner` in your config.', UserWarning)\n",
-            "2022-04-20 05:13:57,552 - mmdet - INFO - Start running, host: root@597380361c27, work_dir: /content/mmtracking/tutorial_exps/reid\n",
-            "2022-04-20 05:13:57,553 - mmdet - INFO - Hooks will be executed in the following order:\n",
-            "before_run:\n",
-            "(VERY_HIGH   ) StepLrUpdaterHook                  \n",
-            "(NORMAL      ) CheckpointHook                     \n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "before_train_epoch:\n",
-            "(VERY_HIGH   ) StepLrUpdaterHook                  \n",
-            "(LOW         ) IterTimerHook                      \n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "before_train_iter:\n",
-            "(VERY_HIGH   ) StepLrUpdaterHook                  \n",
-            "(LOW         ) IterTimerHook                      \n",
-            " -------------------- \n",
-            "after_train_iter:\n",
-            "(ABOVE_NORMAL) OptimizerHook                      \n",
-            "(NORMAL      ) CheckpointHook                     \n",
-            "(LOW         ) IterTimerHook                      \n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "after_train_epoch:\n",
-            "(NORMAL      ) CheckpointHook                     \n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "before_val_epoch:\n",
-            "(LOW         ) IterTimerHook                      \n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "before_val_iter:\n",
-            "(LOW         ) IterTimerHook                      \n",
-            " -------------------- \n",
-            "after_val_iter:\n",
-            "(LOW         ) IterTimerHook                      \n",
-            " -------------------- \n",
-            "after_val_epoch:\n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "after_run:\n",
-            "(VERY_LOW    ) TextLoggerHook                     \n",
-            " -------------------- \n",
-            "2022-04-20 05:13:57,557 - mmdet - INFO - workflow: [('train', 1)], max: 2 epochs\n",
-            "2022-04-20 05:13:57,559 - mmdet - INFO - Checkpoints will be saved to /content/mmtracking/tutorial_exps/reid by HardDiskBackend.\n",
-            "2022-04-20 05:14:14,675 - mmdet - INFO - Epoch [1][50/1576]\tlr: 2.488e-02, eta: 0:17:31, time: 0.339, data_time: 0.052, memory: 4077, triplet_loss: 0.1025, ce_loss: 0.8317, top-1: 91.1875, loss: 0.9342\n",
-            "2022-04-20 05:14:29,582 - mmdet - INFO - Epoch [1][100/1576]\tlr: 4.975e-02, eta: 0:16:12, time: 0.298, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0003, top-1: 100.0000, loss: 0.0003\n",
-            "2022-04-20 05:14:44,624 - mmdet - INFO - Epoch [1][150/1576]\tlr: 7.463e-02, eta: 0:15:38, time: 0.301, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:14:59,453 - mmdet - INFO - Epoch [1][200/1576]\tlr: 9.950e-02, eta: 0:15:11, time: 0.297, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:15:14,151 - mmdet - INFO - Epoch [1][250/1576]\tlr: 1.000e-01, eta: 0:14:47, time: 0.294, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:15:28,850 - mmdet - INFO - Epoch [1][300/1576]\tlr: 1.000e-01, eta: 0:14:26, time: 0.294, data_time: 0.007, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:15:43,596 - mmdet - INFO - Epoch [1][350/1576]\tlr: 1.000e-01, eta: 0:14:07, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:15:58,405 - mmdet - INFO - Epoch [1][400/1576]\tlr: 1.000e-01, eta: 0:13:50, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:16:13,239 - mmdet - INFO - Epoch [1][450/1576]\tlr: 1.000e-01, eta: 0:13:33, time: 0.297, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:16:27,987 - mmdet - INFO - Epoch [1][500/1576]\tlr: 1.000e-01, eta: 0:13:16, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:16:42,705 - mmdet - INFO - Epoch [1][550/1576]\tlr: 1.000e-01, eta: 0:13:00, time: 0.294, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:16:57,480 - mmdet - INFO - Epoch [1][600/1576]\tlr: 1.000e-01, eta: 0:12:44, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:17:12,310 - mmdet - INFO - Epoch [1][650/1576]\tlr: 1.000e-01, eta: 0:12:28, time: 0.297, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:17:27,116 - mmdet - INFO - Epoch [1][700/1576]\tlr: 1.000e-01, eta: 0:12:13, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:17:41,884 - mmdet - INFO - Epoch [1][750/1576]\tlr: 1.000e-01, eta: 0:11:57, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:17:56,633 - mmdet - INFO - Epoch [1][800/1576]\tlr: 1.000e-01, eta: 0:11:42, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:18:11,409 - mmdet - INFO - Epoch [1][850/1576]\tlr: 1.000e-01, eta: 0:11:27, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:18:26,189 - mmdet - INFO - Epoch [1][900/1576]\tlr: 1.000e-01, eta: 0:11:11, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:18:40,972 - mmdet - INFO - Epoch [1][950/1576]\tlr: 1.000e-01, eta: 0:10:56, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:18:55,750 - mmdet - INFO - Epoch [1][1000/1576]\tlr: 1.000e-01, eta: 0:10:41, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:19:10,534 - mmdet - INFO - Epoch [1][1050/1576]\tlr: 1.000e-01, eta: 0:10:26, time: 0.296, data_time: 0.009, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:19:25,299 - mmdet - INFO - Epoch [1][1100/1576]\tlr: 1.000e-01, eta: 0:10:11, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0001, top-1: 100.0000, loss: 0.0001\n",
-            "2022-04-20 05:19:40,061 - mmdet - INFO - Epoch [1][1150/1576]\tlr: 1.000e-01, eta: 0:09:55, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:19:54,816 - mmdet - INFO - Epoch [1][1200/1576]\tlr: 1.000e-01, eta: 0:09:40, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:20:09,564 - mmdet - INFO - Epoch [1][1250/1576]\tlr: 1.000e-01, eta: 0:09:25, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:20:24,351 - mmdet - INFO - Epoch [1][1300/1576]\tlr: 1.000e-01, eta: 0:09:10, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:20:39,116 - mmdet - INFO - Epoch [1][1350/1576]\tlr: 1.000e-01, eta: 0:08:55, time: 0.295, data_time: 0.007, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:20:53,899 - mmdet - INFO - Epoch [1][1400/1576]\tlr: 1.000e-01, eta: 0:08:40, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:21:08,662 - mmdet - INFO - Epoch [1][1450/1576]\tlr: 1.000e-01, eta: 0:08:25, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:21:23,452 - mmdet - INFO - Epoch [1][1500/1576]\tlr: 1.000e-01, eta: 0:08:10, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:21:38,229 - mmdet - INFO - Epoch [1][1550/1576]\tlr: 1.000e-01, eta: 0:07:55, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:21:45,866 - mmdet - INFO - Saving checkpoint at 1 epochs\n",
-            "2022-04-20 05:22:04,077 - mmdet - INFO - Epoch [2][50/1576]\tlr: 1.000e-02, eta: 0:07:28, time: 0.341, data_time: 0.051, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:22:18,897 - mmdet - INFO - Epoch [2][100/1576]\tlr: 1.000e-02, eta: 0:07:13, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:22:33,683 - mmdet - INFO - Epoch [2][150/1576]\tlr: 1.000e-02, eta: 0:06:59, time: 0.296, data_time: 0.007, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:22:48,454 - mmdet - INFO - Epoch [2][200/1576]\tlr: 1.000e-02, eta: 0:06:44, time: 0.295, data_time: 0.007, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:23:03,198 - mmdet - INFO - Epoch [2][250/1576]\tlr: 1.000e-02, eta: 0:06:29, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:23:17,976 - mmdet - INFO - Epoch [2][300/1576]\tlr: 1.000e-02, eta: 0:06:15, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:23:32,734 - mmdet - INFO - Epoch [2][350/1576]\tlr: 1.000e-02, eta: 0:06:00, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:23:47,520 - mmdet - INFO - Epoch [2][400/1576]\tlr: 1.000e-02, eta: 0:05:45, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:24:02,320 - mmdet - INFO - Epoch [2][450/1576]\tlr: 1.000e-02, eta: 0:05:31, time: 0.296, data_time: 0.009, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:24:17,108 - mmdet - INFO - Epoch [2][500/1576]\tlr: 1.000e-02, eta: 0:05:16, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:24:31,876 - mmdet - INFO - Epoch [2][550/1576]\tlr: 1.000e-02, eta: 0:05:01, time: 0.295, data_time: 0.007, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:24:46,655 - mmdet - INFO - Epoch [2][600/1576]\tlr: 1.000e-02, eta: 0:04:47, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:25:01,427 - mmdet - INFO - Epoch [2][650/1576]\tlr: 1.000e-02, eta: 0:04:32, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:25:16,196 - mmdet - INFO - Epoch [2][700/1576]\tlr: 1.000e-02, eta: 0:04:17, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:25:30,965 - mmdet - INFO - Epoch [2][750/1576]\tlr: 1.000e-02, eta: 0:04:03, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:25:45,738 - mmdet - INFO - Epoch [2][800/1576]\tlr: 1.000e-02, eta: 0:03:48, time: 0.295, data_time: 0.007, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:26:00,530 - mmdet - INFO - Epoch [2][850/1576]\tlr: 1.000e-02, eta: 0:03:33, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:26:15,302 - mmdet - INFO - Epoch [2][900/1576]\tlr: 1.000e-02, eta: 0:03:18, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:26:30,094 - mmdet - INFO - Epoch [2][950/1576]\tlr: 1.000e-02, eta: 0:03:04, time: 0.296, data_time: 0.009, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:26:44,876 - mmdet - INFO - Epoch [2][1000/1576]\tlr: 1.000e-02, eta: 0:02:49, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:26:59,644 - mmdet - INFO - Epoch [2][1050/1576]\tlr: 1.000e-02, eta: 0:02:34, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:27:14,441 - mmdet - INFO - Epoch [2][1100/1576]\tlr: 1.000e-02, eta: 0:02:20, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:27:29,222 - mmdet - INFO - Epoch [2][1150/1576]\tlr: 1.000e-02, eta: 0:02:05, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:27:43,996 - mmdet - INFO - Epoch [2][1200/1576]\tlr: 1.000e-02, eta: 0:01:50, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:27:58,760 - mmdet - INFO - Epoch [2][1250/1576]\tlr: 1.000e-02, eta: 0:01:36, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:28:13,523 - mmdet - INFO - Epoch [2][1300/1576]\tlr: 1.000e-02, eta: 0:01:21, time: 0.295, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:28:28,302 - mmdet - INFO - Epoch [2][1350/1576]\tlr: 1.000e-02, eta: 0:01:06, time: 0.296, data_time: 0.009, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:28:43,082 - mmdet - INFO - Epoch [2][1400/1576]\tlr: 1.000e-02, eta: 0:00:51, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:28:57,869 - mmdet - INFO - Epoch [2][1450/1576]\tlr: 1.000e-02, eta: 0:00:37, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:29:12,650 - mmdet - INFO - Epoch [2][1500/1576]\tlr: 1.000e-02, eta: 0:00:22, time: 0.296, data_time: 0.009, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:29:27,444 - mmdet - INFO - Epoch [2][1550/1576]\tlr: 1.000e-02, eta: 0:00:07, time: 0.296, data_time: 0.008, memory: 4077, triplet_loss: 0.0000, ce_loss: 0.0002, top-1: 100.0000, loss: 0.0002\n",
-            "2022-04-20 05:29:35,087 - mmdet - INFO - Saving checkpoint at 2 epochs\n"
+            "09/06 09:16:40 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Checkpoints will be saved to /content/mmtracking/tutorial_exps/reid by HardDiskBackend.\n",
+            "09/06 09:16:56 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][50/1576]  lr: 2.5000e-02  eta: 0:16:34  time: 0.3139  data_time: 0.0053  memory: 4097  triplet_loss: 0.0000  ce_loss: 0.0004  accuracy_top-1: 100.0000  loss: 0.0006\n",
+            "09/06 09:17:12 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][100/1576]  lr: 5.0000e-02  eta: 0:16:09  time: 0.3246  data_time: 0.0059  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:17:27 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][150/1576]  lr: 7.5000e-02  eta: 0:15:49  time: 0.3163  data_time: 0.0052  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:17:44 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][200/1576]  lr: 1.0000e-01  eta: 0:15:37  time: 0.3179  data_time: 0.0054  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:18:00 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][250/1576]  lr: 1.0000e-01  eta: 0:15:23  time: 0.3169  data_time: 0.0057  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:18:15 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][300/1576]  lr: 1.0000e-01  eta: 0:15:06  time: 0.3144  data_time: 0.0054  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:18:31 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][350/1576]  lr: 1.0000e-01  eta: 0:14:49  time: 0.3130  data_time: 0.0055  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:18:47 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][400/1576]  lr: 1.0000e-01  eta: 0:14:33  time: 0.3278  data_time: 0.0051  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:19:03 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][450/1576]  lr: 1.0000e-01  eta: 0:14:17  time: 0.3153  data_time: 0.0066  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:19:19 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][500/1576]  lr: 1.0000e-01  eta: 0:14:02  time: 0.3160  data_time: 0.0062  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:19:35 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][550/1576]  lr: 1.0000e-01  eta: 0:13:48  time: 0.3330  data_time: 0.0112  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:19:51 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][600/1576]  lr: 1.0000e-01  eta: 0:13:32  time: 0.3242  data_time: 0.0061  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:20:07 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][650/1576]  lr: 1.0000e-01  eta: 0:13:16  time: 0.3155  data_time: 0.0070  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:20:23 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][700/1576]  lr: 1.0000e-01  eta: 0:13:00  time: 0.3125  data_time: 0.0067  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:20:39 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][750/1576]  lr: 1.0000e-01  eta: 0:12:43  time: 0.3165  data_time: 0.0063  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:20:54 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][800/1576]  lr: 1.0000e-01  eta: 0:12:27  time: 0.3162  data_time: 0.0056  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:21:10 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][850/1576]  lr: 1.0000e-01  eta: 0:12:11  time: 0.3189  data_time: 0.0053  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:21:26 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][900/1576]  lr: 1.0000e-01  eta: 0:11:56  time: 0.3293  data_time: 0.0081  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:21:42 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][950/1576]  lr: 1.0000e-01  eta: 0:11:40  time: 0.3159  data_time: 0.0067  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:21:58 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Exp name: reid_r50_8xb32-6e_mot17train80_test-mot17val20_20220906_091626\n",
+            "09/06 09:21:58 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1000/1576]  lr: 1.0000e-01  eta: 0:11:24  time: 0.3142  data_time: 0.0055  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:22:14 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1050/1576]  lr: 1.0000e-01  eta: 0:11:08  time: 0.3153  data_time: 0.0052  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0001\n",
+            "09/06 09:22:30 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1100/1576]  lr: 1.0000e-01  eta: 0:10:52  time: 0.3137  data_time: 0.0059  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0001  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:22:46 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1150/1576]  lr: 1.0000e-01  eta: 0:10:36  time: 0.3151  data_time: 0.0068  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:23:02 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1200/1576]  lr: 1.0000e-01  eta: 0:10:20  time: 0.3174  data_time: 0.0059  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:23:18 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1250/1576]  lr: 1.0000e-01  eta: 0:10:04  time: 0.3185  data_time: 0.0065  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:23:33 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1300/1576]  lr: 1.0000e-01  eta: 0:09:48  time: 0.3165  data_time: 0.0061  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:23:50 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1350/1576]  lr: 1.0000e-01  eta: 0:09:33  time: 0.3163  data_time: 0.0066  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:24:05 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1400/1576]  lr: 1.0000e-01  eta: 0:09:17  time: 0.3148  data_time: 0.0064  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:24:21 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1450/1576]  lr: 1.0000e-01  eta: 0:09:01  time: 0.3157  data_time: 0.0061  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:24:37 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1500/1576]  lr: 1.0000e-01  eta: 0:08:45  time: 0.3160  data_time: 0.0067  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:24:53 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [1][1550/1576]  lr: 1.0000e-01  eta: 0:08:29  time: 0.3164  data_time: 0.0055  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:25:01 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Exp name: reid_r50_8xb32-6e_mot17train80_test-mot17val20_20220906_091626\n",
+            "09/06 09:25:01 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Saving checkpoint at 1 epochs\n",
+            "09/06 09:25:19 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][50/1576]  lr: 1.0000e-02  eta: 0:08:03  time: 0.3158  data_time: 0.0053  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:25:35 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][100/1576]  lr: 1.0000e-02  eta: 0:07:47  time: 0.3160  data_time: 0.0053  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:25:51 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][150/1576]  lr: 1.0000e-02  eta: 0:07:31  time: 0.3164  data_time: 0.0062  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:26:07 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][200/1576]  lr: 1.0000e-02  eta: 0:07:15  time: 0.3159  data_time: 0.0062  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:26:23 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][250/1576]  lr: 1.0000e-02  eta: 0:07:00  time: 0.3163  data_time: 0.0065  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:26:38 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][300/1576]  lr: 1.0000e-02  eta: 0:06:44  time: 0.3149  data_time: 0.0056  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:26:54 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][350/1576]  lr: 1.0000e-02  eta: 0:06:28  time: 0.3167  data_time: 0.0063  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:27:10 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][400/1576]  lr: 1.0000e-02  eta: 0:06:12  time: 0.3148  data_time: 0.0057  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:27:18 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Exp name: reid_r50_8xb32-6e_mot17train80_test-mot17val20_20220906_091626\n",
+            "09/06 09:27:26 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][450/1576]  lr: 1.0000e-02  eta: 0:05:56  time: 0.3127  data_time: 0.0057  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:27:42 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][500/1576]  lr: 1.0000e-02  eta: 0:05:40  time: 0.3186  data_time: 0.0064  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:27:58 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][550/1576]  lr: 1.0000e-02  eta: 0:05:24  time: 0.3151  data_time: 0.0053  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:28:13 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][600/1576]  lr: 1.0000e-02  eta: 0:05:09  time: 0.3158  data_time: 0.0058  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:28:29 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][650/1576]  lr: 1.0000e-02  eta: 0:04:53  time: 0.3162  data_time: 0.0063  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:28:45 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][700/1576]  lr: 1.0000e-02  eta: 0:04:37  time: 0.3156  data_time: 0.0066  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:29:01 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][750/1576]  lr: 1.0000e-02  eta: 0:04:21  time: 0.3164  data_time: 0.0060  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:29:17 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][800/1576]  lr: 1.0000e-02  eta: 0:04:05  time: 0.3172  data_time: 0.0067  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:29:33 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][850/1576]  lr: 1.0000e-02  eta: 0:03:49  time: 0.3134  data_time: 0.0054  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:29:48 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][900/1576]  lr: 1.0000e-02  eta: 0:03:34  time: 0.3172  data_time: 0.0066  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:30:04 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][950/1576]  lr: 1.0000e-02  eta: 0:03:18  time: 0.3129  data_time: 0.0059  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:30:20 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1000/1576]  lr: 1.0000e-02  eta: 0:03:02  time: 0.3145  data_time: 0.0059  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:30:36 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1050/1576]  lr: 1.0000e-02  eta: 0:02:46  time: 0.3142  data_time: 0.0066  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:30:52 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1100/1576]  lr: 1.0000e-02  eta: 0:02:30  time: 0.3169  data_time: 0.0063  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:31:08 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1150/1576]  lr: 1.0000e-02  eta: 0:02:14  time: 0.3158  data_time: 0.0064  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:31:23 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1200/1576]  lr: 1.0000e-02  eta: 0:01:59  time: 0.3150  data_time: 0.0067  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:31:39 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1250/1576]  lr: 1.0000e-02  eta: 0:01:43  time: 0.3169  data_time: 0.0056  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:31:55 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1300/1576]  lr: 1.0000e-02  eta: 0:01:27  time: 0.3153  data_time: 0.0061  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:32:11 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1350/1576]  lr: 1.0000e-02  eta: 0:01:11  time: 0.3148  data_time: 0.0053  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:32:27 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1400/1576]  lr: 1.0000e-02  eta: 0:00:55  time: 0.3171  data_time: 0.0057  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:32:34 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Exp name: reid_r50_8xb32-6e_mot17train80_test-mot17val20_20220906_091626\n",
+            "09/06 09:32:42 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1450/1576]  lr: 1.0000e-02  eta: 0:00:39  time: 0.3148  data_time: 0.0051  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:32:58 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1500/1576]  lr: 1.0000e-02  eta: 0:00:24  time: 0.3157  data_time: 0.0053  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:33:14 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(train) [2][1550/1576]  lr: 1.0000e-02  eta: 0:00:08  time: 0.3156  data_time: 0.0068  memory: 3519  triplet_loss: 0.0000  ce_loss: 0.0002  accuracy_top-1: 100.0000  loss: 0.0002\n",
+            "09/06 09:33:22 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Exp name: reid_r50_8xb32-6e_mot17train80_test-mot17val20_20220906_091626\n",
+            "09/06 09:33:22 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Saving checkpoint at 2 epochs\n"
           ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "BaseReID(\n",
+              "  (data_preprocessor): ClsDataPreprocessor()\n",
+              "  (backbone): ResNet(\n",
+              "    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n",
+              "    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "    (relu): ReLU(inplace=True)\n",
+              "    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n",
+              "    (layer1): ResLayer(\n",
+              "      (0): Bottleneck(\n",
+              "        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (downsample): Sequential(\n",
+              "          (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        )\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (1): Bottleneck(\n",
+              "        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (2): Bottleneck(\n",
+              "        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "    )\n",
+              "    (layer2): ResLayer(\n",
+              "      (0): Bottleneck(\n",
+              "        (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (downsample): Sequential(\n",
+              "          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+              "          (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        )\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (1): Bottleneck(\n",
+              "        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (2): Bottleneck(\n",
+              "        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (3): Bottleneck(\n",
+              "        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "    )\n",
+              "    (layer3): ResLayer(\n",
+              "      (0): Bottleneck(\n",
+              "        (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (downsample): Sequential(\n",
+              "          (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+              "          (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        )\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (1): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (2): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (3): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (4): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (5): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "    )\n",
+              "    (layer4): ResLayer(\n",
+              "      (0): Bottleneck(\n",
+              "        (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (downsample): Sequential(\n",
+              "          (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
+              "          (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        )\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (1): Bottleneck(\n",
+              "        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "      (2): Bottleneck(\n",
+              "        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
+              "        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
+              "        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (relu): ReLU(inplace=True)\n",
+              "        (drop_path): Identity()\n",
+              "      )\n",
+              "    )\n",
+              "  )\n",
+              "  init_cfg=[{'type': 'Kaiming', 'layer': ['Conv2d']}, {'type': 'Constant', 'val': 1, 'layer': ['_BatchNorm', 'GroupNorm']}]\n",
+              "  (neck): GlobalAveragePooling(\n",
+              "    (gap): AvgPool2d(kernel_size=(8, 4), stride=1, padding=0)\n",
+              "  )\n",
+              "  (head): LinearReIDHead(\n",
+              "    (loss_cls): CrossEntropyLoss()\n",
+              "    (loss_triplet): TripletLoss(\n",
+              "      (ranking_loss): MarginRankingLoss()\n",
+              "    )\n",
+              "    (fcs): ModuleList(\n",
+              "      (0): FcModule(\n",
+              "        (fc): Linear(in_features=2048, out_features=1024, bias=True)\n",
+              "        (bn): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "        (activate): ReLU(inplace=True)\n",
+              "      )\n",
+              "      init_cfg={'type': 'Kaiming', 'layer': 'Linear'}\n",
+              "    )\n",
+              "    (fc_out): Linear(in_features=1024, out_features=128, bias=True)\n",
+              "    (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+              "    (classifier): Linear(in_features=128, out_features=380, bias=True)\n",
+              "  )\n",
+              "  init_cfg={'type': 'Normal', 'layer': 'Linear', 'mean': 0, 'std': 0.01, 'bias': 0}\n",
+              ")\n",
+              "init_cfg={'type': 'Pretrained', 'checkpoint': 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth'}"
+            ]
+          },
+          "execution_count": 43,
+          "metadata": {},
+          "output_type": "execute_result"
         }
       ],
       "source": [
-        "from mmtrack.datasets import build_dataset\n",
-        "from mmdet.apis import train_detector as train_model\n",
-        "from mmtrack.models import build_reid as build_model\n",
-        "\n",
+        "import os.path as osp\n",
         "\n",
-        "model = build_model(cfg.model.reid)\n",
-        "model.init_weights()\n",
-        "datasets = [build_dataset(cfg.data.train)]\n",
-        "model.CLASSES = datasets[0].CLASSES\n",
+        "from mmengine.utils import mkdir_or_exist\n",
+        "from mmengine.runner import Runner\n",
         "\n",
-        "train_model(model, datasets, cfg)"
+        "mkdir_or_exist(osp.abspath(cfg.work_dir))\n",
+        "runner = Runner.from_cfg(cfg)\n",
+        "runner.train()"
       ]
     },
     {
@@ -1841,14 +2635,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 46,
       "id": "c837c657-cc81-426d-8060-ad19f5494461",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "c837c657-cc81-426d-8060-ad19f5494461",
-        "outputId": "d33fa793-8dc1-4186-dac0-49d97355c69a"
+        "outputId": "9950328d-6ef9-4add-f79d-ceb576eef936"
       },
       "outputs": [
         {
@@ -1857,8 +2651,16 @@
           "text": [
             "Config:\n",
             "model = dict(\n",
+            "    data_preprocessor=dict(\n",
+            "        type='TrackDataPreprocessor',\n",
+            "        mean=[123.675, 116.28, 103.53],\n",
+            "        std=[58.395, 57.12, 57.375],\n",
+            "        bgr_to_rgb=True,\n",
+            "        rgb_to_bgr=False,\n",
+            "        pad_size_divisor=32),\n",
             "    detector=dict(\n",
             "        type='FasterRCNN',\n",
+            "        _scope_='mmdet',\n",
             "        backbone=dict(\n",
             "            type='ResNet',\n",
             "            depth=50,\n",
@@ -1975,8 +2777,9 @@
             "    motion=dict(type='KalmanFilter', center_only=False),\n",
             "    reid=dict(\n",
             "        type='BaseReID',\n",
+            "        data_preprocessor=None,\n",
             "        backbone=dict(\n",
-            "            type='ResNet',\n",
+            "            type='mmcls.ResNet',\n",
             "            depth=50,\n",
             "            num_stages=4,\n",
             "            out_indices=(3, ),\n",
@@ -1989,15 +2792,14 @@
             "            fc_channels=1024,\n",
             "            out_channels=128,\n",
             "            num_classes=380,\n",
-            "            loss=dict(type='CrossEntropyLoss', loss_weight=1.0),\n",
-            "            loss_pairwise=dict(\n",
-            "                type='TripletLoss', margin=0.3, loss_weight=1.0),\n",
+            "            loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),\n",
+            "            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),\n",
             "            norm_cfg=dict(type='BN1d'),\n",
             "            act_cfg=dict(type='ReLU')),\n",
             "        init_cfg=dict(\n",
             "            type='Pretrained', checkpoint='./tutorial_exps/reid/epoch_2.pth')),\n",
             "    tracker=dict(\n",
-            "        type='SortTracker',\n",
+            "        type='SORTTracker',\n",
             "        obj_score_thr=0.5,\n",
             "        reid=dict(\n",
             "            num_samples=10,\n",
@@ -2009,208 +2811,119 @@
             "        num_tentatives=2,\n",
             "        num_frames_retain=100))\n",
             "dataset_type = 'MOTChallengeDataset'\n",
-            "img_norm_cfg = dict(\n",
-            "    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)\n",
+            "data_root = 'data/MOT17/'\n",
             "train_pipeline = [\n",
-            "    dict(type='LoadMultiImagesFromFile', to_float32=True),\n",
-            "    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),\n",
-            "    dict(\n",
-            "        type='SeqResize',\n",
-            "        img_scale=(1088, 1088),\n",
-            "        share_params=True,\n",
-            "        ratio_range=(0.8, 1.2),\n",
-            "        keep_ratio=True,\n",
-            "        bbox_clip_border=False),\n",
-            "    dict(type='SeqPhotoMetricDistortion', share_params=True),\n",
-            "    dict(\n",
-            "        type='SeqRandomCrop',\n",
-            "        share_params=False,\n",
-            "        crop_size=(1088, 1088),\n",
-            "        bbox_clip_border=False),\n",
-            "    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),\n",
-            "    dict(\n",
-            "        type='SeqNormalize',\n",
-            "        mean=[123.675, 116.28, 103.53],\n",
-            "        std=[58.395, 57.12, 57.375],\n",
-            "        to_rgb=True),\n",
-            "    dict(type='SeqPad', size_divisor=32),\n",
-            "    dict(type='MatchInstances', skip_nomatch=True),\n",
             "    dict(\n",
-            "        type='VideoCollect',\n",
-            "        keys=[\n",
-            "            'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',\n",
-            "            'gt_instance_ids'\n",
-            "        ]),\n",
-            "    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')\n",
-            "]\n",
-            "test_pipeline = [\n",
-            "    dict(type='LoadImageFromFile'),\n",
-            "    dict(\n",
-            "        type='MultiScaleFlipAug',\n",
-            "        img_scale=(1088, 1088),\n",
-            "        flip=False,\n",
+            "        type='TransformBroadcaster',\n",
+            "        share_random_params=True,\n",
             "        transforms=[\n",
-            "            dict(type='Resize', keep_ratio=True),\n",
-            "            dict(type='RandomFlip'),\n",
-            "            dict(\n",
-            "                type='Normalize',\n",
-            "                mean=[123.675, 116.28, 103.53],\n",
-            "                std=[58.395, 57.12, 57.375],\n",
-            "                to_rgb=True),\n",
-            "            dict(type='Pad', size_divisor=32),\n",
-            "            dict(type='ImageToTensor', keys=['img']),\n",
-            "            dict(type='VideoCollect', keys=['img'])\n",
-            "        ])\n",
-            "]\n",
-            "data_root = 'data/MOT17_tiny/'\n",
-            "data = dict(\n",
-            "    samples_per_gpu=2,\n",
-            "    workers_per_gpu=2,\n",
-            "    train=dict(\n",
-            "        type='MOTChallengeDataset',\n",
-            "        visibility_thr=-1,\n",
-            "        ann_file='data/MOT17_tiny/annotations/half-val_cocoformat.json',\n",
-            "        img_prefix='data/MOT17_tiny/train',\n",
-            "        ref_img_sampler=dict(\n",
-            "            num_ref_imgs=1,\n",
-            "            frame_range=10,\n",
-            "            filter_key_img=True,\n",
-            "            method='uniform'),\n",
-            "        pipeline=[\n",
-            "            dict(type='LoadMultiImagesFromFile', to_float32=True),\n",
-            "            dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),\n",
+            "            dict(type='LoadImageFromFile'),\n",
+            "            dict(type='LoadTrackAnnotations', with_instance_id=True),\n",
             "            dict(\n",
-            "                type='SeqResize',\n",
-            "                img_scale=(1088, 1088),\n",
-            "                share_params=True,\n",
+            "                type='mmdet.RandomResize',\n",
+            "                scale=(1088, 1088),\n",
             "                ratio_range=(0.8, 1.2),\n",
             "                keep_ratio=True,\n",
-            "                bbox_clip_border=False),\n",
-            "            dict(type='SeqPhotoMetricDistortion', share_params=True),\n",
-            "            dict(\n",
-            "                type='SeqRandomCrop',\n",
-            "                share_params=False,\n",
-            "                crop_size=(1088, 1088),\n",
-            "                bbox_clip_border=False),\n",
-            "            dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),\n",
-            "            dict(\n",
-            "                type='SeqNormalize',\n",
-            "                mean=[123.675, 116.28, 103.53],\n",
-            "                std=[58.395, 57.12, 57.375],\n",
-            "                to_rgb=True),\n",
-            "            dict(type='SeqPad', size_divisor=32),\n",
-            "            dict(type='MatchInstances', skip_nomatch=True),\n",
-            "            dict(\n",
-            "                type='VideoCollect',\n",
-            "                keys=[\n",
-            "                    'img', 'gt_bboxes', 'gt_labels', 'gt_match_indices',\n",
-            "                    'gt_instance_ids'\n",
-            "                ]),\n",
-            "            dict(type='SeqDefaultFormatBundle', ref_prefix='ref')\n",
+            "                clip_object_border=False),\n",
+            "            dict(type='mmdet.PhotoMetricDistortion')\n",
             "        ]),\n",
-            "    val=dict(\n",
-            "        type='MOTChallengeDataset',\n",
-            "        ann_file='data/MOT17_tiny/annotations/half-val_cocoformat.json',\n",
-            "        img_prefix='data/MOT17_tiny/train',\n",
-            "        ref_img_sampler=None,\n",
-            "        pipeline=[\n",
-            "            dict(type='LoadImageFromFile'),\n",
+            "    dict(\n",
+            "        type='TransformBroadcaster',\n",
+            "        share_random_params=False,\n",
+            "        transforms=[\n",
             "            dict(\n",
-            "                type='MultiScaleFlipAug',\n",
-            "                img_scale=(1088, 1088),\n",
-            "                flip=False,\n",
-            "                transforms=[\n",
-            "                    dict(type='Resize', keep_ratio=True),\n",
-            "                    dict(type='RandomFlip'),\n",
-            "                    dict(\n",
-            "                        type='Normalize',\n",
-            "                        mean=[123.675, 116.28, 103.53],\n",
-            "                        std=[58.395, 57.12, 57.375],\n",
-            "                        to_rgb=True),\n",
-            "                    dict(type='Pad', size_divisor=32),\n",
-            "                    dict(type='ImageToTensor', keys=['img']),\n",
-            "                    dict(type='VideoCollect', keys=['img'])\n",
-            "                ])\n",
+            "                type='mmdet.RandomCrop',\n",
+            "                crop_size=(1088, 1088),\n",
+            "                bbox_clip_border=False)\n",
             "        ]),\n",
-            "    test=dict(\n",
+            "    dict(\n",
+            "        type='TransformBroadcaster',\n",
+            "        share_random_params=True,\n",
+            "        transforms=[dict(type='mmdet.RandomFlip', prob=0.5)]),\n",
+            "    dict(type='PackTrackInputs', ref_prefix='ref', num_key_frames=1)\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile'),\n",
+            "    dict(type='LoadTrackAnnotations', with_instance_id=True),\n",
+            "    dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),\n",
+            "    dict(type='PackTrackInputs', pack_single_img=True)\n",
+            "]\n",
+            "train_dataloader = None\n",
+            "val_dataloader = None\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='VideoSampler'),\n",
+            "    dataset=dict(\n",
             "        type='MOTChallengeDataset',\n",
-            "        ann_file='data/MOT17_tiny/annotations/half-val_cocoformat.json',\n",
-            "        img_prefix='data/MOT17_tiny/train',\n",
+            "        data_root='data/MOT17_tiny/',\n",
+            "        ann_file='annotations/half-val_cocoformat.json',\n",
+            "        data_prefix=dict(img_path='train'),\n",
             "        ref_img_sampler=None,\n",
+            "        load_as_video=True,\n",
+            "        test_mode=True,\n",
             "        pipeline=[\n",
             "            dict(type='LoadImageFromFile'),\n",
-            "            dict(\n",
-            "                type='MultiScaleFlipAug',\n",
-            "                img_scale=(1088, 1088),\n",
-            "                flip=False,\n",
-            "                transforms=[\n",
-            "                    dict(type='Resize', keep_ratio=True),\n",
-            "                    dict(type='RandomFlip'),\n",
-            "                    dict(\n",
-            "                        type='Normalize',\n",
-            "                        mean=[123.675, 116.28, 103.53],\n",
-            "                        std=[58.395, 57.12, 57.375],\n",
-            "                        to_rgb=True),\n",
-            "                    dict(type='Pad', size_divisor=32),\n",
-            "                    dict(type='ImageToTensor', keys=['img']),\n",
-            "                    dict(type='VideoCollect', keys=['img'])\n",
-            "                ])\n",
-            "        ],\n",
-            "        test_mode=True))\n",
-            "optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)\n",
-            "optimizer_config = dict(grad_clip=None)\n",
-            "checkpoint_config = dict(interval=1)\n",
-            "log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])\n",
-            "dist_params = dict(backend='nccl')\n",
+            "            dict(type='LoadTrackAnnotations', with_instance_id=True),\n",
+            "            dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),\n",
+            "            dict(type='PackTrackInputs', pack_single_img=True)\n",
+            "        ]))\n",
+            "val_evaluator = None\n",
+            "test_evaluator = dict(\n",
+            "    type='MOTChallengeMetrics', metric=['HOTA', 'CLEAR', 'Identity'])\n",
+            "default_scope = 'mmtrack'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='TrackVisualizationHook', draw=False))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='TrackLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='deepsort_visualizer')\n",
             "log_level = 'INFO'\n",
             "load_from = None\n",
-            "resume_from = None\n",
-            "workflow = [('train', 1)]\n",
-            "opencv_num_threads = 0\n",
-            "mp_start_method = 'fork'\n",
-            "lr_config = dict(\n",
-            "    policy='step',\n",
-            "    warmup='linear',\n",
-            "    warmup_iters=100,\n",
-            "    warmup_ratio=0.01,\n",
-            "    step=[3])\n",
-            "total_epochs = 4\n",
-            "evaluation = dict(metric=['bbox', 'track'], interval=1)\n",
-            "search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML']\n",
+            "resume = False\n",
+            "train_cfg = None\n",
+            "val_cfg = None\n",
+            "test_cfg = dict(type='TestLoop')\n",
             "work_dir = './tutorial_exps'\n",
-            "seed = 0\n",
+            "randomness = dict(seed=0, deterministic=False)\n",
             "gpu_ids = range(0, 1)\n",
             "\n"
           ]
         }
       ],
       "source": [
-        "import mmcv\n",
-        "from mmdet.apis import set_random_seed\n",
-        "cfg = mmcv.Config.fromfile('./configs/mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py')\n",
-        "cfg.data_root = 'data/MOT17_tiny/'\n",
-        "cfg.data.test.ann_file = cfg.data.test.ann_file.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.train.ann_file = cfg.data.test.ann_file.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.val.ann_file = cfg.data.val.ann_file.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "\n",
-        "cfg.data.test.img_prefix = cfg.data.test.img_prefix.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.train.img_prefix = cfg.data.train.img_prefix.replace('data/MOT17/','data/MOT17_tiny/')\n",
-        "cfg.data.val.img_prefix = cfg.data.val.img_prefix.replace('data/MOT17/','data/MOT17_tiny/')\n",
+        "import mmengine\n",
         "\n",
+        "cfg = mmengine.Config.fromfile('./configs/mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py')\n",
+        "cfg.test_dataloader.dataset.data_root = 'data/MOT17_tiny/'\n",
+        "cfg.test_dataloader.dataset.test_mode = True\n",
+        "cfg.train_dataloader = cfg.train_cfg = None\n",
+        "cfg.val_dataloader = cfg.val_cfg = cfg.val_evaluator = None\n",
+        "cfg.visualizer.name = 'deepsort_visualizer'\n",
         "cfg.model.detector.init_cfg.checkpoint = './tutorial_exps/detector/epoch_4.pth'\n",
         "cfg.model.reid.init_cfg.checkpoint = './tutorial_exps/reid/epoch_2.pth'\n",
         "\n",
         "cfg.work_dir = './tutorial_exps'\n",
-        "cfg.seed = 0\n",
-        "set_random_seed(0, deterministic=False)\n",
+        "cfg.randomness = dict(seed=0, deterministic=False)\n",
         "cfg.gpu_ids = range(1)\n",
-        "cfg.data.test.test_mode = True\n",
         "print(f'Config:\\n{cfg.pretty_text}')"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 47,
       "id": "29c9f531-3ea9-42d3-9fb3-bcdd13594406",
       "metadata": {
         "colab": {
@@ -2218,37 +2931,340 @@
           "height": 1000
         },
         "id": "29c9f531-3ea9-42d3-9fb3-bcdd13594406",
-        "outputId": "7c5837e7-78bb-41c7-f0c7-e4dadada5a6c"
+        "outputId": "6d133c91-8323-45c0-e031-5bc8be89485d"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
+            "09/06 09:37:05 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - \n",
+            "------------------------------------------------------------\n",
+            "System environment:\n",
+            "    sys.platform: linux\n",
+            "    Python: 3.7.13 (default, Apr 24 2022, 01:04:09) [GCC 7.5.0]\n",
+            "    CUDA available: True\n",
+            "    numpy_random_seed: 0\n",
+            "    GPU 0: Tesla T4\n",
+            "    CUDA_HOME: /usr/local/cuda\n",
+            "    NVCC: Cuda compilation tools, release 11.1, V11.1.105\n",
+            "    GCC: x86_64-linux-gnu-gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0\n",
+            "    PyTorch: 1.10.0+cu111\n",
+            "    PyTorch compiling details: PyTorch built with:\n",
+            "  - GCC 7.3\n",
+            "  - C++ Version: 201402\n",
+            "  - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications\n",
+            "  - Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)\n",
+            "  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
+            "  - LAPACK is enabled (usually provided by MKL)\n",
+            "  - NNPACK is enabled\n",
+            "  - CPU capability usage: AVX2\n",
+            "  - CUDA Runtime 11.1\n",
+            "  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86\n",
+            "  - CuDNN 8.0.5\n",
+            "  - Magma 2.5.2\n",
+            "  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.1, CUDNN_VERSION=8.0.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, \n",
+            "\n",
+            "    TorchVision: 0.11.0+cu111\n",
+            "    OpenCV: 4.6.0\n",
+            "    MMEngine: 0.1.0\n",
+            "\n",
+            "Runtime environment:\n",
+            "    cudnn_benchmark: False\n",
+            "    mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+            "    dist_cfg: {'backend': 'nccl'}\n",
+            "    seed: 0\n",
+            "    deterministic: False\n",
+            "    Distributed launcher: none\n",
+            "    Distributed training: False\n",
+            "    GPU number: 1\n",
+            "------------------------------------------------------------\n",
+            "\n",
+            "09/06 09:37:05 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Config:\n",
+            "model = dict(\n",
+            "    data_preprocessor=dict(\n",
+            "        type='TrackDataPreprocessor',\n",
+            "        mean=[123.675, 116.28, 103.53],\n",
+            "        std=[58.395, 57.12, 57.375],\n",
+            "        bgr_to_rgb=True,\n",
+            "        rgb_to_bgr=False,\n",
+            "        pad_size_divisor=32),\n",
+            "    detector=dict(\n",
+            "        type='FasterRCNN',\n",
+            "        _scope_='mmdet',\n",
+            "        backbone=dict(\n",
+            "            type='ResNet',\n",
+            "            depth=50,\n",
+            "            num_stages=4,\n",
+            "            out_indices=(0, 1, 2, 3),\n",
+            "            frozen_stages=1,\n",
+            "            norm_cfg=dict(type='BN', requires_grad=True),\n",
+            "            norm_eval=True,\n",
+            "            style='pytorch',\n",
+            "            init_cfg=dict(\n",
+            "                type='Pretrained', checkpoint='torchvision://resnet50')),\n",
+            "        neck=dict(\n",
+            "            type='FPN',\n",
+            "            in_channels=[256, 512, 1024, 2048],\n",
+            "            out_channels=256,\n",
+            "            num_outs=5),\n",
+            "        rpn_head=dict(\n",
+            "            type='RPNHead',\n",
+            "            in_channels=256,\n",
+            "            feat_channels=256,\n",
+            "            anchor_generator=dict(\n",
+            "                type='AnchorGenerator',\n",
+            "                scales=[8],\n",
+            "                ratios=[0.5, 1.0, 2.0],\n",
+            "                strides=[4, 8, 16, 32, 64]),\n",
+            "            bbox_coder=dict(\n",
+            "                type='DeltaXYWHBBoxCoder',\n",
+            "                target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                target_stds=[1.0, 1.0, 1.0, 1.0],\n",
+            "                clip_border=False),\n",
+            "            loss_cls=dict(\n",
+            "                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),\n",
+            "            loss_bbox=dict(\n",
+            "                type='SmoothL1Loss', beta=0.1111111111111111,\n",
+            "                loss_weight=1.0)),\n",
+            "        roi_head=dict(\n",
+            "            type='StandardRoIHead',\n",
+            "            bbox_roi_extractor=dict(\n",
+            "                type='SingleRoIExtractor',\n",
+            "                roi_layer=dict(\n",
+            "                    type='RoIAlign', output_size=7, sampling_ratio=0),\n",
+            "                out_channels=256,\n",
+            "                featmap_strides=[4, 8, 16, 32]),\n",
+            "            bbox_head=dict(\n",
+            "                type='Shared2FCBBoxHead',\n",
+            "                in_channels=256,\n",
+            "                fc_out_channels=1024,\n",
+            "                roi_feat_size=7,\n",
+            "                num_classes=1,\n",
+            "                bbox_coder=dict(\n",
+            "                    type='DeltaXYWHBBoxCoder',\n",
+            "                    target_means=[0.0, 0.0, 0.0, 0.0],\n",
+            "                    target_stds=[0.1, 0.1, 0.2, 0.2],\n",
+            "                    clip_border=False),\n",
+            "                reg_class_agnostic=False,\n",
+            "                loss_cls=dict(\n",
+            "                    type='CrossEntropyLoss',\n",
+            "                    use_sigmoid=False,\n",
+            "                    loss_weight=1.0),\n",
+            "                loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))),\n",
+            "        train_cfg=dict(\n",
+            "            rpn=dict(\n",
+            "                assigner=dict(\n",
+            "                    type='MaxIoUAssigner',\n",
+            "                    pos_iou_thr=0.7,\n",
+            "                    neg_iou_thr=0.3,\n",
+            "                    min_pos_iou=0.3,\n",
+            "                    match_low_quality=True,\n",
+            "                    ignore_iof_thr=-1),\n",
+            "                sampler=dict(\n",
+            "                    type='RandomSampler',\n",
+            "                    num=256,\n",
+            "                    pos_fraction=0.5,\n",
+            "                    neg_pos_ub=-1,\n",
+            "                    add_gt_as_proposals=False),\n",
+            "                allowed_border=-1,\n",
+            "                pos_weight=-1,\n",
+            "                debug=False),\n",
+            "            rpn_proposal=dict(\n",
+            "                nms_pre=2000,\n",
+            "                max_per_img=1000,\n",
+            "                nms=dict(type='nms', iou_threshold=0.7),\n",
+            "                min_bbox_size=0),\n",
+            "            rcnn=dict(\n",
+            "                assigner=dict(\n",
+            "                    type='MaxIoUAssigner',\n",
+            "                    pos_iou_thr=0.5,\n",
+            "                    neg_iou_thr=0.5,\n",
+            "                    min_pos_iou=0.5,\n",
+            "                    match_low_quality=False,\n",
+            "                    ignore_iof_thr=-1),\n",
+            "                sampler=dict(\n",
+            "                    type='RandomSampler',\n",
+            "                    num=512,\n",
+            "                    pos_fraction=0.25,\n",
+            "                    neg_pos_ub=-1,\n",
+            "                    add_gt_as_proposals=True),\n",
+            "                pos_weight=-1,\n",
+            "                debug=False)),\n",
+            "        test_cfg=dict(\n",
+            "            rpn=dict(\n",
+            "                nms_pre=1000,\n",
+            "                max_per_img=1000,\n",
+            "                nms=dict(type='nms', iou_threshold=0.7),\n",
+            "                min_bbox_size=0),\n",
+            "            rcnn=dict(\n",
+            "                score_thr=0.05,\n",
+            "                nms=dict(type='nms', iou_threshold=0.5),\n",
+            "                max_per_img=100)),\n",
+            "        init_cfg=dict(\n",
+            "            type='Pretrained',\n",
+            "            checkpoint='./tutorial_exps/detector/epoch_4.pth')),\n",
+            "    type='DeepSORT',\n",
+            "    motion=dict(type='KalmanFilter', center_only=False),\n",
+            "    reid=dict(\n",
+            "        type='BaseReID',\n",
+            "        data_preprocessor=None,\n",
+            "        backbone=dict(\n",
+            "            type='mmcls.ResNet',\n",
+            "            depth=50,\n",
+            "            num_stages=4,\n",
+            "            out_indices=(3, ),\n",
+            "            style='pytorch'),\n",
+            "        neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),\n",
+            "        head=dict(\n",
+            "            type='LinearReIDHead',\n",
+            "            num_fcs=1,\n",
+            "            in_channels=2048,\n",
+            "            fc_channels=1024,\n",
+            "            out_channels=128,\n",
+            "            num_classes=380,\n",
+            "            loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),\n",
+            "            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),\n",
+            "            norm_cfg=dict(type='BN1d'),\n",
+            "            act_cfg=dict(type='ReLU')),\n",
+            "        init_cfg=dict(\n",
+            "            type='Pretrained', checkpoint='./tutorial_exps/reid/epoch_2.pth')),\n",
+            "    tracker=dict(\n",
+            "        type='SORTTracker',\n",
+            "        obj_score_thr=0.5,\n",
+            "        reid=dict(\n",
+            "            num_samples=10,\n",
+            "            img_scale=(256, 128),\n",
+            "            img_norm_cfg=None,\n",
+            "            match_score_thr=2.0),\n",
+            "        match_iou_thr=0.5,\n",
+            "        momentums=None,\n",
+            "        num_tentatives=2,\n",
+            "        num_frames_retain=100))\n",
+            "dataset_type = 'MOTChallengeDataset'\n",
+            "data_root = 'data/MOT17/'\n",
+            "train_pipeline = [\n",
+            "    dict(\n",
+            "        type='TransformBroadcaster',\n",
+            "        share_random_params=True,\n",
+            "        transforms=[\n",
+            "            dict(type='LoadImageFromFile'),\n",
+            "            dict(type='LoadTrackAnnotations', with_instance_id=True),\n",
+            "            dict(\n",
+            "                type='mmdet.RandomResize',\n",
+            "                scale=(1088, 1088),\n",
+            "                ratio_range=(0.8, 1.2),\n",
+            "                keep_ratio=True,\n",
+            "                clip_object_border=False),\n",
+            "            dict(type='mmdet.PhotoMetricDistortion')\n",
+            "        ]),\n",
+            "    dict(\n",
+            "        type='TransformBroadcaster',\n",
+            "        share_random_params=False,\n",
+            "        transforms=[\n",
+            "            dict(\n",
+            "                type='mmdet.RandomCrop',\n",
+            "                crop_size=(1088, 1088),\n",
+            "                bbox_clip_border=False)\n",
+            "        ]),\n",
+            "    dict(\n",
+            "        type='TransformBroadcaster',\n",
+            "        share_random_params=True,\n",
+            "        transforms=[dict(type='mmdet.RandomFlip', prob=0.5)]),\n",
+            "    dict(type='PackTrackInputs', ref_prefix='ref', num_key_frames=1)\n",
+            "]\n",
+            "test_pipeline = [\n",
+            "    dict(type='LoadImageFromFile'),\n",
+            "    dict(type='LoadTrackAnnotations', with_instance_id=True),\n",
+            "    dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),\n",
+            "    dict(type='PackTrackInputs', pack_single_img=True)\n",
+            "]\n",
+            "train_dataloader = None\n",
+            "val_dataloader = None\n",
+            "test_dataloader = dict(\n",
+            "    batch_size=1,\n",
+            "    num_workers=2,\n",
+            "    persistent_workers=True,\n",
+            "    drop_last=False,\n",
+            "    sampler=dict(type='VideoSampler'),\n",
+            "    dataset=dict(\n",
+            "        type='MOTChallengeDataset',\n",
+            "        data_root='data/MOT17_tiny/',\n",
+            "        ann_file='annotations/half-val_cocoformat.json',\n",
+            "        data_prefix=dict(img_path='train'),\n",
+            "        ref_img_sampler=None,\n",
+            "        load_as_video=True,\n",
+            "        test_mode=True,\n",
+            "        pipeline=[\n",
+            "            dict(type='LoadImageFromFile'),\n",
+            "            dict(type='LoadTrackAnnotations', with_instance_id=True),\n",
+            "            dict(type='mmdet.Resize', scale=(1088, 1088), keep_ratio=True),\n",
+            "            dict(type='PackTrackInputs', pack_single_img=True)\n",
+            "        ]))\n",
+            "val_evaluator = None\n",
+            "test_evaluator = dict(\n",
+            "    type='MOTChallengeMetrics', metric=['HOTA', 'CLEAR', 'Identity'])\n",
+            "default_scope = 'mmtrack'\n",
+            "default_hooks = dict(\n",
+            "    timer=dict(type='IterTimerHook'),\n",
+            "    logger=dict(type='LoggerHook', interval=50),\n",
+            "    param_scheduler=dict(type='ParamSchedulerHook'),\n",
+            "    checkpoint=dict(type='CheckpointHook', interval=1),\n",
+            "    sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+            "    visualization=dict(type='TrackVisualizationHook', draw=False))\n",
+            "env_cfg = dict(\n",
+            "    cudnn_benchmark=False,\n",
+            "    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),\n",
+            "    dist_cfg=dict(backend='nccl'))\n",
+            "vis_backends = [dict(type='LocalVisBackend')]\n",
+            "visualizer = dict(\n",
+            "    type='TrackLocalVisualizer',\n",
+            "    vis_backends=[dict(type='LocalVisBackend')],\n",
+            "    name='deepsort_visualizer')\n",
+            "log_level = 'INFO'\n",
+            "load_from = None\n",
+            "resume = False\n",
+            "train_cfg = None\n",
+            "val_cfg = None\n",
+            "test_cfg = dict(type='TestLoop')\n",
+            "work_dir = './tutorial_exps'\n",
+            "randomness = dict(seed=0, deterministic=False)\n",
+            "gpu_ids = range(0, 1)\n",
+            "\n",
+            "Result has been saved to /content/mmtracking/tutorial_exps/modules_statistic_results.json\n",
+            "09/06 09:37:06 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+            "09/06 09:37:06 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - load model from: ./tutorial_exps/detector/epoch_4.pth\n",
+            "09/06 09:37:06 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - local loads checkpoint from path: ./tutorial_exps/detector/epoch_4.pth\n",
+            "09/06 09:37:06 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "unexpected key in source state_dict: data_preprocessor.mean, data_preprocessor.std\n",
+            "\n",
+            "09/06 09:37:06 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - load model from: ./tutorial_exps/reid/epoch_2.pth\n",
+            "09/06 09:37:06 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - local loads checkpoint from path: ./tutorial_exps/reid/epoch_2.pth\n",
+            "09/06 09:37:06 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - The model and loaded state dict do not match exactly\n",
+            "\n",
+            "unexpected key in source state_dict: data_preprocessor.mean, data_preprocessor.std\n",
+            "\n",
             "loading annotations into memory...\n",
-            "Done (t=0.12s)\n",
+            "Done (t=10.59s)\n",
             "creating index...\n",
-            "index created!\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2022-04-20 05:29:38,354 - mmtrack - INFO - initialize FasterRCNN with init_cfg {'type': 'Pretrained', 'checkpoint': './tutorial_exps/detector/epoch_4.pth'}\n",
-            "2022-04-20 05:29:38,356 - mmcv - INFO - load model from: ./tutorial_exps/detector/epoch_4.pth\n",
-            "2022-04-20 05:29:38,359 - mmcv - INFO - load checkpoint from local path: ./tutorial_exps/detector/epoch_4.pth\n",
-            "2022-04-20 05:29:38,667 - mmtrack - INFO - initialize BaseReID with init_cfg {'type': 'Pretrained', 'checkpoint': './tutorial_exps/reid/epoch_2.pth'}\n",
-            "2022-04-20 05:29:38,669 - mmcv - INFO - load model from: ./tutorial_exps/reid/epoch_2.pth\n",
-            "2022-04-20 05:29:38,672 - mmcv - INFO - load checkpoint from local path: ./tutorial_exps/reid/epoch_2.pth\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Warning: The model doesn't have classes\n",
-            "[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 823/823, 5.1 task/s, elapsed: 162s, ETA:     0sEvaluate CLEAR MOT results.\n",
+            "index created!\n",
+            "09/06 09:37:28 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [50/823]    eta: 0:02:33  time: 0.1980  data_time: 0.0045  memory: 3519  \n",
+            "09/06 09:37:38 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [100/823]    eta: 0:02:15  time: 0.1873  data_time: 0.0042  memory: 2148  \n",
+            "09/06 09:37:49 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [150/823]    eta: 0:02:50  time: 0.2529  data_time: 0.0073  memory: 2148  \n",
+            "09/06 09:38:03 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [200/823]    eta: 0:02:47  time: 0.2690  data_time: 0.0107  memory: 2190  \n",
+            "09/06 09:38:16 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [250/823]    eta: 0:02:24  time: 0.2518  data_time: 0.0070  memory: 2190  \n",
+            "09/06 09:38:28 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [300/823]    eta: 0:02:05  time: 0.2391  data_time: 0.0063  memory: 2236  \n",
+            "09/06 09:38:40 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [350/823]    eta: 0:01:42  time: 0.2171  data_time: 0.0048  memory: 2237  \n",
+            "09/06 09:38:51 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [400/823]    eta: 0:01:35  time: 0.2247  data_time: 0.0056  memory: 2224  \n",
+            "09/06 09:39:03 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [450/823]    eta: 0:01:27  time: 0.2335  data_time: 0.0058  memory: 2218  \n",
+            "09/06 09:39:14 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [500/823]    eta: 0:01:10  time: 0.2188  data_time: 0.0106  memory: 2212  \n",
+            "09/06 09:39:25 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [550/823]    eta: 0:01:01  time: 0.2267  data_time: 0.0098  memory: 2206  \n",
+            "09/06 09:39:36 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [600/823]    eta: 0:00:49  time: 0.2198  data_time: 0.0072  memory: 2200  \n",
+            "09/06 09:39:48 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [650/823]    eta: 0:00:41  time: 0.2403  data_time: 0.0043  memory: 2264  \n",
+            "09/06 09:40:00 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [700/823]    eta: 0:00:29  time: 0.2418  data_time: 0.0047  memory: 2372  \n",
+            "09/06 09:40:12 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [750/823]    eta: 0:00:17  time: 0.2427  data_time: 0.0041  memory: 2246  \n",
+            "09/06 09:40:26 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [800/823]    eta: 0:00:05  time: 0.2434  data_time: 0.0041  memory: 2317  \n",
             "\n",
             "Eval Config:\n",
             "USE_PARALLEL         : False                         \n",
@@ -2267,10 +3283,10 @@
             "PLOT_CURVES          : True                          \n",
             "\n",
             "MotChallenge2DBox Config:\n",
-            "GT_FOLDER            : data/MOT17_tiny/train         \n",
-            "TRACKERS_FOLDER      : /tmp/tmpbbh8obye              \n",
+            "GT_FOLDER            : /tmp/tmpw97bk3y0/gt           \n",
+            "TRACKERS_FOLDER      : /tmp/tmpw97bk3y0              \n",
             "OUTPUT_FOLDER        : None                          \n",
-            "TRACKERS_TO_EVAL     : ['track']                     \n",
+            "TRACKERS_TO_EVAL     : ['default-tracker']           \n",
             "CLASSES_TO_EVAL      : ['pedestrian']                \n",
             "BENCHMARK            : MOT17                         \n",
             "SPLIT_TO_EVAL        : train                         \n",
@@ -2281,37 +3297,107 @@
             "OUTPUT_SUB_FOLDER    :                               \n",
             "TRACKER_DISPLAY_NAMES : None                          \n",
             "SEQMAP_FOLDER        : None                          \n",
-            "SEQMAP_FILE          : /tmp/tmpbbh8obye/videoseq.txt \n",
-            "SEQ_INFO             : None                          \n",
-            "GT_LOC_FORMAT        : {gt_folder}/{seq}/gt/gt_half-val.txt\n",
+            "SEQMAP_FILE          : /tmp/tmpw97bk3y0/default-tracker/videoseq.txt\n",
+            "SEQ_INFO             : {'MOT17-02-FRCNN': 299, 'MOT17-04-FRCNN': 524}\n",
+            "GT_LOC_FORMAT        : {gt_folder}/{seq}.txt         \n",
             "SKIP_SPLIT_FOL       : True                          \n",
             "\n",
-            "Evaluating 1 tracker(s) on 2 sequence(s) for 1 class(es) on MotChallenge2DBox dataset using the following metrics: HOTA, Count\n",
+            "CLEAR Config:\n",
+            "METRICS              : ['CLEAR']                     \n",
+            "THRESHOLD            : 0.5                           \n",
+            "PRINT_CONFIG         : True                          \n",
+            "\n",
+            "Identity Config:\n",
+            "METRICS              : ['Identity']                  \n",
+            "THRESHOLD            : 0.5                           \n",
+            "PRINT_CONFIG         : True                          \n",
+            "\n",
+            "Evaluating 1 tracker(s) on 2 sequence(s) for 1 class(es) on MotChallenge2DBox dataset using the following metrics: HOTA, CLEAR, Identity, Count\n",
             "\n",
             "\n",
-            "Evaluating track\n",
+            "Evaluating default-tracker\n",
+            "\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.7/dist-packages/trackeval/datasets/mot_challenge_2d_box.py:228: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+            "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+            "  time_data = np.asarray(read_data[time_key], dtype=np.float)\n",
+            "/usr/local/lib/python3.7/dist-packages/trackeval/datasets/mot_challenge_2d_box.py:359: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+            "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+            "  to_remove_tracker = np.array([], np.int)\n",
+            "/usr/local/lib/python3.7/dist-packages/trackeval/metrics/hota.py:31: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+            "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+            "  res[field] = np.zeros((len(self.array_labels)), dtype=np.float)\n",
+            "/usr/local/lib/python3.7/dist-packages/trackeval/metrics/identity.py:83: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+            "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+            "  res['IDFN'] = fn_mat[match_rows, match_cols].sum().astype(np.int)\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "1 eval_sequence(MOT17-02-FRCNN, default-tracker)                         0.4670 sec\n",
+            "2 eval_sequence(MOT17-04-FRCNN, default-tracker)                         1.3160 sec\n",
+            "\n",
+            "All sequences for default-tracker finished in 1.78 seconds\n",
             "\n",
-            "1 eval_sequence(MOT17-02-FRCNN, track)                                   0.4733 sec\n",
-            "2 eval_sequence(MOT17-04-FRCNN, track)                                   1.1452 sec\n",
+            "HOTA: default-tracker-pedestrian   HOTA      DetA      AssA      DetRe     DetPr     AssRe     AssPr     LocA      RHOTA     HOTA(0)   LocA(0)   HOTALocA(0)\n",
+            "MOT17-02-FRCNN                     26.07     39.469    17.888    46.443    63.395    20.298    59.499    79.439    28.497    34.31     68.423    23.476    \n",
+            "MOT17-04-FRCNN                     54.514    66.438    45.771    70.914    82.049    51.423    62.818    84.856    56.637    64.719    81.721    52.889    \n",
+            "COMBINED                           47.481    57.937    40.091    63.815    77.25     45.144    62.333    83.649    50.152    57.312    78.488    44.983    \n",
             "\n",
-            "All sequences for track finished in 1.62 seconds\n",
+            "CLEAR: default-tracker-pedestrian  MOTA      MOTP      MODA      CLR_Re    CLR_Pr    MTR       PTR       MLR       sMOTA     CLR_TP    CLR_FN    CLR_FP    IDSW      MT        PT        ML        Frag      \n",
+            "MOT17-02-FRCNN                     28.279    77.975    37.287    55.273    75.449    20.755    60.377    18.868    16.105    5461      4419      1777      890       11        32        10        346       \n",
+            "MOT17-04-FRCNN                     77.19     82.679    80.937    83.684    96.823    65.217    31.884    2.8986    62.695    20233     3945      664       906       45        22        2         285       \n",
+            "COMBINED                           63.001    81.679    68.275    75.442    91.324    45.902    44.262    9.8361    49.18     25694     8364      2441      1796      56        54        12        631       \n",
             "\n",
-            "HOTA: track-pedestrian             HOTA      DetA      AssA      DetRe     DetPr     AssRe     AssPr     LocA      RHOTA     HOTA(0)   LocA(0)   HOTALocA(0)\n",
-            "MOT17-02-FRCNN                     26.163    39.63     18.033    46.541    63.74     20.987    55.171    79.643    28.582    33.942    69.284    23.516    \n",
-            "MOT17-04-FRCNN                     50.13     66.595    38.789    71.095    82.128    44.817    58.181    84.906    52.104    59.437    81.81     48.625    \n",
-            "COMBINED                           44.066    58.107    34.579    63.972    77.414    40.031    57.716    83.727    46.545    53.072    78.788    41.815    \n",
+            "Identity: default-tracker-pedestrianIDF1      IDR       IDP       IDTP      IDFN      IDFP      \n",
+            "MOT17-02-FRCNN                     29.186    25.283    34.512    2498      7382      4740      \n",
+            "MOT17-04-FRCNN                     59.448    55.414    64.114    13398     10780     7499      \n",
+            "COMBINED                           51.118    46.673    56.499    15896     18162     12239     \n",
             "\n",
-            "Count: track-pedestrian            Dets      GT_Dets   IDs       GT_IDs    \n",
-            "MOT17-02-FRCNN                     7214      9880      353       53        \n",
-            "MOT17-04-FRCNN                     20930     24178     164       69        \n",
-            "COMBINED                           28144     34058     517       122       \n",
-            "                IDF1   IDP   IDR  Rcll  Prcn  GT MT PT ML   FP   FN  IDs   FM  MOTA  MOTP IDt IDa IDm      HOTA\n",
-            "MOT17-02-FRCNN 29.6% 35.1% 25.6% 55.1% 75.5%  53 12 29 12 1766 4432 1021  343 26.9% 0.219 382 201   8  0.261632\n",
-            "MOT17-04-FRCNN 52.8% 56.9% 49.2% 84.2% 97.2%  69 48 20  1  583 3831 1267  332 76.5% 0.172 414  74   7  0.501298\n",
-            "OVERALL        46.4% 51.3% 42.4% 75.7% 91.7% 122 60 49 13 2349 8263 2288  675 62.1% 0.182 796 275  15  0.440663\n",
-            "{'IDF1': 0.464, 'IDP': 0.513, 'IDR': 0.424, 'Rcll': 0.757, 'Prcn': 0.917, 'GT': 122, 'MT': 60, 'PT': 49, 'ML': 13, 'FP': 2349, 'FN': 8263, 'IDs': 2288, 'FM': 675, 'MOTA': 0.621, 'MOTP': 0.182, 'IDt': 796, 'IDa': 275, 'IDm': 15, 'HOTA': 0.441}\n"
+            "Count: default-tracker-pedestrian  Dets      GT_Dets   IDs       GT_IDs    \n",
+            "MOT17-02-FRCNN                     7238      9880      434       53        \n",
+            "MOT17-04-FRCNN                     20897     24178     188       69        \n",
+            "COMBINED                           28135     34058     622       122       \n",
+            "09/06 09:40:34 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Evaluating HOTA Metrics...\n",
+            "09/06 09:40:34 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Evaluating CLEAR Metrics...\n",
+            "09/06 09:40:34 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Evaluating Identity Metrics...\n",
+            "09/06 09:40:34 - mmengine - \u001b[4m\u001b[37mINFO\u001b[0m - Epoch(test) [823/823]  motchallenge-metric/HOTA: 0.4748  motchallenge-metric/AssA: 0.4009  motchallenge-metric/DetA: 0.5794  motchallenge-metric/MOTA: 0.6300  motchallenge-metric/MOTP: 0.8168  motchallenge-metric/IDSW: 1796.0000  motchallenge-metric/TP: 25694.0000  motchallenge-metric/FP: 2441.0000  motchallenge-metric/FN: 8364.0000  motchallenge-metric/Frag: 631.0000  motchallenge-metric/MT: 56.0000  motchallenge-metric/ML: 12.0000  motchallenge-metric/IDF1: 0.5112  motchallenge-metric/IDTP: 15896.0000  motchallenge-metric/IDFN: 18162.0000  motchallenge-metric/IDFP: 12239.0000  motchallenge-metric/IDP: 0.5650  motchallenge-metric/IDR: 0.4667\n"
           ]
         },
+        {
+          "data": {
+            "text/plain": [
+              "{'motchallenge-metric/HOTA': 0.47480527965525604,\n",
+              " 'motchallenge-metric/AssA': 0.40090624838232497,\n",
+              " 'motchallenge-metric/DetA': 0.5793687250260392,\n",
+              " 'motchallenge-metric/MOTA': 0.6300135063714839,\n",
+              " 'motchallenge-metric/MOTP': 0.8167920002689131,\n",
+              " 'motchallenge-metric/IDSW': 1796.0,\n",
+              " 'motchallenge-metric/TP': 25694.0,\n",
+              " 'motchallenge-metric/FP': 2441.0,\n",
+              " 'motchallenge-metric/FN': 8364.0,\n",
+              " 'motchallenge-metric/Frag': 631.0,\n",
+              " 'motchallenge-metric/MT': 56.0,\n",
+              " 'motchallenge-metric/ML': 12.0,\n",
+              " 'motchallenge-metric/IDF1': 0.5111829305548856,\n",
+              " 'motchallenge-metric/IDTP': 15896.0,\n",
+              " 'motchallenge-metric/IDFN': 18162.0,\n",
+              " 'motchallenge-metric/IDFP': 12239.0,\n",
+              " 'motchallenge-metric/IDP': 0.5649902256975298,\n",
+              " 'motchallenge-metric/IDR': 0.46673321980151505}"
+            ]
+          },
+          "execution_count": 47,
+          "metadata": {},
+          "output_type": "execute_result"
+        },
         {
           "data": {
             "text/plain": [
@@ -2323,37 +3409,17 @@
         }
       ],
       "source": [
-        "from mmtrack.datasets import build_dataloader\n",
-        "from mmtrack.apis import init_model\n",
-        "from mmcv.parallel import MMDataParallel\n",
-        "from mmtrack.apis import single_gpu_test\n",
-        "from mmtrack.datasets import build_dataset\n",
+        "from mmengine.model import is_model_wrapper\n",
+        "from mmengine.runner import Runner\n",
         "\n",
-        "dataset = build_dataset(cfg.data.test)\n",
-        "data_loader = build_dataloader(\n",
-        "    dataset,\n",
-        "    samples_per_gpu=1,\n",
-        "    workers_per_gpu=cfg.data.workers_per_gpu,\n",
-        "    dist=False,\n",
-        "    shuffle=False)\n",
+        "runner = Runner.from_cfg(cfg)\n",
         "\n",
-        "# build the model and load checkpoint\n",
-        "model = init_model(cfg)\n",
+        "if is_model_wrapper(runner.model):\n",
+        "    runner.model.module.init_weights()\n",
+        "else:\n",
+        "    runner.model.init_weights()\n",
         "\n",
-        "model = MMDataParallel(model, device_ids=cfg.gpu_ids)\n",
-        "outputs = single_gpu_test(model, data_loader)\n",
-        "\n",
-        "eval_kwargs = cfg.get('evaluation', {}).copy()\n",
-        "# hard-code way to remove EvalHook args\n",
-        "eval_hook_args = [\n",
-        "    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',\n",
-        "    'rule', 'by_epoch'\n",
-        "]\n",
-        "for key in eval_hook_args:\n",
-        "    eval_kwargs.pop(key, None)\n",
-        "eval_kwargs.update(dict(metric=['track']))\n",
-        "metric = dataset.evaluate(outputs, **eval_kwargs)\n",
-        "print(metric)"
+        "runner.test()"
       ]
     }
   ],
@@ -2361,7 +3427,6 @@
     "accelerator": "GPU",
     "colab": {
       "collapsed_sections": [],
-      "name": "Copy of Copy of MMTracking_Tutorial.ipynb",
       "provenance": []
     },
     "kernelspec": {
@@ -2383,7 +3448,7 @@
     },
     "widgets": {
       "application/vnd.jupyter.widget-state+json": {
-        "01fa8d3fb39b483195f7cfc293c6b3ba": {
+        "0b4dd8ace9b5467e9e7c0214df6df1a9": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -2435,29 +3500,7 @@
             "width": null
           }
         },
-        "091f72866bc64f8bba239684fe9ddeb6": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_c20f8918d08d4e07a544ac69541038fd",
-              "IPY_MODEL_be103dd0cc5543be9dea3dd8c7890187",
-              "IPY_MODEL_5be59a3bfc0148478a662f436e1677a8"
-            ],
-            "layout": "IPY_MODEL_4c616e4740f24416bbea27ba49709d50"
-          }
-        },
-        "0b656d0fd04b4c05825748fdf9a34d48": {
+        "1a722030c2344bf6a5ba1941c9e056de": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
           "model_name": "HTMLModel",
@@ -2472,129 +3515,53 @@
             "_view_name": "HTMLView",
             "description": "",
             "description_tooltip": null,
-            "layout": "IPY_MODEL_68c48555d8dd49f49e26956904746e60",
+            "layout": "IPY_MODEL_c1a6f0f21a864752bb77a435e5633c5f",
             "placeholder": "​",
-            "style": "IPY_MODEL_758197b8b211425f864204242e518fed",
-            "value": "100%"
+            "style": "IPY_MODEL_f4510cc71cf54df785fbe760b7dfc32c",
+            "value": " 160M/160M [00:17&lt;00:00, 9.42MB/s]"
           }
         },
-        "0ce49a6424414d268a0224b7e57739b6": {
+        "20b5b5e82f5548c98037ee236732ccfe": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
+          "model_name": "ProgressStyleModel",
           "state": {
-            "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_0b656d0fd04b4c05825748fdf9a34d48",
-              "IPY_MODEL_aaa43b3573904c72af7fb76fb981bf36",
-              "IPY_MODEL_c2d658f5adc3416e9be42fa615e2061d"
-            ],
-            "layout": "IPY_MODEL_cbddeebf0f8f464ca15e5899a8cee2a7"
-          }
-        },
-        "12bedcdf97a94489a2356e89d393d107": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
+            "_model_name": "ProgressStyleModel",
             "_view_count": null,
             "_view_module": "@jupyter-widgets/base",
             "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "15eafa933541440d822371cc6afce196": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_3187d7f735ab45cfad16f5bb3471dc0b",
-            "placeholder": "​",
-            "style": "IPY_MODEL_24da18975ff0468a9bc0c294c45b8a16",
-            "value": " 170M/170M [00:18&lt;00:00, 8.73MB/s]"
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
           }
         },
-        "1f5d5858bede4242b51d1cb635c3f610": {
+        "28ef474b2c7c466baa81c1196f8e7505": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
+          "model_name": "FloatProgressModel",
           "state": {
             "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
+            "_model_name": "FloatProgressModel",
             "_view_count": null,
             "_view_module": "@jupyter-widgets/controls",
             "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
             "description": "",
             "description_tooltip": null,
-            "layout": "IPY_MODEL_4c38742132624fa38216e67e3771894f",
-            "placeholder": "​",
-            "style": "IPY_MODEL_6b83dd17433745a3ae3d2ef9b4b932ad",
-            "value": "100%"
+            "layout": "IPY_MODEL_504570ec6de24f40b824fb8eacc65447",
+            "max": 102491894,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_b9470c130a69411b8be481f4aa045d26",
+            "value": 102491894
           }
         },
-        "1fc5eb2eaa6a4ed88a005a081bc24a70": {
+        "2db8224b7bb849c0a3c8a281bb57eb22": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
           "model_name": "HBoxModel",
@@ -2609,133 +3576,35 @@
             "_view_name": "HBoxView",
             "box_style": "",
             "children": [
-              "IPY_MODEL_1f5d5858bede4242b51d1cb635c3f610",
-              "IPY_MODEL_9344070d5d004c0fb5c0bdf22632e343",
-              "IPY_MODEL_c4f0834d72224880b5a12217d95cb086"
+              "IPY_MODEL_b811cd2dd72040fe86cdc5238b522d06",
+              "IPY_MODEL_378b791183a24c558b2ce4afc1fe8a57",
+              "IPY_MODEL_9f46a04413cf4cce9f56a710aebe1424"
             ],
-            "layout": "IPY_MODEL_92345bd0ebed40f4bf0a9ac4aad2bc5b"
+            "layout": "IPY_MODEL_fb5e2db40e004e21a852c9f28740615c"
           }
         },
-        "24da18975ff0468a9bc0c294c45b8a16": {
+        "33267c31afd64c58b153a357aaa4569b": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
+          "model_name": "HTMLModel",
           "state": {
+            "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "2bb53ac1784d421ca2050b5aa6701a0d": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "3187d7f735ab45cfad16f5bb3471dc0b": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
+            "_model_name": "HTMLModel",
             "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_936d523d65254c10a9c64033283ca0f9",
+            "placeholder": "​",
+            "style": "IPY_MODEL_4ea9a77bcea840e686d3525d4ef86cc8",
+            "value": " 170M/170M [00:18&lt;00:00, 10.1MB/s]"
           }
         },
-        "321a9bbadb4b4167ac37f84320bdf76f": {
+        "34594c7fd26c4f61af5e4ec5b34b6b02": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
           "model_name": "HBoxModel",
@@ -2750,170 +3619,54 @@
             "_view_name": "HBoxView",
             "box_style": "",
             "children": [
-              "IPY_MODEL_f69886a5f4984ee09804aaa411660955",
-              "IPY_MODEL_fc977a68f1494ad68c203f5544ce1521",
-              "IPY_MODEL_995bca8653914372874f8861a38f0185"
+              "IPY_MODEL_8727613cf45f4cd4a79557afd1973632",
+              "IPY_MODEL_28ef474b2c7c466baa81c1196f8e7505",
+              "IPY_MODEL_7b268f5a3b334d2c89ae8d43fda6eb53"
             ],
-            "layout": "IPY_MODEL_f636c4fc5cfe4489a47a14afc88a0b9c"
+            "layout": "IPY_MODEL_9d467daaa30e44288dd49152948db460"
           }
         },
-        "3590a55482b14c379f1a63433c140cfb": {
+        "34b3be1af1704bd4a1a25992ff3a2c33": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "36dd9bf50205454493fc50a1cef64d23": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "3d277b18e9d2486492614f9093bf19a4": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "3ed731e3194e4465aa7b79579f8eaaca": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
+          "model_name": "FloatProgressModel",
           "state": {
             "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
+            "_model_name": "FloatProgressModel",
             "_view_count": null,
             "_view_module": "@jupyter-widgets/controls",
             "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
             "description": "",
             "description_tooltip": null,
-            "layout": "IPY_MODEL_750088e53a064a13872ed5fa5616face",
-            "placeholder": "​",
-            "style": "IPY_MODEL_eb8e5c96ffea40e298a38db2b5a38ace",
-            "value": " 160M/160M [00:12&lt;00:00, 13.2MB/s]"
+            "layout": "IPY_MODEL_9c5e87a7ba0f4b8395a3fd0027508930",
+            "max": 167290877,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_a83c0a2e5ee8432d9ecb05468194f3ed",
+            "value": 167290877
           }
-        },
-        "4c38742132624fa38216e67e3771894f": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
+        },
+        "357be7f73b2848f89efe394758edc839": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
           }
         },
-        "4c616e4740f24416bbea27ba49709d50": {
+        "36f603cf7dab445ab6120b805799c1d0": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -2965,7 +3718,31 @@
             "width": null
           }
         },
-        "4ea1a24a8ffc4269b46dc1dc7539a0f3": {
+        "378b791183a24c558b2ce4afc1fe8a57": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_3942fdab3efa47e98ef1398b27eade57",
+            "max": 182353916,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_357be7f73b2848f89efe394758edc839",
+            "value": 182353916
+          }
+        },
+        "3942fdab3efa47e98ef1398b27eade57": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3017,7 +3794,7 @@
             "width": null
           }
         },
-        "535faa89033349c08b3bd31ff5be0018": {
+        "3a09540c2a4247debd7c35b384e7ebed": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3069,7 +3846,22 @@
             "width": null
           }
         },
-        "57bc05e1d2b64e899fd524c13cd47f5a": {
+        "3d25971bbb3444b797b0faac0a934ffb": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "40b5616f3b3741d68210206a1d852418": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3121,59 +3913,50 @@
             "width": null
           }
         },
-        "591fb75b51d14989abfd7a5325afaff6": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "5a4441a529664fdfb5dc4d8de70b3421": {
+        "4679ff15b0db45fb9931757e0328d296": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
+          "model_name": "HTMLModel",
           "state": {
+            "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
+            "_model_name": "HTMLModel",
             "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_5d7f321bb1fe4f32b3c1733a846ef8f5",
+            "placeholder": "​",
+            "style": "IPY_MODEL_dc309139c6764cefa17ef542ebb36d3e",
+            "value": "100%"
           }
         },
-        "5be59a3bfc0148478a662f436e1677a8": {
+        "4757d999925e44748b3b4b8476ff7069": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
+          "model_name": "HBoxModel",
           "state": {
             "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
+            "_model_name": "HBoxModel",
             "_view_count": null,
             "_view_module": "@jupyter-widgets/controls",
             "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_01fa8d3fb39b483195f7cfc293c6b3ba",
-            "placeholder": "​",
-            "style": "IPY_MODEL_ac746c3a37c145c0bc0d40fa7cabfd2d",
-            "value": " 97.8M/97.8M [00:00&lt;00:00, 224MB/s]"
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_64133dab9fa94eb9ad7819da4740a90f",
+              "IPY_MODEL_34b3be1af1704bd4a1a25992ff3a2c33",
+              "IPY_MODEL_1a722030c2344bf6a5ba1941c9e056de"
+            ],
+            "layout": "IPY_MODEL_e0b78a16c9d8488eb6f2b78788bc1987"
           }
         },
-        "5c2860d7559940a7be431ab1228bbe6f": {
+        "482683c5f93f41108ef0added3850519": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
           "model_name": "DescriptionStyleModel",
@@ -3188,7 +3971,7 @@
             "description_width": ""
           }
         },
-        "5f08eb6751fc4438a5de45051218530f": {
+        "4ea9a77bcea840e686d3525d4ef86cc8": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
           "model_name": "DescriptionStyleModel",
@@ -3203,59 +3986,28 @@
             "description_width": ""
           }
         },
-        "6159a01b9931442c8b8968464517f4d5": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
+        "4eb851325f4b4cab82b8cbd979237520": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
           "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
             "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_ec62bc9001d349e696b8d82942e5b8c4",
+            "placeholder": "​",
+            "style": "IPY_MODEL_b55031500bb345b58d0676c47e3c7843",
+            "value": "100%"
           }
         },
-        "683972d6d4c4489dae2394c222f020d4": {
+        "4ebcc9b78ac64aedb146685a2437612c": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3307,7 +4059,7 @@
             "width": null
           }
         },
-        "68c48555d8dd49f49e26956904746e60": {
+        "504570ec6de24f40b824fb8eacc65447": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3359,7 +4111,7 @@
             "width": null
           }
         },
-        "6ad0273eb02440109ba7c2cde43739f9": {
+        "5d7f321bb1fe4f32b3c1733a846ef8f5": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3411,74 +4163,94 @@
             "width": null
           }
         },
-        "6b83dd17433745a3ae3d2ef9b4b932ad": {
+        "64133dab9fa94eb9ad7819da4740a90f": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
+          "model_name": "HTMLModel",
           "state": {
+            "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
+            "_model_name": "HTMLModel",
             "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_36f603cf7dab445ab6120b805799c1d0",
+            "placeholder": "​",
+            "style": "IPY_MODEL_f572376e2dbb437c95e1f62820d6f0d3",
+            "value": "100%"
           }
         },
-        "738b0850c1a4489a9147a63f3a587dd6": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
+        "64e629d4b1c74d2ab6c9ddc52ff4a2a9": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
           "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
             "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_3a09540c2a4247debd7c35b384e7ebed",
+            "max": 102530333,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_20b5b5e82f5548c98037ee236732ccfe",
+            "value": 102530333
+          }
+        },
+        "7b268f5a3b334d2c89ae8d43fda6eb53": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_0b4dd8ace9b5467e9e7c0214df6df1a9",
+            "placeholder": "​",
+            "style": "IPY_MODEL_e495e81fecf844a4ae4a15ca9cb0329c",
+            "value": " 97.7M/97.7M [00:10&lt;00:00, 10.1MB/s]"
+          }
+        },
+        "8727613cf45f4cd4a79557afd1973632": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_9793eaee82db45beaddf9e80cab6d0a1",
+            "placeholder": "​",
+            "style": "IPY_MODEL_e75e2e078b4246a9984fa0841de27a15",
+            "value": "100%"
           }
         },
-        "750088e53a064a13872ed5fa5616face": {
+        "936d523d65254c10a9c64033283ca0f9": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3530,22 +4302,7 @@
             "width": null
           }
         },
-        "758197b8b211425f864204242e518fed": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "7a4444bb015249ae9caa2b14c21fa0fd": {
+        "9793eaee82db45beaddf9e80cab6d0a1": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3597,28 +4354,7 @@
             "width": null
           }
         },
-        "7aacc564ceed400582af233b42822f7d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_36dd9bf50205454493fc50a1cef64d23",
-            "placeholder": "​",
-            "style": "IPY_MODEL_8ea164a027b044278794e87a396a548a",
-            "value": "100%"
-          }
-        },
-        "7d5fd07e913046a6b5445a791197a287": {
+        "98f236bb20444b919f136766394978ff": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3670,28 +4406,23 @@
             "width": null
           }
         },
-        "7da6f24364594f56a573d31665913645": {
+        "9c5adcbe51ef438db298bff6a09c2065": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
+          "model_name": "ProgressStyleModel",
           "state": {
-            "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
+            "_model_name": "ProgressStyleModel",
             "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_12bedcdf97a94489a2356e89d393d107",
-            "placeholder": "​",
-            "style": "IPY_MODEL_3d277b18e9d2486492614f9093bf19a4",
-            "value": " 158M/158M [00:18&lt;00:00, 7.12MB/s]"
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
           }
         },
-        "7dce393a07d14ccbb1d43554976aeb1c": {
+        "9c5e87a7ba0f4b8395a3fd0027508930": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3743,84 +4474,7 @@
             "width": null
           }
         },
-        "7f20458c3bf7422fba34beba519c27bd": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_9e5d67b0f3fb4baaa7254a102464057b",
-              "IPY_MODEL_f750ac53345746ee8c0364b5f850a56d",
-              "IPY_MODEL_3ed731e3194e4465aa7b79579f8eaaca"
-            ],
-            "layout": "IPY_MODEL_7a4444bb015249ae9caa2b14c21fa0fd"
-          }
-        },
-        "82f81af913424d3c96ba2cb8d905258a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "89b71747f37a4213be4750c562a7ccd9": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_bc127741c92f4b65ada91a4c4890ec87",
-            "max": 165672927,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_f140214cae9c45f39c0ca28d856fe596",
-            "value": 165672927
-          }
-        },
-        "8ea164a027b044278794e87a396a548a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "92345bd0ebed40f4bf0a9ac4aad2bc5b": {
+        "9d467daaa30e44288dd49152948db460": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -3872,23 +4526,7 @@
             "width": null
           }
         },
-        "9244d2f08f8c4e2a80e4f2f4921f1b0b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "9344070d5d004c0fb5c0bdf22632e343": {
+        "9f2fb13e136642728fbc97c8d7390118": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
           "model_name": "FloatProgressModel",
@@ -3904,107 +4542,58 @@
             "bar_style": "success",
             "description": "",
             "description_tooltip": null,
-            "layout": "IPY_MODEL_738b0850c1a4489a9147a63f3a587dd6",
-            "max": 102491894,
+            "layout": "IPY_MODEL_dd2d50cb741b4a79a13f0bf4a2b2242b",
+            "max": 177862517,
             "min": 0,
             "orientation": "horizontal",
-            "style": "IPY_MODEL_9604ece7c2bb4e8ba8914d4430a72876",
-            "value": 102491894
+            "style": "IPY_MODEL_9c5adcbe51ef438db298bff6a09c2065",
+            "value": 177862517
           }
         },
-        "939e2242b3eb4388aa4e30ebccd020cc": {
+        "9f46a04413cf4cce9f56a710aebe1424": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
+          "model_name": "HTMLModel",
           "state": {
             "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
+            "_model_name": "HTMLModel",
             "_view_count": null,
             "_view_module": "@jupyter-widgets/controls",
             "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
+            "_view_name": "HTMLView",
             "description": "",
             "description_tooltip": null,
-            "layout": "IPY_MODEL_2bb53ac1784d421ca2050b5aa6701a0d",
-            "max": 177862517,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_82f81af913424d3c96ba2cb8d905258a",
-            "value": 177862517
-          }
-        },
-        "95eaa4b589df4f4da736e0c104759fd1": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
+            "layout": "IPY_MODEL_b704c75bb577455dbac176c728b5540b",
+            "placeholder": "​",
+            "style": "IPY_MODEL_e34931d882f44879aee48bfd46d62dce",
+            "value": " 174M/174M [00:20&lt;00:00, 9.14MB/s]"
           }
         },
-        "9604ece7c2bb4e8ba8914d4430a72876": {
+        "a3fc7d320ed3417d833db4e9db0fce5c": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
+          "model_name": "HBoxModel",
           "state": {
+            "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
+            "_model_name": "HBoxModel",
             "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_4eb851325f4b4cab82b8cbd979237520",
+              "IPY_MODEL_9f2fb13e136642728fbc97c8d7390118",
+              "IPY_MODEL_33267c31afd64c58b153a357aaa4569b"
+            ],
+            "layout": "IPY_MODEL_98f236bb20444b919f136766394978ff"
           }
         },
-        "9718af278d2d45e99d1355cafc2c7518": {
+        "a5c4355973c342acbae02ff5c32a677f": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -4052,47 +4641,11 @@
             "padding": null,
             "right": null,
             "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "995bca8653914372874f8861a38f0185": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_535faa89033349c08b3bd31ff5be0018",
-            "placeholder": "​",
-            "style": "IPY_MODEL_5c2860d7559940a7be431ab1228bbe6f",
-            "value": " 174M/174M [00:20&lt;00:00, 8.81MB/s]"
-          }
-        },
-        "9a95f05e979e4a9bbad4675883350b1d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
+            "visibility": null,
+            "width": null
           }
         },
-        "9bdb4cdea53b429cad0e8c418fd7e715": {
+        "a83c0a2e5ee8432d9ecb05468194f3ed": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
           "model_name": "ProgressStyleModel",
@@ -4108,28 +4661,22 @@
             "description_width": ""
           }
         },
-        "9e5d67b0f3fb4baaa7254a102464057b": {
+        "b55031500bb345b58d0676c47e3c7843": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
+          "model_name": "DescriptionStyleModel",
           "state": {
-            "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
+            "_model_name": "DescriptionStyleModel",
             "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_9718af278d2d45e99d1355cafc2c7518",
-            "placeholder": "​",
-            "style": "IPY_MODEL_cb11dac7ded84ad59f12a03b04660b3c",
-            "value": "100%"
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
           }
         },
-        "a1106afabced4d418406b7bbb4f826cc": {
+        "b704c75bb577455dbac176c728b5540b": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -4181,61 +4728,44 @@
             "width": null
           }
         },
-        "aaa43b3573904c72af7fb76fb981bf36": {
+        "b811cd2dd72040fe86cdc5238b522d06": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
+          "model_name": "HTMLModel",
           "state": {
             "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
+            "_model_name": "HTMLModel",
             "_view_count": null,
             "_view_module": "@jupyter-widgets/controls",
             "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
+            "_view_name": "HTMLView",
             "description": "",
             "description_tooltip": null,
-            "layout": "IPY_MODEL_6159a01b9931442c8b8968464517f4d5",
-            "max": 103221797,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_591fb75b51d14989abfd7a5325afaff6",
-            "value": 103221797
-          }
-        },
-        "ab7f46e98c794292893599541f343047": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
+            "layout": "IPY_MODEL_40b5616f3b3741d68210206a1d852418",
+            "placeholder": "​",
+            "style": "IPY_MODEL_482683c5f93f41108ef0added3850519",
+            "value": "100%"
           }
         },
-        "ac746c3a37c145c0bc0d40fa7cabfd2d": {
+        "b9470c130a69411b8be481f4aa045d26": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
+          "model_name": "ProgressStyleModel",
           "state": {
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
+            "_model_name": "ProgressStyleModel",
             "_view_count": null,
             "_view_module": "@jupyter-widgets/base",
             "_view_module_version": "1.2.0",
             "_view_name": "StyleView",
+            "bar_color": null,
             "description_width": ""
           }
         },
-        "bc127741c92f4b65ada91a4c4890ec87": {
+        "c1a6f0f21a864752bb77a435e5633c5f": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -4287,52 +4817,7 @@
             "width": null
           }
         },
-        "be103dd0cc5543be9dea3dd8c7890187": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_57bc05e1d2b64e899fd524c13cd47f5a",
-            "max": 102530333,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_9244d2f08f8c4e2a80e4f2f4921f1b0b",
-            "value": 102530333
-          }
-        },
-        "c20f8918d08d4e07a544ac69541038fd": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_6ad0273eb02440109ba7c2cde43739f9",
-            "placeholder": "​",
-            "style": "IPY_MODEL_5f08eb6751fc4438a5de45051218530f",
-            "value": "100%"
-          }
-        },
-        "c2cdc25a1b9644b0ac9a45a5e5609314": {
+        "d8911837396446c19d76bc22c5be63e2": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
           "model_name": "HBoxModel",
@@ -4347,56 +4832,14 @@
             "_view_name": "HBoxView",
             "box_style": "",
             "children": [
-              "IPY_MODEL_7aacc564ceed400582af233b42822f7d",
-              "IPY_MODEL_89b71747f37a4213be4750c562a7ccd9",
-              "IPY_MODEL_7da6f24364594f56a573d31665913645"
+              "IPY_MODEL_4679ff15b0db45fb9931757e0328d296",
+              "IPY_MODEL_64e629d4b1c74d2ab6c9ddc52ff4a2a9",
+              "IPY_MODEL_e00476c38c4d473191eeed5d6c42ecce"
             ],
-            "layout": "IPY_MODEL_95eaa4b589df4f4da736e0c104759fd1"
-          }
-        },
-        "c2d658f5adc3416e9be42fa615e2061d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_683972d6d4c4489dae2394c222f020d4",
-            "placeholder": "​",
-            "style": "IPY_MODEL_ab7f46e98c794292893599541f343047",
-            "value": " 98.4M/98.4M [00:08&lt;00:00, 12.0MB/s]"
-          }
-        },
-        "c4f0834d72224880b5a12217d95cb086": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_7d5fd07e913046a6b5445a791197a287",
-            "placeholder": "​",
-            "style": "IPY_MODEL_5a4441a529664fdfb5dc4d8de70b3421",
-            "value": " 97.7M/97.7M [00:08&lt;00:00, 11.6MB/s]"
+            "layout": "IPY_MODEL_a5c4355973c342acbae02ff5c32a677f"
           }
         },
-        "cb11dac7ded84ad59f12a03b04660b3c": {
+        "dc309139c6764cefa17ef542ebb36d3e": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
           "model_name": "DescriptionStyleModel",
@@ -4411,7 +4854,7 @@
             "description_width": ""
           }
         },
-        "cbddeebf0f8f464ca15e5899a8cee2a7": {
+        "dd2d50cb741b4a79a13f0bf4a2b2242b": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -4463,44 +4906,28 @@
             "width": null
           }
         },
-        "d189fcaddeff41fcb9cd7b27969d53de": {
+        "e00476c38c4d473191eeed5d6c42ecce": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
+          "model_name": "HTMLModel",
           "state": {
             "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
+            "_model_name": "HTMLModel",
             "_view_count": null,
             "_view_module": "@jupyter-widgets/controls",
             "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_f507f59b242a4a5ebb204c4bb84a8241",
-              "IPY_MODEL_939e2242b3eb4388aa4e30ebccd020cc",
-              "IPY_MODEL_15eafa933541440d822371cc6afce196"
-            ],
-            "layout": "IPY_MODEL_4ea1a24a8ffc4269b46dc1dc7539a0f3"
-          }
-        },
-        "ddfcad5fc8854b5cbe0fe8840013ffe5": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_4ebcc9b78ac64aedb146685a2437612c",
+            "placeholder": "​",
+            "style": "IPY_MODEL_3d25971bbb3444b797b0faac0a934ffb",
+            "value": " 97.8M/97.8M [00:00&lt;00:00, 165MB/s]"
           }
         },
-        "e8b957c1eadc45ba9f7092df0d080f08": {
+        "e0b78a16c9d8488eb6f2b78788bc1987": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -4552,7 +4979,7 @@
             "width": null
           }
         },
-        "eb8e5c96ffea40e298a38db2b5a38ace": {
+        "e34931d882f44879aee48bfd46d62dce": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
           "model_name": "DescriptionStyleModel",
@@ -4567,23 +4994,37 @@
             "description_width": ""
           }
         },
-        "f140214cae9c45f39c0ca28d856fe596": {
+        "e495e81fecf844a4ae4a15ca9cb0329c": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
+          "model_name": "DescriptionStyleModel",
           "state": {
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "e75e2e078b4246a9984fa0841de27a15": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
             "_view_count": null,
             "_view_module": "@jupyter-widgets/base",
             "_view_module_version": "1.2.0",
             "_view_name": "StyleView",
-            "bar_color": null,
             "description_width": ""
           }
         },
-        "f2613e8b5e1e453db11090a1aa888799": {
+        "ec62bc9001d349e696b8d82942e5b8c4": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -4635,28 +5076,37 @@
             "width": null
           }
         },
-        "f507f59b242a4a5ebb204c4bb84a8241": {
+        "f4510cc71cf54df785fbe760b7dfc32c": {
           "model_module": "@jupyter-widgets/controls",
           "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
+          "model_name": "DescriptionStyleModel",
           "state": {
-            "_dom_classes": [],
             "_model_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
+            "_model_name": "DescriptionStyleModel",
             "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_a1106afabced4d418406b7bbb4f826cc",
-            "placeholder": "​",
-            "style": "IPY_MODEL_9a95f05e979e4a9bbad4675883350b1d",
-            "value": "100%"
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "f572376e2dbb437c95e1f62820d6f0d3": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
           }
         },
-        "f636c4fc5cfe4489a47a14afc88a0b9c": {
+        "fb5e2db40e004e21a852c9f28740615c": {
           "model_module": "@jupyter-widgets/base",
           "model_module_version": "1.2.0",
           "model_name": "LayoutModel",
@@ -4707,75 +5157,6 @@
             "visibility": null,
             "width": null
           }
-        },
-        "f69886a5f4984ee09804aaa411660955": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_7dce393a07d14ccbb1d43554976aeb1c",
-            "placeholder": "​",
-            "style": "IPY_MODEL_ddfcad5fc8854b5cbe0fe8840013ffe5",
-            "value": "100%"
-          }
-        },
-        "f750ac53345746ee8c0364b5f850a56d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_f2613e8b5e1e453db11090a1aa888799",
-            "max": 167290877,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_9bdb4cdea53b429cad0e8c418fd7e715",
-            "value": 167290877
-          }
-        },
-        "fc977a68f1494ad68c203f5544ce1521": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_e8b957c1eadc45ba9f7092df0d080f08",
-            "max": 182353916,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_3590a55482b14c379f1a63433c140cfb",
-            "value": 182353916
-          }
         }
       }
     }
diff --git a/demo/README.md b/demo/README.md
deleted file mode 100644
index b13df396e..000000000
--- a/demo/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Demo
-
-Please refer to the inference of [VID](../docs/en/quick_run.md#inference-vid-models), [MOT](../docs/en/quick_run.md#inference-motvis-models), [SOT](../docs/en/quick_run.md#inference-sot-models) and [VIS](../docs/en/quick_run.md#inference-motvis-models) to run demos.
diff --git a/demo/demo_mot_pose_vis.py b/demo/demo_mot_pose_vis.py
new file mode 100644
index 000000000..9024bd98c
--- /dev/null
+++ b/demo/demo_mot_pose_vis.py
@@ -0,0 +1,361 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+from argparse import ArgumentParser
+
+import mmcv
+import mmengine
+
+from mmtrack.apis import inference_mot, init_model
+from mmpose.structures import merge_data_samples
+from mmtrack.registry import VISUALIZERS
+from mmtrack.utils import register_all_modules
+from mmpose.utils import register_all_modules as register_all_modules_pose
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('config', help='config file')
+    parser.add_argument('--input', help='input video file or folder')
+    parser.add_argument(
+        '--output', help='output video file (mp4 format) or folder')
+    parser.add_argument('--checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--score-thr',
+        type=float,
+        default=0.0,
+        help='The threshold of score to filter bboxes.')
+    parser.add_argument(
+        '--device', default='cuda:0', help='device used for inference')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='whether show the results on the fly')
+    parser.add_argument('--fps', help='FPS of the output video')
+    args = parser.parse_args()
+    return args
+
+
+def draw_image(pose, img):
+    print('number of keypoints:', len(pose))
+    import cv2
+
+    for k in range(len(pose)):
+        landmarks = pose[k].pred_instances.keypoints.reshape(-1, 2)
+
+        for i in range(landmarks.shape[0]):
+            center_coordinates = (int(landmarks[i][0]), int(landmarks[i][1]))
+            radius = 3
+            color = (100, 255, 100)
+            thickness = 1
+            img = cv2.circle(img, center_coordinates, radius, color, thickness)
+
+        cv2.imwrite('image2.jpg', img)
+
+
+def main(args):
+    assert args.output or args.show
+    # load images
+    if osp.isdir(args.input):
+        imgs = sorted(
+            filter(lambda x: x.endswith(('.jpg', '.png', '.jpeg')),
+                   os.listdir(args.input)),
+            key=lambda x: int(x.split('.')[0]))
+        IN_VIDEO = False
+    else:
+        imgs = mmcv.VideoReader(args.input)
+        IN_VIDEO = True
+
+    # define output
+    OUT_VIDEO = False
+    if args.output is not None:
+        if args.output.endswith('.mp4'):
+            OUT_VIDEO = True
+            out_dir = tempfile.TemporaryDirectory()
+            out_path = out_dir.name
+            _out = args.output.rsplit(os.sep, 1)
+            if len(_out) > 1:
+                os.makedirs(_out[0], exist_ok=True)
+        else:
+            out_path = args.output
+            os.makedirs(out_path, exist_ok=True)
+
+    fps = args.fps
+    if args.show or OUT_VIDEO:
+        if fps is None and IN_VIDEO:
+            fps = imgs.fps
+        if not fps:
+            raise ValueError('Please set the FPS for the output video.')
+        fps = int(fps)
+
+    register_all_modules(init_default_scope=True)
+    register_all_modules_pose(init_default_scope=False)
+
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint, device=args.device)
+
+    # build the visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = get_dataset_meta()
+
+    # build the pose visualizer
+    from mmpose.visualization import PoseLocalVisualizer
+    pose_visualizer = VISUALIZERS.build(
+        dict(
+            type='PoseLocalVisualizer',
+            name='visualizer',
+            radius=3,
+            line_width=1))
+
+    prog_bar = mmengine.ProgressBar(len(imgs))
+    # test and show/save the images
+    for i, img in enumerate(imgs):
+        if isinstance(img, str):
+            img_path = osp.join(args.input, img)
+            img = mmcv.imread(img_path)
+
+        print()
+        print('origin image:', img.shape)
+        result = inference_mot(model, img, frame_id=i)
+
+        if args.output is not None:
+            if IN_VIDEO or OUT_VIDEO:
+                out_file = osp.join(out_path, f'{i:06d}.jpg')
+            else:
+                out_file = osp.join(out_path, img.rsplit(os.sep, 1)[-1])
+        else:
+            out_file = None
+
+        pose_result = result.pred_track_instances.pose
+        # draw keypoints
+        draw_image(pose_result, img.copy())
+        data_samples = merge_data_samples(pose_result)
+
+        # show the results
+        visualizer.add_datasample(
+            'mot',
+            img[..., ::-1],
+            data_sample=result,
+            show=args.show,
+            draw_gt=False,
+            out_file=out_file,
+            wait_time=float(1 / int(fps)) if fps else 0,
+            pred_score_thr=args.score_thr,
+            step=i)
+
+        prog_bar.update()
+
+    if args.output and OUT_VIDEO:
+        print(f'making the output video at {args.output} with a FPS of {fps}')
+        mmcv.frames2video(out_path, args.output, fps=fps, fourcc='mp4v')
+        out_dir.cleanup()
+
+
+def get_dataset_meta():
+    dataset_info = dict(
+        dataset_name='coco',
+        paper_info=dict(
+            author='Lin, Tsung-Yi and Maire, Michael and '
+            'Belongie, Serge and Hays, James and '
+            'Perona, Pietro and Ramanan, Deva and '
+            r'Doll{\'a}r, Piotr and Zitnick, C Lawrence',
+            title='Microsoft coco: Common objects in context',
+            container='European conference on computer vision',
+            year='2014',
+            homepage='http://cocodataset.org/',
+        ),
+        keypoint_info={
+            0:
+            dict(
+                name='nose', id=0, color=[51, 153, 255], type='upper',
+                swap=''),
+            1:
+            dict(
+                name='left_eye',
+                id=1,
+                color=[51, 153, 255],
+                type='upper',
+                swap='right_eye'),
+            2:
+            dict(
+                name='right_eye',
+                id=2,
+                color=[51, 153, 255],
+                type='upper',
+                swap='left_eye'),
+            3:
+            dict(
+                name='left_ear',
+                id=3,
+                color=[51, 153, 255],
+                type='upper',
+                swap='right_ear'),
+            4:
+            dict(
+                name='right_ear',
+                id=4,
+                color=[51, 153, 255],
+                type='upper',
+                swap='left_ear'),
+            5:
+            dict(
+                name='left_shoulder',
+                id=5,
+                color=[0, 255, 0],
+                type='upper',
+                swap='right_shoulder'),
+            6:
+            dict(
+                name='right_shoulder',
+                id=6,
+                color=[255, 128, 0],
+                type='upper',
+                swap='left_shoulder'),
+            7:
+            dict(
+                name='left_elbow',
+                id=7,
+                color=[0, 255, 0],
+                type='upper',
+                swap='right_elbow'),
+            8:
+            dict(
+                name='right_elbow',
+                id=8,
+                color=[255, 128, 0],
+                type='upper',
+                swap='left_elbow'),
+            9:
+            dict(
+                name='left_wrist',
+                id=9,
+                color=[0, 255, 0],
+                type='upper',
+                swap='right_wrist'),
+            10:
+            dict(
+                name='right_wrist',
+                id=10,
+                color=[255, 128, 0],
+                type='upper',
+                swap='left_wrist'),
+            11:
+            dict(
+                name='left_hip',
+                id=11,
+                color=[0, 255, 0],
+                type='lower',
+                swap='right_hip'),
+            12:
+            dict(
+                name='right_hip',
+                id=12,
+                color=[255, 128, 0],
+                type='lower',
+                swap='left_hip'),
+            13:
+            dict(
+                name='left_knee',
+                id=13,
+                color=[0, 255, 0],
+                type='lower',
+                swap='right_knee'),
+            14:
+            dict(
+                name='right_knee',
+                id=14,
+                color=[255, 128, 0],
+                type='lower',
+                swap='left_knee'),
+            15:
+            dict(
+                name='left_ankle',
+                id=15,
+                color=[0, 255, 0],
+                type='lower',
+                swap='right_ankle'),
+            16:
+            dict(
+                name='right_ankle',
+                id=16,
+                color=[255, 128, 0],
+                type='lower',
+                swap='left_ankle')
+        },
+        skeleton_info={
+            0:
+            dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+            1:
+            dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+            2:
+            dict(
+                link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+            3:
+            dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+            4:
+            dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+            5:
+            dict(
+                link=('left_shoulder', 'left_hip'), id=5, color=[51, 153,
+                                                                 255]),
+            6:
+            dict(
+                link=('right_shoulder', 'right_hip'),
+                id=6,
+                color=[51, 153, 255]),
+            7:
+            dict(
+                link=('left_shoulder', 'right_shoulder'),
+                id=7,
+                color=[51, 153, 255]),
+            8:
+            dict(
+                link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+            9:
+            dict(
+                link=('right_shoulder', 'right_elbow'),
+                id=9,
+                color=[255, 128, 0]),
+            10:
+            dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+            11:
+            dict(
+                link=('right_elbow', 'right_wrist'),
+                id=11,
+                color=[255, 128, 0]),
+            12:
+            dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+            13:
+            dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+            14:
+            dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+            15:
+            dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+            16:
+            dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+            17:
+            dict(
+                link=('left_ear', 'left_shoulder'),
+                id=17,
+                color=[51, 153, 255]),
+            18:
+            dict(
+                link=('right_ear', 'right_shoulder'),
+                id=18,
+                color=[51, 153, 255])
+        },
+        joint_weights=[
+            1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2,
+            1.5, 1.5
+        ],
+        sigmas=[
+            0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072,
+            0.062, 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+        ])
+    return dataset_info
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/demo/demo_mot_vis.py b/demo/demo_mot_vis.py
index 31c8fdb6b..191808862 100644
--- a/demo/demo_mot_vis.py
+++ b/demo/demo_mot_vis.py
@@ -5,11 +5,15 @@
 from argparse import ArgumentParser
 
 import mmcv
+import mmengine
 
 from mmtrack.apis import inference_mot, init_model
+from mmtrack.registry import VISUALIZERS
+from mmtrack.utils import register_all_modules
+from mmpose.utils import register_all_modules as register_all_modules_pose
 
 
-def main():
+def parse_args():
     parser = ArgumentParser()
     parser.add_argument('config', help='config file')
     parser.add_argument('--input', help='input video file or folder')
@@ -27,13 +31,12 @@ def main():
         '--show',
         action='store_true',
         help='whether show the results on the fly')
-    parser.add_argument(
-        '--backend',
-        choices=['cv2', 'plt'],
-        default='cv2',
-        help='the backend to visualize the results')
     parser.add_argument('--fps', help='FPS of the output video')
     args = parser.parse_args()
+    return args
+
+
+def main(args):
     assert args.output or args.show
     # load images
     if osp.isdir(args.input):
@@ -45,7 +48,9 @@ def main():
     else:
         imgs = mmcv.VideoReader(args.input)
         IN_VIDEO = True
+
     # define output
+    OUT_VIDEO = False
     if args.output is not None:
         if args.output.endswith('.mp4'):
             OUT_VIDEO = True
@@ -55,7 +60,6 @@ def main():
             if len(_out) > 1:
                 os.makedirs(_out[0], exist_ok=True)
         else:
-            OUT_VIDEO = False
             out_path = args.output
             os.makedirs(out_path, exist_ok=True)
 
@@ -67,15 +71,24 @@ def main():
             raise ValueError('Please set the FPS for the output video.')
         fps = int(fps)
 
+    register_all_modules(init_default_scope=True)
+    register_all_modules_pose(init_default_scope=False)
+
     # build the model from a config file and a checkpoint file
     model = init_model(args.config, args.checkpoint, device=args.device)
 
-    prog_bar = mmcv.ProgressBar(len(imgs))
+    # build the visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = model.dataset_meta
+
+    prog_bar = mmengine.ProgressBar(len(imgs))
     # test and show/save the images
     for i, img in enumerate(imgs):
         if isinstance(img, str):
-            img = osp.join(args.input, img)
+            img_path = osp.join(args.input, img)
+            img = mmcv.imread(img_path)
         result = inference_mot(model, img, frame_id=i)
+
         if args.output is not None:
             if IN_VIDEO or OUT_VIDEO:
                 out_file = osp.join(out_path, f'{i:06d}.jpg')
@@ -83,14 +96,19 @@ def main():
                 out_file = osp.join(out_path, img.rsplit(os.sep, 1)[-1])
         else:
             out_file = None
-        model.show_result(
-            img,
-            result,
-            score_thr=args.score_thr,
+
+        # show the results
+        visualizer.add_datasample(
+            'mot',
+            img[..., ::-1],
+            data_sample=result,
             show=args.show,
-            wait_time=int(1000. / fps) if fps else 0,
+            draw_gt=False,
             out_file=out_file,
-            backend=args.backend)
+            wait_time=float(1 / int(fps)) if fps else 0,
+            pred_score_thr=args.score_thr,
+            step=i)
+
         prog_bar.update()
 
     if args.output and OUT_VIDEO:
@@ -100,4 +118,5 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    args = parse_args()
+    main(args)
diff --git a/demo/demo_sot.py b/demo/demo_sot.py
index ebe001837..4f8360631 100644
--- a/demo/demo_sot.py
+++ b/demo/demo_sot.py
@@ -6,11 +6,14 @@
 
 import cv2
 import mmcv
+import mmengine
 
 from mmtrack.apis import inference_sot, init_model
+from mmtrack.registry import VISUALIZERS
+from mmtrack.utils import register_all_modules
 
 
-def main():
+def parse_args():
     parser = ArgumentParser()
     parser.add_argument('config', help='Config file')
     parser.add_argument('--input', help='input video file')
@@ -23,14 +26,13 @@ def main():
         action='store_true',
         default=False,
         help='whether to show visualizations.')
-    parser.add_argument(
-        '--color', default=(0, 255, 0), help='Color of tracked bbox lines.')
-    parser.add_argument(
-        '--thickness', default=3, type=int, help='Thickness of bbox lines.')
     parser.add_argument('--fps', help='FPS of the output video')
     parser.add_argument('--gt_bbox_file', help='The path of gt_bbox file')
     args = parser.parse_args()
+    return args
+
 
+def main(args):
     # load images
     if osp.isdir(args.input):
         imgs = sorted(
@@ -42,8 +44,8 @@ def main():
         imgs = mmcv.VideoReader(args.input)
         IN_VIDEO = True
 
-    OUT_VIDEO = False
     # define output
+    OUT_VIDEO = False
     if args.output is not None:
         if args.output.endswith('.mp4'):
             OUT_VIDEO = True
@@ -55,18 +57,24 @@ def main():
         else:
             out_path = args.output
             os.makedirs(out_path, exist_ok=True)
-    fps = int(args.fps)
-    if args.show or OUT_VIDEO:
+    fps = args.fps
+    if OUT_VIDEO:
         if fps is None and IN_VIDEO:
             fps = imgs.fps
         if not fps:
             raise ValueError('Please set the FPS for the output video.')
         fps = int(fps)
 
+    register_all_modules(init_default_scope=True)
+
     # build the model from a config file and a checkpoint file
     model = init_model(args.config, args.checkpoint, device=args.device)
 
-    prog_bar = mmcv.ProgressBar(len(imgs))
+    # build the visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = model.dataset_meta
+
+    prog_bar = mmengine.ProgressBar(len(imgs))
     # test and show/save the images
     for i, img in enumerate(imgs):
         if isinstance(img, str):
@@ -74,10 +82,11 @@ def main():
             img = mmcv.imread(img_path)
         if i == 0:
             if args.gt_bbox_file is not None:
-                bboxes = mmcv.list_from_file(args.gt_bbox_file)
+                bboxes = mmengine.list_from_file(args.gt_bbox_file)
                 init_bbox = list(map(float, bboxes[0].split(',')))
             else:
                 init_bbox = list(cv2.selectROI(args.input, img, False, False))
+                cv2.destroyAllWindows()
 
             # convert (x1, y1, w, h) to (x1, y1, x2, y2)
             init_bbox[2] += init_bbox[0]
@@ -91,13 +100,19 @@ def main():
                 out_file = osp.join(out_path, img_path.rsplit(os.sep, 1)[-1])
         else:
             out_file = None
-        model.show_result(
-            img,
-            result,
+
+        # show the results
+        visualizer.add_datasample(
+            'sot',
+            img[..., ::-1],
+            data_sample=result,
             show=args.show,
-            wait_time=int(1000. / fps) if fps else 0,
+            draw_gt=False,
             out_file=out_file,
-            thickness=args.thickness)
+            wait_time=float(1 / int(fps)) if fps else 0,
+            pred_score_thr=-100,
+            step=i)
+
         prog_bar.update()
 
     if args.output and OUT_VIDEO:
@@ -108,4 +123,5 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    args = parse_args()
+    main(args)
diff --git a/demo/demo_vid.py b/demo/demo_vid.py
index f768e615b..c85502d45 100644
--- a/demo/demo_vid.py
+++ b/demo/demo_vid.py
@@ -5,11 +5,14 @@
 from argparse import ArgumentParser
 
 import mmcv
+import mmengine
 
 from mmtrack.apis import inference_vid, init_model
+from mmtrack.registry import VISUALIZERS
+from mmtrack.utils import register_all_modules
 
 
-def main():
+def parse_args():
     parser = ArgumentParser()
     parser.add_argument('config', help='Config file')
     parser.add_argument('--input', help='input video file')
@@ -23,16 +26,17 @@ def main():
         default=False,
         help='whether to show visualizations.')
     parser.add_argument(
-        '--score-thr', type=float, default=0.8, help='bbox score threshold')
-    parser.add_argument(
-        '--thickness', default=3, type=int, help='Thickness of bbox lines.')
+        '--score_thr', type=float, default=0.8, help='bbox score threshold')
     parser.add_argument('--fps', help='FPS of the output video')
     args = parser.parse_args()
+    return args
+
 
+def main(args):
     # load images
     if osp.isdir(args.input):
         imgs = sorted(
-            filter(lambda x: x.endswith(('.jpg', '.png', '.jpeg')),
+            filter(lambda x: x.endswith(('.jpg', '.png', '.jpeg', '.JPEG')),
                    os.listdir(args.input)),
             key=lambda x: int(x.split('.')[0]))
         IN_VIDEO = False
@@ -41,6 +45,7 @@ def main():
         IN_VIDEO = True
 
     # define output
+    OUT_VIDEO = False
     if args.output is not None:
         if args.output.endswith('.mp4'):
             OUT_VIDEO = True
@@ -50,7 +55,6 @@ def main():
             if len(_out) > 1:
                 os.makedirs(_out[0], exist_ok=True)
         else:
-            OUT_VIDEO = False
             out_path = args.output
             os.makedirs(out_path, exist_ok=True)
     fps = args.fps
@@ -61,15 +65,21 @@ def main():
             raise ValueError('Please set the FPS for the output video.')
         fps = int(fps)
 
+    register_all_modules(init_default_scope=True)
+
     # build the model from a config file and a checkpoint file
     model = init_model(args.config, args.checkpoint, device=args.device)
 
-    prog_bar = mmcv.ProgressBar(len(imgs))
+    # build the visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = model.dataset_meta
+
+    prog_bar = mmengine.ProgressBar(len(imgs))
     # test and show/save the images
     for i, img in enumerate(imgs):
         if isinstance(img, str):
-            img = osp.join(args.input, img)
-            img = mmcv.imread(img)
+            img_path = osp.join(args.input, img)
+            img = mmcv.imread(img_path)
 
         result = inference_vid(model, img, frame_id=i)
         if args.output is not None:
@@ -79,14 +89,19 @@ def main():
                 out_file = osp.join(out_path, img.rsplit(os.sep, 1)[-1])
         else:
             out_file = None
-        model.show_result(
-            img,
-            result,
-            score_thr=args.score_thr,
+
+        # show the results
+        visualizer.add_datasample(
+            'vid',
+            img[..., ::-1],
+            data_sample=result,
             show=args.show,
-            wait_time=int(1000. / fps) if fps else 0,
+            draw_gt=False,
             out_file=out_file,
-            thickness=args.thickness)
+            wait_time=float(1 / int(fps)) if fps else 0,
+            pred_score_thr=args.score_thr,
+            step=i)
+
         prog_bar.update()
 
     if args.output and OUT_VIDEO:
@@ -97,4 +112,5 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    args = parse_args()
+    main(args)
diff --git a/demo/test1.mp4 b/demo/test1.mp4
new file mode 100644
index 000000000..b380b018c
Binary files /dev/null and b/demo/test1.mp4 differ
diff --git a/demo/test2.mp4 b/demo/test2.mp4
new file mode 100644
index 000000000..c5a9a5bf1
Binary files /dev/null and b/demo/test2.mp4 differ
diff --git a/docs/en/advanced_guides/10_add_datasets.md b/docs/en/advanced_guides/10_add_datasets.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/11_add_transforms.md b/docs/en/advanced_guides/11_add_transforms.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/12_add_metrics.md b/docs/en/advanced_guides/12_add_metrics.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/13_customize_runtime.md b/docs/en/advanced_guides/13_customize_runtime.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/1_data_flow.md b/docs/en/advanced_guides/1_data_flow.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/2_structures.md b/docs/en/advanced_guides/2_structures.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/3_models.md b/docs/en/advanced_guides/3_models.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/4_datasets.md b/docs/en/advanced_guides/4_datasets.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/5_transforms.md b/docs/en/advanced_guides/5_transforms.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/6_evaluation.md b/docs/en/advanced_guides/6_evaluation.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/7_engine.md b/docs/en/advanced_guides/7_engine.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/8_convention.md b/docs/en/advanced_guides/8_convention.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/9_add_modules.md b/docs/en/advanced_guides/9_add_modules.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/advanced_guides/index.rst b/docs/en/advanced_guides/index.rst
new file mode 100644
index 000000000..22c8fefd3
--- /dev/null
+++ b/docs/en/advanced_guides/index.rst
@@ -0,0 +1,27 @@
+Basic Concepts
+***************
+
+
+.. toctree::
+   :maxdepth: 1
+
+   1_data_flow.md
+   2_structures.md
+   3_models.md
+   4_datasets.md
+   5_transforms.md
+   6_evaluation.md
+   7_engine.md
+   8_conventions.md
+
+Component Customization
+************************
+
+.. toctree::
+   :maxdepth: 1
+
+   9_add_modules.md
+   10_add_datasets.md
+   11_add_transforms.md
+   12_add_metrics.md
+   13_customize_runtime.md
diff --git a/docs/en/api.rst b/docs/en/api.rst
index 8b2f1ec69..a1de14f73 100644
--- a/docs/en/api.rst
+++ b/docs/en/api.rst
@@ -3,89 +3,81 @@ mmtrack.apis
 .. automodule:: mmtrack.apis
     :members:
 
-mmtrack.core
---------------
+mmtrack.datasets
+----------------
 
-anchor
+datasets
 ^^^^^^^^^^
-.. automodule:: mmtrack.core.anchor
+.. automodule:: mmtrack.datasets
     :members:
 
-evaluation
+api_wrappers
 ^^^^^^^^^^
-.. automodule:: mmtrack.core.evaluation
+.. automodule:: mmtrack.datasets.api_wrappers
     :members:
 
-motion
+samplers
 ^^^^^^^^^^
-.. automodule:: mmtrack.core.motion
+.. automodule:: mmtrack.datasets.samplers
     :members:
 
-optimizer
+transforms
 ^^^^^^^^^^
-.. automodule:: mmtrack.core.optimizer
+.. automodule:: mmtrack.datasets.transforms
     :members:
 
-track
+mmtrack.engine
+----------------
+
+hooks
 ^^^^^^^^^^
-.. automodule:: mmtrack.core.track
+.. automodule:: mmtrack.engine.hooks
     :members:
 
-utils
+schedulers
 ^^^^^^^^^^
-.. automodule:: mmtrack.core.utils
+.. automodule:: mmtrack.engine.schedulers
     :members:
 
-mmtrack.datasets
+mmtrack.evaluation
 ----------------
 
-datasets
-^^^^^^^^^^
-.. automodule:: mmtrack.datasets
-    :members:
-
-parsers
+functional
 ^^^^^^^^^^
-.. automodule:: mmtrack.datasets.parsers
+.. automodule:: mmtrack.evaluation.functional
     :members:
 
-pipelines
+metrics
 ^^^^^^^^^^
-.. automodule:: mmtrack.datasets.pipelines
+.. automodule:: mmtrack.evaluation.metrics
     :members:
 
-samplers
-^^^^^^^^^^
-.. automodule:: mmtrack.datasets.samplers
-    :members:
-
-
 mmtrack.models
 --------------
 
-mot
+aggregators
 ^^^^^^^^^^
-.. automodule:: mmtrack.models.mot
+.. automodule:: mmtrack.models.aggregators
     :members:
 
-sot
+backbones
 ^^^^^^^^^^
-.. automodule:: mmtrack.models.sot
+.. automodule:: mmtrack.models.backbones
     :members:
 
-vid
+data_preprocessors
 ^^^^^^^^^^
-.. automodule:: mmtrack.models.vid
+.. automodule:: mmtrack.models.data_preprocessors
     :members:
 
-aggregators
+filter
 ^^^^^^^^^^^
-.. automodule:: mmtrack.models.aggregators
+.. automodule:: mmtrack.models.filter
     :members:
 
-backbones
+layers
 ^^^^^^^^^^
-.. automodule:: mmtrack.models.backbones
+.. automodule:: mmtrack.models.layers
     :members:
 
 losses
@@ -93,6 +85,11 @@ losses
 .. automodule:: mmtrack.models.losses
     :members:
 
+mot
+^^^^^^^^^^
+.. automodule:: mmtrack.models.mot
+    :members:
+
 motion
 ^^^^^^^^^^
 .. automodule:: mmtrack.models.motion
@@ -104,21 +101,65 @@ reid
     :members:
 
 roi_heads
-^^^^^^^^^^
+^^^^^^^^^^^
 .. automodule:: mmtrack.models.roi_heads
     :members:
 
+sot
+^^^^^^^^^^^
+.. automodule:: mmtrack.models.sot
+    :members:
+
+task_modules
+^^^^^^^^^^^
+.. automodule:: mmtrack.models.task_modules
+    :members:
+
 track_heads
 ^^^^^^^^^^^
 .. automodule:: mmtrack.models.track_heads
     :members:
 
-builder
+trackers
+^^^^^^^^^^^
+.. automodule:: mmtrack.models.trackers
+    :members:
+
+vid
 ^^^^^^^^^^^
-.. automodule:: mmtrack.models
+.. automodule:: mmtrack.models.vid
+    :members:
+
+vis
+^^^^^^^^^^^
+.. automodule:: mmtrack.models.vis
+    :members:
+
+mmtrack.structures
+----------------
+
+structures
+^^^^^^^^^^
+.. automodule:: mmtrack.structures
+    :members:
+
+bbox
+^^^^^^^^^^
+.. automodule:: mmtrack.structures.bbox
     :members:
 
 mmtrack.utils
---------------
+----------------
+
+utils
+^^^^^^^^^^
 .. automodule:: mmtrack.utils
     :members:
+
+mmtrack.visualiztion
+----------------
+
+visualiztion
+^^^^^^^^^^
+.. automodule:: mmtrack.visualiztion
+    :members:
diff --git a/docs/en/dataset_zoo.md b/docs/en/dataset_zoo.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/get_started.md b/docs/en/get_started.md
new file mode 100644
index 000000000..9e0d256fa
--- /dev/null
+++ b/docs/en/get_started.md
@@ -0,0 +1,192 @@
+## Prerequisites
+
+- Linux | macOS | Windows
+- Python 3.6+
+- PyTorch 1.6+
+- CUDA 9.2+ (If you build PyTorch from source, CUDA 9.0 is also compatible)
+- GCC 5+
+- [MMCV](https://mmcv.readthedocs.io/en/latest/get_started/installation.html)
+- [MMEngine](https://mmengine.readthedocs.io/en/latest/get_started/installation.html)
+- [MMDetection](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation)
+
+The compatible MMTracking, MMEngine, MMCV, and MMDetection versions are as below. Please install the correct version to avoid installation issues.
+
+| MMTracking version | MMEngine version |      MMCV version      |   MMDetection version   |
+| :----------------: | :--------------: | :--------------------: | :---------------------: |
+|        1.x         | mmengine>=0.1.0  | mmcv>=2.0.0rc1,\<2.0.0 | mmdet>=3.0.0rc0,\<3.0.0 |
+|      1.0.0rc1      | mmengine>=0.1.0  | mmcv>=2.0.0rc1,\<2.0.0 | mmdet>=3.0.0rc0,\<3.0.0 |
+
+## Installation
+
+### Detailed Instructions
+
+1. Create a conda virtual environment and activate it.
+
+   ```shell
+   conda create -n open-mmlab python=3.9 -y
+   conda activate open-mmlab
+   ```
+
+2. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/). Here we use PyTorch 1.10.0 and CUDA 11.1.
+   You may also switch to other version by specifying the version number.
+
+   **Install with conda**
+
+   ```shell
+   conda install pytorch=1.11.0 torchvision cudatoolkit=11.3 -c pytorch
+   ```
+
+   **Install with pip**
+
+   ```shell
+   pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+   ```
+
+3. Install MMEngine
+
+   ```shell
+   pip install mmengine
+   ```
+
+4. Install mmcv, we recommend you to install the pre-build package as below.
+
+   ```shell
+   pip install 'mmcv>=2.0.0rc1' -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+   ```
+
+   mmcv is only compiled on PyTorch 1.x.0 because the compatibility usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you can install mmcv compiled with PyTorch 1.x.0 and it usually works well.
+
+   ```shell
+   # We can ignore the micro version of PyTorch
+   pip install 'mmcv>=2.0.0rc1' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html
+   ```
+
+   See [here](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) for different versions of MMCV compatible to different PyTorch and CUDA versions.
+   Optionally you can choose to compile mmcv from source by the following command
+
+   ```shell
+   git clone -b 2.x https://github.com/open-mmlab/mmcv.git
+   cd mmcv
+   MMCV_WITH_OPS=1 pip install -e .  # package mmcv, which contains cuda ops, will be installed after this step
+   # pip install -e .  # package mmcv, which contains no cuda ops, will be installed after this step
+   cd ..
+   ```
+
+   **Important**: You need to run pip uninstall mmcv-lite first if you have mmcv installed. Because if mmcv-lite and mmcv are both installed, there will be ModuleNotFoundError.
+
+5. Install MMDetection
+
+   ```shell
+   pip install 'mmdet>=3.0.0rc0'
+   ```
+
+   Optionally, you can also build MMDetection from source in case you want to modify the code:
+
+   ```shell
+   git clone -b 3.x https://github.com/open-mmlab/mmdetection.git
+   cd mmdetection
+   pip install -r requirements/build.txt
+   pip install -v -e .  # or "python setup.py develop"
+   ```
+
+6. Clone the MMTracking repository.
+
+   ```shell
+   git clone -b 1.x https://github.com/open-mmlab/mmtracking.git
+   cd mmtracking
+   ```
+
+7. Install build requirements and then install MMTracking.
+
+   ```shell
+   pip install -r requirements/build.txt
+   pip install -v -e .  # or "python setup.py develop"
+   ```
+
+8. Install extra dependencies
+
+- For MOT evaluation (required):
+
+  ```shell
+  pip install git+https://github.com/JonathonLuiten/TrackEval.git
+  ```
+
+- For VOT evaluation (optional)
+
+  ```shell
+  pip install git+https://github.com/votchallenge/toolkit.git
+  ```
+
+- For LVIS evaluation (optional):
+
+  ```shell
+  pip install git+https://github.com/lvis-dataset/lvis-api.git
+  ```
+
+- For TAO evaluation (optional):
+
+  ```shell
+  pip install git+https://github.com/TAO-Dataset/tao.git
+  ```
+
+Note:
+
+a. Following the above instructions, MMTracking is installed on `dev` mode
+, any local modifications made to the code will take effect without the need to reinstall it.
+
+b. If you would like to use `opencv-python-headless` instead of `opencv-python`,
+you can install it before installing MMCV.
+
+### A from-scratch setup script
+
+Assuming that you already have CUDA 10.1 installed, here is a full script for setting up MMTracking with conda.
+
+```shell
+conda create -n open-mmlab python=3.9 -y
+conda activate open-mmlab
+
+conda install pytorch=1.11.0 torchvision cudatoolkit=11.3 -c pytorch
+
+pip install mmengine
+
+# install the latest mmcv
+pip install 'mmcv>=2.0.0rc1' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html
+
+# install mmdetection
+pip install 'mmdet>=3.0.0rc0'
+
+# install mmtracking
+git clone -b 1.x https://github.com/open-mmlab/mmtracking.git
+cd mmtracking
+pip install -r requirements/build.txt
+pip install -v -e .
+pip install git+https://github.com/JonathonLuiten/TrackEval.git
+pip install git+https://github.com/votchallenge/toolkit.git (optional)
+pip install git+https://github.com/lvis-dataset/lvis-api.git (optional)
+pip install git+https://github.com/TAO-Dataset/tao.git (optional)
+```
+
+### Developing with multiple MMTracking versions
+
+The train and test scripts already modify the `PYTHONPATH` to ensure the script use the MMTracking in the current directory.
+
+To use the default MMTracking installed in the environment rather than that you are working with, you can remove the following line in those scripts
+
+```shell
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH
+```
+
+## Verification
+
+To verify whether MMTracking and the required environment are installed correctly, we can run **one of** MOT, VIS, VID and SOT [demo scripts](https://github.com/open-mmlab/mmtracking/blob/1.x/demo/):
+
+Here is an example for MOT demo:
+
+```shell
+python demo/demo_mot_vis.py \
+    configs/mot/deepsort/deepsort_faster-rcnn-r50-fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \
+    --input demo/demo.mp4 \
+    --output mot.mp4
+```
+
+If you want to run more other demos, you can refer to [inference guides](./user_guides/3_inference.md)
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 0f2547366..b0cf56237 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -1,53 +1,59 @@
 Welcome to MMTracking's documentation!
 =======================================
 
-You can switch between Chinese and English documents in the lower-left corner of the layout.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Get Started
 
-   install.md
-   modelzoo_statistics.md
-   model_zoo.md
+   overview.md
+   get_started.md
 
 .. toctree::
    :maxdepth: 2
-   :caption: Quick run
-
-   dataset.md
-   quick_run.md
+   :caption: User Guides
 
+   user_guides/index.rst
 
 .. toctree::
    :maxdepth: 2
-   :caption: Tutorials
+   :caption: Advanced Guides
 
-   tutorials/index.rst
+   advanced_guides/index.rst
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Useful Tools and Scripts
+   :maxdepth: 1
+   :caption: Migration
 
-   useful_tools_scripts.md
+   migration.md
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Notes
+   :maxdepth: 1
+   :caption: API Reference
 
-   changelog.md
+   api.rst
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Model Zoo
+
+   model_zoo.md
 
 .. toctree::
-   :caption: Switch Language
+   :maxdepth: 1
+   :caption: Notes
 
-   switch_language.md
+   notes/contribution_guide.md
+   notes/projects.md
+   notes/changelog.md
+   notes/faq.md
 
 
 .. toctree::
-   :caption: API Reference
+   :caption: Switch Language
+
+   switch_language.md
 
-   api.rst
 
 
 Indices and tables
diff --git a/docs/en/migration.md b/docs/en/migration.md
new file mode 100644
index 000000000..11761480d
--- /dev/null
+++ b/docs/en/migration.md
@@ -0,0 +1,169 @@
+# Migration from MMTracking 0.xx
+
+Compared with the 0.xx versions of MMTracking, the latest 1.xx version of MMTracking has the following important modifications.
+
+## Overall Structures
+
+The `core` in the old versions of MMTracking is splited into `engine`, `evaluation`, `structures`, `visualization` and `model/task_moduls` in the 1.xx version of MMTracking. Details can be seen in the [user guides](../../docs/en/user_guides).
+
+## Configs
+
+### file names
+
+**old**: `deepsort_faster-rcnn_fpn_4e_mot17-private-half.py`
+
+**new**: `deepsort_faster-rcnn-resnet50-fpn_8x2bs-4e_mot17halftrain_test-mot17halfval.py`
+
+### keys of dataset loader
+
+**old**
+
+```python
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    persistent_workers=True,
+    samples_per_epoch=60000,
+    train=dict(
+        type='GOT10kDataset',
+        ann_file=data_root +
+        'got10k/annotations/got10k_train_infos.txt',
+        img_prefix=data_root + 'got10k',
+        pipeline=train_pipeline,
+        split='train',
+        test_mode=False),
+    val=dict(
+        type='GOT10kDataset',
+        ann_file=data_root + 'got10k/annotations/got10k_test_infos.txt',
+        img_prefix=data_root + 'got10k',
+        pipeline=test_pipeline,
+        split='test',
+        test_mode=True),
+    test=dict(
+        type='GOT10kDataset',
+        ann_file=data_root + 'got10k/annotations/got10k_test_infos.txt',
+        img_prefix=data_root + 'got10k',
+        pipeline=test_pipeline,
+        split='test',
+        test_mode=True))
+```
+
+**new**
+
+```python
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='QuotaSampler', samples_per_epoch=60000),
+    dataset=dict(
+        type='GOT10kDataset',
+        data_root=data_root,
+        ann_file='GOT10k/annotations/got10k_train_infos.txt',
+        data_prefix=dict(img_path='GOT10k'),
+        pipeline=train_pipeline,
+        test_mode=False))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='VideoSampler'),
+    dataset=dict(
+        type='GOT10kDataset',
+        data_root='data/',
+        ann_file='GOT10k/annotations/got10k_test_infos.txt',
+        data_prefix=dict(img_path='GOT10k'),
+        pipeline=test_pipeline,
+        test_mode=True))
+test_dataloader = val_dataloader
+```
+
+### keys of optimizer
+
+**old**
+
+```python
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(
+        custom_keys=dict(backbone=dict(lr_mult=0.1, decay_mult=1.0))))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+```
+
+**new**
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys=dict(backbone=dict(lr_mult=0.1, decay_mult=1.0))))
+```
+
+### keys of learning scheduler
+
+**old**
+
+```python
+lr_config = dict(policy='step', step=[400])
+```
+
+**new**
+
+```python
+param_scheduler = dict(type='MultiStepLR', milestones=[400], gamma=0.1)
+```
+
+## Model
+
+### Data preprocessor
+
+The 1.xx versions of MMtracking add [TrackDataPreprocessor](../../mmtrack/models/data_preprocessors/data_preprocessor.py). The data out from the data pipeline is transformed by this module and then fed into the model.
+
+### Train
+
+The training forward of models and heads is performed by calling `loss` function in their respective classes. The arguments of `loss` function in models contain a dict of `Tensor` and a list of `TrackDataSample`.
+
+### Test
+
+The test forward of models and heads is performed by calling `predict` function in their respective classes. The arguments of `predict` function in models contain a dict of `Tensor` and a list of `TrackDataSample`.
+
+## Data
+
+### data structure
+
+The 1.xx versions of MMtracking add two new data structure: [TrackDataSample](../../mmtrack/structures/track_data_sample.py) and [ReIDDataSample](../../mmtrack/structures/reid_data_sample.py). These data structures wrap the annotations and predictions from one image (sequence) and are used as interfaces between different components.
+
+### dataset class
+
+The 1.xx versions of MMTracking add two base dataset classes which inherient from the `BaseDataset` in MMEngine: `BaseSOTDataset` and `BaseVideoDataset`. The former is only used in SOT and the latter is used for all other tasks.
+
+### data pipeline
+
+1. Most of the transforms on image sequences in the old MMTracking are refactored in the latest MMTracking. Specifically, we use `TransformBroadcaster` to wrap the transforms of single image.
+
+Some transforms on image sequences, such as `SeqCropLikeStark`, are reserved since `TransformBroadcaster` doesn't support setting different arguments respectively for each image in the sequence.
+
+2. We pack the `VideoCollect`, `ConcatSameTypeFrames` and `SeqDefaultFormatBundle` in the old MMTracking into `PackTrackInputs` in the latest MMTracking.
+
+3. The normalizaion in the pipeline in the old MMTracking is removed and this operation is implemented in the model forward.
+
+### data sampler
+
+The 1.xx versions of MMtracking add `DATA_SAMPLERS` registry. You can customize different dataset samplers in the configs. Details about the samplers can be seen [here](../../mmtrack/datasets/samplers).
+
+## Evaluation
+
+The old versions of MMTarcking implement evaluation in the dataset class. In the 1.xx versions of MMTracking, we add `METRICS` registry. All evaluation are implemented in the metric classes registered in `METRICS`. Details can be seen [here](../../mmtrack/evaluation/metrics).
+
+## Visualization
+
+The 1.xx versions of MMTracking add `TrackLocalVisualizer` and `DetLocalVisualizer` which are registered in `VISUALIZER`. Compared with the 0.xx versions of MMTracking, we support the visualization of images and feature maps. Details can be seen [here](../../mmtrack/visualization/local_visualizer.py).
+
+## Engine
+
+The runner, hook, logging and optimizer in the training, evaluation and test are refactored in the 1.xx versions of MMTracking. Details can be seen in MMEngine.
diff --git a/docs/en/model_zoo.md b/docs/en/model_zoo.md
index 0114e831e..9f5c892bc 100644
--- a/docs/en/model_zoo.md
+++ b/docs/en/model_zoo.md
@@ -8,7 +8,7 @@
 
 - For fair comparison with other codebases, we report the GPU memory as the maximum value of `torch.cuda.max_memory_allocated()` for all 8 GPUs. Note that this value is usually less than what `nvidia-smi` shows.
 
-- We report the inference time as the total time of network forwarding and post-processing, excluding the data loading time. Results are obtained with the script `tools/analysis/benchmark.py` which computes the average time on 2000 images.
+- We report the inference time as the total time of network forwarding and post-processing, excluding the data loading time. Results are obtained with the script `tools/analysis_tools/benchmark.py` which computes the average time on 2000 images.
 
 - Speed benchmark environments
 
@@ -29,50 +29,66 @@
 
 ### DFF (CVPR 2017)
 
-Please refer to [DFF](https://github.com/open-mmlab/mmtracking/blob/master/configs/vid/dff) for details.
+Please refer to [DFF](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/vid/dff) for details.
 
 ### FGFA (ICCV 2017)
 
-Please refer to [FGFA](https://github.com/open-mmlab/mmtracking/blob/master/configs/vid/fgfa) for details.
+Please refer to [FGFA](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/vid/fgfa) for details.
 
 ### SELSA (ICCV 2019)
 
-Please refer to [SELSA](https://github.com/open-mmlab/mmtracking/blob/master/configs/vid/selsa) for details.
+Please refer to [SELSA](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/vid/selsa) for details.
 
 ### Temporal RoI Align (AAAI 2021)
 
-Please refer to [Temporal RoI Align](https://github.com/open-mmlab/mmtracking/blob/master/configs/vid/temporal_roi_align) for details.
+Please refer to [Temporal RoI Align](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/vid/temporal_roi_align) for details.
 
 ## Baselines of multiple object tracking
 
-### SORT/DeepSORT (ICIP 2016/2017)
+### SORT (ICIP 2016)
 
-Please refer to [SORT/DeepSORT](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/deepsort) for details.
+Please refer to [SORT](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/mot/sort) for details.
+
+### DeepSORT (ICIP 2017)
+
+Please refer to [DeepSORT](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/mot/deepsort) for details.
 
 ### Tracktor (ICCV 2019)
 
-Please refer to [Tracktor](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/tracktor) for details.
+Please refer to [Tracktor](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/mot/tracktor) for details.
 
 ### QDTrack (CVPR 2021)
 
-Please refer to [QDTrack](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/qdtrack) for details.
+Please refer to [QDTrack](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/mot/qdtrack) for details.
+
+### ByteTrack (ECCV 2022)
+
+Please refer to [ByteTrack](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/mot/bytetrack) for details.
 
-### ByteTrack (arXiv 2021)
+### StrongSORT (arvix 2022)
 
-Please refer to [ByteTrack](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/bytetrack) for details.
+Please refer to [StrongSORT](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/mot/strongsort) for details
 
 ## Baselines of single object tracking
 
 ### SiameseRPN++ (CVPR 2019)
 
-Please refer to [SiameseRPN++](https://github.com/open-mmlab/mmtracking/blob/master/configs/sot/siamese_rpn) for details.
+Please refer to [SiameseRPN++](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/sot/siamese_rpn) for details.
+
+### PrDiMP (CVPR 2020)
+
+Please refer to [PrDiMP](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/sot/prdimp) for details.
 
 ### STARK (ICCV 2021)
 
-Please refer to [STARK](https://github.com/open-mmlab/mmtracking/blob/master/configs/sot/stark) for details.
+Please refer to [STARK](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/sot/stark) for details.
 
 ## Baselines of video instance segmentation
 
 ### MaskTrack R-CNN (ICCV 2019)
 
-Please refer to [MaskTrack R-CNN](https://github.com/open-mmlab/mmtracking/blob/master/configs/vis/masktrack_rcnn) for details.
+Please refer to [MaskTrack R-CNN](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/vis/masktrack_rcnn) for details
+
+### Mask2Former (CVPR 2022)
+
+Please refer to [Mask2Former](https://github.com/open-mmlab/mmtracking/blob/1.x/configs/vis/mask2former) for details.
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
new file mode 100644
index 000000000..af5558e4f
--- /dev/null
+++ b/docs/en/notes/changelog.md
@@ -0,0 +1,204 @@
+## Changelog
+
+### v1.0.0rc1 (10/10/2022)
+
+MMTracking 1.0.0rc1 is the 2-nd version of MMTracking 1.x, a part of the OpenMMLab 2.0 projects.
+
+Built upon the new [training engine](https://github.com/open-mmlab/mmengine), MMTracking 1.x unifies the interfaces of datasets, models, evaluation, and visualization.
+
+And there are some BC-breaking changes. Please check [the migration tutorial](https://mmtracking.readthedocs.io/en/1.x/migration.html) for more details.
+
+We also support more methods in MMTracking 1.x, such as StrongSORT for MOT, Mask2Former for VIS, PrDiMP for SOT.
+
+### v0.13.0 (29/04/2022)
+
+#### Highlights
+
+- Support tracking colab tutorial ([#511](https://github.com/open-mmlab/mmtracking/pull/511))
+
+#### New Features
+
+- Refactor the training datasets of SiamRPN++ ([#496](https://github.com/open-mmlab/mmtracking/pull/496)), ([#518](https://github.com/open-mmlab/mmtracking/pull/518))
+
+- Support loading data from ceph for SOT datasets ([#494](https://github.com/open-mmlab/mmtracking/pull/494))
+
+- Support loading data from ceph for MOT challenge dataset ([#517](https://github.com/open-mmlab/mmtracking/pull/517))
+
+- Support evaluation metric for VIS task ([#501](https://github.com/open-mmlab/mmtracking/pull/501))
+
+#### Bug Fixes
+
+- Fix a bug in the LaSOT datasets and update the pretrained models of STARK ([#483](https://github.com/open-mmlab/mmtracking/pull/483)), ([#503](https://github.com/open-mmlab/mmtracking/pull/503))
+
+- Fix a bug in the format_results function of VIS task ([#504](https://github.com/open-mmlab/mmtracking/pull/504))
+
+### v0.12.0 (01/04/2022)
+
+#### Highlights
+
+- Support QDTrack algorithm in MOT ([#433](https://github.com/open-mmlab/mmtracking/pull/433)), ([#451](https://github.com/open-mmlab/mmtracking/pull/451)), ([#461](https://github.com/open-mmlab/mmtracking/pull/461)), ([#469](https://github.com/open-mmlab/mmtracking/pull/469))
+
+#### Bug Fixes
+
+- Support empty tensor for selsa aggregator ([#463](https://github.com/open-mmlab/mmtracking/pull/463))
+
+### v0.11.0 (04/03/2022)
+
+#### Highlights
+
+- Support STARK algorithm in SOT ([#443](https://github.com/open-mmlab/mmtracking/pull/443)), ([#440](https://github.com/open-mmlab/mmtracking/pull/440)), ([#434](https://github.com/open-mmlab/mmtracking/pull/434)), ([#438](https://github.com/open-mmlab/mmtracking/pull/438)), ([#435](https://github.com/open-mmlab/mmtracking/pull/435)), ([#426](https://github.com/open-mmlab/mmtracking/pull/426))
+
+- Support HOTA evaluation metrics for MOT ([#417](https://github.com/open-mmlab/mmtracking/pull/417))
+
+#### New Features
+
+- Support TAO dataset in MOT ([#415](https://github.com/open-mmlab/mmtracking/pull/415))
+
+### v0.10.0 (10/02/2022)
+
+#### New Features
+
+- Support CPU training ([#404](https://github.com/open-mmlab/mmtracking/pull/404))
+
+#### Improvements
+
+- Refactor SOT datasets ([#401](https://github.com/open-mmlab/mmtracking/pull/401)), ([#402](https://github.com/open-mmlab/mmtracking/pull/402)), ([#393](https://github.com/open-mmlab/mmtracking/pull/393))
+
+### v0.9.0 (05/01/2022)
+
+#### Highlights
+
+- Support arXiv 2021 manuscript 'ByteTrack: Multi-Object Tracking by Associating Every Detection Box' ([#385](https://github.com/open-mmlab/mmtracking/pull/385)), ([#383](https://github.com/open-mmlab/mmtracking/pull/383)), ([#372](https://github.com/open-mmlab/mmtracking/pull/372))
+- Support ICCV 2019 paper 'Video Instance Segmentation' ([#304](https://github.com/open-mmlab/mmtracking/pull/304)), ([#303](https://github.com/open-mmlab/mmtracking/pull/303)), ([#298](https://github.com/open-mmlab/mmtracking/pull/298)), ([#292](https://github.com/open-mmlab/mmtracking/pull/292))
+
+#### New Features
+
+- Support CrowdHuman dataset for MOT ([#366](https://github.com/open-mmlab/mmtracking/pull/366))
+- Support VOT2018 dataset for SOT ([#305](https://github.com/open-mmlab/mmtracking/pull/305))
+- Support YouTube-VIS dataset for VIS ([#290](https://github.com/open-mmlab/mmtracking/pull/290))
+
+#### Bug Fixes
+
+- Fix two significant bugs in SOT and provide new SOT pretrained models ([#349](https://github.com/open-mmlab/mmtracking/pull/349))
+
+#### Improvements
+
+- Refactor LaSOT, TrackingNet dataset and support GOT-10K datasets ([#296](https://github.com/open-mmlab/mmtracking/pull/296))
+- Support persisitent workers ([#348](https://github.com/open-mmlab/mmtracking/pull/348))
+
+### v0.8.0 (03/10/2021)
+
+#### New Features
+
+- Support OTB100 dataset in SOT ([#271](https://github.com/open-mmlab/mmtracking/pull/271))
+- Support TrackingNet dataset in SOT ([#268](https://github.com/open-mmlab/mmtracking/pull/268))
+- Support UAV123 dataset in SOT ([#260](https://github.com/open-mmlab/mmtracking/pull/260))
+
+#### Bug Fixes
+
+- Fix a bug in mot_param_search.py ([#270](https://github.com/open-mmlab/mmtracking/pull/270))
+
+#### Improvements
+
+- Use PyTorch sphinx theme ([#274](https://github.com/open-mmlab/mmtracking/pull/274))
+- Use pycocotools instead of mmpycocotools ([#263](https://github.com/open-mmlab/mmtracking/pull/263))
+
+### v0.7.0 (03/09/2021)
+
+#### Highlights
+
+- Release code of AAAI 2021 paper 'Temporal ROI Align for Video Object Recognition' ([#247](https://github.com/open-mmlab/mmtracking/pull/247))
+- Refactor English documentations ([#243](https://github.com/open-mmlab/mmtracking/pull/243))
+- Add Chinese documentations ([#248](https://github.com/open-mmlab/mmtracking/pull/248)), ([#250](https://github.com/open-mmlab/mmtracking/pull/250))
+
+#### New Features
+
+- Support fp16 training and testing ([#230](https://github.com/open-mmlab/mmtracking/pull/230))
+- Release model using ResNeXt-101 as backbone for all VID methods ([#254](https://github.com/open-mmlab/mmtracking/pull/254))
+- Support the results of Tracktor on MOT15, MOT16 and MOT20 datasets ([#217](https://github.com/open-mmlab/mmtracking/pull/217))
+- Support visualization for single gpu test ([#216](https://github.com/open-mmlab/mmtracking/pull/216))
+
+#### Bug Fixes
+
+- Fix a bug in MOTP evaluation ([#235](https://github.com/open-mmlab/mmtracking/pull/235))
+- Fix two bugs in reid training and testing ([#249](https://github.com/open-mmlab/mmtracking/pull/249))
+
+#### Improvements
+
+- Refactor anchor in SiameseRPN++ ([#229](https://github.com/open-mmlab/mmtracking/pull/229))
+- Unify model initialization ([#235](https://github.com/open-mmlab/mmtracking/pull/235))
+- Refactor unittest ([#231](https://github.com/open-mmlab/mmtracking/pull/231))
+
+### v0.6.0 (30/07/2021)
+
+#### Highlights
+
+- Fix training bugs of all three tasks ([#219](https://github.com/open-mmlab/mmtracking/pull/219)), ([#221](https://github.com/open-mmlab/mmtracking/pull/221))
+
+#### New Features
+
+- Support error visualization for mot task ([#212](https://github.com/open-mmlab/mmtracking/pull/212))
+
+#### Bug Fixes
+
+- Fix a bug in SOT demo ([#213](https://github.com/open-mmlab/mmtracking/pull/213))
+
+#### Improvements
+
+- Use MMCV registry ([#220](https://github.com/open-mmlab/mmtracking/pull/220))
+- Add README.md for reid training ([#210](https://github.com/open-mmlab/mmtracking/pull/210))
+- Modify dict keys of the outputs of SOT ([#223](https://github.com/open-mmlab/mmtracking/pull/223))
+- Add Chinese docs including install.md, quick_run.md, model_zoo.md, dataset.md ([#205](https://github.com/open-mmlab/mmtracking/pull/205)), ([#214](https://github.com/open-mmlab/mmtracking/pull/214))
+
+### v0.5.3 (01/07/2021)
+
+#### New Features
+
+- Support ReID training ([#177](https://github.com/open-mmlab/mmtracking/pull/177)), ([#179](https://github.com/open-mmlab/mmtracking/pull/179)), ([#180](https://github.com/open-mmlab/mmtracking/pull/180)), ([#181](https://github.com/open-mmlab/mmtracking/pull/181)),
+- Support MIM ([#158](https://github.com/open-mmlab/mmtracking/pull/158))
+
+#### Bug Fixes
+
+- Fix evaluation hook ([#176](https://github.com/open-mmlab/mmtracking/pull/176))
+- Fix a typo in vid config ([#171](https://github.com/open-mmlab/mmtracking/pull/171))
+
+#### Improvements
+
+- Refactor nms config ([#167](https://github.com/open-mmlab/mmtracking/pull/167))
+
+### v0.5.2 (03/06/2021)
+
+#### Improvements
+
+- Fixed typos ([#104](https://github.com/open-mmlab/mmtracking/commit/3ccc9b79ce6e14e013268d0dbb53462c0432f357), [#121](https://github.com/open-mmlab/mmtracking/commit/fadcd811df095781fbbdc7c47f8dac1305555461), [#145](https://github.com/open-mmlab/mmtracking/commit/48a47868abd9a0d96c010fc3f85cba1bd2854a9b))
+- Added conference reference ([#111](https://github.com/open-mmlab/mmtracking/commit/9a3c463b087cdee201a9345f270f6c01e116cf2c))
+- Updated the link of CONTRIBUTING to mmcv ([#112](https://github.com/open-mmlab/mmtracking/commit/b725e63463b1bd795fd3c3000b30ef37832a844d))
+- Adapt updates in mmcv (FP16Hook) ([#114](https://github.com/open-mmlab/mmtracking/commit/49f910878345250d22fd5da1104f1fb227244939), [#119](https://github.com/open-mmlab/mmtracking/commit/f1df53dd8e571f4674867919d1886b9fb2024bf9))
+- Added bibtex and links to other codebases ([#122](https://github.com/open-mmlab/mmtracking/commit/1b456423e0aeddb52e7c29e5b0ec3d48e058c615))
+- Added docker files ([#124](https://github.com/open-mmlab/mmtracking/commit/a01c3e8fff97a2b8eebc8d28e3e9d9a360ffbc3c))
+- Used `collect_env` in mmcv ([#129](https://github.com/open-mmlab/mmtracking/commit/0055947c4d19c8921c32ce128ae0314d61e593d2))
+- Added and updated Chinese README ([#135](https://github.com/open-mmlab/mmtracking/commit/ecc83b5e6523582b92196095eb21d72d654322f2), [#147](https://github.com/open-mmlab/mmtracking/commit/19004b6eeca594a2179d8b3a3622764e1753aa4d), [#148](https://github.com/open-mmlab/mmtracking/commit/dc367868453fdcb528041176a59ede368f0e2053))
+
+### v0.5.1 (01/02/2021)
+
+#### Bug Fixes
+
+- Fixed ReID checkpoint loading ([#80](https://github.com/open-mmlab/mmtracking/pull/80))
+- Fixed empty tensor in `track_result` ([#86](https://github.com/open-mmlab/mmtracking/pull/86))
+- Fixed `wait_time` in MOT demo script ([#92](https://github.com/open-mmlab/mmtracking/pull/92))
+
+#### Improvements
+
+- Support single-stage detector for DeepSORT ([#100](https://github.com/open-mmlab/mmtracking/pull/100))
+
+### v0.5.0 (04/01/2021)
+
+#### Highlights
+
+- MMTracking is released!
+
+#### New Features
+
+- Support video object detection methods: [DFF](https://arxiv.org/abs/1611.07715), [FGFA](https://arxiv.org/abs/1703.10025), [SELSA](https://arxiv.org/abs/1907.06390)
+- Support multi object tracking methods: [SORT](https://arxiv.org/abs/1602.00763)/[DeepSORT](https://arxiv.org/abs/1703.07402), [Tracktor](https://arxiv.org/abs/1903.05625)
+- Support single object tracking methods: [SiameseRPN++](https://arxiv.org/abs/1812.11703)
diff --git a/docs/en/notes/contribution_guide.md b/docs/en/notes/contribution_guide.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/notes/faq.md b/docs/en/notes/faq.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/notes/projects.md b/docs/en/notes/projects.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/en/overview.md b/docs/en/overview.md
new file mode 100644
index 000000000..caef3b49a
--- /dev/null
+++ b/docs/en/overview.md
@@ -0,0 +1,73 @@
+## Introduction
+
+MMTracking is an open source video perception toolbox by [PyTorch](https://pytorch.org/). It is a part of [OpenMMLab](https://openmmlab.com) project.
+
+It supports 4 video tasks:
+
+- Video object detection (VID)
+- Single object tracking (SOT)
+- Multiple object tracking (MOT)
+- Video instance segmentation (VIS)
+
+## Major features
+
+- **The First Unified Video Perception Platform**
+
+  We are the first open source toolbox that unifies versatile video perception tasks including video object detection, multiple object tracking, single object tracking and video instance segmentation.
+
+- **Modular Design**
+
+  We decompose the video perception framework into different components and one can easily construct a customized method by combining different modules.
+
+- **Simple, Fast and Strong**
+
+  **Simple**: MMTracking interacts with other OpenMMLab projects. It is built upon [MMDetection](https://github.com/open-mmlab/mmdetection) that we can capitalize any detector only through modifying the configs.
+
+  **Fast**: All operations run on GPUs. The training and inference speeds are faster than or comparable to other implementations.
+
+  **Strong**: We reproduce state-of-the-art models and some of them even outperform the official implementations.
+
+## Getting Started
+
+Please refer to [get_started.md](./get_started.md) for the basic usage of MMTracking.
+
+A Colab tutorial is provided. You may preview the notebook [here](../../demo/MMTracking_Tutorial.ipynb) or directly run it on [Colab](https://colab.research.google.com/github/open-mmlab/mmtracking/blob/master/demo/MMTracking_Tutorial.ipynb).
+
+## User Guides
+
+There are some basic [usage guides](./user_guides/), including:
+
+- [configs](./user_guides/1_config.md)
+- [dataset preparation](./user_guides/2_dataset_prepare.md)
+- [inference](./user_guides/3_inference.md)
+- [train and test](./user_guides/4_train_test.md)
+- [visualization](./user_guides/5_visualization.md)
+- [analysis tools](./user_guides/6_analysis_tools.md)
+
+If you want to learn more [advanced guides](./advanced_guides), you can refer to:
+
+- [data flow](./advanced_guides/1_data_flow.md)
+- [structures](./advanced_guides/2_structures.md)
+- [models](./advanced_guides/3_models.md)
+- [datasets](./advanced_guides/4_datasets.md)
+- [transforms](./advanced_guides/5_transforms.md)
+- [evaluation](./advanced_guides/6_evaluation.md)
+- [engine](./advanced_guides/7_engine.md)
+- [convention](./advanced_guides/8_convention.md)
+- [add modules](./advanced_guides/9_add_modules.md)
+- [add datasets](./advanced_guides/10_add_datasets.md)
+- [add transforms](./advanced_guides/11_add_transforms.md)
+- [add metrics](./advanced_guides/12_add_metrics.md)
+- [customized runtime](./advanced_guides/13_custime_runtime.md)
+
+## Benchmark and model zoo
+
+Results and models are available in the [model zoo](./model_zoo.md).
+
+## Contributing
+
+We appreciate all contributions to improve MMTracking. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) for the contributing guideline and [this discussion](https://github.com/open-mmlab/mmtracking/issues/73) for development roadmap.
+
+## FAQ
+
+If you encounter any problems in the process of using MMTracking, you can firstly refer to [FAQ](./notes/faq.md). If not solved, you can post an [issue](https://github.com/open-mmlab/mmtracking/issues/) and we will give a response as soon as possible.
diff --git a/docs/en/switch_language.md b/docs/en/swich_language.md
similarity index 100%
rename from docs/en/switch_language.md
rename to docs/en/swich_language.md
diff --git a/docs/en/user_guides/1_config.md b/docs/en/user_guides/1_config.md
new file mode 100644
index 000000000..3141536c1
--- /dev/null
+++ b/docs/en/user_guides/1_config.md
@@ -0,0 +1,120 @@
+# Learn about Configs
+
+We use python files as our config system. You can find all the provided configs under $MMTracking/configs.
+
+We incorporate modular and inheritance design into our config system,
+which is convenient to conduct various experiments.
+If you wish to inspect the config file,
+you may run `python tools/misc/print_config.py /PATH/TO/CONFIG` to see the complete config.
+
+## A brief description of a complete config
+
+A complete config usually contains the following primary fields:
+
+- `model`: the basic config of model, which may contain `data_preprocessor`, modules (e.g., `detector`, `motion`),`train_cfg`, `test_cfg`, etc.
+- `train_dataloader`: the config of training dataloader, which usually contains `batch_size`, `num_workers`, `sampler`, `dataset`, etc.
+- `val_dataloader`: the config of validation dataloader, which is similar with `train_dataloader`.
+- `test_dataloader`: the config of testing dataloader, which is similar with `train_dataloader`.
+- `val_evaluator`: the config of validation evaluator. For example, `type='CocoVideoMetric'` for VID task on the ILSVRC benchmark, `type='MOTChallengeMetrics'` for MOT task on the MOTChallenge benchmarks.
+- `test_evaluator`: the config of testing evaluator, which is similar with `val_evaluator`.
+- `train_cfg`: the config of training loop. For example, `type='EpochBasedTrainLoop'`.
+- `val_cfg`: the config of validation loop. For example, `type='ValLoop'`.
+- `test_cfg`: the config of testing loop. For example, `type='TestLoop'`.
+- `default_hooks`: the config of default hooks, which may include hooks for timer, logger, param_scheduler, checkpoint, sampler_seed, visualization, etc.
+- `vis_backends`: the config of visualization backends, which uses `type='LocalVisBackend'` as default.
+- `visualizer`: the config of visualizer. For example, `type='DetLocalVisualizer'` for VID task, and `type='TrackLocalVisualizer'` for MOT, VIS, SOT, VOS tasks.
+- `param_scheduler`: the config of parameter scheduler, which usually sets the learning rate scheduler.
+- `optim_wrapper`: the config of optimizer wrapper, which contains optimization-related information, for example optimizer, gradient clipping, etc.
+- `load_from`: load models as a pre-trained model from a given path.
+- `resume`: If `True`, resume checkpoints from `load_from`, and the training will be resumed from the epoch when the checkpoint is saved.
+
+## Modify config through script arguments
+
+When submitting jobs using `tools/train.py` or `tools/test.py`,
+you may specify `--cfg-options` to in-place modify the config.
+We present several examples as follows.
+For more details, please refer to [MMEngine](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/config.md).
+
+- **Update config keys of dict chains.**
+
+  The config options can be specified following the order of the dict keys in the original config.
+  For example, `--cfg-options model.detector.backbone.norm_eval=False` changes the all BN modules in model backbones to train mode.
+
+- **Update keys inside a list of configs.**
+
+  Some config dicts are composed as a list in your config.
+  For example, the testing pipeline `test_dataloader.dataset.pipeline` is normally a list e.g. `[dict(type='LoadImageFromFile'), ...]`.
+  If you want to change `LoadImageFromFile` to `LoadImageFromWebcam` in the pipeline,
+  you may specify `--cfg-options test_dataloader.dataset.pipeline.0.type=LoadImageFromWebcam`.
+
+- **Update values of list/tuples.**
+
+  Maybe the value to be updated is a list or a tuple.
+  For example, you can change the key `mean` of `data_preprocessor` by specifying `--cfg-options model.data_preprocessor.mean=[0,0,0]`.
+  Note that **NO** white space is allowed inside the specified value.
+
+## Config File Structure
+
+There are 3 basic component types under `config/_base_`, i.e., dataset, model and default_runtime.
+Many methods could be easily constructed with one of each like DFF, FGFA, SELSA, SORT, DeepSORT.
+The configs that are composed by components from `_base_` are called *primitive*.
+
+For all configs under the same folder, it is recommended to have only **one** *primitive* config.
+All other configs should inherit from the *primitive* config.
+In this way, the maximum of inheritance level is 3.
+
+For easy understanding, we recommend contributors to inherit from exiting methods.
+For example, if some modification is made base on Faster R-CNN,
+user may first inherit the basic Faster R-CNN structure
+by specifying `_base_ = ../../_base_/models/faster-rcnn_r50-dc5.py`,
+then modify the necessary fields in the config files.
+
+If you are building an entirely new method that does not share the structure with any of the existing methods,
+you may create a folder `method_name` under `configs`.
+
+Please refer to [MMEngine](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/config.md) for detailed documentation.
+
+## Config Name Style
+
+We follow the below style to name config files. Contributors are advised to follow the same style.
+
+```shell
+{method}_{module}_{train_cfg}_{train_data}_{test_data}
+```
+
+- `{method}`: method name, like `dff`, `deepsort`, `siamese_rpn`.
+- `{module}`: basic modules of the method, like `faster-rcnn_r50_fpn`.
+- `{train_cfg}`: training config which usually contains batch size, epochs, etc, like `8xb4-80e`.
+- `{train_data}`: training data, like `mot17halftrain`.
+- `{test_data}`: testing data, like `test-mot17halfval`.
+
+## FAQ
+
+**Ignore some fields in the base configs**
+
+Sometimes, you may set `_delete_=True` to ignore some of fields in base configs.
+You may refer to [MMEngine](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/config.md) for simple illustration.
+
+**Use intermediate variables in configs**
+
+Some intermediate variables are used in the configs files, like `train_pipeline`/`test_pipeline` in datasets.
+It's worth noting that when modifying intermediate variables in the children configs,
+user need to pass the intermediate variables into corresponding fields again.
+For example, we would like to use testing strategy of adaptive stride to test SELSA.
+ref_img_sampler is intermediate variable we would like to modify.
+
+```python
+_base_ = ['./selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py']
+# dataset settings
+ref_img_sampler=dict(
+    _delete_=True,
+    num_ref_imgs=14,
+    frame_range=[-7, 7],
+    method='test_with_adaptive_stride')
+val_dataloader = dict(
+    dataset=dict(ref_img_sampler=ref_img_sampler))
+test_dataloader = dict(
+    dataset=dict(ref_img_sampler=ref_img_sampler))
+```
+
+We first define the new `ref_img_sampler` and pass them into `val_dataloader` and `test_dataloader`.
diff --git a/docs/en/user_guides/2_dataset_prepare.md b/docs/en/user_guides/2_dataset_prepare.md
new file mode 100644
index 000000000..f36f0bd85
--- /dev/null
+++ b/docs/en/user_guides/2_dataset_prepare.md
@@ -0,0 +1,702 @@
+## Dataset Preparation
+
+This page provides the instructions for dataset preparation on existing benchmarks, include
+
+- Video Object Detection
+  - [ILSVRC](http://image-net.org/challenges/LSVRC/2017/)
+- Multiple Object Tracking
+  - [MOT Challenge](https://motchallenge.net/)
+  - [CrowdHuman](https://www.crowdhuman.org/)
+  - [LVIS](https://www.lvisdataset.org/)
+  - [TAO](https://taodataset.org/)
+  - [DanceTrack](https://dancetrack.github.io)
+- Single Object Tracking
+  - [LaSOT](http://vision.cs.stonybrook.edu/~lasot/)
+  - [UAV123](https://cemse.kaust.edu.sa/ivul/uav123/)
+  - [TrackingNet](https://tracking-net.org/)
+  - [OTB100](http://www.visual-tracking.net/)
+  - [GOT10k](http://got-10k.aitestunion.com/)
+  - [VOT2018](https://www.votchallenge.net/vot2018/)
+- Video Instance Segmentation
+  - [YouTube-VIS](https://youtube-vos.org/dataset/vis/)
+
+### 1. Download Datasets
+
+Please download the datasets from the official websites. It is recommended to symlink the root of the datasets to `$MMTRACKING/data`.
+
+#### 1.1 Video Object Detection
+
+- For the training and testing of video object detection task, only ILSVRC dataset is needed.
+
+- The `Lists` under `ILSVRC` contains the txt files from [here](https://github.com/msracver/Flow-Guided-Feature-Aggregation/tree/master/data/ILSVRC2015/ImageSets).
+
+#### 1.2 Multiple Object Tracking
+
+- For the training and testing of multi object tracking task, one of the MOT Challenge datasets (e.g. MOT17, TAO and DanceTrack) are needed, CrowdHuman and LVIS can be served as comlementary dataset.
+
+- The `annotations` under `tao` contains the official annotations from [here](https://github.com/TAO-Dataset/annotations).
+
+- The `annotations` under `lvis` contains the official annotations of lvis-v0.5 which can be downloaded according to [here](https://github.com/lvis-dataset/lvis-api/issues/23#issuecomment-894963957). The synset mapping file `coco_to_lvis_synset.json` used in `./tools/dataset_converters/tao/merge_coco_with_lvis.py` script can be found [here](https://github.com/TAO-Dataset/tao/tree/master/data).
+
+#### 1.3 Single Object Tracking
+
+- For the training and testing of single object tracking task, the MSCOCO, ILSVRC, LaSOT, UAV123, TrackingNet, OTB100, GOT10k and VOT2018 datasets are needed.
+
+- For OTB100 dataset, you don't need to download the dataset from the official website manually, since we provide a script to download it.
+
+```shell
+# download OTB100 dataset by web crawling
+python ./tools/dataset_converters/otb100/download_otb100.py -o ./data/OTB100/zips -p 8
+```
+
+- For VOT2018, we use the official downloading script.
+
+```shell
+# download VOT2018 dataset by web crawling
+python ./tools/dataset_converters/vot/download_vot.py --dataset vot2018 --save_path ./data/VOT2018/data
+```
+
+#### 1.4 Video Instance Segmentation
+
+- For the training and testing of video instance segmetatioon task, only one of YouTube-VIS datasets (e.g. YouTube-VIS 2019) is needed.
+
+#### 1.5 Data Structure
+
+If your folder structure is different from the following, you may need to change the corresponding paths in config files.
+
+```
+mmtracking
+├── mmtrack
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+│   │   ├── annotations
+│   │
+│   ├── ILSVRC
+│   │   ├── Data
+│   │   │   ├── DET
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+|   │   │   │   ├── test
+│   │   │   ├── VID
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+|   │   │   │   ├── test
+│   │   ├── Annotations
+│   │   │   ├── DET
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+│   │   │   ├── VID
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+│   │   ├── Lists
+│   │
+|   ├── MOT15/MOT16/MOT17/MOT20
+|   |   ├── train
+|   |   ├── test
+│   │
+|   ├── DanceTrack
+|   |   ├── train
+|   |   ├── val
+|   |   ├── test
+|   |
+│   ├── crowdhuman
+│   │   ├── annotation_train.odgt
+│   │   ├── annotation_val.odgt
+│   │   ├── train
+│   │   │   ├── Images
+│   │   │   ├── CrowdHuman_train01.zip
+│   │   │   ├── CrowdHuman_train02.zip
+│   │   │   ├── CrowdHuman_train03.zip
+│   │   ├── val
+│   │   │   ├── Images
+│   │   │   ├── CrowdHuman_val.zip
+│   │
+│   ├── lvis
+│   │   ├── train (the same as coco/train2017)
+│   │   ├── val (the same as coco/val2017)
+│   │   ├── test (the same as coco/test2017)
+│   │   ├── annotations
+│   │   │   ├── coco_to_lvis_synset.json
+│   │   │   ├── lvis_v0.5_train.json
+│   │   │   ├── lvis_v0.5_val.json
+│   │   │   ├── lvis_v1_train.json
+│   │   │   ├── lvis_v1_val.json
+│   │   │   ├── lvis_v1_image_info_test_challenge.json
+│   │   │   ├── lvis_v1_image_info_test_dev.json
+│   │
+│   ├── tao
+│   │   ├── annotations
+│   │   │   ├── test_without_annotations.json
+│   │   │   ├── train.json
+│   │   │   ├── validation.json
+│   │   │   ├── ......
+│   │   ├── test
+│   │   │   ├── ArgoVerse
+│   │   │   ├── AVA
+│   │   │   ├── BDD
+│   │   │   ├── Charades
+│   │   │   ├── HACS
+│   │   │   ├── LaSOT
+│   │   │   ├── YFCC100M
+│   │   ├── train
+│   │   ├── val
+│   │
+│   ├── LaSOT_full
+│   │   ├── LaSOTBenchmark
+│   │   │   ├── airplane
+|   │   │   │   ├── airplane-1
+|   │   │   │   ├── airplane-2
+|   │   │   │   ├── ......
+│   │   │   ├── ......
+│   │
+│   ├── UAV123
+│   │   ├── data_seq
+│   │   │   ├── UAV123
+│   │   │   │   ├── bike1
+│   │   │   │   ├── boat1
+│   │   │   │   ├── ......
+│   │   ├── anno
+│   │   │   ├── UAV123
+│   │
+│   ├── TrackingNet
+│   │   ├── TEST.zip
+│   │   ├── TRAIN_0.zip
+│   │   ├── ......
+│   │   ├── TRAIN_11.zip
+│   │
+│   ├── OTB100
+│   │   │── zips
+│   │   │   │── Basketball.zip
+│   │   │   │── Biker.zip
+│   │   │   │──
+│   │
+│   ├── GOT10k
+│   │   │── full_data
+│   │   │   │── train_data
+│   │   │   │   ├── GOT-10k_Train_split_01.zip
+│   │   │   │   ├── ......
+│   │   │   │   ├── GOT-10k_Train_split_19.zip
+│   │   │   │   ├── list.txt
+│   │   │   │── test_data.zip
+│   │   │   │── val_data.zip
+│   │
+|   ├── VOT2018
+|   |   ├── data
+|   |   |   ├── ants1
+|   │   │   │   ├──color
+│   │
+│   ├── youtube_vis_2019
+│   │   │── train
+│   │   │   │── JPEGImages
+│   │   │   │── ......
+│   │   │── valid
+│   │   │   │── JPEGImages
+│   │   │   │── ......
+│   │   │── test
+│   │   │   │── JPEGImages
+│   │   │   │── ......
+│   │   │── train.json (the official annotation files)
+│   │   │── valid.json (the official annotation files)
+│   │   │── test.json (the official annotation files)
+│   │
+│   ├── youtube_vis_2021
+│   │   │── train
+│   │   │   │── JPEGImages
+│   │   │   │── instances.json (the official annotation files)
+│   │   │   │── ......
+│   │   │── valid
+│   │   │   │── JPEGImages
+│   │   │   │── instances.json (the official annotation files)
+│   │   │   │── ......
+│   │   │── test
+│   │   │   │── JPEGImages
+│   │   │   │── instances.json (the official annotation files)
+│   │   │   │── ......
+```
+
+### 2. Convert Annotations
+
+We use [CocoVID](https://github.com/open-mmlab/mmtracking/blob/master/mmtrack/datasets/parsers/coco_video_parser.py) to maintain all datasets in this codebase.
+In this case, you need to convert the official annotations to this style. We provide scripts and the usages are as following:
+
+```shell
+# ImageNet DET
+python ./tools/dataset_converters/ilsvrc/imagenet2coco_det.py -i ./data/ILSVRC -o ./data/ILSVRC/annotations
+
+# ImageNet VID
+python ./tools/dataset_converters/ilsvrc/imagenet2coco_vid.py -i ./data/ILSVRC -o ./data/ILSVRC/annotations
+
+# MOT17
+# The processing of other MOT Challenge dataset is the same as MOT17
+python ./tools/dataset_converters/mot/mot2coco.py -i ./data/MOT17/ -o ./data/MOT17/annotations --split-train --convert-det
+python ./tools/dataset_converters/mot/mot2reid.py -i ./data/MOT17/ -o ./data/MOT17/reid --val-split 0.2 --vis-threshold 0.3
+
+# DanceTrack
+python ./tools/dataset_converters/dancetrack/dancetrack2coco.py -i ./data/DanceTrack ./data/DanceTrack/annotations
+
+# CrowdHuman
+python ./tools/dataset_converters/mot/crowdhuman2coco.py -i ./data/crowdhuman -o ./data/crowdhuman/annotations
+
+# LVIS
+# Merge annotations from LVIS and COCO for training QDTrack
+python ./tools/dataset_converters/tao/merge_coco_with_lvis.py --lvis ./data/lvis/annotations/lvis_v0.5_train.json --coco ./data/coco/annotations/instances_train2017.json --mapping ./data/lvis/annotations/coco_to_lvis_synset.json --output-json ./data/lvis/annotations/lvisv0.5+coco_train.json
+
+# TAO
+# Generate filtered json file for QDTrack
+python ./tools/dataset_converters/tao/tao2coco.py -i ./data/tao/annotations --filter-classes
+
+# LaSOT
+python ./tools/dataset_converters/lasot/gen_lasot_infos.py -i ./data/LaSOT_full/LaSOTBenchmark -o ./data/LaSOT_full/annotations
+
+# UAV123
+# download annotations
+# due to the annotations of all videos in UAV123 are inconsistent, we just download the information file generated in advance.
+wget https://download.openmmlab.com/mmtracking/data/uav123_infos.txt -P data/UAV123/annotations
+
+# TrackingNet
+# unzip files in 'data/trackingnet/*.zip'
+bash ./tools/dataset_converters/trackingnet/unzip_trackingnet.sh ./data/TrackingNet
+# generate annotations
+python ./tools/dataset_converters/trackingnet/gen_trackingnet_infos.py -i ./data/TrackingNet -o ./data/TrackingNet/annotations
+
+# OTB100
+# unzip files in 'data/otb100/zips/*.zip'
+bash ./tools/dataset_converters/otb100/unzip_otb100.sh ./data/OTB100
+# download annotations
+# due to the annotations of all videos in OTB100 are inconsistent, we just need to download the information file generated in advance.
+wget https://download.openmmlab.com/mmtracking/data/otb100_infos.txt -P data/OTB100/annotations
+
+# GOT10k
+# unzip 'data/GOT10k/full_data/test_data.zip', 'data/GOT10k/full_data/val_data.zip' and files in 'data/GOT10k/full_data/train_data/*.zip'
+bash ./tools/dataset_converters/got10k/unzip_got10k.sh ./data/GOT10k
+# generate annotations
+python ./tools/dataset_converters/got10k/gen_got10k_infos.py -i ./data/GOT10k -o ./data/GOT10k/annotations
+
+# VOT2018
+python ./tools/dataset_converters/vot/gen_vot_infos.py -i ./data/VOT2018 -o ./data/VOT2018/annotations --dataset_type vot2018
+
+# YouTube-VIS 2019
+python ./tools/dataset_converters/youtubevis/youtubevis2coco.py -i ./data/youtube_vis_2019 -o ./data/youtube_vis_2019/annotations --version 2019
+
+# YouTube-VIS 2021
+python ./tools/dataset_converters/youtubevis/youtubevis2coco.py -i ./data/youtube_vis_2021 -o ./data/youtube_vis_2021/annotations --version 2021
+```
+
+The folder structure will be as following after your run these scripts:
+
+```
+mmtracking
+├── mmtrack
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+│   │   ├── annotations
+│   │
+│   ├── ILSVRC
+│   │   ├── Data
+│   │   │   ├── DET
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+|   │   │   │   ├── test
+│   │   │   ├── VID
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+|   │   │   │   ├── test
+│   │   ├── Annotations (the official annotation files)
+│   │   │   ├── DET
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+│   │   │   ├── VID
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+│   │   ├── Lists
+│   │   ├── annotations (the converted annotation files)
+│   │
+|   ├── MOT15/MOT16/MOT17/MOT20
+|   |   ├── train
+|   |   ├── test
+|   |   ├── annotations
+|   |   ├── reid
+│   │   │   ├── imgs
+│   │   │   ├── meta
+│   │
+│   ├── DanceTrack
+│   │   ├── train
+│   │   ├── val
+│   │   ├── test
+│   │   ├── annotations
+│   │
+│   ├── crowdhuman
+│   │   ├── annotation_train.odgt
+│   │   ├── annotation_val.odgt
+│   │   ├── train
+│   │   │   ├── Images
+│   │   │   ├── CrowdHuman_train01.zip
+│   │   │   ├── CrowdHuman_train02.zip
+│   │   │   ├── CrowdHuman_train03.zip
+│   │   ├── val
+│   │   │   ├── Images
+│   │   │   ├── CrowdHuman_val.zip
+│   │   ├── annotations
+│   │   │   ├── crowdhuman_train.json
+│   │   │   ├── crowdhuman_val.json
+│   │
+│   ├── lvis
+│   │   ├── train (the same as coco/train2017)
+│   │   ├── val (the same as coco/val2017)
+│   │   ├── test (the same as coco/test2017)
+│   │   ├── annotations
+│   │   │   ├── coco_to_lvis_synset.json
+│   │   │   ├── lvisv0.5+coco_train.json
+│   │   │   ├── lvis_v0.5_train.json
+│   │   │   ├── lvis_v0.5_val.json
+│   │   │   ├── lvis_v1_train.json
+│   │   │   ├── lvis_v1_val.json
+│   │   │   ├── lvis_v1_image_info_test_challenge.json
+│   │   │   ├── lvis_v1_image_info_test_dev.json
+│   │
+│   ├── tao
+│   │   ├── annotations
+│   │   │   ├── test_482_classes.json
+│   │   │   ├── test_without_annotations.json
+│   │   │   ├── train.json
+│   │   │   ├── train_482_classes.json
+│   │   │   ├── validation.json
+│   │   │   ├── validation_482_classes.json
+│   │   │   ├── ......
+│   │   ├── test
+│   │   │   ├── ArgoVerse
+│   │   │   ├── AVA
+│   │   │   ├── BDD
+│   │   │   ├── Charades
+│   │   │   ├── HACS
+│   │   │   ├── LaSOT
+│   │   │   ├── YFCC100M
+│   │   ├── train
+│   │   ├── val
+│   │
+│   ├── LaSOT_full
+│   │   ├── LaSOTBenchmark
+│   │   │   ├── airplane
+|   │   │   │   ├── airplane-1
+|   │   │   │   ├── airplane-2
+|   │   │   │   ├── ......
+│   │   │   ├── ......
+│   │   ├── annotations
+│   │
+│   ├── UAV123
+│   │   ├── data_seq
+│   │   │   ├── UAV123
+│   │   │   │   ├── bike1
+│   │   │   │   ├── boat1
+│   │   │   │   ├── ......
+│   │   ├── anno (the official annotation files)
+│   │   │   ├── UAV123
+│   │   ├── annotations (the converted annotation file)
+│   │
+│   ├── TrackingNet
+│   │   ├── TEST
+│   │   │   ├── anno (the official annotation files)
+│   │   │   ├── zips
+│   │   │   ├── frames (the unzipped folders)
+│   │   │   │   ├── 0-6LB4FqxoE_0
+│   │   │   │   ├── 07Ysk1C0ZX0_0
+│   │   │   │   ├── ......
+│   │   ├── TRAIN_0
+│   │   │   ├── anno (the official annotation files)
+│   │   │   ├── zips
+│   │   │   ├── frames (the unzipped folders)
+│   │   │   │   ├── -3TIfnTSM6c_2
+│   │   │   │   ├── a1qoB1eERn0_0
+│   │   │   │   ├── ......
+│   │   ├── ......
+│   │   ├── TRAIN_11
+│   │   ├── annotations (the converted annotation file)
+│   │
+│   ├── OTB100
+│   │   ├── zips
+│   │   │   ├── Basketball.zip
+│   │   │   ├── Biker.zip
+│   │   │   │── ......
+│   │   ├── annotations
+│   │   ├── data
+│   │   │   ├── Basketball
+│   │   │   │   ├── img
+│   │   │   ├── ......
+│   │
+│   ├── GOT10k
+│   │   │── full_data
+│   │   │   │── train_data
+│   │   │   │   ├── GOT-10k_Train_split_01.zip
+│   │   │   │   ├── ......
+│   │   │   │   ├── GOT-10k_Train_split_19.zip
+│   │   │   │   ├── list.txt
+│   │   │   │── test_data.zip
+│   │   │   │── val_data.zip
+│   │   │── train
+│   │   │   ├── GOT-10k_Train_000001
+│   │   │   │   ├── ......
+│   │   │   ├── GOT-10k_Train_009335
+│   │   │   ├── list.txt
+│   │   │── test
+│   │   │   ├── GOT-10k_Test_000001
+│   │   │   │   ├── ......
+│   │   │   ├── GOT-10k_Test_000180
+│   │   │   ├── list.txt
+│   │   │── val
+│   │   │   ├── GOT-10k_Val_000001
+│   │   │   │   ├── ......
+│   │   │   ├── GOT-10k_Val_000180
+│   │   │   ├── list.txt
+│   │   │── annotations
+│   │
+|   ├── VOT2018
+|   |   ├── data
+|   |   |   ├── ants1
+|   │   │   │   ├──color
+|   |   ├── annotations
+│   │   │   ├── ......
+│   │
+│   ├── youtube_vis_2019
+│   │   │── train
+│   │   │   │── JPEGImages
+│   │   │   │── ......
+│   │   │── valid
+│   │   │   │── JPEGImages
+│   │   │   │── ......
+│   │   │── test
+│   │   │   │── JPEGImages
+│   │   │   │── ......
+│   │   │── train.json (the official annotation files)
+│   │   │── valid.json (the official annotation files)
+│   │   │── test.json (the official annotation files)
+│   │   │── annotations (the converted annotation file)
+│   │
+│   ├── youtube_vis_2021
+│   │   │── train
+│   │   │   │── JPEGImages
+│   │   │   │── instances.json (the official annotation files)
+│   │   │   │── ......
+│   │   │── valid
+│   │   │   │── JPEGImages
+│   │   │   │── instances.json (the official annotation files)
+│   │   │   │── ......
+│   │   │── test
+│   │   │   │── JPEGImages
+│   │   │   │── instances.json (the official annotation files)
+│   │   │   │── ......
+│   │   │── annotations (the converted annotation file)
+```
+
+#### The folder of annotations in ILSVRC
+
+There are 3 JSON files in `data/ILSVRC/annotations`:
+
+`imagenet_det_30plus1cls.json`: JSON file containing the annotations information of the training set in ImageNet DET dataset. The `30` in `30plus1cls` denotes the overlapped 30 categories in ImageNet VID dataset, and the `1cls` means we take the other 170 categories in ImageNet DET dataset as a category, named as `other_categeries`.
+
+`imagenet_vid_train.json`: JSON file containing the annotations information of the training set in ImageNet VID dataset.
+
+`imagenet_vid_val.json`: JSON file containing the annotations information of the validation set in ImageNet VID dataset.
+
+#### The folder of annotations and reid in MOT15/MOT16/MOT17/MOT20
+
+We take MOT17 dataset as examples, the other datasets share similar structure.
+
+There are 8 JSON files in `data/MOT17/annotations`:
+
+`train_cocoformat.json`: JSON file containing the annotations information of the training set in MOT17 dataset.
+
+`train_detections.pkl`: Pickle file containing the public detections of the training set in MOT17 dataset.
+
+`test_cocoformat.json`: JSON file containing the annotations information of the testing set in MOT17 dataset.
+
+`test_detections.pkl`: Pickle file containing the public detections of the testing set in MOT17 dataset.
+
+`half-train_cocoformat.json`, `half-train_detections.pkl`, `half-val_cocoformat.json`and `half-val_detections.pkl` share similar meaning with `train_cocoformat.json` and `train_detections.pkl`. The `half` means we split each video in the training set into half. The first half videos are denoted as `half-train` set, and the second half videos are denoted as`half-val` set.
+
+The structure of `data/MOT17/reid` is as follows:
+
+```
+reid
+├── imgs
+│   ├── MOT17-02-FRCNN_000002
+│   │   ├── 000000.jpg
+│   │   ├── 000001.jpg
+│   │   ├── ...
+│   ├── MOT17-02-FRCNN_000003
+│   │   ├── 000000.jpg
+│   │   ├── 000001.jpg
+│   │   ├── ...
+├── meta
+│   ├── train_80.txt
+│   ├── val_20.txt
+```
+
+The `80` in `train_80.txt` means the proportion of the training dataset to the whole ReID dataset is 80%. While the proportion of the validation dataset is 20%.
+
+For training, we provide a annotation list `train_80.txt`. Each line of the list contains a filename and its corresponding ground-truth labels. The format is as follows:
+
+```
+MOT17-05-FRCNN_000110/000018.jpg 0
+MOT17-13-FRCNN_000146/000014.jpg 1
+MOT17-05-FRCNN_000088/000004.jpg 2
+MOT17-02-FRCNN_000009/000081.jpg 3
+```
+
+`MOT17-05-FRCNN_000110` denotes the 110-th person in `MOT17-05-FRCNN` video.
+
+For validation, The annotation list `val_20.txt` remains the same as format above.
+
+Images in `reid/imgs` are cropped from raw images in `MOT17/train` by the corresponding `gt.txt`. The value of ground-truth labels should fall in range `[0, num_classes - 1]`.
+
+#### The folder of annotations in crowdhuman
+
+There are 2 JSON files in `data/crowdhuman/annotations`:
+
+`crowdhuman_train.json`:  JSON file containing the annotations information of the training set in CrowdHuman dataset.
+`crowdhuman_val.json`:  JSON file containing the annotations information of the validation set in CrowdHuman dataset.
+
+#### The folder of annotations in lvis
+
+There are 8 JSON files in `data/lvis/annotations`
+
+`coco_to_lvis_synset.json`: JSON file containing the mapping relationship between COCO and LVIS categories.
+
+`lvisv0.5+coco_train.json`: JSON file containing the merged annotations.
+
+`lvis_v0.5_train.json`: JSON file containing the annotations information of the training set in lvisv0.5.
+
+`lvis_v0.5_val.json`: JSON file containing the annotations information of the validation set in lvisv0.5.
+
+`lvis_v1_train.json`: JSON file containing the annotations information of the training set in lvisv1.
+
+`lvis_v1_val.json`: JSON file containing the annotations information of the validation set in lvisv1.
+
+`lvis_v1_image_info_test_challenge.json`: JSON file containing the annotations information of the testing set in lvisv1 available for year-round evaluation.
+
+`lvis_v1_image_info_test_dev.json`: JSON file containing the annotations information of the testing set in lvisv1 available only once a year for LVIS Challenge.
+
+#### The folder of annotations in tao
+
+There are 9 JSON files in `data/tao/annotations`:
+
+`test_categories.json`: JSON file containing a list of categories which will be evaluated on the TAO test set.
+
+`test_without_annotations.json`:  JSON for test videos. The 'images' and 'videos' fields contain the images and videos that will be evaluated on the test set.
+
+`test_482_classes.json`: JSON file containing the converted results for test set.
+
+`train.json`: JSON file containing annotations for LVIS categories in TAO train.
+
+`train_482_classes.json`: JSON file containing the converted results for train set.
+
+`train_with_freeform.json`: JSON file containing annotations for all categories in TAO train.
+
+`validation.json`: JSON file containing annotations for LVIS categories in TAO train.
+
+`validation_482_classes.json`: JSON file containing the converted results for validation set.
+
+`validation_with_freeform.json`: JSON file containing annotations for all categories in TAO validation.
+
+#### The folder of annotations in LaSOT
+
+There are 2 JSON files in `data/LaSOT_full/annotations`:
+
+`lasot_train.json`:  JSON file containing the annotations information of the training set in LaSOT dataset.
+`lasot_test.json`:  JSON file containing the annotations information of the testing set in LaSOT dataset.
+
+There are 2 TEXT files in `data/LaSOT_full/annotations`:
+
+`lasot_train_infos.txt`:  TEXT file containing the annotations information of the training set in LaSOT dataset.
+`lasot_test_infos.txt`:  TEXT file containing the annotations information of the testing set in LaSOT dataset.
+
+#### The folder of annotations in UAV123
+
+There are only 1 JSON files in `data/UAV123/annotations`:
+
+`uav123.json`:  JSON file containing the annotations information of the UAV123 dataset.
+
+There are only 1 TEXT files in `data/UAV123/annotations`:
+
+`uav123_infos.txt`:  TEXT file containing the information of the UAV123 dataset.
+
+#### The folder of frames and annotations in TrackingNet
+
+There are 511 video directories of TrackingNet testset in `data/TrackingNet/TEST/frames`, and each video directory contains all images of the video. Similar file structures can be seen in `data/TrackingNet/TRAIN_{*}/frames`.
+
+There are 2 JSON files in `data/TrackingNet/annotations`:
+
+`trackingnet_test.json`:  JSON file containing the annotations information of the testing set in TrackingNet dataset.
+`trackingnet_train.json`:  JSON file containing the annotations information of the training set in TrackingNet dataset.
+
+There are 2 TEXT files in `data/TrackingNet/annotations`:
+
+`trackingnet_test_infos.txt`:  TEXT file containing the information of the testing set in TrackingNet dataset.
+`trackingnet_train_infos.txt`:  TEXT file containing the information of the training set in TrackingNet dataset.
+
+#### The folder of data and annotations in OTB100
+
+There are 98 video directories of OTB100 dataset in `data/OTB100/data`, and the `img` folder under each video directory contains all images of the video.
+
+There are only 1 JSON files in `data/OTB100/annotations`:
+
+`otb100.json`:  JSON file containing the annotations information of the OTB100 dataset.
+
+There are only 1 TEXT files in `data/OTB100/annotations`:
+
+`otb100_infos.txt`:  TEXT file containing the information of the OTB100 dataset.
+
+#### The folder of frames and annotations in GOT10k
+
+There are training video directories in `data/GOT10k/train`, and each video directory contains all images of the video. Similar file structures can be seen in `data/GOT10k/test` and `data/GOT10k/val`.
+
+There are 3 JSON files in `data/GOT10k/annotations`:
+
+`got10k_train.json`:  JSON file containing the annotations information of the training set in GOT10k dataset.
+
+`got10k_test.json`:  JSON file containing the annotations information of the testing set in GOT10k dataset.
+
+`got10k_val.json`:  JSON file containing the annotations information of the valuation set in GOT10k dataset.
+
+There are 5 TEXT files in `data/GOT10k/annotations`:
+
+`got10k_train_infos.txt`:  TEXT file containing the information of the training set in GOT10k dataset.
+
+`got10k_test_infos.txt`:  TEXT file containing the information of the testing set in GOT10k dataset.
+
+`got10k_val_infos.txt`:  TEXT file containing the information of the valuation set in GOT10k dataset.
+
+`got10k_train_vot_infos.txt`:  TEXT file containing the information of the `train_vot` split in GOT10k dataset.
+
+`got10k_val_vot_infos.txt`:  TEXT file containing the information of the `val_vot` split in GOT10k dataset.
+
+#### The folder of data and annotations in VOT2018
+
+There are 60 video directories of VOT2018 dataset in `data/VOT2018/data`, and the `color` folder under each video directory contains all images of the video.
+
+There are only 1 JSON files in `data/VOT2018/annotations`:
+
+`vot2018.json`:  JSON file containing the annotations information of the VOT2018 dataset.
+
+There are only 1 TEXT files in `data/VOT2018/annotations`:
+
+`vot2018_infos.txt`:  TEXT file containing the information of the VOT2018 dataset.
+
+#### The folder of annotations in youtube_vis_2019/youtube_vis2021
+
+There are 3 JSON files in `data/youtube_vis_2019/annotations` or `data/youtube_vis_2021/annotations`:
+
+`youtube_vis_2019_train.json`/`youtube_vis_2021_train.json`: JSON file containing the annotations information of the training set in youtube_vis_2019/youtube_vis2021 dataset.
+
+`youtube_vis_2019_valid.json`/`youtube_vis_2021_valid.json`: JSON file containing the annotations information of the validation set in youtube_vis_2019/youtube_vis2021 dataset.
+
+`youtube_vis_2019_test.json`/`youtube_vis_2021_test.json`: JSON file containing the annotations information of the testing set in youtube_vis_2019/youtube_vis2021 dataset.
diff --git a/docs/en/user_guides/3_inference.md b/docs/en/user_guides/3_inference.md
new file mode 100644
index 000000000..747af194b
--- /dev/null
+++ b/docs/en/user_guides/3_inference.md
@@ -0,0 +1,134 @@
+# Inference
+
+We provide demo scripts to inference a given video or a folder that contains continuous images. The source codes are available [here](https://github.com/open-mmlab/mmtracking/tree/dev-1.x/demo/).
+
+Note that if you use a folder as the input, the image names there must be  **sortable** , which means we can re-order the images according to the numbers contained in the filenames. We now only support reading the images whose filenames end with `.jpg`, `.jpeg` and `.png`.
+
+## Inference VID models
+
+This script can inference an input video with a video object detection model.
+
+```
+python demo/demo_vid.py \
+    ${CONFIG_FILE}\
+    --input ${INPUT} \
+    --checkpoint ${CHECKPOINT_FILE} \
+    [--output ${OUTPUT}] \
+    [--device ${DEVICE}] \
+    [--show]
+```
+
+The `INPUT` and `OUTPUT` support both _mp4 video_ format and the _folder_ format.
+
+Optional arguments:
+
+- `OUTPUT`: Output of the visualized demo. If not specified, the `--show` is obligate to show the video on the fly.
+- `DEVICE`: The device for inference. Options are `cpu` or `cuda:0`, etc.
+- `--show`: Whether show the video on the fly.
+
+**Examples:**
+
+Assume that you have already downloaded the checkpoints to the directory `checkpoints/`, your video filename is `demo.mp4`, and your output path is the `./outputs/`
+
+```shell
+python ./demo/demo_vid.py \
+    configs/vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py \
+    --input ./demo.mp4 \
+    --checkpoint checkpoints/selsa_faster_rcnn_r101_dc5_1x_imagenetvid_20201218_172724-aa961bcc.pth \
+    --output ./outputs/ \
+    --show
+```
+
+## Inference MOT/VIS models
+
+This script can inference an input video / images with a multiple object tracking or video instance segmentation model.
+
+```shell
+python demo/demo_mot_vis.py \
+    ${CONFIG_FILE} \
+    --input ${INPUT} \
+    [--output ${OUTPUT}] \
+    [--checkpoint ${CHECKPOINT_FILE}] \
+    [--score-thr ${SCORE_THR} \
+    [--device ${DEVICE}] \
+    [--show]
+```
+
+The `INPUT` and `OUTPUT` support both _mp4 video_ format and the _folder_ format.
+
+**Important:** For `DeepSORT`, `SORT`, `Tracktor`, `StrongSORT`, they need both the weight of the `reid` and the weight of the `detector`. Therefore, we can't use `--checkpoint` to specify it. We need to use `init_cfg` in the configuration file to set the weight path. Other algorithms such as `ByteTrack`, `OCSORT` and `QDTrack` need not pay attention to this.
+
+Optional arguments:
+
+- `OUTPUT`: Output of the visualized demo. If not specified, the `--show` is obligate to show the video on the fly.
+- `CHECKPOINT_FILE`: The checkpoint is optional in case that you already set up the pretrained models in the config by the key `init_cfg`.
+- `SCORE_THR`: The threshold of score to filter bboxes.
+- `DEVICE`: The device for inference. Options are `cpu` or `cuda:0`, etc.
+- `--show`: Whether show the video on the fly.
+
+**Examples of running mot model:**
+
+```shell
+# Example 1: do not specify --checkpoint to use the default init_cfg
+python demo/demo_mot_vis.py \
+    configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \
+    --input demo/demo.mp4 \
+    --output mot.mp4
+
+# Example 2: use --checkpoint
+python demo/demo_mot_vis.py \
+    configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py \
+    --input demo/demo.mp4 \
+    --checkpoint checkpoints/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth \
+    --output mot.mp4
+```
+
+**Examples of running vis model:**
+
+Assume that you have already downloaded the checkpoints to the directory `checkpoints/`, your video filename is `demo.mp4`, and your output path is the `./outputs/`
+
+```shell
+python demo/demo_mot_vis.py \
+    configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py \
+    --input demo.mp4 \
+    --checkpoint checkpoints/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth \
+    --output ./outputs/ \
+    --show
+```
+
+## Inference SOT models
+
+This script can inference an input video with a single object tracking model.
+
+```shell
+python demo/demo_sot.py \
+    ${CONFIG_FILE}\
+    --input ${INPUT} \
+    --checkpoint ${CHECKPOINT_FILE} \
+    [--output ${OUTPUT}] \
+    [--device ${DEVICE}] \
+    [--show] \
+    [--gt_bbox_file ${GT_BBOX_FILE}]
+```
+
+The `INPUT` and `OUTPUT` support both _mp4 video_ format and the _folder_ format.
+
+Optional arguments:
+
+- `OUTPUT`: Output of the visualized demo. If not specified, the `--show` is obligate to show the video on the fly.
+- `DEVICE`: The device for inference. Options are `cpu` or `cuda:0`, etc.
+- `--show`: Whether show the video on the fly.
+- `GT_BBOX_FILE`: The gt_bbox file path of the video. We only use the gt_bbox of the first frame. If not specified, you would draw init bbox of the video manually.
+
+**Examples:**
+
+Assume that you have already downloaded the checkpoints to the directory `checkpoints/`
+
+```shell
+python ./demo/demo_sot.py \
+    configs/sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py \
+    --input ${VIDEO_FILE} \
+    --checkpoint checkpoints/siamese_rpn_r50_1x_lasot_20211203_151612-da4b3c66.pth \
+    --output ${OUTPUT} \
+    --show
+```
diff --git a/docs/en/user_guides/4_train_test.md b/docs/en/user_guides/4_train_test.md
new file mode 100644
index 000000000..db318b5ed
--- /dev/null
+++ b/docs/en/user_guides/4_train_test.md
@@ -0,0 +1,225 @@
+# Learn to train and test
+
+## Train
+
+This section will show how to train existing models on supported datasets.
+The following training environments are supported:
+
+- CPU
+- single GPU
+- single node multiple GPUs
+- multiple nodes
+
+You can also manage jobs with Slurm.
+
+Important:
+
+- You can change the evaluation interval during training by modifying the `train_cfg` as
+  `train_cfg = dict(val_interval=10)`. That means evaluating the model every 10 epochs.
+- The default learning rate in all config files is for 8 GPUs.
+  According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677),
+  you need to set the learning rate proportional to the batch size if you use different GPUs or images per GPU,
+  e.g., `lr=0.01` for 8 GPUs * 1 img/gpu and lr=0.04 for 16 GPUs * 2 imgs/gpu.
+- During training, log files and checkpoints will be saved to the working directory,
+  which is specified by CLI argument `--work-dir`. It uses `./work_dirs/CONFIG_NAME` as default.
+- If you want the mixed precision training, simply specify CLI argument `--amp`.
+
+#### 1. Train on CPU
+
+The model is default put on cuda device.
+Only if there are no cuda devices, the model will be put on cpu.
+So if you want to train the model on CPU, you need to `export CUDA_VISIBLE_DEVICES=-1` to disable GPU visibility first.
+More details in [MMEngine](https://github.com/open-mmlab/mmengine/blob/ca282aee9e402104b644494ca491f73d93a9544f/mmengine/runner/runner.py#L849-L850).
+
+```shell script
+CUDA_VISIBLE_DEVICES=-1 python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+An example of training the VID model DFF on CPU:
+
+```shell script
+CUDA_VISIBLE_DEVICES=-1 python tools/train.py configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py
+```
+
+#### 2. Train on single GPU
+
+If you want to train the model on single GPU, you can directly use the `tools/train.py` as follows.
+
+```shell script
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+You can use `export CUDA_VISIBLE_DEVICES=$GPU_ID` to select the GPU.
+
+An example of training the MOT model ByteTrack on single GPU:
+
+```shell script
+CUDA_VISIBLE_DEVICES=2 python tools/train.py configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
+```
+
+#### 3. Train on single node multiple GPUs
+
+We provide `tools/dist_train.sh` to launch training on multiple GPUs.
+The basic usage is as follows.
+
+```shell script
+bash ./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
+```
+
+If you would like to launch multiple jobs on a single machine,
+e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs,
+you need to specify different ports (29500 by default) for each job to avoid communication conflict.
+
+For example, you can set the port in commands as follows.
+
+```shell script
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
+```
+
+An example of training the SOT model SiameseRPN++ on single node multiple GPUs:
+
+```shell script
+bash ./tools/dist_train.sh ./configs/sot/siamese_rpn/siamese-rpn_r50_8xb16-20e_imagenetvid-imagenetdet-coco_test-otb100.py 8
+```
+
+#### 4. Train on multiple nodes
+
+If you launch with multiple machines simply connected with ethernet, you can simply run following commands:
+
+On the first machine:
+
+```shell script
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS
+```
+
+On the second machine:
+
+```shell script
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS
+```
+
+Usually it is slow if you do not have high speed networking like InfiniBand.
+
+#### 5. Train with Slurm
+
+[Slurm](https://slurm.schedmd.com/) is a good job scheduling system for computing clusters.
+On a cluster managed by Slurm, you can use `slurm_train.sh` to spawn training jobs.
+It supports both single-node and multi-node training.
+
+The basic usage is as follows.
+
+```shell script
+bash ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR} ${GPUS}
+```
+
+An example of training the VIS model MaskTrack R-CNN with Slurm:
+
+```shell script
+PORT=29501 \
+GPUS_PER_NODE=8 \
+SRUN_ARGS="--quotatype=reserved" \
+bash ./tools/slurm_train.sh \
+mypartition \
+masktrack \
+configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py \
+./work_dirs/MaskTrack_RCNN \
+8
+```
+
+## Test
+
+This section will show how to test existing models on supported datasets.
+The following testing environments are supported:
+
+- CPU
+- single GPU
+- single node multiple GPUs
+- multiple nodes
+
+You can also manage jobs with Slurm.
+
+Important:
+
+- You can set the results saving path by modifying the key `outfile_prefix` in evaluator.
+  For example, `val_evaluator = dict(outfile_prefix='results/stark_st1_trackingnet')`.
+  Otherwise, a temporal file will be created and will be removed after evaluation.
+- If you just want the formatted results without evaluation, you can set `format_only=True`.
+  For example, `test_evaluator = dict(type='YouTubeVISMetric', metric='youtube_vis_ap', outfile_prefix='./youtube_vis_results', format_only=True)`
+
+#### 1. Test on CPU
+
+The model is default put on cuda device.
+Only if there are no cuda devices, the model will be put on cpu.
+So if you want to test the model on CPU, you need to `export CUDA_VISIBLE_DEVICES=-1` to disable GPU visibility first.
+More details in [MMEngine](https://github.com/open-mmlab/mmengine/blob/ca282aee9e402104b644494ca491f73d93a9544f/mmengine/runner/runner.py#L849-L850).
+
+```shell script
+CUDA_VISIBLE_DEVICES=-1 python tools/test.py ${CONFIG_FILE} [optional arguments]
+```
+
+An example of testing the VID model DFF on CPU:
+
+```shell script
+CUDA_VISIBLE_DEVICES=-1 python tools/test.py configs/vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py --checkpoint https://download.openmmlab.com/mmtracking/vid/dff/dff_faster_rcnn_r50_dc5_1x_imagenetvid/dff_faster_rcnn_r50_dc5_1x_imagenetvid_20201227_213250-548911a4.pth
+```
+
+#### 2. Test on single GPU
+
+If you want to test the model on single GPU, you can directly use the `tools/test.py` as follows.
+
+```shell script
+python tools/test.py ${CONFIG_FILE} [optional arguments]
+```
+
+You can use `export CUDA_VISIBLE_DEVICES=$GPU_ID` to select the GPU.
+
+An example of testing the MOT model ByteTrack on single GPU:
+
+```shell script
+CUDA_VISIBLE_DEVICES=2 python tools/test.py configs/mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py --checkpoint https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth
+```
+
+#### 3. Test on single node multiple GPUs
+
+We provide `tools/dist_test.sh` to launch testing on multiple GPUs.
+The basic usage is as follows.
+
+```shell script
+bash ./tools/dist_test.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
+```
+
+An example of testing the SOT model SiameseRPN++ on single node multiple GPUs:
+
+```shell script
+bash ./tools/dist_test.sh ./configs/sot/siamese_rpn/siamese-rpn_r50_8xb16-20e_imagenetvid-imagenetdet-coco_test-otb100.py 8 --checkpoint https://download.openmmlab.com/mmtracking/sot/siamese_rpn/siamese_rpn_r50_1x_otb100/siamese_rpn_r50_20e_otb100_20220421_144232-6b8f1730.pth
+```
+
+#### 4. Test on multiple nodes
+
+You can test on multiple nodes, which is similar with "Train on multiple nodes".
+
+#### 5. Test with Slurm
+
+On a cluster managed by Slurm, you can use `slurm_test.sh` to spawn testing jobs.
+It supports both single-node and multi-node testing.
+
+The basic usage is as follows.
+
+```shell script
+bash ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${GPUS}
+```
+
+An example of testing the VIS model MaskTrack R-CNN with Slurm:
+
+```shell script
+PORT=29501 \
+GPUS_PER_NODE=8 \
+SRUN_ARGS="--quotatype=reserved" \
+bash ./tools/slurm_test.sh \
+mypartition \
+masktrack \
+configs/vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py \
+8 \
+--checkpoint https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth
+```
diff --git a/docs/en/user_guides/5_visualization.md b/docs/en/user_guides/5_visualization.md
new file mode 100644
index 000000000..be7ae2ba7
--- /dev/null
+++ b/docs/en/user_guides/5_visualization.md
@@ -0,0 +1,67 @@
+# Learn about Visualization
+
+## Local Visualization
+
+This section will present how to visualize the detection/tracking results with local visualizer.
+
+If you want to draw prediction results, you can turn this feature on by setting `draw=True` in `TrackVisualizationHook` as follows.
+
+```shell script
+default_hooks = dict(visualization=dict(type='TrackVisualizationHook', draw=True))
+```
+
+Specifically, the `TrackVisualizationHook` has the following arguments:
+
+- `draw`: whether to draw prediction results. If it is False, it means that no drawing will be done. Defaults to False.
+- `interval`: The interval of visualization. Defaults to 30.
+- `score_thr`: The threshold to visualize the bboxes and masks. Defaults to 0.3.
+- `show`: Whether to display the drawn image. Default to False.
+- `wait_time`: The interval of show (s). Defaults to 0.
+- `test_out_dir`: directory where painted images will be saved in testing process.
+- `file_client_args`: Arguments to instantiate a FileClient. Defaults to `dict(backend='disk')`.
+
+In the `TrackVisualizationHook`, a visualizer will be called to implement visualization,
+i.e., `DetLocalVisualizer` for VID task and `TrackLocalVisualizer` for MOT, VIS, SOT, VOS tasks.
+We will present the details below.
+You can refer to MMEngine for more details about [Visualization](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/visualization.md) and [Hook](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/hook.md).
+
+#### Detection Visualization
+
+We realize the detection visualization with class `DetLocalVisualizer`.
+You can call it as follows.
+
+```python
+visualizer = dict(type='DetLocalVisualizer')
+```
+
+It has the following arguments:
+
+- `name`: Name of the instance. Defaults to 'visualizer'.
+- `image`: The origin image to draw. The format should be RGB. Defaults to None.
+- `vis_backends`: Visual backend config list. Defaults to None.
+- `save_dir`: Save file dir for all storage backends. If it is None, the backend storage will not save any data.
+- `bbox_color`: Color of bbox lines. The tuple of color should be in BGR order. Defaults to None.
+- `text_color`: Color of texts. The tuple of color should be in BGR order. Defaults to (200, 200, 200).
+- `line_width`: The linewidth of lines. Defaults to 3.
+- `alpha`: The transparency of bboxes or mask. Defaults to 0.8.
+
+Here is a visualization example of DFF:
+
+![test_img_29](https://user-images.githubusercontent.com/99722489/186062793-623f6b1e-163e-4e1a-aa79-efea2d97a16d.png)
+
+#### Tracking Visualization
+
+We realize the tracking visualization with class `TrackLocalVisualizer`.
+You can call it as follows.
+
+```python
+visualizer = dict(type='TrackLocalVisualizer')
+```
+
+It has the following arguments, which has the same meaning of that in `DetLocalVisualizer`.
+
+`name`, `image`, `vis_backends`, `save_dir`, `line_width`, `alpha`.
+
+Here is a visualization example of DeepSORT:
+
+![test_img_89](https://user-images.githubusercontent.com/99722489/186062929-6d0e4663-0d8e-4045-9ec8-67e0e41da876.png)
diff --git a/docs/en/user_guides/6_analysis_tools.md b/docs/en/user_guides/6_analysis_tools.md
new file mode 100644
index 000000000..fff8c88b6
--- /dev/null
+++ b/docs/en/user_guides/6_analysis_tools.md
@@ -0,0 +1,235 @@
+**We provide lots of useful tools under the `tools/` directory.**
+
+## MOT Test-time Parameter Search
+
+`tools/analysis_tools/mot/mot_param_search.py` can search the parameters of the `tracker` in MOT models.
+It is used as the same manner with `tools/test.py` but **different** in the configs.
+
+Here is an example that shows how to modify the configs:
+
+1. Define the desirable evaluation metrics to record.
+
+   For example, you can define the `evaluator` as
+
+   ```python
+   test_evaluator=dict(type='MOTChallengeMetrics', metric=['HOTA', 'CLEAR', 'Identity'])
+   ```
+
+   Of course, you can also customize the content of `metric` in `test_evaluator`. You are free to choose one or more of `['HOTA', 'CLEAR', 'Identity']`.
+
+2. Define the parameters and the values to search.
+
+   Assume you have a tracker like
+
+   ```python
+   model=dict(
+       tracker=dict(
+           type='BaseTracker',
+           obj_score_thr=0.5,
+           match_iou_thr=0.5
+       )
+   )
+   ```
+
+   If you want to search the parameters of the tracker, just change the value to a list as follow
+
+   ```python
+   model=dict(
+       tracker=dict(
+           type='BaseTracker',
+           obj_score_thr=[0.4, 0.5, 0.6],
+           match_iou_thr=[0.4, 0.5, 0.6, 0.7]
+       )
+   )
+   ```
+
+   Then the script will test the totally 12 cases and log the results.
+
+## MOT Error Visualize
+
+`tools/analysis_tools/mot/mot_error_visualize.py` can visualize errors for multiple object tracking.
+This script needs the result of inference. By Default, the **red** bounding box denotes false positive, the **yellow** bounding box denotes the false negative and the **blue** bounding box denotes ID switch.
+
+```
+python tools/analysis_tools/mot/mot_error_visualize.py \
+    ${CONFIG_FILE}\
+    --input ${INPUT} \
+    --result-dir ${RESULT_DIR} \
+    [--out-dir ${OUTPUT}] \
+    [--fps ${FPS}] \
+    [--show] \
+    [--backend ${BACKEND}]
+```
+
+The `RESULT_DIR` contains the inference results of all videos and the inference result is a `txt` file.
+
+Optional arguments:
+
+- `OUTPUT`: Output of the visualized demo. If not specified, the `--show` is obligate to show the video on the fly.
+- `FPS`: FPS of the output video.
+- `--show`: Whether show the video on the fly.
+- `BACKEND`: The backend to visualize the boxes. Options are `cv2` and `plt`.
+
+## SiameseRPN++ Test-time Parameter Search
+
+`tools/analysis_tools/sot/sot_siamrpn_param_search.py` can search the test-time tracking parameters in SiameseRPN++: `penalty_k`, `lr` and `window_influence`. You need to pass the searching range of each parameter into the argparser.
+
+**Example on UAV123 dataset:**
+
+```shell
+./tools/analysis_tools/sot/dist_sot_siamrpn_param_search.sh [${CONFIG_FILE}] [$GPUS] \
+[--checkpoint ${CHECKPOINT}] [--log ${LOG_FILENAME}] [--eval ${EVAL}] \
+[--penalty-k-range 0.01,0.22,0.05] [--lr-range 0.4,0.61,0.05] [--win-infu-range 0.01,0.22,0.05]
+```
+
+**Example on OTB100 dataset:**
+
+```shell
+./tools/analysis_tools/sot/dist_sot_siamrpn_param_search.sh [${CONFIG_FILE}] [$GPUS] \
+[--checkpoint ${CHECKPOINT}] [--log ${LOG_FILENAME}] [--eval ${EVAL}] \
+[--penalty-k-range 0.3,0.45,0.02] [--lr-range 0.35,0.5,0.02] [--win-infu-range 0.46,0.55,0.02]
+```
+
+**Example on VOT2018 dataset:**
+
+```shell
+./tools/analysis_tools/sot/dist_sot_siamrpn_param_search.sh [${CONFIG_FILE}] [$GPUS] \
+[--checkpoint ${CHECKPOINT}] [--log ${LOG_FILENAME}] [--eval ${EVAL}] \
+[--penalty-k-range 0.01,0.31,0.05] [--lr-range 0.2,0.51,0.05] [--win-infu-range 0.3,0.56,0.05]
+```
+
+## Log Analysis
+
+`tools/analysis_tools/analyze_logs.py` plots loss/mAP curves given a training log file.
+
+```shell
+python tools/analysis_tools/analyze_logs.py plot_curve [--keys ${KEYS}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}]
+```
+
+**Examples:**
+
+- Plot the classification loss of some run.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls --legend loss_cls
+  ```
+
+- Plot the classification and regression loss of some run, and save the figure to a pdf.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls loss_bbox --out losses.pdf
+  ```
+
+- Compare the bbox mAP of two runs in the same figure.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py plot_curve log1.json log2.json --keys bbox_mAP --legend run1 run2
+  ```
+
+- Compute the average training speed.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py cal_train_time log.json [--include-outliers]
+  ```
+
+  The output is expected to be like the following.
+
+  ```text
+  -----Analyze train time of work_dirs/some_exp/20190611_192040.log.json-----
+  slowest epoch 11, average time is 1.2024
+  fastest epoch 1, average time is 1.1909
+  time std over epochs is 0.0028
+  average iter time: 1.1959 s/iter
+  ```
+
+## Browse dataset
+
+`tools/analysis_tools/browse_dataset.py` can visualize the training dataset to check whether the dataset configuration is correct.
+
+**Examples:**
+
+```shell
+python tools/analysis_tools/browse_dataset.py ${CONFIG_FILE} [--show-interval ${SHOW_INTERVAL}]
+```
+
+Optional arguments:
+
+- `SHOW_INTERVAL`: The interval of show (s).
+- `--not-show`: Whether do not show the images on the fly.
+
+## Show SOT evaluation results in video level
+
+The SOT evaluation results are sorted in video level from largest to smallest by the Success metric.
+You can selectively show the performance results of some good cases or bad cases by setting `eval_show_video_indices`.
+
+```python
+test_evaluator=dict(
+    type='SOTMetric',
+    options_after_eval=dict(eval_show_video_indices=10))
+```
+
+Here, `eval_show_video_indices` is used to index a `numpy.ndarray`.
+It can be `int` (positive or negative) or `list`. The positive number `k` means all the top-k
+reuslts while the negative number means the bottom-k results.
+
+## Save SOT evaluation results and plot them
+
+Save the SOT evaluation result by setting the `SOTMetric` in the config.
+
+```python
+test_evaluator = dict(
+    type='SOTMetric',
+    options_after_eval = dict(tracker_name = 'SiamRPN++', saved_eval_res_file = './results/sot_results.json'))
+```
+
+The saved result is a dict in the format:
+
+```python
+dict{tracker_name=dict(
+      success = np.ndarray,
+      norm_precision = np.ndarray,
+      precision = np.ndarray)}
+```
+
+The metrics have shape (M, ), where M is the number of values corresponding to different thresholds.
+
+Given the saved results, you can plot them using the following command:
+
+```shell
+python ./tools/analysis_tools/sot/sot_plot_curve.py ./results --plot_save_path ./results
+```
+
+# Save tracked results and playback them
+
+Save the tracked result by setting the `SOTMetric` in the config.
+
+```python
+test_evaluator = dict(
+    type='SOTMetric',
+    options_after_eval = dict(saved_track_res_path='./tracked_results'))
+```
+
+Playback the tracked results using the following command:
+
+```shell
+python ./tools/analysis_tools/sot/sot_playback.py  data/OTB100/data/Basketball/img/ tracked_results/basketball.txt --show --output results/basketball.mp4 --fps 20 --gt_bboxes data/OTB100/data/Basketball/groundtruth_rect.txt
+```
+
+## Visualization of feature map
+
+Here is an example of calling the Visualizer in MMEngine:
+
+```python
+# call visualizer at any position
+visualizer = Visualizer.get_current_instance()
+# set the image as background
+visualizer.set_image(image=image)
+# draw feature map on the image
+drawn_img = visualizer.draw_featmap(feature_map, image, channel_reduction='squeeze_mean')
+# show
+visualizer.show(drawn_img)
+# saved as ${saved_dir}/vis_data/vis_image/feat_0.png
+visualizer.add_image('feature_map', drawn_img)
+```
+
+More details about visualization of feature map can be seen in [visualizer docs](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/advanced_tutorials/visualization.md) and [draw_featmap function](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/visualizer.py#L864)
diff --git a/docs/en/user_guides/index.rst b/docs/en/user_guides/index.rst
new file mode 100644
index 000000000..edb298a73
--- /dev/null
+++ b/docs/en/user_guides/index.rst
@@ -0,0 +1,19 @@
+Train & Test
+**************
+
+.. toctree::
+   :maxdepth: 1
+
+   1_config.md
+   2_dataset_prepare.md
+   3_inference.md
+   4_train_test.md
+
+Useful Tools
+*************
+
+.. toctree::
+   :maxdepth: 1
+
+   5_visualization.md
+   6_analysis_tools.md
diff --git a/docs/zh_cn/Makefile b/docs_deprecated/en/Makefile
similarity index 100%
rename from docs/zh_cn/Makefile
rename to docs_deprecated/en/Makefile
diff --git a/docs/zh_cn/_static/css/readthedocs.css b/docs_deprecated/en/_static/css/readthedocs.css
similarity index 100%
rename from docs/zh_cn/_static/css/readthedocs.css
rename to docs_deprecated/en/_static/css/readthedocs.css
diff --git a/docs/zh_cn/_static/image/mmtrack-logo.png b/docs_deprecated/en/_static/image/mmtrack-logo.png
similarity index 100%
rename from docs/zh_cn/_static/image/mmtrack-logo.png
rename to docs_deprecated/en/_static/image/mmtrack-logo.png
diff --git a/docs/zh_cn/api.rst b/docs_deprecated/en/api.rst
similarity index 97%
rename from docs/zh_cn/api.rst
rename to docs_deprecated/en/api.rst
index 8b2f1ec69..eb8b02fa5 100644
--- a/docs/zh_cn/api.rst
+++ b/docs_deprecated/en/api.rst
@@ -46,7 +46,7 @@ datasets
 
 parsers
 ^^^^^^^^^^
-.. automodule:: mmtrack.datasets.parsers
+.. automodule:: mmtrack.datasets.api_wrappers
     :members:
 
 pipelines
diff --git a/docs/en/changelog.md b/docs_deprecated/en/changelog.md
similarity index 100%
rename from docs/en/changelog.md
rename to docs_deprecated/en/changelog.md
diff --git a/docs_deprecated/en/conf.py b/docs_deprecated/en/conf.py
new file mode 100644
index 000000000..37f02d91d
--- /dev/null
+++ b/docs_deprecated/en/conf.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import subprocess
+import sys
+
+import pytorch_sphinx_theme
+
+sys.path.insert(0, os.path.abspath('../..'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'MMTracking'
+copyright = '2018-2021, OpenMMLab'
+author = 'MMTracking Authors'
+version_file = '../../mmtrack/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+# The full version, including alpha/beta/rc tags
+release = get_version()
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'recommonmark',
+    'sphinx_markdown_tables',
+    'sphinx_copybutton',
+]
+
+autodoc_mock_imports = [
+    'matplotlib', 'pycocotools', 'terminaltables', 'mmtrack.version',
+    'seaborn', 'motmetrics', 'torchvision', 'pandas', 'scipy'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+# The master toctree document.
+master_doc = 'index'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'sphinx_rtd_theme'
+html_theme = 'pytorch_sphinx_theme'
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+    # 'logo_url': 'https://mmtracking.readthedocs.io/en/latest/',
+    'menu': [
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-mmlab/mmtracking'
+        },
+        {
+            'name':
+            'Upstream',
+            'children': [
+                {
+                    'name': 'MMCV',
+                    'url': 'https://github.com/open-mmlab/mmcv',
+                },
+                {
+                    'name': 'MMDetection',
+                    'url': 'https://github.com/open-mmlab/mmdetection',
+                },
+            ]
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang':
+    'en'
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_css_files = ['css/readthedocs.css']
+
+language = 'en'
+
+# Enable ::: for my_st
+myst_enable_extensions = ['colon_fence']
+myst_heading_anchors = 3
+
+
+def builder_inited_handler(app):
+    subprocess.run(['./stat.py'])
+
+
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)
diff --git a/docs/en/dataset.md b/docs_deprecated/en/dataset.md
similarity index 93%
rename from docs/en/dataset.md
rename to docs_deprecated/en/dataset.md
index bfe45a296..2548508eb 100644
--- a/docs/en/dataset.md
+++ b/docs_deprecated/en/dataset.md
@@ -46,14 +46,14 @@ Please download the datasets from the official websites. It is recommended to sy
 
 ```shell
 # download OTB100 dataset by web crawling
-python ./tools/convert_datasets/otb100/download_otb100.py -o ./data/otb100/zips -p 8
+python ./tools/convert_datasets/otb100/download_otb100.py -o ./data/OTB100/zips -p 8
 ```
 
 - For VOT2018, we use the official downloading script.
 
 ```shell
 # download VOT2018 dataset by web crawling
-python ./tools/convert_datasets/vot/download_vot.py --dataset vot2018 --save_path ./data/vot2018/data
+python ./tools/convert_datasets/vot/download_vot.py --dataset vot2018 --save_path ./data/VOT2018/data
 ```
 
 #### 1.4 Video Instance Segmentation
@@ -146,7 +146,7 @@ mmtracking
 │   │   ├── train
 │   │   ├── val
 │   │
-│   ├── lasot
+│   ├── LaSOT_full
 │   │   ├── LaSOTBenchmark
 │   │   │   ├── airplane
 |   │   │   │   ├── airplane-1
@@ -163,19 +163,19 @@ mmtracking
 │   │   ├── anno
 │   │   │   ├── UAV123
 │   │
-│   ├── trackingnet
+│   ├── TrackingNet
 │   │   ├── TEST.zip
 │   │   ├── TRAIN_0.zip
 │   │   ├── ......
 │   │   ├── TRAIN_11.zip
 │   │
-│   ├── otb100
+│   ├── OTB100
 │   │   │── zips
 │   │   │   │── Basketball.zip
 │   │   │   │── Biker.zip
 │   │   │   │──
 │   │
-│   ├── got10k
+│   ├── GOT10k
 │   │   │── full_data
 │   │   │   │── train_data
 │   │   │   │   ├── GOT-10k_Train_split_01.zip
@@ -185,7 +185,7 @@ mmtracking
 │   │   │   │── test_data.zip
 │   │   │   │── val_data.zip
 │   │
-|   ├── vot2018
+|   ├── VOT2018
 |   |   ├── data
 |   |   |   ├── ants1
 |   │   │   │   ├──color
@@ -251,34 +251,34 @@ python ./tools/convert_datasets/tao/merge_coco_with_lvis.py --lvis ./data/lvis/a
 python ./tools/convert_datasets/tao/tao2coco.py -i ./data/tao/annotations --filter-classes
 
 # LaSOT
-python ./tools/convert_datasets/lasot/gen_lasot_infos.py -i ./data/lasot/LaSOTBenchmark -o ./data/lasot/annotations
+python ./tools/convert_datasets/lasot/gen_lasot_infos.py -i ./data/LaSOT_full/LaSOTBenchmark -o ./data/LaSOT_full/annotations
 
 # UAV123
 # download annotations
 # due to the annotations of all videos in UAV123 are inconsistent, we just download the information file generated in advance.
-wget https://download.openmmlab.com/mmtracking/data/uav123_infos.txt -P data/uav123/annotations
+wget https://download.openmmlab.com/mmtracking/data/uav123_infos.txt -P data/UAV123/annotations
 
 # TrackingNet
 # unzip files in 'data/trackingnet/*.zip'
-bash ./tools/convert_datasets/trackingnet/unzip_trackingnet.sh ./data/trackingnet
+bash ./tools/convert_datasets/trackingnet/unzip_trackingnet.sh ./data/TrackingNet
 # generate annotations
-python ./tools/convert_datasets/trackingnet/gen_trackingnet_infos.py -i ./data/trackingnet -o ./data/trackingnet/annotations
+python ./tools/convert_datasets/trackingnet/gen_trackingnet_infos.py -i ./data/TrackingNet -o ./data/TrackingNet/annotations
 
 # OTB100
 # unzip files in 'data/otb100/zips/*.zip'
-bash ./tools/convert_datasets/otb100/unzip_otb100.sh ./data/otb100
+bash ./tools/convert_datasets/otb100/unzip_otb100.sh ./data/OTB100
 # download annotations
 # due to the annotations of all videos in OTB100 are inconsistent, we just need to download the information file generated in advance.
-wget https://download.openmmlab.com/mmtracking/data/otb100_infos.txt -P data/otb100/annotations
+wget https://download.openmmlab.com/mmtracking/data/otb100_infos.txt -P data/OTB100/annotations
 
 # GOT10k
-# unzip 'data/got10k/full_data/test_data.zip', 'data/got10k/full_data/val_data.zip' and files in 'data/got10k/full_data/train_data/*.zip'
-bash ./tools/convert_datasets/got10k/unzip_got10k.sh ./data/got10k
+# unzip 'data/GOT10k/full_data/test_data.zip', 'data/GOT10k/full_data/val_data.zip' and files in 'data/GOT10k/full_data/train_data/*.zip'
+bash ./tools/convert_datasets/got10k/unzip_got10k.sh ./data/GOT10k
 # generate annotations
-python ./tools/convert_datasets/got10k/gen_got10k_infos.py -i ./data/got10k -o ./data/got10k/annotations
+python ./tools/convert_datasets/got10k/gen_got10k_infos.py -i ./data/GOT10k -o ./data/GOT10k/annotations
 
 # VOT2018
-python ./tools/convert_datasets/vot/gen_vot_infos.py -i ./data/vot2018 -o ./data/vot2018/annotations --dataset_type vot2018
+python ./tools/convert_datasets/vot/gen_vot_infos.py -i ./data/VOT2018 -o ./data/VOT2018/annotations --dataset_type vot2018
 
 # YouTube-VIS 2019
 python ./tools/convert_datasets/youtubevis/youtubevis2coco.py -i ./data/youtube_vis_2019 -o ./data/youtube_vis_2019/annotations --version 2019
@@ -384,7 +384,7 @@ mmtracking
 │   │   ├── train
 │   │   ├── val
 │   │
-│   ├── lasot
+│   ├── LaSOT_full
 │   │   ├── LaSOTBenchmark
 │   │   │   ├── airplane
 |   │   │   │   ├── airplane-1
@@ -403,7 +403,7 @@ mmtracking
 │   │   │   ├── UAV123
 │   │   ├── annotations (the converted annotation file)
 │   │
-│   ├── trackingnet
+│   ├── TrackingNet
 │   │   ├── TEST
 │   │   │   ├── anno (the official annotation files)
 │   │   │   ├── zips
@@ -422,7 +422,7 @@ mmtracking
 │   │   ├── TRAIN_11
 │   │   ├── annotations (the converted annotation file)
 │   │
-│   ├── otb100
+│   ├── OTB100
 │   │   ├── zips
 │   │   │   ├── Basketball.zip
 │   │   │   ├── Biker.zip
@@ -433,7 +433,7 @@ mmtracking
 │   │   │   │   ├── img
 │   │   │   ├── ......
 │   │
-│   ├── got10k
+│   ├── GOT10k
 │   │   │── full_data
 │   │   │   │── train_data
 │   │   │   │   ├── GOT-10k_Train_split_01.zip
@@ -459,7 +459,7 @@ mmtracking
 │   │   │   ├── list.txt
 │   │   │── annotations
 │   │
-|   ├── vot2018
+|   ├── VOT2018
 |   |   ├── data
 |   |   |   ├── ants1
 |   │   │   │   ├──color
@@ -607,14 +607,14 @@ There are 9 JSON files in `data/tao/annotations`:
 
 `validation_with_freeform.json`: JSON file containing annotations for all categories in TAO validation.
 
-#### The folder of annotations in lasot
+#### The folder of annotations in LaSOT
 
-There are 2 JSON files in `data/lasot/annotations`:
+There are 2 JSON files in `data/LaSOT_full/annotations`:
 
 `lasot_train.json`:  JSON file containing the annotations information of the training set in LaSOT dataset.
 `lasot_test.json`:  JSON file containing the annotations information of the testing set in LaSOT dataset.
 
-There are 2 TEXT files in `data/lasot/annotations`:
+There are 2 TEXT files in `data/LaSOT_full/annotations`:
 
 `lasot_train_infos.txt`:  TEXT file containing the annotations information of the training set in LaSOT dataset.
 `lasot_test_infos.txt`:  TEXT file containing the annotations information of the testing set in LaSOT dataset.
@@ -631,35 +631,35 @@ There are only 1 TEXT files in `data/UAV123/annotations`:
 
 #### The folder of frames and annotations in TrackingNet
 
-There are 511 video directories of TrackingNet testset in `data/trackingnet/TEST/frames`, and each video directory contains all images of the video. Similar file structures can be seen in `data/trackingnet/TRAIN_{*}/frames`.
+There are 511 video directories of TrackingNet testset in `data/TrackingNet/TEST/frames`, and each video directory contains all images of the video. Similar file structures can be seen in `data/TrackingNet/TRAIN_{*}/frames`.
 
-There are 2 JSON files in `data/trackingnet/annotations`:
+There are 2 JSON files in `data/TrackingNet/annotations`:
 
 `trackingnet_test.json`:  JSON file containing the annotations information of the testing set in TrackingNet dataset.
 `trackingnet_train.json`:  JSON file containing the annotations information of the training set in TrackingNet dataset.
 
-There are 2 TEXT files in `data/trackingnet/annotations`:
+There are 2 TEXT files in `data/TrackingNet/annotations`:
 
 `trackingnet_test_infos.txt`:  TEXT file containing the information of the testing set in TrackingNet dataset.
 `trackingnet_train_infos.txt`:  TEXT file containing the information of the training set in TrackingNet dataset.
 
 #### The folder of data and annotations in OTB100
 
-There are 98 video directories of OTB100 dataset in `data/otb100/data`, and the `img` folder under each video directory contains all images of the video.
+There are 98 video directories of OTB100 dataset in `data/OTB100/data`, and the `img` folder under each video directory contains all images of the video.
 
-There are only 1 JSON files in `data/otb100/annotations`:
+There are only 1 JSON files in `data/OTB100/annotations`:
 
 `otb100.json`:  JSON file containing the annotations information of the OTB100 dataset.
 
-There are only 1 TEXT files in `data/otb100/annotations`:
+There are only 1 TEXT files in `data/OTB100/annotations`:
 
 `otb100_infos.txt`:  TEXT file containing the information of the OTB100 dataset.
 
 #### The folder of frames and annotations in GOT10k
 
-There are training video directories in `data/got10k/train`, and each video directory contains all images of the video. Similar file structures can be seen in `data/got10k/test` and `data/got10k/val`.
+There are training video directories in `data/GOT10k/train`, and each video directory contains all images of the video. Similar file structures can be seen in `data/GOT10k/test` and `data/GOT10k/val`.
 
-There are 3 JSON files in `data/got10k/annotations`:
+There are 3 JSON files in `data/GOT10k/annotations`:
 
 `got10k_train.json`:  JSON file containing the annotations information of the training set in GOT10k dataset.
 
@@ -667,7 +667,7 @@ There are 3 JSON files in `data/got10k/annotations`:
 
 `got10k_val.json`:  JSON file containing the annotations information of the valuation set in GOT10k dataset.
 
-There are 5 TEXT files in `data/got10k/annotations`:
+There are 5 TEXT files in `data/GOT10k/annotations`:
 
 `got10k_train_infos.txt`:  TEXT file containing the information of the training set in GOT10k dataset.
 
@@ -681,13 +681,13 @@ There are 5 TEXT files in `data/got10k/annotations`:
 
 #### The folder of data and annotations in VOT2018
 
-There are 60 video directories of VOT2018 dataset in `data/vot2018/data`, and the `color` folder under each video directory contains all images of the video.
+There are 60 video directories of VOT2018 dataset in `data/VOT2018/data`, and the `color` folder under each video directory contains all images of the video.
 
-There are only 1 JSON files in `data/vot2018/annotations`:
+There are only 1 JSON files in `data/VOT2018/annotations`:
 
 `vot2018.json`:  JSON file containing the annotations information of the VOT2018 dataset.
 
-There are only 1 TEXT files in `data/vot2018/annotations`:
+There are only 1 TEXT files in `data/VOT2018/annotations`:
 
 `vot2018_infos.txt`:  TEXT file containing the information of the VOT2018 dataset.
 
diff --git a/docs_deprecated/en/index.rst b/docs_deprecated/en/index.rst
new file mode 100644
index 000000000..0f2547366
--- /dev/null
+++ b/docs_deprecated/en/index.rst
@@ -0,0 +1,57 @@
+Welcome to MMTracking's documentation!
+=======================================
+
+You can switch between Chinese and English documents in the lower-left corner of the layout.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Get Started
+
+   install.md
+   modelzoo_statistics.md
+   model_zoo.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Quick run
+
+   dataset.md
+   quick_run.md
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Tutorials
+
+   tutorials/index.rst
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Useful Tools and Scripts
+
+   useful_tools_scripts.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Notes
+
+   changelog.md
+
+
+.. toctree::
+   :caption: Switch Language
+
+   switch_language.md
+
+
+.. toctree::
+   :caption: API Reference
+
+   api.rst
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/en/install.md b/docs_deprecated/en/install.md
similarity index 100%
rename from docs/en/install.md
rename to docs_deprecated/en/install.md
diff --git a/docs/zh_cn/make.bat b/docs_deprecated/en/make.bat
similarity index 100%
rename from docs/zh_cn/make.bat
rename to docs_deprecated/en/make.bat
diff --git a/docs_deprecated/en/model_zoo.md b/docs_deprecated/en/model_zoo.md
new file mode 100644
index 000000000..1f66c94b3
--- /dev/null
+++ b/docs_deprecated/en/model_zoo.md
@@ -0,0 +1,82 @@
+# Benchmark and Model Zoo
+
+## Common settings
+
+- We use distributed training.
+
+- All pytorch-style pretrained backbones on ImageNet are from PyTorch model zoo.
+
+- For fair comparison with other codebases, we report the GPU memory as the maximum value of `torch.cuda.max_memory_allocated()` for all 8 GPUs. Note that this value is usually less than what `nvidia-smi` shows.
+
+- We report the inference time as the total time of network forwarding and post-processing, excluding the data loading time. Results are obtained with the script `tools/analysis/benchmark.py` which computes the average time on 2000 images.
+
+- Speed benchmark environments
+
+  HardWare
+
+  - 8 NVIDIA Tesla V100 (32G) GPUs
+  - Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+
+  Software environment
+
+  - Python 3.7
+  - PyTorch 1.5
+  - CUDA 10.1
+  - CUDNN 7.6.03
+  - NCCL 2.4.08
+
+## Baselines of video object detection
+
+### DFF (CVPR 2017)
+
+Please refer to [DFF](https://github.com/open-mmlab/mmtracking/blob/master/configs/vid/dff) for details.
+
+### FGFA (ICCV 2017)
+
+Please refer to [FGFA](https://github.com/open-mmlab/mmtracking/blob/master/configs/vid/fgfa) for details.
+
+### SELSA (ICCV 2019)
+
+Please refer to [SELSA](https://github.com/open-mmlab/mmtracking/blob/master/configs/vid/selsa) for details.
+
+### Temporal RoI Align (AAAI 2021)
+
+Please refer to [Temporal RoI Align](https://github.com/open-mmlab/mmtracking/blob/master/configs/vid/temporal_roi_align) for details.
+
+## Baselines of multiple object tracking
+
+### SORT/DeepSORT (ICIP 2016/2017)
+
+Please refer to [SORT/DeepSORT](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/deepsort) for details.
+
+### Tracktor (ICCV 2019)
+
+Please refer to [Tracktor](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/tracktor) for details.
+
+### QDTrack (CVPR 2021)
+
+Please refer to [QDTrack](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/qdtrack) for details.
+
+### ByteTrack (ECCV 2022)
+
+Please refer to [ByteTrack](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/bytetrack) for details.
+
+### StrongSORT (arvix 2022)
+
+Please refer to [StrongSORT](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/strongsort) for details
+
+## Baselines of single object tracking
+
+### SiameseRPN++ (CVPR 2019)
+
+Please refer to [SiameseRPN++](https://github.com/open-mmlab/mmtracking/blob/master/configs/sot/siamese_rpn) for details.
+
+### STARK (ICCV 2021)
+
+Please refer to [STARK](https://github.com/open-mmlab/mmtracking/blob/master/configs/sot/stark) for details.
+
+## Baselines of video instance segmentation
+
+### MaskTrack R-CNN (ICCV 2019)
+
+Please refer to [MaskTrack R-CNN](https://github.com/open-mmlab/mmtracking/blob/master/configs/vis/masktrack_rcnn) for details.
diff --git a/docs/en/quick_run.md b/docs_deprecated/en/quick_run.md
similarity index 100%
rename from docs/en/quick_run.md
rename to docs_deprecated/en/quick_run.md
diff --git a/docs_deprecated/en/stat.py b/docs_deprecated/en/stat.py
new file mode 100644
index 000000000..e4f764cab
--- /dev/null
+++ b/docs_deprecated/en/stat.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools as func
+import glob
+import os.path as osp
+import re
+
+import numpy as np
+
+url_prefix = 'https://github.com/open-mmlab/mmtracking/blob/master/'
+
+files = sorted(glob.glob('../../configs/*/*/README.md'))
+
+stats = []
+titles = []
+num_ckpts = 0
+
+for f in files:
+    url = osp.dirname(f.replace('../../', url_prefix))
+
+    with open(f, 'r') as content_file:
+        content = content_file.read()
+
+    title = content.split('\n')[0].replace('#', '').strip()
+    ckpts = set(x.lower().strip()
+                for x in re.findall(r'https?://download.*\.pth', content)
+                if 'mmtracking' in x)
+    if len(ckpts) == 0:
+        continue
+
+    _papertype = [x for x in re.findall(r'\s*\[([A-Z]*?)\]\s*', content)]
+    assert len(_papertype) > 0
+    papertype = _papertype[0]
+
+    paper = set([(papertype, title)])
+
+    titles.append(title)
+    num_ckpts += len(ckpts)
+    statsmsg = f"""
+\t* [{papertype}] [{title}]({url}) ({len(ckpts)} ckpts)
+"""
+    stats.append((paper, ckpts, statsmsg))
+
+allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _ in stats])
+msglist = '\n'.join(x for _, _, x in stats)
+
+papertypes, papercounts = np.unique([t for t, _ in allpapers],
+                                    return_counts=True)
+countstr = '\n'.join(
+    [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
+
+modelzoo = f"""
+# Model Zoo Statistics
+* Number of papers: {len(set(titles))}
+{countstr}
+* Number of checkpoints: {num_ckpts}
+{msglist}
+"""
+
+with open('modelzoo_statistics.md', 'w') as f:
+    f.write(modelzoo)
diff --git a/docs/zh_cn/switch_language.md b/docs_deprecated/en/switch_language.md
similarity index 100%
rename from docs/zh_cn/switch_language.md
rename to docs_deprecated/en/switch_language.md
diff --git a/docs/en/tutorials/config.md b/docs_deprecated/en/tutorials/config.md
similarity index 100%
rename from docs/en/tutorials/config.md
rename to docs_deprecated/en/tutorials/config.md
diff --git a/docs/en/tutorials/config_mot.md b/docs_deprecated/en/tutorials/config_mot.md
similarity index 100%
rename from docs/en/tutorials/config_mot.md
rename to docs_deprecated/en/tutorials/config_mot.md
diff --git a/docs/en/tutorials/config_sot.md b/docs_deprecated/en/tutorials/config_sot.md
similarity index 100%
rename from docs/en/tutorials/config_sot.md
rename to docs_deprecated/en/tutorials/config_sot.md
diff --git a/docs/en/tutorials/config_vid.md b/docs_deprecated/en/tutorials/config_vid.md
similarity index 100%
rename from docs/en/tutorials/config_vid.md
rename to docs_deprecated/en/tutorials/config_vid.md
diff --git a/docs/en/tutorials/customize_data_pipeline.md b/docs_deprecated/en/tutorials/customize_data_pipeline.md
similarity index 98%
rename from docs/en/tutorials/customize_data_pipeline.md
rename to docs_deprecated/en/tutorials/customize_data_pipeline.md
index 4612c0896..260c0bf5c 100644
--- a/docs/en/tutorials/customize_data_pipeline.md
+++ b/docs_deprecated/en/tutorials/customize_data_pipeline.md
@@ -63,7 +63,7 @@ These sequential pipelines are generally inherited from the pipeline in MMDetect
 
 ```python
 from mmdet.datasets.builder import PIPELINES
-from mmdet.datasets.pipelines import LoadImageFromFile
+from mmdet.datasets.transforms import LoadImageFromFile
 
 @PIPELINES.register_module()
 class LoadMultiImagesFromFile(LoadImageFromFile):
diff --git a/docs/en/tutorials/customize_dataset.md b/docs_deprecated/en/tutorials/customize_dataset.md
similarity index 100%
rename from docs/en/tutorials/customize_dataset.md
rename to docs_deprecated/en/tutorials/customize_dataset.md
diff --git a/docs/en/tutorials/customize_mot_model.md b/docs_deprecated/en/tutorials/customize_mot_model.md
similarity index 98%
rename from docs/en/tutorials/customize_mot_model.md
rename to docs_deprecated/en/tutorials/customize_mot_model.md
index 11307fc5c..9a816804b 100644
--- a/docs/en/tutorials/customize_mot_model.md
+++ b/docs_deprecated/en/tutorials/customize_mot_model.md
@@ -78,7 +78,7 @@ Create a new file `mmtrack/models/motion/my_flownet.py`.
 You can inherit the motion model from `BaseModule` in `mmcv.runner` if it is a deep learning module, and from `object` if not.
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from ..builder import MOTION
 
@@ -129,7 +129,7 @@ motion=dict(
 Create a new file `mmtrack/models/reid/my_reid.py`.
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from ..builder import REID
 
@@ -180,7 +180,7 @@ reid=dict(
 Create a new file `mmtrack/models/track_heads/my_head.py`.
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from mmdet.models import HEADS
 
diff --git a/docs/en/tutorials/customize_runtime.md b/docs_deprecated/en/tutorials/customize_runtime.md
similarity index 100%
rename from docs/en/tutorials/customize_runtime.md
rename to docs_deprecated/en/tutorials/customize_runtime.md
diff --git a/docs/en/tutorials/customize_sot_model.md b/docs_deprecated/en/tutorials/customize_sot_model.md
similarity index 96%
rename from docs/en/tutorials/customize_sot_model.md
rename to docs_deprecated/en/tutorials/customize_sot_model.md
index f1f2887ef..3aa0ca378 100644
--- a/docs/en/tutorials/customize_sot_model.md
+++ b/docs_deprecated/en/tutorials/customize_sot_model.md
@@ -17,7 +17,7 @@ Create a new file `mmtrack/models/backbones/mobilenet.py`.
 
 ```python
 import torch.nn as nn
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from mmdet.models.builder import BACKBONES
 
@@ -69,7 +69,7 @@ model = dict(
 Create a new file `mmtrack/models/necks/my_fpn.py`.
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from mmdet.models.builder import NECKS
 
@@ -118,7 +118,7 @@ neck=dict(
 Create a new file `mmtrack/models/track_heads/my_head.py`.
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from mmdet.models import HEADS
 
diff --git a/docs/en/tutorials/customize_vid_model.md b/docs_deprecated/en/tutorials/customize_vid_model.md
similarity index 96%
rename from docs/en/tutorials/customize_vid_model.md
rename to docs_deprecated/en/tutorials/customize_vid_model.md
index e8cd7e9e4..1461cfb45 100644
--- a/docs/en/tutorials/customize_vid_model.md
+++ b/docs_deprecated/en/tutorials/customize_vid_model.md
@@ -17,7 +17,7 @@ Please refer to [tutorial in mmdetection](https://mmdetection.readthedocs.io/en/
 Create a new file `mmtrack/models/motion/my_flownet.py`.
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from ..builder import MOTION
 
@@ -68,7 +68,7 @@ motion=dict(
 Create a new file `mmtrack/models/aggregators/my_aggregator.py`.
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from ..builder import AGGREGATORS
 
diff --git a/docs/en/tutorials/index.rst b/docs_deprecated/en/tutorials/index.rst
similarity index 100%
rename from docs/en/tutorials/index.rst
rename to docs_deprecated/en/tutorials/index.rst
diff --git a/docs/en/useful_tools_scripts.md b/docs_deprecated/en/useful_tools_scripts.md
similarity index 100%
rename from docs/en/useful_tools_scripts.md
rename to docs_deprecated/en/useful_tools_scripts.md
diff --git a/docs_deprecated/zh_cn/Makefile b/docs_deprecated/zh_cn/Makefile
new file mode 100644
index 000000000..d4bb2cbb9
--- /dev/null
+++ b/docs_deprecated/zh_cn/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs_deprecated/zh_cn/_static/css/readthedocs.css b/docs_deprecated/zh_cn/_static/css/readthedocs.css
new file mode 100644
index 000000000..e6a9e814c
--- /dev/null
+++ b/docs_deprecated/zh_cn/_static/css/readthedocs.css
@@ -0,0 +1,6 @@
+.header-logo {
+    background-image: url("../image/mmtrack-logo.png");
+    background-size: 145px 40px;
+    height: 40px;
+    width: 145px;
+}
diff --git a/docs_deprecated/zh_cn/_static/image/mmtrack-logo.png b/docs_deprecated/zh_cn/_static/image/mmtrack-logo.png
new file mode 100644
index 000000000..beec8f582
Binary files /dev/null and b/docs_deprecated/zh_cn/_static/image/mmtrack-logo.png differ
diff --git a/docs_deprecated/zh_cn/api.rst b/docs_deprecated/zh_cn/api.rst
new file mode 100644
index 000000000..eb8b02fa5
--- /dev/null
+++ b/docs_deprecated/zh_cn/api.rst
@@ -0,0 +1,124 @@
+mmtrack.apis
+--------------
+.. automodule:: mmtrack.apis
+    :members:
+
+mmtrack.core
+--------------
+
+anchor
+^^^^^^^^^^
+.. automodule:: mmtrack.core.anchor
+    :members:
+
+evaluation
+^^^^^^^^^^
+.. automodule:: mmtrack.core.evaluation
+    :members:
+
+motion
+^^^^^^^^^^
+.. automodule:: mmtrack.core.motion
+    :members:
+
+optimizer
+^^^^^^^^^^
+.. automodule:: mmtrack.core.optimizer
+    :members:
+
+track
+^^^^^^^^^^
+.. automodule:: mmtrack.core.track
+    :members:
+
+utils
+^^^^^^^^^^
+.. automodule:: mmtrack.core.utils
+    :members:
+
+mmtrack.datasets
+----------------
+
+datasets
+^^^^^^^^^^
+.. automodule:: mmtrack.datasets
+    :members:
+
+parsers
+^^^^^^^^^^
+.. automodule:: mmtrack.datasets.api_wrappers
+    :members:
+
+pipelines
+^^^^^^^^^^
+.. automodule:: mmtrack.datasets.pipelines
+    :members:
+
+samplers
+^^^^^^^^^^
+.. automodule:: mmtrack.datasets.samplers
+    :members:
+
+
+mmtrack.models
+--------------
+
+mot
+^^^^^^^^^^
+.. automodule:: mmtrack.models.mot
+    :members:
+
+sot
+^^^^^^^^^^
+.. automodule:: mmtrack.models.sot
+    :members:
+
+vid
+^^^^^^^^^^
+.. automodule:: mmtrack.models.vid
+    :members:
+
+aggregators
+^^^^^^^^^^^
+.. automodule:: mmtrack.models.aggregators
+    :members:
+
+backbones
+^^^^^^^^^^
+.. automodule:: mmtrack.models.backbones
+    :members:
+
+losses
+^^^^^^^^^^
+.. automodule:: mmtrack.models.losses
+    :members:
+
+motion
+^^^^^^^^^^
+.. automodule:: mmtrack.models.motion
+    :members:
+
+reid
+^^^^^^^^^^
+.. automodule:: mmtrack.models.reid
+    :members:
+
+roi_heads
+^^^^^^^^^^
+.. automodule:: mmtrack.models.roi_heads
+    :members:
+
+track_heads
+^^^^^^^^^^^
+.. automodule:: mmtrack.models.track_heads
+    :members:
+
+builder
+^^^^^^^^^^^
+.. automodule:: mmtrack.models
+    :members:
+
+mmtrack.utils
+--------------
+.. automodule:: mmtrack.utils
+    :members:
diff --git a/docs/zh_cn/changelog.md b/docs_deprecated/zh_cn/changelog.md
similarity index 100%
rename from docs/zh_cn/changelog.md
rename to docs_deprecated/zh_cn/changelog.md
diff --git a/docs/zh_cn/conf.py b/docs_deprecated/zh_cn/conf.py
similarity index 100%
rename from docs/zh_cn/conf.py
rename to docs_deprecated/zh_cn/conf.py
diff --git a/docs/zh_cn/dataset.md b/docs_deprecated/zh_cn/dataset.md
similarity index 93%
rename from docs/zh_cn/dataset.md
rename to docs_deprecated/zh_cn/dataset.md
index 7c24aa2c3..d00caaa1b 100644
--- a/docs/zh_cn/dataset.md
+++ b/docs_deprecated/zh_cn/dataset.md
@@ -46,14 +46,14 @@
 
 ```shell
 # 通过网页爬虫下载 OTB100 数据集
-python ./tools/convert_datasets/otb100/download_otb100.py -o ./data/otb100/zips -p 8
+python ./tools/convert_datasets/otb100/download_otb100.py -o ./data/OTB100/zips -p 8
 ```
 
 - 对于 VOT2018, 我们使用官方的下载脚本。
 
 ```shell
 # 通过网页爬虫下载 VOT2018 数据集
-python ./tools/convert_datasets/vot/download_vot.py --dataset vot2018 --save_path ./data/vot2018/data
+python ./tools/convert_datasets/vot/download_vot.py --dataset vot2018 --save_path ./data/VOT2018/data
 ```
 
 #### 1.4 视频实例分割
@@ -146,7 +146,7 @@ mmtracking
 │   │   ├── train
 │   │   ├── val
 │   │
-│   ├── lasot
+│   ├── LaSOT_full
 │   │   ├── LaSOTBenchmark
 │   │   │   ├── airplane
 |   │   │   │   ├── airplane-1
@@ -163,19 +163,19 @@ mmtracking
 │   │   ├── anno
 │   │   │   ├── UAV123
 │   │
-│   ├── trackingnet
+│   ├── TrackingNet
 │   │   ├── TEST.zip
 │   │   ├── TRAIN_0.zip
 │   │   ├── ......
 │   │   ├── TRAIN_11.zip
 │   │
-│   ├── otb100
+│   ├── OTB100
 │   │   │── zips
 │   │   │   │── Basketball.zip
 │   │   │   │── Biker.zip
 │   │   │   │──
 │   │
-│   ├── got10k
+│   ├── GOT10k
 │   │   │── full_data
 │   │   │   │── train_data
 │   │   │   │   ├── GOT-10k_Train_split_01.zip
@@ -185,7 +185,7 @@ mmtracking
 │   │   │   │── test_data.zip
 │   │   │   │── val_data.zip
 │   │
-|   ├── vot2018
+|   ├── VOT2018
 |   |   ├── data
 |   |   |   ├── ants1
 |   │   │   │   ├──color
@@ -252,34 +252,34 @@ python ./tools/convert_datasets/tao/merge_coco_with_lvis.py --lvis ./data/lvis/a
 python ./tools/convert_datasets/tao/tao2coco.py -i ./data/tao/annotations --filter-classes
 
 # LaSOT
-python ./tools/convert_datasets/lasot/gen_lasot_infos.py -i ./data/lasot/LaSOTBenchmark -o ./data/lasot/annotations
+python ./tools/convert_datasets/lasot/gen_lasot_infos.py -i ./data/LaSOT_full/LaSOTBenchmark -o ./data/LaSOT_full/annotations
 
 # UAV123
 # 下载标注
 # 由于UAV123数据集的所有视频的标注信息不具有统一性，我们仅需下载提前生成的数据信息文件即可。
-wget https://download.openmmlab.com/mmtracking/data/uav123_infos.txt -P data/uav123/annotations
+wget https://download.openmmlab.com/mmtracking/data/uav123_infos.txt -P data/UAV123/annotations
 
 # TrackingNet
 # 解压目录 'data/trackingnet/' 下的所有 '*.zip' 文件
-bash ./tools/convert_datasets/trackingnet/unzip_trackingnet.sh ./data/trackingnet
+bash ./tools/convert_datasets/trackingnet/unzip_trackingnet.sh ./data/TrackingNet
 # 生成标注
-python ./tools/convert_datasets/trackingnet/gen_trackingnet_infos.py -i ./data/trackingnet -o ./data/trackingnet/annotations
+python ./tools/convert_datasets/trackingnet/gen_trackingnet_infos.py -i ./data/TrackingNet -o ./data/TrackingNet/annotations
 
 # OTB100
 # 解压目录 'data/otb100/zips' 下的所有 '*.zip' 文件
-bash ./tools/convert_datasets/otb100/unzip_otb100.sh ./data/otb100
+bash ./tools/convert_datasets/otb100/unzip_otb100.sh ./data/OTB100
 # 下载标注
 # 由于UAV123数据集的所有视频的标注信息不具有统一性，我们仅需下载提前生成的数据信息文件即可。
-wget https://download.openmmlab.com/mmtracking/data/otb100_infos.txt -P data/otb100/annotations
+wget https://download.openmmlab.com/mmtracking/data/otb100_infos.txt -P data/OTB100/annotations
 
 # GOT10k
-# 解压 'data/got10k/full_data/test_data.zip', 'data/got10k/full_data/val_data.zip' 和 目录'data/got10k/full_data/train_data/' 下的所有 '*.zip' 文件
-bash ./tools/convert_datasets/got10k/unzip_got10k.sh ./data/got10k
+# 解压 'data/GOT10k/full_data/test_data.zip', 'data/got10k/full_data/val_data.zip' 和 目录'data/GOT10k/full_data/train_data/' 下的所有 '*.zip' 文件
+bash ./tools/convert_datasets/got10k/unzip_got10k.sh ./data/GOT10k
 # 生成标注
-python ./tools/convert_datasets/got10k/gen_got10k_infos.py -i ./data/got10k -o ./data/got10k/annotations
+python ./tools/convert_datasets/got10k/gen_got10k_infos.py -i ./data/GOT10k -o ./data/GOT10k/annotations
 
 # VOT2018
-python ./tools/convert_datasets/vot/gen_vot_infos.py -i ./data/vot2018 -o ./data/vot2018/annotations --dataset_type vot2018
+python ./tools/convert_datasets/vot/gen_vot_infos.py -i ./data/VOT2018 -o ./data/VOT2018/annotations --dataset_type vot2018
 
 # YouTube-VIS 2019
 python ./tools/convert_datasets/youtubevis/youtubevis2coco.py -i ./data/youtube_vis_2019 -o ./data/youtube_vis_2019/annotations --version 2019
@@ -385,7 +385,7 @@ mmtracking
 │   │   ├── train
 │   │   ├── val
 │   │
-│   ├── lasot
+│   ├── LaSOT_full
 │   │   ├── LaSOTBenchmark
 │   │   │   ├── airplane
 |   │   │   │   ├── airplane-1
@@ -404,7 +404,7 @@ mmtracking
 │   │   │   ├── UAV123
 │   │   ├── annotations (the converted annotation file)
 │   │
-│   ├── trackingnet
+│   ├── TrackingNet
 │   │   ├── TEST
 │   │   │   ├── anno (the official annotation files)
 │   │   │   ├── zips
@@ -423,7 +423,7 @@ mmtracking
 │   │   ├── TRAIN_11
 │   │   ├── annotations (the converted annotation file)
 │   │
-│   ├── otb100
+│   ├── OTB100
 │   │   ├── zips
 │   │   │   ├── Basketball.zip
 │   │   │   ├── Biker.zip
@@ -434,7 +434,7 @@ mmtracking
 │   │   │   │   ├── img
 │   │   │   ├── ......
 │   │
-│   ├── got10k
+│   ├── GOT10k
 │   │   │── full_data
 │   │   │   │── train_data
 │   │   │   │   ├── GOT-10k_Train_split_01.zip
@@ -460,7 +460,7 @@ mmtracking
 │   │   │   ├── list.txt
 │   │   │── annotations
 │   │
-|   ├── vot2018
+|   ├── VOT2018
 |   |   ├── data
 |   |   |   ├── ants1
 |   │   │   │   ├──color
@@ -613,9 +613,9 @@ MOT17-02-FRCNN_000009/000081.jpg 3
 
 `validation_with_freeform.json`: 包含 TAO 验证集所有类别标注的 JSON 文件。
 
-#### lasot 的标注文件夹
+#### LaSOT 的标注文件夹
 
-在 `data/lasot/annotations` 中有 2 个 JSON 文件:
+在 `data/LaSOT_full/annotations` 中有 2 个 JSON 文件:
 
 `lasot_train.json`:  包含 LaSOT 训练集标注信息的 JSON 文件。
 `lasot_test.json`:  包含 LaSOT 测试集标注信息的 JSON 文件。
@@ -637,9 +637,9 @@ MOT17-02-FRCNN_000009/000081.jpg 3
 
 #### TrackingNet 的标注和视频帧文件夹
 
-在 `data/trackingnet/TEST/frames` 文件夹下有 TrackingNet 测试集的 511 个视频目录， 每个视频目录下面包含该视频所有图片。`data/trackingnet/TRAIN_{*}/frames` 下具有类似的文件目录结构。
+在 `data/TrackingNet/TEST/frames` 文件夹下有 TrackingNet 测试集的 511 个视频目录， 每个视频目录下面包含该视频所有图片。`data/TrackingNet/TRAIN_{*}/frames` 下具有类似的文件目录结构。
 
-在 `data/trackingnet/annotations` 中有 2 个 JSON 文件：
+在 `data/TrackingNet/annotations` 中有 2 个 JSON 文件：
 
 `trackingnet_train.json`： 包含 TrackingNet 训练集标注信息的 JSON 文件。
 `trackingnet_test.json`： 包含 TrackingNet 测试集标注信息的 JSON 文件。
@@ -651,21 +651,21 @@ MOT17-02-FRCNN_000009/000081.jpg 3
 
 #### OTB100 的标注和视频帧文件夹
 
-在 `data/otb100/data` 文件夹下有 OTB100 数据集的 98 个视频目录， 每个视频目录下的 `img` 文件夹包含该视频所有图片。
+在 `data/OTB100/data` 文件夹下有 OTB100 数据集的 98 个视频目录， 每个视频目录下的 `img` 文件夹包含该视频所有图片。
 
-在 `data/otb100/data/annotations` 中只有 1 个 JSON 文件：
+在 `data/OTB100/data/annotations` 中只有 1 个 JSON 文件：
 
 `otb100.json`： 包含 OTB100 数据集标注信息的 JSON 文件
 
-在 `data/otb100/annotations` 中有 1 个 TEXT 文件:
+在 `data/OTB100/annotations` 中有 1 个 TEXT 文件:
 
 `otb100_infos.txt`:  包含 OTB100 数据信息的 TEXT 文件。
 
 #### GOT10k 的标注和视频帧文件夹
 
-在 `data/got10k/train` 文件夹下有 GOT10k 训练集的视频目录， 每个视频目录下面包含该视频所有图片。`data/got10k/test` 和 `data/got10k/val` 下具有类似的文件目录结构。
+在 `data/GOT10k/train` 文件夹下有 GOT10k 训练集的视频目录， 每个视频目录下面包含该视频所有图片。`data/GOT10k/test` 和 `data/GOT10k/val` 下具有类似的文件目录结构。
 
-在 `data/got10k/annotations` 中有 3 个 JSON 文件：
+在 `data/GOT10k/annotations` 中有 3 个 JSON 文件：
 
 `got10k_train.json`： 包含 GOT10k 训练集标注信息的 JSON 文件。
 
@@ -673,7 +673,7 @@ MOT17-02-FRCNN_000009/000081.jpg 3
 
 `got10k_val.json`： 包含 GOT10k 验证集标注信息的 JSON 文件。
 
-在 `data/got10k/annotations` 中有 5 个 TEXT 文件：
+在 `data/GOT10k/annotations` 中有 5 个 TEXT 文件：
 
 `got10k_train_infos.txt`： 包含 GOT10k 训练集信息的 TEXT 文件。
 
@@ -685,15 +685,15 @@ MOT17-02-FRCNN_000009/000081.jpg 3
 
 `got10k_val_vot_infos.txt`： 包含 GOT10k `val_vot` 划分集信息的 TEXT 文件。
 
-#### VOT2018的标注和视频帧文件夹
+#### VOT2018 的标注和视频帧文件夹
 
-在 `data/vot2018/data` 文件夹下有 VOT2018 数据集的 60 个视频目录， 每个视频目录下的 `color` 文件夹包含该视频所有图片。
+在 `data/VOT2018/data` 文件夹下有 VOT2018 数据集的 60 个视频目录， 每个视频目录下的 `color` 文件夹包含该视频所有图片。
 
-在 `data/vot2018/data/annotations` 中只有一个 JSON 文件：
+在 `data/VOT2018/data/annotations` 中只有一个 JSON 文件：
 
 `vot2018.json`： 包含 VOT2018 数据集标注信息的 JSON 文件。
 
-在 `data/vot2018/data/annotations` 中只有一个 TEXT 文件：
+在 `data/VOT2018/data/annotations` 中只有一个 TEXT 文件：
 
 `vot2018_infos.txt`： 包含 VOT2018 数据集信息的 TEXT 文件。
 
diff --git a/docs/zh_cn/index.rst b/docs_deprecated/zh_cn/index.rst
similarity index 100%
rename from docs/zh_cn/index.rst
rename to docs_deprecated/zh_cn/index.rst
diff --git a/docs/zh_cn/install.md b/docs_deprecated/zh_cn/install.md
similarity index 100%
rename from docs/zh_cn/install.md
rename to docs_deprecated/zh_cn/install.md
diff --git a/docs_deprecated/zh_cn/make.bat b/docs_deprecated/zh_cn/make.bat
new file mode 100644
index 000000000..922152e96
--- /dev/null
+++ b/docs_deprecated/zh_cn/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/zh_cn/model_zoo.md b/docs_deprecated/zh_cn/model_zoo.md
similarity index 93%
rename from docs/zh_cn/model_zoo.md
rename to docs_deprecated/zh_cn/model_zoo.md
index 2173c6be4..538cc9e44 100644
--- a/docs/zh_cn/model_zoo.md
+++ b/docs_deprecated/zh_cn/model_zoo.md
@@ -57,10 +57,14 @@
 
 详情请参考 [QDTrack](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/qdtrack/README.md)。
 
-### ByteTrack (arXiv 2021)
+### ByteTrack (ECCV 2022)
 
 详情请参考 [ByteTrack](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/bytetrack)。
 
+### StrongSORT (arxiv 2022)
+
+详情请参考 [StrongSORT](https://github.com/open-mmlab/mmtracking/blob/master/configs/mot/strongsort)。
+
 ## 单目标跟踪基线
 
 ### SiameseRPN++ (CVPR 2019)
diff --git a/docs/zh_cn/quick_run.md b/docs_deprecated/zh_cn/quick_run.md
similarity index 100%
rename from docs/zh_cn/quick_run.md
rename to docs_deprecated/zh_cn/quick_run.md
diff --git a/docs/zh_cn/stat.py b/docs_deprecated/zh_cn/stat.py
old mode 100755
new mode 100644
similarity index 100%
rename from docs/zh_cn/stat.py
rename to docs_deprecated/zh_cn/stat.py
diff --git a/docs_deprecated/zh_cn/switch_language.md b/docs_deprecated/zh_cn/switch_language.md
new file mode 100644
index 000000000..745fb420d
--- /dev/null
+++ b/docs_deprecated/zh_cn/switch_language.md
@@ -0,0 +1,3 @@
+## <a href='https://mmtracking.readthedocs.io/en/latest/'>English</a>
+
+## <a href='https://mmtracking.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/docs/zh_cn/tutorials/config.md b/docs_deprecated/zh_cn/tutorials/config.md
similarity index 100%
rename from docs/zh_cn/tutorials/config.md
rename to docs_deprecated/zh_cn/tutorials/config.md
diff --git a/docs/zh_cn/tutorials/config_mot.md b/docs_deprecated/zh_cn/tutorials/config_mot.md
similarity index 100%
rename from docs/zh_cn/tutorials/config_mot.md
rename to docs_deprecated/zh_cn/tutorials/config_mot.md
diff --git a/docs/zh_cn/tutorials/config_sot.md b/docs_deprecated/zh_cn/tutorials/config_sot.md
similarity index 100%
rename from docs/zh_cn/tutorials/config_sot.md
rename to docs_deprecated/zh_cn/tutorials/config_sot.md
diff --git a/docs/zh_cn/tutorials/config_vid.md b/docs_deprecated/zh_cn/tutorials/config_vid.md
similarity index 100%
rename from docs/zh_cn/tutorials/config_vid.md
rename to docs_deprecated/zh_cn/tutorials/config_vid.md
diff --git a/docs/zh_cn/tutorials/customize_data_pipeline.md b/docs_deprecated/zh_cn/tutorials/customize_data_pipeline.md
similarity index 98%
rename from docs/zh_cn/tutorials/customize_data_pipeline.md
rename to docs_deprecated/zh_cn/tutorials/customize_data_pipeline.md
index 0393adc14..a0958b971 100644
--- a/docs/zh_cn/tutorials/customize_data_pipeline.md
+++ b/docs_deprecated/zh_cn/tutorials/customize_data_pipeline.md
@@ -58,7 +58,7 @@ class CocoVideoDataset(CocoDataset):
 
 ```python
 from mmdet.datasets.builder import PIPELINES
-from mmdet.datasets.pipelines import LoadImageFromFile
+from mmdet.datasets.transforms import LoadImageFromFile
 
 @PIPELINES.register_module()
 class LoadMultiImagesFromFile(LoadImageFromFile):
diff --git a/docs/zh_cn/tutorials/customize_dataset.md b/docs_deprecated/zh_cn/tutorials/customize_dataset.md
similarity index 100%
rename from docs/zh_cn/tutorials/customize_dataset.md
rename to docs_deprecated/zh_cn/tutorials/customize_dataset.md
diff --git a/docs/zh_cn/tutorials/customize_mot_model.md b/docs_deprecated/zh_cn/tutorials/customize_mot_model.md
similarity index 98%
rename from docs/zh_cn/tutorials/customize_mot_model.md
rename to docs_deprecated/zh_cn/tutorials/customize_mot_model.md
index bc7939c4e..e939cbc05 100644
--- a/docs/zh_cn/tutorials/customize_mot_model.md
+++ b/docs_deprecated/zh_cn/tutorials/customize_mot_model.md
@@ -72,7 +72,7 @@ tracker=dict(
 如果该运动估计模型是一个深度学习模块，你可以继承 `mmcv.runner` 的 `BaseModule`，否则继承 `Object`。
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from ..builder import MOTION
 
@@ -121,7 +121,7 @@ motion=dict(
 新建一个文件 `mmtrack/models/motion/my_flownet.py`。
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from ..builder import REID
 
@@ -171,7 +171,7 @@ motion=dict(
 新建一个文件 `mmtrack/models/track_heads/my_head.py`。
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from mmdet.models import HEADS
 
diff --git a/docs/zh_cn/tutorials/customize_runtime.md b/docs_deprecated/zh_cn/tutorials/customize_runtime.md
similarity index 100%
rename from docs/zh_cn/tutorials/customize_runtime.md
rename to docs_deprecated/zh_cn/tutorials/customize_runtime.md
diff --git a/docs/zh_cn/tutorials/customize_sot_model.md b/docs_deprecated/zh_cn/tutorials/customize_sot_model.md
similarity index 96%
rename from docs/zh_cn/tutorials/customize_sot_model.md
rename to docs_deprecated/zh_cn/tutorials/customize_sot_model.md
index 285c61d28..81f9669a8 100644
--- a/docs/zh_cn/tutorials/customize_sot_model.md
+++ b/docs_deprecated/zh_cn/tutorials/customize_sot_model.md
@@ -17,7 +17,7 @@
 
 ```python
 import torch.nn as nn
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from mmdet.models.builder import BACKBONES
 
@@ -67,7 +67,7 @@ model = dict(
 创建一个新文件 `mmtrack/models/necks/my_fpn.py`
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from mmdet.models.builder import NECKS
 
@@ -114,7 +114,7 @@ neck=dict(
 创建一个新文件 `mmtrack/models/track_heads/my_head.py`
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from mmdet.models import HEADS
 
diff --git a/docs/zh_cn/tutorials/customize_vid_model.md b/docs_deprecated/zh_cn/tutorials/customize_vid_model.md
similarity index 96%
rename from docs/zh_cn/tutorials/customize_vid_model.md
rename to docs_deprecated/zh_cn/tutorials/customize_vid_model.md
index 1aedf4cd4..a3479bcca 100644
--- a/docs/zh_cn/tutorials/customize_vid_model.md
+++ b/docs_deprecated/zh_cn/tutorials/customize_vid_model.md
@@ -17,7 +17,7 @@
 新建一个文件 `mmtrack/models/motion/my_flownet.py`。
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from ..builder import MOTION
 
@@ -66,7 +66,7 @@ motion=dict(
 创建一个新文件 `mmtrack/models/aggregators/my_aggregator.py`。
 
 ```python
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from ..builder import AGGREGATORS
 
diff --git a/docs/zh_cn/tutorials/index.rst b/docs_deprecated/zh_cn/tutorials/index.rst
similarity index 100%
rename from docs/zh_cn/tutorials/index.rst
rename to docs_deprecated/zh_cn/tutorials/index.rst
diff --git a/docs/zh_cn/useful_tools_scripts.md b/docs_deprecated/zh_cn/useful_tools_scripts.md
similarity index 100%
rename from docs/zh_cn/useful_tools_scripts.md
rename to docs_deprecated/zh_cn/useful_tools_scripts.md
diff --git a/mmtrack/__init__.py b/mmtrack/__init__.py
index 64d5212ee..b81baa3cf 100644
--- a/mmtrack/__init__.py
+++ b/mmtrack/__init__.py
@@ -7,10 +7,11 @@
 
 from .version import __version__, version_info
 
-MMCV_MIN = '1.3.17'
-MMCV_MAX = '1.6.0'
+MMCV_MIN = '2.0.0rc1'
+MMCV_MAX = '2.0.0'
 
-MMDET_MIN = '2.19.1'
+MMDET_MIN = '3.0.0rc0'
+MMDET_MAX = '3.0.0'
 
 
 def digit_version(version_str: str, length: int = 4):
@@ -58,16 +59,17 @@ def digit_version(version_str: str, length: int = 4):
 mmcv_version = digit_version(mmcv.__version__)
 
 
-assert (mmcv_min_version <= mmcv_version <= mmcv_max_version), \
+assert (mmcv_min_version <= mmcv_version < mmcv_max_version), \
     f'MMCV=={mmcv.__version__} is used but incompatible. ' \
-    f'Please install mmcv>={MMCV_MIN}, <={MMCV_MAX}.'
+    f'Please install mmcv>={MMCV_MIN}, <{MMCV_MAX}.'
 
 mmdet_min_version = digit_version(MMDET_MIN)
+mmdet_max_version = digit_version(MMDET_MAX)
 mmdet_version = digit_version(mmdet.__version__)
 
 
-assert (mmdet_min_version <= mmdet_version), \
+assert (mmdet_min_version <= mmdet_version < mmdet_max_version), \
     f'MMDet=={mmdet.__version__} is used but incompatible. ' \
-    f'Please install mmdet>={MMDET_MIN}.'
+    f'Please install mmdet>={MMDET_MIN}, <{MMDET_MAX}.'
 
 __all__ = ['__version__', 'version_info', 'digit_version']
diff --git a/mmtrack/apis/__init__.py b/mmtrack/apis/__init__.py
index 2b5c3eaba..8fdbca9bc 100644
--- a/mmtrack/apis/__init__.py
+++ b/mmtrack/apis/__init__.py
@@ -1,9 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .inference import inference_mot, inference_sot, inference_vid, init_model
-from .test import multi_gpu_test, single_gpu_test
-from .train import init_random_seed, train_model
 
-__all__ = [
-    'init_model', 'multi_gpu_test', 'single_gpu_test', 'train_model',
-    'inference_mot', 'inference_sot', 'inference_vid', 'init_random_seed'
-]
+__all__ = ['init_model', 'inference_mot', 'inference_sot', 'inference_vid']
diff --git a/mmtrack/apis/inference.py b/mmtrack/apis/inference.py
index a6f93ab7d..08f3e3631 100644
--- a/mmtrack/apis/inference.py
+++ b/mmtrack/apis/inference.py
@@ -1,48 +1,54 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import copy
 import logging
 import os
 import tempfile
+import warnings
+from typing import Optional, Union
 
-import mmcv
+import mmengine
 import numpy as np
 import torch
 from mmcv.ops import RoIPool
-from mmcv.parallel import collate, scatter
-from mmcv.runner import load_checkpoint
-from mmdet.datasets.pipelines import Compose
+from mmengine.dataset import Compose
+from mmengine.logging import MMLogger
+from mmengine.runner import load_checkpoint
+from torch import nn
 
-from mmtrack.models import build_model
+from mmtrack.registry import MODELS
+from mmtrack.utils import SampleList
 
 
-def init_model(config,
-               checkpoint=None,
-               device='cuda:0',
-               cfg_options=None,
-               verbose_init_params=False):
+def init_model(config: Union[str, mmengine.Config],
+               checkpoint: Optional[str] = None,
+               device: str = 'cuda:0',
+               cfg_options: Optional[dict] = None,
+               verbose_init_params: bool = False) -> nn.Module:
     """Initialize a model from config file.
 
     Args:
-        config (str or :obj:`mmcv.Config`): Config file path or the config
+        config (str or :obj:`mmengine.Config`): Config file path or the config
             object.
-        checkpoint (str, optional): Checkpoint path. Default as None.
-        cfg_options (dict, optional): Options to override some settings in
-            the used config. Default to None.
+        checkpoint (Optional[str], optional): Checkpoint path. Defaults to
+            None.
+        device (str, optional): The device that the model inferences on.
+            Defaults to `cuda:0`.
+        cfg_options (Optional[dict], optional): Options to override some
+            settings in the used config. Defaults to None.
         verbose_init_params (bool, optional): Whether to print the information
-            of initialized parameters to the console. Default to False.
+            of initialized parameters to the console. Defaults to False.
 
     Returns:
-        nn.Module: The constructed detector.
+        nn.Module: The constructed model.
     """
     if isinstance(config, str):
-        config = mmcv.Config.fromfile(config)
-    elif not isinstance(config, mmcv.Config):
+        config = mmengine.Config.fromfile(config)
+    elif not isinstance(config, mmengine.Config):
         raise TypeError('config must be a filename or Config object, '
                         f'but got {type(config)}')
     if cfg_options is not None:
         config.merge_from_dict(cfg_options)
-    if 'detector' in config.model:
-        config.model.detector.pretrained = None
-    model = build_model(config.model)
+    model = MODELS.build(config.model)
 
     if not verbose_init_params:
         # Creating a temporary file to record the information of initialized
@@ -51,12 +57,13 @@ def init_model(config,
         # `mmcv.runner.BaseModule.init_weights`.
         tmp_file = tempfile.NamedTemporaryFile(delete=False)
         file_handler = logging.FileHandler(tmp_file.name, mode='w')
-        model.logger.addHandler(file_handler)
+        logger = MMLogger.get_current_instance()
+        logger.addHandler(file_handler)
         # We need call `init_weights()` to load pretained weights in MOT
         # task.
         model.init_weights()
         file_handler.close()
-        model.logger.removeHandler(file_handler)
+        logger.removeHandler(file_handler)
         tmp_file.close()
         os.remove(tmp_file.name)
     else:
@@ -65,108 +72,105 @@ def init_model(config,
 
     if checkpoint is not None:
         checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
-        if 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']:
-            model.CLASSES = checkpoint['meta']['CLASSES']
-    if not hasattr(model, 'CLASSES'):
-        if hasattr(model, 'detector') and hasattr(model.detector, 'CLASSES'):
-            model.CLASSES = model.detector.CLASSES
-        else:
-            print("Warning: The model doesn't have classes")
-            model.CLASSES = None
+        # Weights converted from elsewhere may not have meta fields.
+        checkpoint_meta = checkpoint.get('meta', {})
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint_meta:
+            # mmtrack 1.x
+            model.dataset_meta = checkpoint_meta['dataset_meta']
+        elif 'CLASSES' in checkpoint_meta:
+            # < mmtrack 1.x
+            classes = checkpoint_meta['CLASSES']
+            model.dataset_meta = {'CLASSES': classes}
+
+    # Some methods don't load checkpoints or checkpoints don't contain
+    # 'dataset_meta'
+    if not hasattr(model, 'dataset_meta'):
+        warnings.simplefilter('once')
+        warnings.warn('dataset_meta or class names are missed, '
+                      'use None by default.')
+        model.dataset_meta = {'CLASSES': None}
+
     model.cfg = config  # save the config in the model for convenience
     model.to(device)
     model.eval()
     return model
 
 
-def inference_mot(model, img, frame_id):
+def inference_mot(model: nn.Module, img: np.ndarray,
+                  frame_id: int) -> SampleList:
     """Inference image(s) with the mot model.
 
     Args:
         model (nn.Module): The loaded mot model.
-        img (str | ndarray): Either image name or loaded image.
+        img (np.ndarray): Loaded image.
         frame_id (int): frame id.
 
     Returns:
-        dict[str : ndarray]: The tracking results.
+        SampleList: The tracking data samples.
     """
     cfg = model.cfg
-    device = next(model.parameters()).device  # model device
-    # prepare data
-    if isinstance(img, np.ndarray):
-        # directly add img
-        data = dict(img=img, img_info=dict(frame_id=frame_id), img_prefix=None)
-        cfg = cfg.copy()
-        # set loading pipeline type
-        cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
-    else:
-        # add information into dict
-        data = dict(
-            img_info=dict(filename=img, frame_id=frame_id), img_prefix=None)
-    # build the data pipeline
-    test_pipeline = Compose(cfg.data.test.pipeline)
+    data = dict(
+        img=img.astype(np.float32), frame_id=frame_id, ori_shape=img.shape[:2])
+    # remove the "LoadImageFromFile" and "LoadTrackAnnotations" in pipeline
+    test_pipeline = Compose(cfg.test_dataloader.dataset.pipeline[2:])
     data = test_pipeline(data)
-    data = collate([data], samples_per_gpu=1)
-    if next(model.parameters()).is_cuda:
-        # scatter to specified GPU
-        data = scatter(data, [device])[0]
-    else:
+
+    if not next(model.parameters()).is_cuda:
         for m in model.modules():
             assert not isinstance(
                 m, RoIPool
             ), 'CPU inference with RoIPool is not supported currently.'
-        # just get the actual data from DataContainer
-        data['img_metas'] = data['img_metas'][0].data
+
     # forward the model
     with torch.no_grad():
-        result = model(return_loss=False, rescale=True, **data)
+        data = mmengine.dataset.default_collate([data])
+        result = model.test_step(data)[0]
     return result
 
 
-def inference_sot(model, image, init_bbox, frame_id):
+def inference_sot(model: nn.Module, image: np.ndarray, init_bbox: np.ndarray,
+                  frame_id: int) -> SampleList:
     """Inference image with the single object tracker.
 
     Args:
         model (nn.Module): The loaded tracker.
-        image (ndarray): Loaded images.
-        init_bbox (ndarray): The target needs to be tracked.
+        image (np.ndarray): Loaded images.
+        init_bbox (np.ndarray): The target needs to be tracked.
         frame_id (int): frame id.
 
     Returns:
-        dict[str : ndarray]: The tracking results.
+        SampleList: The tracking data samples.
     """
     cfg = model.cfg
-    device = next(model.parameters()).device  # model device
-
     data = dict(
         img=image.astype(np.float32),
         gt_bboxes=np.array(init_bbox).astype(np.float32),
-        img_info=dict(frame_id=frame_id))
+        frame_id=frame_id,
+        ori_shape=image.shape[:2])
     # remove the "LoadImageFromFile" and "LoadAnnotations" in pipeline
-    test_pipeline = Compose(cfg.data.test.pipeline[2:])
+    test_pipeline = Compose(cfg.test_dataloader.dataset.pipeline[2:])
     data = test_pipeline(data)
-    data = collate([data], samples_per_gpu=1)
-    if next(model.parameters()).is_cuda:
-        # scatter to specified GPU
-        data = scatter(data, [device])[0]
-    else:
+
+    if not next(model.parameters()).is_cuda:
         for m in model.modules():
             assert not isinstance(
                 m, RoIPool
             ), 'CPU inference with RoIPool is not supported currently.'
-        # just get the actual data from DataContainer
-        data['img_metas'] = data['img_metas'][0].data
 
     # forward the model
     with torch.no_grad():
-        result = model(return_loss=False, rescale=True, **data)
+        data = mmengine.dataset.default_collate([data])
+        result = model.test_step(data)[0]
     return result
 
 
-def inference_vid(model,
-                  image,
-                  frame_id,
-                  ref_img_sampler=dict(frame_stride=10, num_left_ref_imgs=10)):
+def inference_vid(
+    model: nn.Module,
+    image: np.ndarray,
+    frame_id: int,
+    ref_img_sampler: dict = dict(frame_stride=2, num_left_ref_imgs=10)
+) -> SampleList:
     """Inference image with the video object detector.
 
     Args:
@@ -178,62 +182,58 @@ def inference_vid(model,
             dict(frame_stride=2, num_left_ref_imgs=10).
 
     Returns:
-        dict[str : ndarray]: The detection results.
+        SampleList: The detection results.
     """
     cfg = model.cfg
-    device = next(model.parameters()).device  # model device
 
-    if cfg.data.test.pipeline[0].type == 'LoadImageFromFile':
+    first_transform = cfg.test_dataloader.dataset.pipeline[0]
+    if first_transform.type == 'LoadImageFromFile':
         data = dict(
             img=image.astype(np.float32).copy(),
-            img_info=dict(frame_id=frame_id))
-
+            frame_id=frame_id,
+            ori_shape=image.shape[:2])
         # remove the "LoadImageFromFile" in pipeline
-        test_pipeline = Compose(cfg.data.test.pipeline[1:])
-
-    elif cfg.data.test.pipeline[0].type == 'LoadMultiImagesFromFile':
-        data = [
-            dict(
-                img=image.astype(np.float32).copy(),
-                img_info=dict(frame_id=frame_id))
-        ]
+        test_pipeline = Compose(cfg.test_dataloader.dataset.pipeline[1:])
+    elif first_transform.type == 'TransformBroadcaster':
+        assert first_transform.transforms[0].type == 'LoadImageFromFile'
+        # Only used under video detector of fgfa style.
+        data = dict(
+            img=[image.astype(np.float32).copy()],
+            frame_id=[frame_id],
+            ori_shape=[image.shape[:2]])
 
         num_left_ref_imgs = ref_img_sampler.get('num_left_ref_imgs')
         frame_stride = ref_img_sampler.get('frame_stride')
         if frame_id == 0:
             for i in range(num_left_ref_imgs):
-                one_ref_img = dict(
-                    img=image.astype(np.float32).copy(),
-                    img_info=dict(frame_id=frame_id))
-                data.append(one_ref_img)
+                data['img'].append(image.astype(np.float32).copy())
+                data['frame_id'].append(frame_id)
+                data['ori_shape'].append(image.shape[:2])
         elif frame_id % frame_stride == 0:
-            one_ref_img = dict(
-                img=image.astype(np.float32).copy(),
-                img_info=dict(frame_id=frame_id))
-            data.append(one_ref_img)
-
-        # remove the "LoadMultiImagesFromFile" in pipeline
-        test_pipeline = Compose(cfg.data.test.pipeline[1:])
-
+            data['img'].append(image.astype(np.float32).copy())
+            data['frame_id'].append(frame_id)
+            data['ori_shape'].append(image.shape[:2])
+        # In order to pop the LoadImageFromFile, test_pipeline[0] is
+        # `TransformBroadcaster` and test_pipeline[0].transforms[0]
+        # is 'LoadImageFromFile'.
+        test_pipeline = copy.deepcopy(cfg.test_dataloader.dataset.pipeline)
+        test_pipeline[0].transforms.pop(0)
+        test_pipeline = Compose(test_pipeline)
     else:
         print('Not supported loading data pipeline type: '
-              f'{cfg.data.test.pipeline[0].type}')
+              f'{first_transform.type}')
         raise NotImplementedError
 
     data = test_pipeline(data)
-    data = collate([data], samples_per_gpu=1)
-    if next(model.parameters()).is_cuda:
-        # scatter to specified GPU
-        data = scatter(data, [device])[0]
-    else:
+
+    if not next(model.parameters()).is_cuda:
         for m in model.modules():
             assert not isinstance(
                 m, RoIPool
             ), 'CPU inference with RoIPool is not supported currently.'
-        # just get the actual data from DataContainer
-        data['img_metas'] = data['img_metas'][0].data
 
     # forward the model
     with torch.no_grad():
-        result = model(return_loss=False, rescale=True, **data)
+        data = mmengine.dataset.default_collate([data])
+        result = model.test_step(data)[0]
     return result
diff --git a/mmtrack/apis/test.py b/mmtrack/apis/test.py
deleted file mode 100644
index b2eb04e99..000000000
--- a/mmtrack/apis/test.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import os.path as osp
-import shutil
-import tempfile
-import time
-from collections import defaultdict
-
-import mmcv
-import torch
-import torch.distributed as dist
-from mmcv.image import tensor2imgs
-from mmcv.runner import get_dist_info
-from mmdet.core import encode_mask_results
-
-
-def single_gpu_test(model,
-                    data_loader,
-                    show=False,
-                    out_dir=None,
-                    fps=3,
-                    show_score_thr=0.3):
-    """Test model with single gpu.
-
-    Args:
-        model (nn.Module): Model to be tested.
-        data_loader (nn.Dataloader): Pytorch data loader.
-        show (bool, optional): If True, visualize the prediction results.
-            Defaults to False.
-        out_dir (str, optional): Path of directory to save the
-            visualization results. Defaults to None.
-        fps (int, optional): FPS of the output video.
-            Defaults to 3.
-        show_score_thr (float, optional): The score threshold of visualization
-            (Only used in VID for now). Defaults to 0.3.
-
-    Returns:
-        dict[str, list]: The prediction results.
-    """
-    model.eval()
-    results = defaultdict(list)
-    dataset = data_loader.dataset
-    prev_img_meta = None
-    prog_bar = mmcv.ProgressBar(len(dataset))
-    for i, data in enumerate(data_loader):
-        with torch.no_grad():
-            result = model(return_loss=False, rescale=True, **data)
-
-        batch_size = data['img'][0].size(0)
-        if show or out_dir:
-            assert batch_size == 1, 'Only support batch_size=1 when testing.'
-            img_tensor = data['img'][0]
-            img_meta = data['img_metas'][0].data[0][0]
-            img = tensor2imgs(img_tensor, **img_meta['img_norm_cfg'])[0]
-
-            h, w, _ = img_meta['img_shape']
-            img_show = img[:h, :w, :]
-
-            ori_h, ori_w = img_meta['ori_shape'][:-1]
-            img_show = mmcv.imresize(img_show, (ori_w, ori_h))
-
-            if out_dir:
-                out_file = osp.join(out_dir, img_meta['ori_filename'])
-            else:
-                out_file = None
-
-            model.module.show_result(
-                img_show,
-                result,
-                show=show,
-                out_file=out_file,
-                score_thr=show_score_thr)
-
-            # Whether need to generate a video from images.
-            # The frame_id == 0 means the model starts processing
-            # a new video, therefore we can write the previous video.
-            # There are two corner cases.
-            # Case 1: prev_img_meta == None means there is no previous video.
-            # Case 2: i == len(dataset) means processing the last video
-            need_write_video = (
-                prev_img_meta is not None and img_meta['frame_id'] == 0
-                or i == len(dataset))
-            if out_dir and need_write_video:
-                prev_img_prefix, prev_img_name = prev_img_meta[
-                    'ori_filename'].rsplit(os.sep, 1)
-                prev_img_idx, prev_img_type = prev_img_name.split('.')
-                prev_filename_tmpl = '{:0' + str(
-                    len(prev_img_idx)) + 'd}.' + prev_img_type
-                prev_img_dirs = f'{out_dir}/{prev_img_prefix}'
-                prev_img_names = sorted(os.listdir(prev_img_dirs))
-                prev_start_frame_id = int(prev_img_names[0].split('.')[0])
-                prev_end_frame_id = int(prev_img_names[-1].split('.')[0])
-
-                mmcv.frames2video(
-                    prev_img_dirs,
-                    f'{prev_img_dirs}/out_video.mp4',
-                    fps=fps,
-                    fourcc='mp4v',
-                    filename_tmpl=prev_filename_tmpl,
-                    start=prev_start_frame_id,
-                    end=prev_end_frame_id,
-                    show_progress=False)
-
-            prev_img_meta = img_meta
-
-        for key in result:
-            if 'mask' in key:
-                result[key] = encode_mask_results(result[key])
-
-        for k, v in result.items():
-            results[k].append(v)
-
-        for _ in range(batch_size):
-            prog_bar.update()
-
-    return results
-
-
-def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
-    """Test model with multiple gpus.
-
-    This method tests model with multiple gpus and collects the results
-    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
-    it encodes results to gpu tensors and use gpu communication for results
-    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
-    and collects them by the rank 0 worker. 'gpu_collect=True' is not
-    supported for now.
-
-    Args:
-        model (nn.Module): Model to be tested.
-        data_loader (nn.Dataloader): Pytorch data loader.
-        tmpdir (str): Path of directory to save the temporary results from
-            different gpus under cpu mode. Defaults to None.
-        gpu_collect (bool): Option to use either gpu or cpu to collect results.
-            Defaults to False.
-
-    Returns:
-        dict[str, list]: The prediction results.
-    """
-    model.eval()
-    results = defaultdict(list)
-    dataset = data_loader.dataset
-    rank, world_size = get_dist_info()
-    if rank == 0:
-        prog_bar = mmcv.ProgressBar(len(dataset))
-    time.sleep(2)  # This line can prevent deadlock problem in some cases.
-    for i, data in enumerate(data_loader):
-        with torch.no_grad():
-            result = model(return_loss=False, rescale=True, **data)
-        for key in result:
-            if 'mask' in key:
-                result[key] = encode_mask_results(result[key])
-
-        for k, v in result.items():
-            results[k].append(v)
-
-        if rank == 0:
-            batch_size = data['img'][0].size(0)
-            for _ in range(batch_size * world_size):
-                prog_bar.update()
-
-    # collect results from all ranks
-    if gpu_collect:
-        raise NotImplementedError
-    else:
-        results = collect_results_cpu(results, tmpdir)
-    return results
-
-
-def collect_results_cpu(result_part, tmpdir=None):
-    """Collect results on cpu mode.
-
-    Saves the results on different gpus to 'tmpdir' and collects them by the
-    rank 0 worker.
-
-    Args:
-        result_part (dict[list]): The part of prediction results.
-        tmpdir (str): Path of directory to save the temporary results from
-            different gpus under cpu mode. If is None, use `tempfile.mkdtemp()`
-            to make a temporary path. Defaults to None.
-
-    Returns:
-        dict[str, list]: The prediction results.
-    """
-    rank, world_size = get_dist_info()
-    # create a tmp dir if it is not specified
-    if tmpdir is None:
-        MAX_LEN = 512
-        # 32 is whitespace
-        dir_tensor = torch.full((MAX_LEN, ),
-                                32,
-                                dtype=torch.uint8,
-                                device='cuda')
-        if rank == 0:
-            tmpdir = tempfile.mkdtemp()
-            tmpdir = torch.tensor(
-                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
-            dir_tensor[:len(tmpdir)] = tmpdir
-        dist.broadcast(dir_tensor, 0)
-        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
-    else:
-        mmcv.mkdir_or_exist(tmpdir)
-    # dump the part result to the dir
-    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
-    dist.barrier()
-    # collect all parts
-    if rank != 0:
-        return None
-    else:
-        # load results of all parts from tmp dir
-        part_list = defaultdict(list)
-        for i in range(world_size):
-            part_file = osp.join(tmpdir, f'part_{i}.pkl')
-            part_file = mmcv.load(part_file)
-            for k, v in part_file.items():
-                part_list[k].extend(v)
-        shutil.rmtree(tmpdir)
-        return part_list
diff --git a/mmtrack/apis/train.py b/mmtrack/apis/train.py
deleted file mode 100644
index acd796cba..000000000
--- a/mmtrack/apis/train.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-import torch.distributed as dist
-from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
-from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
-                         build_optimizer, get_dist_info)
-from mmcv.utils import build_from_cfg
-from mmdet.datasets import build_dataset
-
-from mmtrack.core import DistEvalHook, EvalHook
-from mmtrack.datasets import build_dataloader
-from mmtrack.utils import get_root_logger
-
-
-def init_random_seed(seed=None, device='cuda'):
-    """Initialize random seed.
-
-    If the seed is not set, the seed will be automatically randomized,
-    and then broadcast to all processes to prevent some potential bugs.
-    Args:
-        seed (int, Optional): The seed. Default to None.
-        device (str): The device where the seed will be put on.
-            Default to 'cuda'.
-    Returns:
-        int: Seed to be used.
-    """
-    if seed is not None:
-        return seed
-
-    # Make sure all ranks share the same random seed to prevent
-    # some potential bugs. Please refer to
-    # https://github.com/open-mmlab/mmdetection/issues/6339
-    rank, world_size = get_dist_info()
-    seed = np.random.randint(2**31)
-    if world_size == 1:
-        return seed
-
-    if rank == 0:
-        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
-    else:
-        random_num = torch.tensor(0, dtype=torch.int32, device=device)
-    dist.broadcast(random_num, src=0)
-    return random_num.item()
-
-
-def train_model(model,
-                dataset,
-                cfg,
-                distributed=False,
-                validate=False,
-                timestamp=None,
-                meta=None):
-    """Train model entry function.
-
-    Args:
-        model (nn.Module): The model to be trained.
-        dataset (:obj:`Dataset`): Train dataset.
-        cfg (dict): The config dict for training.
-        distributed (bool): Whether to use distributed training.
-            Default: False.
-        validate (bool): Whether to do evaluation. Default: False.
-        timestamp (str | None): Local time for runner. Default: None.
-        meta (dict | None): Meta dict to record some important information.
-            Default: None
-    """
-    logger = get_root_logger(cfg.log_level)
-
-    # prepare data loaders
-    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
-    if 'imgs_per_gpu' in cfg.data:
-        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
-                       'Please use "samples_per_gpu" instead')
-        if 'samples_per_gpu' in cfg.data:
-            logger.warning(
-                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
-                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
-                f'={cfg.data.imgs_per_gpu} is used in this experiments')
-        else:
-            logger.warning(
-                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
-                f'{cfg.data.imgs_per_gpu} in this experiments')
-        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
-
-    data_loaders = [
-        build_dataloader(
-            ds,
-            cfg.data.samples_per_gpu,
-            cfg.data.workers_per_gpu,
-            # cfg.gpus will be ignored if distributed
-            len(cfg.gpu_ids),
-            samples_per_epoch=cfg.data.get('samples_per_epoch', None),
-            dist=distributed,
-            seed=cfg.seed,
-            persistent_workers=cfg.data.get('persistent_workers', False))
-        for ds in dataset
-    ]
-
-    # put model on gpus
-    if distributed:
-        find_unused_parameters = cfg.get('find_unused_parameters', False)
-        if find_unused_parameters:
-            logger.info('set find_unused_parameters = True in DDP')
-        # Sets the `find_unused_parameters` parameter in
-        # torch.nn.parallel.DistributedDataParallel
-        model = MMDistributedDataParallel(
-            model.cuda(),
-            device_ids=[torch.cuda.current_device()],
-            broadcast_buffers=False,
-            find_unused_parameters=find_unused_parameters)
-    else:
-        model = MMDataParallel(model, device_ids=cfg.gpu_ids)
-
-    # build runner
-    optimizer = build_optimizer(model, cfg.optimizer)
-    runner = EpochBasedRunner(
-        model,
-        optimizer=optimizer,
-        work_dir=cfg.work_dir,
-        logger=logger,
-        meta=meta)
-    # an ugly workaround to make .log and .log.json filenames the same
-    runner.timestamp = timestamp
-
-    # fp16 setting
-    fp16_cfg = cfg.get('fp16', None)
-    optimizer_config = cfg.optimizer_config
-    if 'type' not in cfg.optimizer_config:
-        optimizer_config.type = 'Fp16OptimizerHook' \
-            if fp16_cfg else 'OptimizerHook'
-    if fp16_cfg:
-        optimizer_config.update(fp16_cfg)
-    if 'Fp16' in optimizer_config.type:
-        optimizer_config.update(distributed=distributed)
-
-    # register hooks
-    runner.register_training_hooks(cfg.lr_config, optimizer_config,
-                                   cfg.checkpoint_config, cfg.log_config,
-                                   cfg.get('momentum_config', None))
-    if distributed:
-        runner.register_hook(DistSamplerSeedHook())
-
-    # register eval hooks
-    if validate:
-        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
-        val_dataloader = build_dataloader(
-            val_dataset,
-            samples_per_gpu=1,
-            workers_per_gpu=cfg.data.workers_per_gpu,
-            dist=distributed,
-            shuffle=False,
-            persistent_workers=cfg.data.get('persistent_workers', False))
-        eval_cfg = cfg.get('evaluation', {})
-        eval_hook = DistEvalHook if distributed else EvalHook
-        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
-
-    # user-defined hooks
-    if cfg.get('custom_hooks', None):
-        custom_hooks = cfg.custom_hooks
-        assert isinstance(custom_hooks, list), \
-            f'custom_hooks expect list type, but got {type(custom_hooks)}'
-        for hook_cfg in cfg.custom_hooks:
-            assert isinstance(hook_cfg, dict), \
-                'Each item in custom_hooks expects dict type, but got ' \
-                f'{type(hook_cfg)}'
-            hook_cfg = hook_cfg.copy()
-            priority = hook_cfg.pop('priority', 'NORMAL')
-            hook = build_from_cfg(hook_cfg, HOOKS)
-            runner.register_hook(hook, priority=priority)
-
-    if cfg.resume_from:
-        runner.resume(cfg.resume_from)
-    elif cfg.load_from:
-        runner.load_checkpoint(cfg.load_from)
-    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
diff --git a/mmtrack/core/__init__.py b/mmtrack/core/__init__.py
deleted file mode 100644
index bb5260f22..000000000
--- a/mmtrack/core/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .anchor import *  # noqa: F401, F403
-from .bbox import *  # noqa: F401, F403
-from .evaluation import *  # noqa: F401, F403
-from .hook import *  # noqa: F401, F403
-from .motion import *  # noqa: F401, F403
-from .optimizer import *  # noqa: F401, F403
-from .track import *  # noqa: F401, F403
-from .utils import *  # noqa: F401, F403
diff --git a/mmtrack/core/evaluation/__init__.py b/mmtrack/core/evaluation/__init__.py
deleted file mode 100644
index ecd9a5924..000000000
--- a/mmtrack/core/evaluation/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .eval_hooks import DistEvalHook, EvalHook
-from .eval_mot import eval_mot
-from .eval_sot_ope import eval_sot_ope
-from .eval_sot_vot import (bbox2region, eval_sot_accuracy_robustness,
-                           eval_sot_eao)
-from .eval_vis import eval_vis
-
-__all__ = [
-    'EvalHook', 'DistEvalHook', 'eval_mot', 'eval_sot_ope', 'bbox2region',
-    'eval_sot_eao', 'eval_sot_accuracy_robustness', 'eval_vis'
-]
diff --git a/mmtrack/core/evaluation/eval_hooks.py b/mmtrack/core/evaluation/eval_hooks.py
deleted file mode 100644
index aa50c8cc4..000000000
--- a/mmtrack/core/evaluation/eval_hooks.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-
-import torch.distributed as dist
-from mmcv.runner import DistEvalHook as BaseDistEvalHook
-from mmcv.runner import EvalHook as BaseEvalHook
-from torch.nn.modules.batchnorm import _BatchNorm
-
-
-class EvalHook(BaseEvalHook):
-    """Please refer to `mmcv.runner.hooks.evaluation.py:EvalHook` for detailed
-    docstring."""
-
-    def _do_evaluate(self, runner):
-        """perform evaluation and save ckpt."""
-        if not self._should_evaluate(runner):
-            return
-
-        from mmtrack.apis import single_gpu_test
-        results = single_gpu_test(runner.model, self.dataloader, show=False)
-        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
-        key_score = self.evaluate(runner, results)
-        if self.save_best:
-            self._save_ckpt(runner, key_score)
-
-
-class DistEvalHook(BaseDistEvalHook):
-    """Please refer to `mmcv.runner.hooks.evaluation.py:DistEvalHook` for
-    detailed docstring."""
-
-    def _do_evaluate(self, runner):
-        """perform evaluation and save ckpt."""
-        # Synchronization of BatchNorm's buffer (running_mean
-        # and running_var) is not supported in the DDP of pytorch,
-        # which may cause the inconsistent performance of models in
-        # different ranks, so we broadcast BatchNorm's buffers
-        # of rank 0 to other ranks to avoid this.
-        if self.broadcast_bn_buffer:
-            model = runner.model
-            for name, module in model.named_modules():
-                if isinstance(module,
-                              _BatchNorm) and module.track_running_stats:
-                    dist.broadcast(module.running_var, 0)
-                    dist.broadcast(module.running_mean, 0)
-
-        if not self._should_evaluate(runner):
-            return
-
-        tmpdir = self.tmpdir
-        if tmpdir is None:
-            tmpdir = osp.join(runner.work_dir, '.eval_hook')
-
-        from mmtrack.apis import multi_gpu_test
-        results = multi_gpu_test(
-            runner.model,
-            self.dataloader,
-            tmpdir=tmpdir,
-            gpu_collect=self.gpu_collect)
-        if runner.rank == 0:
-            print('\n')
-            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
-            key_score = self.evaluate(runner, results)
-
-            if self.save_best:
-                self._save_ckpt(runner, key_score)
diff --git a/mmtrack/core/evaluation/eval_mot.py b/mmtrack/core/evaluation/eval_mot.py
deleted file mode 100644
index ce8461ffe..000000000
--- a/mmtrack/core/evaluation/eval_mot.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import time
-from multiprocessing import Pool
-
-import motmetrics as mm
-import numpy as np
-import pandas as pd
-from mmcv.utils import print_log
-from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
-from motmetrics.lap import linear_sum_assignment
-from motmetrics.math_util import quiet_divide
-
-from mmtrack.core.track import outs2results
-
-METRIC_MAPS = {
-    'idf1': 'IDF1',
-    'mota': 'MOTA',
-    'motp': 'MOTP',
-    'num_false_positives': 'FP',
-    'num_misses': 'FN',
-    'num_switches': 'IDSw',
-    'recall': 'Rcll',
-    'precision': 'Prcn',
-    'mostly_tracked': 'MT',
-    'partially_tracked': 'PT',
-    'mostly_lost': 'ML',
-    'num_fragmentations': 'FM'
-}
-
-
-def bbox_distances(bboxes1, bboxes2, iou_thr=0.5):
-    """Calculate the IoU distances of two sets of boxes."""
-    ious = bbox_overlaps(bboxes1, bboxes2, mode='iou')
-    distances = 1 - ious
-    distances = np.where(distances > iou_thr, np.nan, distances)
-    return distances
-
-
-def acc_single_video(results,
-                     gts,
-                     iou_thr=0.5,
-                     ignore_iof_thr=0.5,
-                     ignore_by_classes=False):
-    """Accumulate results in a single video."""
-    num_classes = len(results[0])
-    accumulators = [
-        mm.MOTAccumulator(auto_id=True) for i in range(num_classes)
-    ]
-    for result, gt in zip(results, gts):
-        if ignore_by_classes:
-            gt_ignore = outs2results(
-                bboxes=gt['bboxes_ignore'],
-                labels=gt['labels_ignore'],
-                num_classes=num_classes)['bbox_results']
-        else:
-            gt_ignore = [gt['bboxes_ignore'] for i in range(num_classes)]
-        gt = outs2results(
-            bboxes=gt['bboxes'],
-            labels=gt['labels'],
-            ids=gt['instance_ids'],
-            num_classes=num_classes)['bbox_results']
-        for i in range(num_classes):
-            gt_ids, gt_bboxes = gt[i][:, 0].astype(np.int), gt[i][:, 1:]
-            pred_ids, pred_bboxes = result[i][:, 0].astype(
-                np.int), result[i][:, 1:-1]
-            dist = bbox_distances(gt_bboxes, pred_bboxes, iou_thr)
-            if gt_ignore[i].shape[0] > 0:
-                # 1. assign gt and preds
-                fps = np.ones(pred_bboxes.shape[0]).astype(np.bool)
-                row, col = linear_sum_assignment(dist)
-                for m, n in zip(row, col):
-                    if not np.isfinite(dist[m, n]):
-                        continue
-                    fps[n] = False
-                # 2. ignore by iof
-                iofs = bbox_overlaps(pred_bboxes, gt_ignore[i], mode='iof')
-                ignores = (iofs > ignore_iof_thr).any(axis=1)
-                # 3. filter preds
-                valid_inds = ~(fps & ignores)
-                pred_ids = pred_ids[valid_inds]
-                dist = dist[:, valid_inds]
-            if dist.shape != (0, 0):
-                accumulators[i].update(gt_ids, pred_ids, dist)
-    return accumulators
-
-
-def aggregate_accs(accumulators, classes):
-    """Aggregate results from each class."""
-    # accs for each class
-    items = list(classes)
-    names, accs = [[] for c in classes], [[] for c in classes]
-    for video_ind, _accs in enumerate(accumulators):
-        for cls_ind, acc in enumerate(_accs):
-            if len(acc._events['Type']) == 0:
-                continue
-            name = f'{classes[cls_ind]}_{video_ind}'
-            names[cls_ind].append(name)
-            accs[cls_ind].append(acc)
-
-    # overall
-    items.append('OVERALL')
-    names.append([n for name in names for n in name])
-    accs.append([a for acc in accs for a in acc])
-
-    return names, accs, items
-
-
-def eval_single_class(names, accs):
-    """Evaluate CLEAR MOT results for each class."""
-    mh = mm.metrics.create()
-    summary = mh.compute_many(
-        accs, names=names, metrics=METRIC_MAPS.keys(), generate_overall=True)
-    results = [v['OVERALL'] for k, v in summary.to_dict().items()]
-    motp_ind = list(METRIC_MAPS).index('motp')
-    if np.isnan(results[motp_ind]):
-        num_dets = mh.compute_many(
-            accs,
-            names=names,
-            metrics=['num_detections'],
-            generate_overall=True)
-        sum_motp = (summary['motp'] * num_dets['num_detections']).sum()
-        motp = quiet_divide(sum_motp, num_dets['num_detections']['OVERALL'])
-        results[motp_ind] = float(1 - motp)
-    else:
-        results[motp_ind] = 1 - results[motp_ind]
-    return results
-
-
-def eval_mot(results,
-             annotations,
-             logger=None,
-             classes=None,
-             iou_thr=0.5,
-             ignore_iof_thr=0.5,
-             ignore_by_classes=False,
-             nproc=4):
-    """Evaluation CLEAR MOT metrics.
-
-    Args:
-        results (list[list[list[ndarray]]]): The first list indicates videos,
-            The second list indicates images. The third list indicates
-            categories. The ndarray indicates the tracking results.
-        annotations (list[list[dict]]): The first list indicates videos,
-            The second list indicates images. The third list indicates
-            the annotations of each video. Keys of annotations are
-
-            - `bboxes`: numpy array of shape (n, 4)
-            - `labels`: numpy array of shape (n, )
-            - `instance_ids`: numpy array of shape (n, )
-            - `bboxes_ignore` (optional): numpy array of shape (k, 4)
-            - `labels_ignore` (optional): numpy array of shape (k, )
-        logger (logging.Logger | str | None, optional): The way to print the
-            evaluation results. Defaults to None.
-        classes (list, optional): Classes in the dataset. Defaults to None.
-        iou_thr (float, optional): IoU threshold for evaluation.
-            Defaults to 0.5.
-        ignore_iof_thr (float, optional): Iof threshold to ignore results.
-            Defaults to 0.5.
-        ignore_by_classes (bool, optional): Whether ignore the results by
-            classes or not. Defaults to False.
-        nproc (int, optional): Number of the processes. Defaults to 4.
-
-    Returns:
-        dict[str, float]: Evaluation results.
-    """
-    print_log('---CLEAR MOT Evaluation---', logger)
-    t = time.time()
-    gts = annotations.copy()
-    if classes is None:
-        classes = [i + 1 for i in range(len(results[0]))]
-    assert len(results) == len(gts)
-    metrics = METRIC_MAPS.keys()
-
-    print_log('Accumulating...', logger)
-
-    pool = Pool(nproc)
-    accs = pool.starmap(
-        acc_single_video,
-        zip(results, gts, [iou_thr for _ in range(len(gts))],
-            [ignore_iof_thr for _ in range(len(gts))],
-            [ignore_by_classes for _ in range(len(gts))]))
-    names, accs, items = aggregate_accs(accs, classes)
-    print_log('Evaluating...', logger)
-    eval_results = pd.DataFrame(columns=metrics)
-    summaries = pool.starmap(eval_single_class, zip(names, accs))
-    pool.close()
-
-    # category and overall results
-    for i, item in enumerate(items):
-        eval_results.loc[item] = summaries[i]
-
-    dtypes = {m: type(d) for m, d in zip(metrics, summaries[0])}
-    # average results
-    avg_results = []
-    for i, m in enumerate(metrics):
-        v = np.array([s[i] for s in summaries[:len(classes)]])
-        v = np.nan_to_num(v, nan=0)
-        if dtypes[m] == int:
-            avg_results.append(int(v.sum()))
-        elif dtypes[m] == float:
-            avg_results.append(float(v.mean()))
-        else:
-            raise TypeError()
-    eval_results.loc['AVERAGE'] = avg_results
-    eval_results = eval_results.astype(dtypes)
-
-    print_log('Rendering...', logger)
-    strsummary = mm.io.render_summary(
-        eval_results,
-        formatters=mm.metrics.create().formatters,
-        namemap=METRIC_MAPS)
-
-    print_log('\n' + strsummary, logger)
-    print_log(f'Evaluation finishes with {(time.time() - t):.2f} s.', logger)
-
-    eval_results = eval_results.to_dict()
-    out = {METRIC_MAPS[k]: v['OVERALL'] for k, v in eval_results.items()}
-    for k, v in out.items():
-        out[k] = float(f'{(v):.3f}') if isinstance(v, float) else int(f'{v}')
-    for m in ['OVERALL', 'AVERAGE']:
-        out[f'track_{m}_copypaste'] = ''
-        for k in METRIC_MAPS.keys():
-            v = eval_results[k][m]
-            v = f'{(v):.3f} ' if isinstance(v, float) else f'{v} '
-            out[f'track_{m}_copypaste'] += v
-
-    return out
diff --git a/mmtrack/core/evaluation/eval_vis.py b/mmtrack/core/evaluation/eval_vis.py
deleted file mode 100644
index 87bd83675..000000000
--- a/mmtrack/core/evaluation/eval_vis.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import contextlib
-import io
-from collections import OrderedDict
-
-from mmcv.utils import print_log
-
-from .ytvis import YTVIS
-from .ytviseval import YTVISeval
-
-
-def eval_vis(test_results, vis_anns, logger=None):
-    """Evaluation on VIS metrics.
-
-    Args:
-        test_results (dict(list[dict])): Testing results of the VIS dataset.
-        vis_anns (dict(list[dict])): The annotation in the format
-                of YouTube-VIS.
-        logger (logging.Logger | str | None): Logger used for printing
-                related information during evaluation. Default: None.
-
-    Returns:
-        dict[str, float]: Evaluation results.
-    """
-    ytvis = YTVIS(vis_anns)
-
-    if len(ytvis.anns) == 0:
-        print_log('Annotations does not exist', logger=logger)
-        return
-
-    ytvis_dets = ytvis.loadRes(test_results)
-    vid_ids = ytvis.getVidIds()
-
-    iou_type = metric = 'segm'
-    eval_results = OrderedDict()
-    ytvisEval = YTVISeval(ytvis, ytvis_dets, iou_type)
-    ytvisEval.params.vidIds = vid_ids
-    ytvisEval.evaluate()
-    ytvisEval.accumulate()
-
-    # Save coco summarize print information to logger
-    redirect_string = io.StringIO()
-    with contextlib.redirect_stdout(redirect_string):
-        ytvisEval.summarize()
-    print_log('\n' + redirect_string.getvalue(), logger=logger)
-
-    metric_items = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l']
-    coco_metric_names = {
-        'mAP': 0,
-        'mAP_50': 1,
-        'mAP_75': 2,
-        'mAP_s': 3,
-        'mAP_m': 4,
-        'mAP_l': 5,
-        'AR@1': 6,
-        'AR@10': 7,
-        'AR@100': 8,
-        'AR_s@100': 9,
-        'AR_m@100': 10,
-        'AR_l@100': 11
-    }
-
-    for metric_item in metric_items:
-        key = f'{metric}_{metric_item}'
-        val = float(f'{ytvisEval.stats[coco_metric_names[metric_item]]:.3f}')
-        eval_results[key] = val
-
-    ap = ytvisEval.stats[:6]
-    eval_results[f'{metric}_mAP_copypaste'] = (
-        f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
-        f'{ap[4]:.3f} {ap[5]:.3f}')
-    return eval_results
diff --git a/mmtrack/core/hook/__init__.py b/mmtrack/core/hook/__init__.py
deleted file mode 100644
index 32ffd1e88..000000000
--- a/mmtrack/core/hook/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .yolox_mode_switch_hook import YOLOXModeSwitchHook
-
-__all__ = ['YOLOXModeSwitchHook']
diff --git a/mmtrack/core/optimizer/__init__.py b/mmtrack/core/optimizer/__init__.py
deleted file mode 100644
index 597bf2cd3..000000000
--- a/mmtrack/core/optimizer/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .sot_lr_updater import SiameseRPNLrUpdaterHook
-from .sot_optimizer_hook import (SiameseRPNFp16OptimizerHook,
-                                 SiameseRPNOptimizerHook)
-
-__all__ = [
-    'SiameseRPNOptimizerHook', 'SiameseRPNLrUpdaterHook',
-    'SiameseRPNFp16OptimizerHook'
-]
diff --git a/mmtrack/core/optimizer/sot_lr_updater.py b/mmtrack/core/optimizer/sot_lr_updater.py
deleted file mode 100644
index a64f54efc..000000000
--- a/mmtrack/core/optimizer/sot_lr_updater.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-
-import numpy as np
-from mmcv.runner.hooks import HOOKS, LrUpdaterHook
-
-
-def step_lr_interval(start_lr_factor, end_lr_factor, start_epoch, end_epoch):
-    """Exponentially varying learning rate.
-
-    Generator learning rate factor exponentially varying from `start_lr_factor`
-    to `end_lr_factor` in total `end_epoch - start_epoch` epochs.
-
-    Args:
-        start_lr_factor (float): Start learning rate factor.
-        end_lr_factor (float): End learning rate factor.
-        start_epoch (int): Start epoch.
-        end_epoch (int): End epoch.
-
-    Returns:
-        ndarray: The exponentially varying learning rate.
-    """
-    epochs = end_epoch - start_epoch
-    mult = math.pow(end_lr_factor / start_lr_factor, 1. / (epochs))
-    lr_intervals = start_lr_factor * (mult**np.arange(epochs))
-    return lr_intervals
-
-
-def log_lr_interval(start_lr_factor, end_lr_factor, start_epoch, end_epoch):
-    """Logarithmically varying learning rate.
-
-    Generator learning rate factor logarithmically varying from
-    `start_lr_factor` to `end_lr_factor` in total `end_epoch - start_epoch`
-    epochs.
-
-    Args:
-        start_lr_factor (float): Start learning rate factor.
-        end_lr_factor (float): End learning rate factor.
-        start_epoch (int): Start epoch.
-        end_epoch (int): End epoch.
-
-    Returns:
-        ndarray: The logarithmically varying learning rate.
-    """
-    epochs = end_epoch - start_epoch
-    lr_intervals = np.logspace(
-        math.log10(start_lr_factor), math.log10(end_lr_factor), epochs)
-    return lr_intervals
-
-
-@HOOKS.register_module()
-class SiameseRPNLrUpdaterHook(LrUpdaterHook):
-    """Learning rate updater for siamese rpn.
-
-    Args:
-        lr_configs (list[dict]): List of dict where each dict denotes the
-            configuration of specifical learning rate updater and must have
-            'type'.
-    """
-
-    lr_types = {'step': step_lr_interval, 'log': log_lr_interval}
-
-    def __init__(self,
-                 lr_configs=[
-                     dict(
-                         type='step',
-                         start_lr_factor=0.2,
-                         end_lr_factor=1.0,
-                         end_epoch=5),
-                     dict(
-                         type='log',
-                         start_lr_factor=1.0,
-                         end_lr_factor=0.1,
-                         end_epoch=20),
-                 ],
-                 **kwargs):
-        super(SiameseRPNLrUpdaterHook, self).__init__(**kwargs)
-        assert self.by_epoch is True
-        self.lr_intervals = []
-
-        start_epoch = 0
-        for lr_config in lr_configs:
-            lr_type = self.lr_types[lr_config.pop('type')]
-            lr_config['start_epoch'] = start_epoch
-
-            lr_intervals = lr_type(**lr_config)
-
-            self.lr_intervals.append(lr_intervals)
-            start_epoch = lr_config['end_epoch']
-        self.lr_intervals = np.concatenate(self.lr_intervals)
-
-    def get_lr(self, runner, base_lr):
-        """Get a specifical learning rate for each epoch."""
-        return base_lr * self.lr_intervals[runner.epoch]
diff --git a/mmtrack/core/optimizer/sot_optimizer_hook.py b/mmtrack/core/optimizer/sot_optimizer_hook.py
deleted file mode 100644
index c3d24a928..000000000
--- a/mmtrack/core/optimizer/sot_optimizer_hook.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-from mmcv.runner.hooks import HOOKS, Fp16OptimizerHook, OptimizerHook
-
-
-@HOOKS.register_module()
-class SiameseRPNOptimizerHook(OptimizerHook):
-    """Optimizer hook for siamese rpn.
-
-    Args:
-        backbone_start_train_epoch (int): Start to train the backbone at
-            `backbone_start_train_epoch`-th epoch. Note the epoch in this
-            class counts from 0, while the epoch in the log file counts from 1.
-        backbone_train_layers (list(str)): List of str denoting the stages
-            needed be trained in backbone.
-    """
-
-    def __init__(self, backbone_start_train_epoch, backbone_train_layers,
-                 **kwargs):
-        super(SiameseRPNOptimizerHook, self).__init__(**kwargs)
-        self.backbone_start_train_epoch = backbone_start_train_epoch
-        self.backbone_train_layers = backbone_train_layers
-
-    def before_train_epoch(self, runner):
-        """If `runner.epoch >= self.backbone_start_train_epoch`, start to train
-        the backbone."""
-        if runner.epoch >= self.backbone_start_train_epoch:
-            for layer in self.backbone_train_layers:
-                for param in getattr(runner.model.module.backbone,
-                                     layer).parameters():
-                    param.requires_grad = True
-                for m in getattr(runner.model.module.backbone,
-                                 layer).modules():
-                    if isinstance(m, nn.BatchNorm2d):
-                        m.train()
-
-
-@HOOKS.register_module()
-class SiameseRPNFp16OptimizerHook(Fp16OptimizerHook):
-    """FP16Optimizer hook for siamese rpn.
-
-    Args:
-        backbone_start_train_epoch (int): Start to train the backbone at
-            `backbone_start_train_epoch`-th epoch. Note the epoch in this
-            class counts from 0, while the epoch in the log file counts from 1.
-        backbone_train_layers (list(str)): List of str denoting the stages
-            needed be trained in backbone.
-    """
-
-    def __init__(self, backbone_start_train_epoch, backbone_train_layers,
-                 **kwargs):
-        super(SiameseRPNFp16OptimizerHook, self).__init__(**kwargs)
-        self.backbone_start_train_epoch = backbone_start_train_epoch
-        self.backbone_train_layers = backbone_train_layers
-
-    def before_train_epoch(self, runner):
-        """If `runner.epoch >= self.backbone_start_train_epoch`, start to train
-        the backbone."""
-        if runner.epoch >= self.backbone_start_train_epoch:
-            for layer in self.backbone_train_layers:
-                for param in getattr(runner.model.module.backbone,
-                                     layer).parameters():
-                    param.requires_grad = True
-                for m in getattr(runner.model.module.backbone,
-                                 layer).modules():
-                    if isinstance(m, nn.BatchNorm2d):
-                        m.train()
diff --git a/mmtrack/core/track/__init__.py b/mmtrack/core/track/__init__.py
deleted file mode 100644
index facaa041e..000000000
--- a/mmtrack/core/track/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .correlation import depthwise_correlation
-from .interpolation import interpolate_tracks
-from .similarity import embed_similarity
-from .transforms import imrenormalize, outs2results, results2outs
-
-__all__ = [
-    'depthwise_correlation', 'outs2results', 'results2outs',
-    'embed_similarity', 'imrenormalize', 'interpolate_tracks'
-]
diff --git a/mmtrack/core/track/interpolation.py b/mmtrack/core/track/interpolation.py
deleted file mode 100644
index 1f7e66e71..000000000
--- a/mmtrack/core/track/interpolation.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-
-
-def _interpolate_track(track, track_id, max_num_frames=20):
-    """Interpolate a track linearly to make the track more complete.
-
-    Args:
-        track (ndarray): With shape (N, 7). Each row denotes
-            (frame_id, track_id, x1, y1, x2, y2, score).
-        max_num_frames (int, optional): The maximum disconnected length in the
-            track. Defaults to 20.
-
-    Returns:
-        ndarray: The interpolated track with shape (N, 7). Each row denotes
-            (frame_id, track_id, x1, y1, x2, y2, score)
-    """
-    assert (track[:, 1] == track_id).all(), \
-        'The track id should not changed when interpolate a track.'
-
-    frame_ids = track[:, 0]
-    interpolated_track = np.zeros((0, 7))
-    # perform interpolation for the disconnected frames in the track.
-    for i in np.where(np.diff(frame_ids) > 1)[0]:
-        left_frame_id = frame_ids[i]
-        right_frame_id = frame_ids[i + 1]
-        num_disconnected_frames = int(right_frame_id - left_frame_id)
-
-        if 1 < num_disconnected_frames < max_num_frames:
-            left_bbox = track[i, 2:6]
-            right_bbox = track[i + 1, 2:6]
-
-            # perform interpolation for two adjacent tracklets.
-            for j in range(1, num_disconnected_frames):
-                cur_bbox = j / (num_disconnected_frames) * (
-                    right_bbox - left_bbox) + left_bbox
-                cur_result = np.ones((7, ))
-                cur_result[0] = j + left_frame_id
-                cur_result[1] = track_id
-                cur_result[2:6] = cur_bbox
-
-                interpolated_track = np.concatenate(
-                    (interpolated_track, cur_result[None]), axis=0)
-
-    interpolated_track = np.concatenate((track, interpolated_track), axis=0)
-    return interpolated_track
-
-
-def interpolate_tracks(tracks, min_num_frames=5, max_num_frames=20):
-    """Interpolate tracks linearly to make tracks more complete.
-
-    This function is proposed in
-    "ByteTrack: Multi-Object Tracking by Associating Every Detection Box."
-    `ByteTrack<https://arxiv.org/abs/2110.06864>`_.
-
-    Args:
-        tracks (ndarray): With shape (N, 7). Each row denotes
-            (frame_id, track_id, x1, y1, x2, y2, score).
-        min_num_frames (int, optional): The minimum length of a track that will
-            be interpolated. Defaults to 5.
-        max_num_frames (int, optional): The maximum disconnected length in
-            a track. Defaults to 20.
-
-    Returns:
-        ndarray: The interpolated tracks with shape (N, 7). Each row denotes
-            (frame_id, track_id, x1, y1, x2, y2, score)
-    """
-    max_track_id = int(np.max(tracks[:, 1]))
-    min_track_id = int(np.min(tracks[:, 1]))
-
-    # perform interpolation for each track
-    interpolated_tracks = []
-    for track_id in range(min_track_id, max_track_id + 1):
-        inds = tracks[:, 1] == track_id
-        track = tracks[inds]
-        num_frames = len(track)
-        if num_frames <= 2:
-            continue
-
-        if num_frames > min_num_frames:
-            interpolated_track = _interpolate_track(track, track_id,
-                                                    max_num_frames)
-        else:
-            interpolated_track = track
-        interpolated_tracks.append(interpolated_track)
-
-    interpolated_tracks = np.concatenate(interpolated_tracks)
-    return interpolated_tracks[interpolated_tracks[:, 0].argsort()]
diff --git a/mmtrack/core/track/transforms.py b/mmtrack/core/track/transforms.py
deleted file mode 100644
index 6a7c02f10..000000000
--- a/mmtrack/core/track/transforms.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
-import numpy as np
-import torch
-from mmdet.core import bbox2result
-
-
-def imrenormalize(img, img_norm_cfg, new_img_norm_cfg):
-    """Re-normalize the image.
-
-    Args:
-        img (Tensor | ndarray): Input image. If the input is a Tensor, the
-            shape is (1, C, H, W). If the input is a ndarray, the shape
-            is (H, W, C).
-        img_norm_cfg (dict): Original configuration for the normalization.
-        new_img_norm_cfg (dict): New configuration for the normalization.
-
-    Returns:
-        Tensor | ndarray: Output image with the same type and shape of
-        the input.
-    """
-    if isinstance(img, torch.Tensor):
-        assert img.ndim == 4 and img.shape[0] == 1
-        new_img = img.squeeze(0).cpu().numpy().transpose(1, 2, 0)
-        new_img = _imrenormalize(new_img, img_norm_cfg, new_img_norm_cfg)
-        new_img = new_img.transpose(2, 0, 1)[None]
-        return torch.from_numpy(new_img).to(img)
-    else:
-        return _imrenormalize(img, img_norm_cfg, new_img_norm_cfg)
-
-
-def _imrenormalize(img, img_norm_cfg, new_img_norm_cfg):
-    """Re-normalize the image."""
-    img_norm_cfg = img_norm_cfg.copy()
-    new_img_norm_cfg = new_img_norm_cfg.copy()
-    for k, v in img_norm_cfg.items():
-        if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray):
-            img_norm_cfg[k] = np.array(v, dtype=img.dtype)
-    # reverse cfg
-    if 'to_rgb' in img_norm_cfg:
-        img_norm_cfg['to_bgr'] = img_norm_cfg['to_rgb']
-        img_norm_cfg.pop('to_rgb')
-    for k, v in new_img_norm_cfg.items():
-        if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray):
-            new_img_norm_cfg[k] = np.array(v, dtype=img.dtype)
-    img = mmcv.imdenormalize(img, **img_norm_cfg)
-    img = mmcv.imnormalize(img, **new_img_norm_cfg)
-    return img
-
-
-def outs2results(bboxes=None,
-                 labels=None,
-                 masks=None,
-                 ids=None,
-                 num_classes=None,
-                 **kwargs):
-    """Convert tracking/detection results to a list of numpy arrays.
-
-    Args:
-        bboxes (torch.Tensor | np.ndarray): shape (n, 5)
-        labels (torch.Tensor | np.ndarray): shape (n, )
-        masks (torch.Tensor | np.ndarray): shape (n, h, w)
-        ids (torch.Tensor | np.ndarray): shape (n, )
-        num_classes (int): class number, not including background class
-
-    Returns:
-        dict[str : list(ndarray) | list[list[np.ndarray]]]: tracking/detection
-        results of each class. It may contain keys as belows:
-
-        - bbox_results (list[np.ndarray]): Each list denotes bboxes of one
-            category.
-        - mask_results (list[list[np.ndarray]]): Each outer list denotes masks
-            of one category. Each inner list denotes one mask belonging to
-            the category. Each mask has shape (h, w).
-    """
-    assert labels is not None
-    assert num_classes is not None
-
-    results = dict()
-
-    if ids is not None:
-        valid_inds = ids > -1
-        ids = ids[valid_inds]
-        labels = labels[valid_inds]
-
-    if bboxes is not None:
-        if ids is not None:
-            bboxes = bboxes[valid_inds]
-            if bboxes.shape[0] == 0:
-                bbox_results = [
-                    np.zeros((0, 6), dtype=np.float32)
-                    for i in range(num_classes)
-                ]
-            else:
-                if isinstance(bboxes, torch.Tensor):
-                    bboxes = bboxes.cpu().numpy()
-                    labels = labels.cpu().numpy()
-                    ids = ids.cpu().numpy()
-                bbox_results = [
-                    np.concatenate(
-                        (ids[labels == i, None], bboxes[labels == i, :]),
-                        axis=1) for i in range(num_classes)
-                ]
-        else:
-            bbox_results = bbox2result(bboxes, labels, num_classes)
-        results['bbox_results'] = bbox_results
-
-    if masks is not None:
-        if ids is not None:
-            masks = masks[valid_inds]
-        if isinstance(masks, torch.Tensor):
-            masks = masks.detach().cpu().numpy()
-        masks_results = [[] for _ in range(num_classes)]
-        for i in range(bboxes.shape[0]):
-            masks_results[labels[i]].append(masks[i])
-        results['mask_results'] = masks_results
-
-    return results
-
-
-def results2outs(bbox_results=None,
-                 mask_results=None,
-                 mask_shape=None,
-                 **kwargs):
-    """Restore the results (list of results of each category) into the results
-    of the model forward.
-
-    Args:
-        bbox_results (list[np.ndarray]): Each list denotes bboxes of one
-            category.
-        mask_results (list[list[np.ndarray]]): Each outer list denotes masks of
-            one category. Each inner list denotes one mask belonging to
-            the category. Each mask has shape (h, w).
-        mask_shape (tuple[int]): The shape (h, w) of mask.
-
-    Returns:
-        tuple: tracking results of each class. It may contain keys as belows:
-
-        - bboxes (np.ndarray): shape (n, 5)
-        - labels (np.ndarray): shape (n, )
-        - masks (np.ndarray): shape (n, h, w)
-        - ids (np.ndarray): shape (n, )
-    """
-    outputs = dict()
-
-    if bbox_results is not None:
-        labels = []
-        for i, bbox in enumerate(bbox_results):
-            labels.extend([i] * bbox.shape[0])
-        labels = np.array(labels, dtype=np.int64)
-        outputs['labels'] = labels
-
-        bboxes = np.concatenate(bbox_results, axis=0).astype(np.float32)
-        if bboxes.shape[1] == 5:
-            outputs['bboxes'] = bboxes
-        elif bboxes.shape[1] == 6:
-            ids = bboxes[:, 0].astype(np.int64)
-            bboxes = bboxes[:, 1:]
-            outputs['bboxes'] = bboxes
-            outputs['ids'] = ids
-        else:
-            raise NotImplementedError(
-                f'Not supported bbox shape: (N, {bboxes.shape[1]})')
-
-    if mask_results is not None:
-        assert mask_shape is not None
-        mask_height, mask_width = mask_shape
-        mask_results = mmcv.concat_list(mask_results)
-        if len(mask_results) == 0:
-            masks = np.zeros((0, mask_height, mask_width)).astype(bool)
-        else:
-            masks = np.stack(mask_results, axis=0)
-        outputs['masks'] = masks
-
-    return outputs
diff --git a/mmtrack/core/utils/__init__.py b/mmtrack/core/utils/__init__.py
deleted file mode 100644
index 7e4c0e777..000000000
--- a/mmtrack/core/utils/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .image import crop_image
-from .misc import setup_multi_processes
-from .visualization import imshow_mot_errors, imshow_tracks
-
-__all__ = [
-    'crop_image', 'imshow_tracks', 'imshow_mot_errors', 'setup_multi_processes'
-]
diff --git a/mmtrack/core/utils/image.py b/mmtrack/core/utils/image.py
deleted file mode 100644
index 500172d1b..000000000
--- a/mmtrack/core/utils/image.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import cv2
-import numpy as np
-
-
-def crop_image(image, crop_region, crop_size, padding=(0, 0, 0)):
-    """Crop image based on `crop_region` and `crop_size`.
-
-    Args:
-        image (ndarray): of shape (H, W, 3).
-        crop_region (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
-        crop_size (int): Crop size.
-        padding (tuple | ndarray): of shape (3, ) denoting the padding values.
-
-    Returns:
-        ndarray: Cropped image of shape (crop_size, crop_size, 3).
-    """
-    a = crop_size / (crop_region[2] - crop_region[0])
-    b = crop_size / (crop_region[3] - crop_region[1])
-    c = -a * crop_region[0]
-    d = -b * crop_region[1]
-    mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float32)
-    crop_image = cv2.warpAffine(
-        image,
-        mapping, (crop_size, crop_size),
-        borderMode=cv2.BORDER_CONSTANT,
-        borderValue=padding)
-    return crop_image
diff --git a/mmtrack/core/utils/misc.py b/mmtrack/core/utils/misc.py
deleted file mode 100644
index 8cf7b325d..000000000
--- a/mmtrack/core/utils/misc.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import multiprocessing as mp
-import os
-import platform
-import warnings
-
-import cv2
-
-
-def setup_multi_processes(cfg):
-    # set multi-process start method as `fork` to speed up the training
-    if platform.system() != 'Windows':
-        mp_start_method = cfg.get('mp_start_method', 'fork')
-        mp.set_start_method(mp_start_method)
-
-    # disable opencv multithreading to avoid system being overloaded
-    opencv_num_threads = cfg.get('opencv_num_threads', 0)
-    cv2.setNumThreads(opencv_num_threads)
-
-    # setup OMP threads
-    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
-    if ('OMP_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1):
-        omp_num_threads = 1
-        warnings.warn(
-            f'Setting OMP_NUM_THREADS environment variable for each process '
-            f'to be {omp_num_threads} in default, to avoid your system being '
-            f'overloaded, please further tune the variable for optimal '
-            f'performance in your application as needed.')
-        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
-
-    # setup MKL threads
-    if 'MKL_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1:
-        mkl_num_threads = 1
-        warnings.warn(
-            f'Setting MKL_NUM_THREADS environment variable for each process '
-            f'to be {mkl_num_threads} in default, to avoid your system being '
-            f'overloaded, please further tune the variable for optimal '
-            f'performance in your application as needed.')
-        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
diff --git a/mmtrack/datasets/__init__.py b/mmtrack/datasets/__init__.py
index caa53634a..ba0e75f24 100644
--- a/mmtrack/datasets/__init__.py
+++ b/mmtrack/datasets/__init__.py
@@ -1,9 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.datasets.builder import DATASETS, build_dataset
-
 from .base_sot_dataset import BaseSOTDataset
-from .builder import build_dataloader
-from .coco_video_dataset import CocoVideoDataset
+from .base_video_dataset import BaseVideoDataset
 from .dancetrack_dataset import DanceTrackDataset
 from .dataset_wrappers import RandomSampleConcatDataset
 from .got10k_dataset import GOT10kDataset
@@ -11,13 +8,10 @@
 from .lasot_dataset import LaSOTDataset
 from .mot_challenge_dataset import MOTChallengeDataset
 from .otb_dataset import OTB100Dataset
-from .parsers import CocoVID
-from .pipelines import PIPELINES
 from .reid_dataset import ReIDDataset
+from .samplers import EntireVideoBatchSampler, QuotaSampler, VideoSampler
 from .sot_coco_dataset import SOTCocoDataset
 from .sot_imagenet_vid_dataset import SOTImageNetVIDDataset
-from .sot_test_dataset import SOTTestDataset
-from .sot_train_dataset import SOTTrainDataset
 from .tao_dataset import TaoDataset
 from .trackingnet_dataset import TrackingNetDataset
 from .uav123_dataset import UAV123Dataset
@@ -25,11 +19,10 @@
 from .youtube_vis_dataset import YouTubeVISDataset
 
 __all__ = [
-    'DATASETS', 'PIPELINES', 'build_dataloader', 'build_dataset', 'CocoVID',
-    'CocoVideoDataset', 'ImagenetVIDDataset', 'MOTChallengeDataset',
-    'ReIDDataset', 'SOTTrainDataset', 'SOTTestDataset', 'LaSOTDataset',
-    'UAV123Dataset', 'TrackingNetDataset', 'OTB100Dataset',
-    'YouTubeVISDataset', 'GOT10kDataset', 'VOTDataset', 'BaseSOTDataset',
-    'SOTCocoDataset', 'SOTImageNetVIDDataset', 'RandomSampleConcatDataset',
-    'TaoDataset', 'DanceTrackDataset'
+    'BaseVideoDataset', 'MOTChallengeDataset', 'BaseSOTDataset',
+    'LaSOTDataset', 'ReIDDataset', 'GOT10kDataset', 'SOTCocoDataset',
+    'SOTImageNetVIDDataset', 'TrackingNetDataset', 'YouTubeVISDataset',
+    'ImagenetVIDDataset', 'RandomSampleConcatDataset', 'TaoDataset',
+    'UAV123Dataset', 'VOTDataset', 'OTB100Dataset', 'DanceTrackDataset',
+    'VideoSampler', 'QuotaSampler', 'EntireVideoBatchSampler'
 ]
diff --git a/mmtrack/datasets/parsers/__init__.py b/mmtrack/datasets/api_wrappers/__init__.py
similarity index 64%
rename from mmtrack/datasets/parsers/__init__.py
rename to mmtrack/datasets/api_wrappers/__init__.py
index 9985b074c..73602d6ac 100644
--- a/mmtrack/datasets/parsers/__init__.py
+++ b/mmtrack/datasets/api_wrappers/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .coco_video_parser import CocoVID
+from .coco_video_api import CocoVID
 
 __all__ = ['CocoVID']
diff --git a/mmtrack/datasets/parsers/coco_video_parser.py b/mmtrack/datasets/api_wrappers/coco_video_api.py
similarity index 100%
rename from mmtrack/datasets/parsers/coco_video_parser.py
rename to mmtrack/datasets/api_wrappers/coco_video_api.py
diff --git a/mmtrack/datasets/base_sot_dataset.py b/mmtrack/datasets/base_sot_dataset.py
index 823661e35..a7a6fcb24 100644
--- a/mmtrack/datasets/base_sot_dataset.py
+++ b/mmtrack/datasets/base_sot_dataset.py
@@ -1,112 +1,75 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
 import random
-from abc import ABCMeta, abstractmethod
+from abc import ABCMeta
 from io import StringIO
+from typing import Any, Optional, Sequence, Union
 
-import mmcv
 import numpy as np
 from addict import Dict
-from mmcv.utils import print_log
-from mmdet.datasets.pipelines import Compose
-from torch.utils.data import Dataset
+from mmengine.dataset import BaseDataset, force_full_init
+from mmengine.fileio.file_client import FileClient
 
-from mmtrack.core.evaluation import eval_sot_ope
-from mmtrack.datasets import DATASETS
+from mmtrack.registry import DATASETS
 
 
 @DATASETS.register_module()
-class BaseSOTDataset(Dataset, metaclass=ABCMeta):
-    """Dataset of single object tracking. The dataset can both support training
-    and testing mode.
+class BaseSOTDataset(BaseDataset, metaclass=ABCMeta):
+    """Base dataset for SOT task. The dataset can both support training and
+    testing mode.
 
     Args:
-        img_prefix (str): Prefix in the paths of image files.
-        pipeline (list[dict]): Processing pipeline.
-        split (str): Dataset split.
-        ann_file (str, optional): The file contains data information. It will
-            be loaded and parsed in the `self.load_data_infos` function.
-        test_mode (bool, optional): Default to False.
         bbox_min_size (int, optional): Only bounding boxes whose sizes are
-            larger than `bbox_min_size` can be regarded as valid. Default to 0.
+            larger than ``bbox_min_size`` can be regarded as valid.
+            Default to 0.
         only_eval_visible (bool, optional): Whether to only evaluate frames
             where object are visible. Default to False.
-        file_client_args (dict, optional): Arguments to instantiate a
-                FileClient. Default: dict(backend='disk').
     """
 
-    # Compatible with MOT and VID Dataset class. The 'CLASSES' attribute will
-    # be called in tools/train.py.
-    CLASSES = None
+    META = dict(CLASSES=None)
 
     def __init__(self,
-                 img_prefix,
-                 pipeline,
-                 split,
-                 ann_file=None,
-                 test_mode=False,
-                 bbox_min_size=0,
-                 only_eval_visible=False,
-                 file_client_args=dict(backend='disk'),
+                 bbox_min_size: int = 0,
+                 only_eval_visible: bool = False,
+                 *args,
                  **kwargs):
-        self.img_prefix = img_prefix
-        self.split = split
-        self.pipeline = Compose(pipeline)
-        self.ann_file = ann_file
-        self.test_mode = test_mode
         self.bbox_min_size = bbox_min_size
         self.only_eval_visible = only_eval_visible
-        self.file_client_args = file_client_args
-        self.file_client = mmcv.FileClient(**file_client_args)
-        # 'self.load_as_video' must be set to True in order to using
+        # ``self.load_as_video`` must be set to True in order to using
         # distributed video sampler to load dataset when testing.
         self.load_as_video = True
-        ''' The self.data_info is a list, which the length is the
-            number of videos. The default content is in the following format:
-            [
-                {
-                    'video_path': the video path
-                    'ann_path': the annotation path
-                    'start_frame_id': the starting frame ID number contained in
-                                    the image name
-                    'end_frame_id': the ending frame ID number contained in the
-                                    image name
-                    'framename_template': the template of image name
-                },
-                ...
-            ]
-        '''
-        self.data_infos = self.load_data_infos(split=self.split)
-        self.num_frames_per_video = [
-            self.get_len_per_video(video_ind)
-            for video_ind in range(len(self.data_infos))
-        ]
+        super().__init__(*args, **kwargs)
+
         # used to record the video information at the beginning of the video
         # test. Thus, we can avoid reloading the files of video information
         # repeatedly in all frames of one video.
         self.test_memo = Dict()
 
-    def __getitem__(self, ind):
-        if self.test_mode:
-            assert isinstance(ind, tuple)
-            # the first element in the tuple is the video index and the second
-            # element in the tuple is the frame index
-            return self.prepare_test_data(ind[0], ind[1])
-        else:
-            return self.prepare_train_data(ind)
+    def _loadtxt(self,
+                 filepath: str,
+                 dtype=np.float32,
+                 delimiter: Optional[str] = None,
+                 skiprows: int = 0,
+                 return_ndarray: bool = True) -> Union[np.ndarray, str]:
+        """Load TEXT file.
 
-    @abstractmethod
-    def load_data_infos(self, split='train'):
-        pass
+        Args:
+            filepath (str): The path of file.
+            dtype (data-type, optional): Data-type of the resulting array.
+                Defaults to np.float32.
+            delimiter (str, optional): The string used to separate values.
+                Defaults to None.
+            skiprows (int, optional): Skip the first ``skiprows`` lines,
+                including comments. Defaults to 0.
+            return_ndarray (bool, optional): Whether to return the ``ndarray``
+                type. Defaults to True.
 
-    def loadtxt(self,
-                filepath,
-                dtype=float,
-                delimiter=None,
-                skiprows=0,
-                return_array=True):
-        file_string = self.file_client.get_text(filepath)
-        if return_array:
+        Returns:
+            Union[np.ndarray, str]: Contents of the file.
+        """
+        file_client = FileClient.infer_client(uri=filepath)
+        file_string = file_client.get_text(filepath)
+        if return_ndarray:
             return np.loadtxt(
                 StringIO(file_string),
                 dtype=dtype,
@@ -115,24 +78,25 @@ def loadtxt(self,
         else:
             return file_string.strip()
 
-    def get_bboxes_from_video(self, video_ind):
+    def get_bboxes_from_video(self, video_idx: int) -> np.ndarray:
         """Get bboxes annotation about the instance in a video.
 
         Args:
-            video_ind (int): video index
+            video_idx (int): video index
 
         Returns:
-            ndarray: in [N, 4] shape. The N is the number of bbox and the bbox
-                is in (x, y, w, h) format.
+            np.ndarray: In [N, 4] shape. The N is the number of bbox and
+                the bbox is in (x, y, w, h) format.
         """
-        bbox_path = osp.join(self.img_prefix,
-                             self.data_infos[video_ind]['ann_path'])
-        bboxes = self.loadtxt(bbox_path, dtype=float, delimiter=',')
+        meta_video_info = self.get_data_info(video_idx)
+        bbox_path = osp.join(self.data_prefix['img_path'],
+                             meta_video_info['ann_path'])
+        bboxes = self._loadtxt(bbox_path, dtype=float, delimiter=',')
         if len(bboxes.shape) == 1:
             bboxes = np.expand_dims(bboxes, axis=0)
 
-        end_frame_id = self.data_infos[video_ind]['end_frame_id']
-        start_frame_id = self.data_infos[video_ind]['start_frame_id']
+        end_frame_id = meta_video_info['end_frame_id']
+        start_frame_id = meta_video_info['start_frame_id']
 
         if not self.test_mode:
             assert len(bboxes) == (
@@ -140,193 +104,204 @@ def get_bboxes_from_video(self, video_ind):
             ), f'{len(bboxes)} is not equal to {end_frame_id}-{start_frame_id}+1'  # noqa
         return bboxes
 
-    def get_len_per_video(self, video_ind):
-        """Get the number of frames in a video."""
-        return self.data_infos[video_ind]['end_frame_id'] - self.data_infos[
-            video_ind]['start_frame_id'] + 1
+    def get_len_per_video(self, video_idx: int) -> int:
+        """Get the number of frames in a video.
+
+        Args:
+            video_idx (int): The index of video.
+
+        Returns:
+            int: The length of the video.
+        """
+        return self.get_data_info(
+            video_idx)['end_frame_id'] - self.get_data_info(
+                video_idx)['start_frame_id'] + 1
+
+    def get_visibility_from_video(self, video_idx: int) -> dict:
+        """Get the visible information of instance in a video.
+
+        Args:
+            video_idx (int): The index of video.
 
-    def get_visibility_from_video(self, video_ind):
-        """Get the visible information of instance in a video."""
-        visible = np.array([True] * self.get_len_per_video(video_ind))
+        Returns:
+            dict: The visibilities of each object in the video.
+        """
+        visible = np.array([True] * self.get_len_per_video(video_idx))
         return dict(visible=visible)
 
-    def get_masks_from_video(self, video_ind):
+    def get_masks_from_video(self, video_idx: int) -> Any:
+        """Get the mask information of instance in a video.
+
+        Args:
+            video_idx (int): The index of video.
+
+        Returns:
+            Any: Not implemented yet.
+        """
         pass
 
-    def get_ann_infos_from_video(self, video_ind):
-        """Get annotation information in a video.
+    def get_img_infos_from_video(self, video_idx: int) -> dict:
+        """Get the information of images in a video.
 
         Args:
-            video_ind (int): video index
+            video_idx (int): The index of video.
 
         Returns:
-            dict: {'bboxes': ndarray in (N, 4) shape, 'bboxes_isvalid':
-                ndarray, 'visible':ndarray}. The annotation information in some
-                datasets may contain 'visible_ratio'. The bbox is in
-                (x1, y1, x2, y2) format.
+            dict: {
+                    'video_id': int,
+                    'frame_ids': np.ndarray,
+                    'img_paths': list[str],
+                    'video_length': int
+                  }
         """
-        bboxes = self.get_bboxes_from_video(video_ind)
+        img_paths = []
+        meta_video_info = self.get_data_info(video_idx)
+        start_frame_id = meta_video_info['start_frame_id']
+        end_frame_id = meta_video_info['end_frame_id']
+        framename_template = meta_video_info['framename_template']
+        for frame_id in range(start_frame_id, end_frame_id + 1):
+            img_paths.append(
+                osp.join(self.data_prefix['img_path'],
+                         meta_video_info['video_path'],
+                         framename_template % frame_id))
+        video_len = self.get_len_per_video(video_idx)
+        frame_ids = np.arange(video_len)
+
+        img_infos = dict(
+            video_id=video_idx,
+            frame_ids=frame_ids,
+            img_paths=img_paths,
+            video_length=video_len)
+        return img_infos
+
+    def get_ann_infos_from_video(self, video_idx: int) -> dict:
+        """Get the information of annotations in a video.
+
+        Args:
+            video_idx (int): The index of video.
+
+        Returns:
+            dict: {
+                    'bboxes': np.ndarray in (N, 4) shape,
+                    'bboxes_isvalid': np.ndarray,
+                    'visible': np.ndarray
+                  }.
+                  The annotation information in some datasets may contain
+                    'visible_ratio'. The bbox is in (x1, y1, x2, y2) format.
+        """
+        bboxes = self.get_bboxes_from_video(video_idx)
         # The visible information in some datasets may contain
         # 'visible_ratio'.
-        visible_info = self.get_visibility_from_video(video_ind)
+        visible_info = self.get_visibility_from_video(video_idx)
         bboxes_isvalid = (bboxes[:, 2] > self.bbox_min_size) & (
             bboxes[:, 3] > self.bbox_min_size)
         visible_info['visible'] = visible_info['visible'] & bboxes_isvalid
         bboxes[:, 2:] += bboxes[:, :2]
+
         ann_infos = dict(
             bboxes=bboxes, bboxes_isvalid=bboxes_isvalid, **visible_info)
         return ann_infos
 
-    def get_img_infos_from_video(self, video_ind):
-        """Get image information in a video.
+    def prepare_test_data(self, video_idx: int, frame_idx: int) -> dict:
+        """Get testing data of one frame. We parse one video, get one frame
+        from it and pass the frame information to the pipeline.
 
         Args:
-            video_ind (int): video index
+            video_idx (int): The index of video.
+            frame_idx (int): The index of frame.
 
         Returns:
-            dict: {'filename': list[str], 'frame_ids':ndarray, 'video_id':int}
+            dict: Testing data of one frame.
         """
-        img_names = []
-        start_frame_id = self.data_infos[video_ind]['start_frame_id']
-        end_frame_id = self.data_infos[video_ind]['end_frame_id']
-        framename_template = self.data_infos[video_ind]['framename_template']
-        for frame_id in range(start_frame_id, end_frame_id + 1):
-            img_names.append(
-                osp.join(self.data_infos[video_ind]['video_path'],
-                         framename_template % frame_id))
-        frame_ids = np.arange(self.get_len_per_video(video_ind))
-        img_infos = dict(
-            filename=img_names, frame_ids=frame_ids, video_id=video_ind)
-        return img_infos
+        # Avoid reloading the files of video information
+        # repeatedly in all frames of one video.
+        if self.test_memo.get('video_idx', None) != video_idx:
+            self.test_memo.video_idx = video_idx
+            ann_infos = self.get_ann_infos_from_video(video_idx)
+            img_infos = self.get_img_infos_from_video(video_idx)
+            self.test_memo.video_infos = dict(**img_infos, **ann_infos)
+        assert 'video_idx' in self.test_memo and 'video_infos'\
+            in self.test_memo
+
+        results = {}
+        results['img_path'] = self.test_memo.video_infos['img_paths'][
+            frame_idx]
+        results['frame_id'] = frame_idx
+        results['video_id'] = video_idx
+        results['video_length'] = self.test_memo.video_infos['video_length']
+
+        results['instances'] = []
+        instance = {}
+        instance['bbox'] = self.test_memo.video_infos['bboxes'][frame_idx]
+        instance['visible'] = self.test_memo.video_infos['visible'][frame_idx]
+        instance['bbox_label'] = np.array([0], dtype=np.int32)
+        results['instances'].append(instance)
 
-    def prepare_test_data(self, video_ind, frame_ind):
-        """Get testing data of one frame. We parse one video, get one frame
-        from it and pass the frame information to the pipeline.
+        results = self.pipeline(results)
+        return results
+
+    def prepare_train_data(self, video_idx: int) -> dict:
+        """Get training data sampled from some videos. We firstly sample two
+        videos from the dataset and then parse the data information in the
+        subsequent pipeline. The first operation in the training pipeline must
+        be frames sampling.
 
         Args:
-            video_ind (int): video index
-            frame_ind (int): frame index
+            video_idx (int): The index of video.
 
         Returns:
-            dict: testing data of one frame.
+            dict: Training data pairs, triplets or groups.
         """
-        if self.test_memo.get('video_ind', None) != video_ind:
-            self.test_memo.video_ind = video_ind
-            self.test_memo.ann_infos = self.get_ann_infos_from_video(video_ind)
-            self.test_memo.img_infos = self.get_img_infos_from_video(video_ind)
-        assert 'video_ind' in self.test_memo and 'ann_infos' in \
-            self.test_memo and 'img_infos' in self.test_memo
-
-        img_info = dict(
-            filename=self.test_memo.img_infos['filename'][frame_ind],
-            frame_id=frame_ind)
-        ann_info = dict(
-            bboxes=self.test_memo.ann_infos['bboxes'][frame_ind],
-            visible=self.test_memo.ann_infos['visible'][frame_ind])
-
-        results = dict(img_info=img_info, ann_info=ann_info)
-        self.pre_pipeline(results)
-        results = self.pipeline(results)
+        video_idxes = random.choices(list(range(len(self))), k=2)
+        pair_video_infos = []
+        for video_idx in video_idxes:
+            ann_infos = self.get_ann_infos_from_video(video_idx)
+            img_infos = self.get_img_infos_from_video(video_idx)
+            video_infos = dict(**img_infos, **ann_infos)
+            pair_video_infos.append(video_infos)
+
+        results = self.pipeline(pair_video_infos)
         return results
 
-    def prepare_train_data(self, video_ind):
-        """Get training data sampled from some videos. We firstly sample two
-        videos from the dataset and then parse the data information. The first
-        operation in the training pipeline is frames sampling.
+    def prepare_data(self, idx: Union[Sequence[int], int]) -> Any:
+        """Get data processed by ``self.pipeline``.
 
         Args:
-            video_ind (int): video index
+            idx (int): The index of ``data_info``.
 
         Returns:
-            dict: training data pairs, triplets or groups.
-        """
-        while True:
-            video_inds = random.choices(list(range(len(self))), k=2)
-            pair_video_infos = []
-            for video_index in video_inds:
-                ann_infos = self.get_ann_infos_from_video(video_index)
-                img_infos = self.get_img_infos_from_video(video_index)
-                video_infos = dict(**ann_infos, **img_infos)
-                self.pre_pipeline(video_infos)
-                pair_video_infos.append(video_infos)
-
-            results = self.pipeline(pair_video_infos)
-            if results is not None:
-                return results
-
-    def pre_pipeline(self, results):
-        """Prepare results dict for pipeline.
-
-        The following keys in dict will be called in the subsequent pipeline.
+            Any: Depends on ``self.pipeline``.
         """
-        results['img_prefix'] = self.img_prefix
-        results['bbox_fields'] = []
-        results['mask_fields'] = []
-        results['seg_fields'] = []
-
-    def __len__(self):
         if self.test_mode:
-            return sum(self.num_frames_per_video)
+            assert isinstance(idx, Sequence) and len(idx) == 2
+            # the first element in the ``Sequence`` is the video index and the
+            # second element in the ``Sequence`` is the frame index
+            return self.prepare_test_data(idx[0], idx[1])
         else:
-            return len(self.data_infos)
+            assert isinstance(idx, int)
+            return self.prepare_train_data(idx)
 
-    def evaluate(self, results, metric=['track'], logger=None):
-        """Default evaluation standard is OPE.
+    @property
+    def num_videos(self) -> int:
+        """Get the number of videos in the dataset.
 
-        Args:
-            results (dict(list[ndarray])): tracking results. The ndarray is in
-                (x1, y1, x2, y2, score) format.
-            metric (list, optional): defaults to ['track'].
-            logger (logging.Logger | str | None, optional): defaults to None.
+        Returns:
+            int: The number of videos.
         """
+        num_videos = len(self.data_address) if self.serialize_data else len(
+            self.data_list)
+        return num_videos
+
+    @force_full_init
+    def __len__(self) -> int:
+        """Get the length of filtered dataset and automatically call
+        ``full_init`` if the  dataset has not been fully init.
 
-        if isinstance(metric, list):
-            metrics = metric
-        elif isinstance(metric, str):
-            metrics = [metric]
+        Returns:
+            int: The length of filtered dataset.
+        """
+        if self.test_mode:
+            return sum(
+                self.get_len_per_video(idx) for idx in range(self.num_videos))
         else:
-            raise TypeError('metric must be a list or a str.')
-        allowed_metrics = ['track']
-        for metric in metrics:
-            if metric not in allowed_metrics:
-                raise KeyError(f'metric {metric} is not supported.')
-
-        # get all test annotations
-        gt_bboxes = []
-        visible_infos = []
-        for video_ind in range(len(self.data_infos)):
-            video_anns = self.get_ann_infos_from_video(video_ind)
-            gt_bboxes.append(video_anns['bboxes'])
-            visible_infos.append(video_anns['visible'])
-
-        # tracking_bboxes converting code
-        eval_results = dict()
-        if 'track' in metrics:
-            assert len(self) == len(
-                results['track_bboxes']
-            ), f"{len(self)} == {len(results['track_bboxes'])}"
-            print_log('Evaluate OPE Benchmark...', logger=logger)
-            track_bboxes = []
-            start_ind = end_ind = 0
-            for num in self.num_frames_per_video:
-                end_ind += num
-                track_bboxes.append(
-                    list(
-                        map(lambda x: x[:-1],
-                            results['track_bboxes'][start_ind:end_ind])))
-                start_ind += num
-
-            if not self.only_eval_visible:
-                visible_infos = None
-            # evaluation
-            track_eval_results = eval_sot_ope(
-                results=track_bboxes,
-                annotations=gt_bboxes,
-                visible_infos=visible_infos)
-            eval_results.update(track_eval_results)
-
-            for k, v in eval_results.items():
-                if isinstance(v, float):
-                    eval_results[k] = float(f'{(v):.3f}')
-            print_log(eval_results, logger=logger)
-        return eval_results
+            return self.num_videos
diff --git a/mmtrack/datasets/base_video_dataset.py b/mmtrack/datasets/base_video_dataset.py
new file mode 100644
index 000000000..c79e2aab5
--- /dev/null
+++ b/mmtrack/datasets/base_video_dataset.py
@@ -0,0 +1,515 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+import random
+from typing import Any, List, Tuple
+
+from mmdet.datasets.api_wrappers import COCO
+from mmengine.dataset import BaseDataset, force_full_init
+from mmengine.fileio import FileClient
+from mmengine.logging import MMLogger
+
+from mmtrack.registry import DATASETS
+from .api_wrappers import CocoVID
+
+
+@DATASETS.register_module()
+class BaseVideoDataset(BaseDataset):
+    """Base video dataset for VID, MOT and VIS tasks, except for SOT tasks.
+
+    Args:
+        load_as_video (bool, optional): Load data as videos or images.
+            Defaults to True.
+        key_img_sampler (dict, optional): Configuration of sampling key images.
+            Defaults to dict(interval=1).
+        ref_img_sampler (dict, optional): Configuration of sampling
+            reference images.
+            - num_ref_imgs (int, optional): The number of sampled reference
+                images. Defaults to 2.
+            - frame_range (List(int) | int, optional): The sampling range of
+                reference frames in the same video for key frame.
+                Defaults to 9.
+            - filter_key_img (bool, optional): If False, the key image will be
+                in the sampling reference candidates, otherwise, it is exclude.
+                Defaults to True.
+            - method (str, optional): The sampling method. Options are
+                'uniform', 'bilateral_uniform', 'test_with_adaptive_stride',
+                'test_with_fix_stride'. Defaults to 'bilateral_uniform'.
+    """
+    META = dict(CLASSES=None)
+
+    def __init__(self,
+                 load_as_video: bool = True,
+                 key_img_sampler: dict = dict(interval=1),
+                 ref_img_sampler: dict = dict(
+                     num_ref_imgs=2,
+                     frame_range=9,
+                     filter_key_img=True,
+                     method='bilateral_uniform'),
+                 *args,
+                 **kwargs):
+        self.load_as_video = load_as_video
+        self.key_img_sampler = key_img_sampler
+        self.ref_img_sampler = ref_img_sampler
+        super().__init__(*args, **kwargs)
+
+    def full_init(self):
+        """Load annotation file and set ``BaseDataset._fully_initialized`` to
+        True.
+
+        If ``lazy_init=False``, ``full_init`` will be called during the
+        instantiation and ``self._fully_initialized`` will be set to True. If
+        ``obj._fully_initialized=False``, the class method decorated by
+        ``force_full_init`` will call ``full_init`` automatically.
+
+        Several steps to initialize annotation:
+
+            - load_data_list: Load annotations from annotation file.
+            - filter data information: Filter annotations according to
+              filter_cfg.
+            - serialize_data: Serialize ``self.data_list`` if
+            ``self.serialize_data`` is True.
+        """
+        if self._fully_initialized:
+            return
+        # Load data information.
+        # We use `self.valid_data_inds` to record the ids of `data_list` used
+        # for training and testing.
+        self.data_list, self.valid_data_indices = self.load_data_list()
+        # Filter illegal data, such as data that has no annotations.
+        self.valid_data_indices = self.filter_data()
+
+        # Serialize data_list
+        if self.serialize_data:
+            self.data_bytes, self.data_address = self._serialize_data()
+
+        self._fully_initialized = True
+
+    def load_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from an annotation file named as ``self.ann_file``.
+        Specifically, if self.load_as_video is True, it loads from the video
+        annotation file. Otherwise, from the image annotation file.
+
+        Returns:
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
+        """
+        if self.load_as_video:
+            data_list, valid_data_indices = self._load_video_data_list()
+        else:
+            data_list, valid_data_indices = self._load_image_data_list()
+
+        return data_list, valid_data_indices
+
+    def _load_video_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from a video annotation file named as
+        ``self.ann_file``.
+
+        Returns:
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
+        """
+        file_client = FileClient.infer_client(uri=self.ann_file)
+        with file_client.get_local_path(self.ann_file) as local_path:
+            coco = CocoVID(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the CLASSES
+        self.cat_ids = coco.get_cat_ids(cat_names=self.metainfo['CLASSES'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(coco.cat_img_map)
+
+        data_list = []
+        valid_data_indices = []
+        data_id = 0
+        vid_ids = coco.get_vid_ids()
+        for vid_id in vid_ids:
+            img_ids = coco.get_img_ids_from_vid(vid_id)
+            for img_id in img_ids:
+                # load img info
+                raw_img_info = coco.load_imgs([img_id])[0]
+                raw_img_info['img_id'] = img_id
+                raw_img_info['video_length'] = len(img_ids)
+
+                # load ann info
+                ann_ids = coco.get_ann_ids(
+                    img_ids=[img_id], cat_ids=self.cat_ids)
+                raw_ann_info = coco.load_anns(ann_ids)
+
+                if (self.key_img_sampler is not None) and (
+                        raw_img_info['frame_id'] %
+                        self.key_img_sampler.get('interval', 1) == 0):
+                    valid_data_indices.append(data_id)
+                # get data_info
+                parsed_data_info = self.parse_data_info(
+                    dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info))
+                data_list.append(parsed_data_info)
+                data_id += 1
+
+        return data_list, valid_data_indices
+
+    def _load_image_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from an image annotation file named as
+        ``self.ann_file``.
+
+        Returns:
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
+        """
+        file_client = FileClient.infer_client(uri=self.ann_file)
+        with file_client.get_local_path(self.ann_file) as local_path:
+            coco = COCO(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the CLASSES
+        self.cat_ids = coco.get_cat_ids(cat_names=self.metainfo['CLASSES'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(coco.cat_img_map)
+
+        data_list = []
+        valid_data_indices = []
+        data_id = 0
+        img_ids = coco.get_img_ids()
+        total_ann_ids = []
+        for img_id in img_ids:
+            # load img info
+            raw_img_info = coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            # load ann info
+            ann_ids = coco.get_ann_ids(img_ids=[img_id], cat_ids=self.cat_ids)
+            raw_ann_info = coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            # load images for training
+            valid_data_indices.append(data_id)
+
+            # get data_info
+            parsed_data_info = self.parse_data_info(
+                dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info))
+            data_list.append(parsed_data_info)
+            data_id += 1
+
+        assert len(set(total_ann_ids)) == len(
+            total_ann_ids
+        ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        return data_list, valid_data_indices
+
+    def parse_data_info(self, raw_data_info: dict) -> dict:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            dict: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        data_info = {}
+
+        data_info.update(img_info)
+        if self.data_prefix.get('img_path', None) is not None:
+            img_path = osp.join(self.data_prefix['img_path'],
+                                img_info['file_name'])
+        else:
+            img_path = img_info['file_name']
+        data_info['img_path'] = img_path
+
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+            if ann.get('segmentation', None):
+                instance['mask'] = ann['segmentation']
+            if ann.get('instance_id', None):
+                instance['instance_id'] = ann['instance_id']
+            else:
+                # image dataset usually has no `instance_id`.
+                # Therefore, we set it to `i`.
+                instance['instance_id'] = i
+            if len(instance) > 0:
+                instances.append(instance)
+        data_info['instances'] = instances
+        return data_info
+
+    def filter_data(self) -> List[int]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            list[int]: Filtered results.
+        """
+        if self.test_mode:
+            return self.valid_data_indices
+
+        # obtain images that contain annotation
+        ids_with_ann = set(data_info['img_id'] for data_info in self.data_list
+                           if len(data_info['instances']) > 0)
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_data_indices = []
+        for i, data_info in enumerate(self.data_list):
+            img_id = data_info['img_id']
+            width = data_info['width']
+            height = data_info['height']
+            if self.filter_cfg is None:
+                if img_id not in ids_in_cat:
+                    continue
+                if min(width, height) >= 32:
+                    valid_data_indices.append(i)
+            else:
+                if self.filter_cfg.get('filter_empty_gt',
+                                       True) and img_id not in ids_in_cat:
+                    continue
+                if min(width, height) >= self.filter_cfg.get('min_size', 32):
+                    valid_data_indices.append(i)
+
+        set_valid_data_indices = set(self.valid_data_indices)
+        valid_data_indices = [
+            id for id in valid_data_indices if id in set_valid_data_indices
+        ]
+        return valid_data_indices
+
+    def get_cat_ids(self, idx: int) -> List[int]:
+        """Get category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            List[int]: All categories in the image of specified index.
+        """
+
+        instances = self.get_data_info(idx)['instances']
+        return [instance['bbox_label'] for instance in instances]
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by the index of `self.valid_data_indices` and
+        automatically call ``full_init`` if the dataset has not been fully
+        initialized.
+
+        Args:
+            idx (int): The index of data in `self.valid_data_indices`.
+
+        Returns:
+            dict: The idx-th annotation of the dataset.
+        """
+        ori_idx = self.valid_data_indices[idx]
+        data_info = super().get_data_info(ori_idx)
+        # Reset the `sample_idx`
+        # Some codebase needs `sample_idx` of data information. Here we convert
+        # the idx to a positive number and save it in data information.
+        if idx >= 0:
+            data_info['sample_idx'] = idx
+        else:
+            data_info['sample_idx'] = len(self) + idx
+        return data_info
+
+    @force_full_init
+    def _get_ori_data_info(self, ori_idx: int) -> dict:
+        """Get annotation by the index of `self.data_list` and automatically
+        call ``full_init`` if the dataset has not been fully initialized.
+
+        Args:
+            ori_idx (int): The index of data in `self.data_list`.
+
+        Returns:
+            dict: The ori_idx-th annotation of the `self.data_list``.
+        """
+        ori_data_info = super().get_data_info(ori_idx)
+        # delete the `sample_idx` key
+        ori_data_info.pop('sample_idx')
+        return ori_data_info
+
+    @force_full_init
+    def __len__(self) -> int:
+        """Get the length of filtered dataset and automatically call
+        ``full_init`` if the  dataset has not been fully init.
+
+        Returns:
+            int: The length of filtered dataset.
+        """
+        return len(self.valid_data_indices)
+
+    def prepare_data(self, idx: int) -> Any:
+        """Get data processed by ``self.pipeline``.
+
+        Args:
+            idx (int): The index of ``data_info``.
+
+        Returns:
+            Any: Depends on ``self.pipeline``.
+        """
+        data_info = self.get_data_info(idx)
+        if self.ref_img_sampler is not None:
+            data_infos = self.ref_img_sampling(idx, data_info,
+                                               **self.ref_img_sampler)
+            for _data in data_infos:
+                if 'video_id' in data_infos[0]:
+                    assert data_infos[0]['video_id'] == _data['video_id']
+                _data['is_video_data'] = self.load_as_video
+            final_data_info = data_infos[0].copy()
+            # Collate data_list scatters (list of dict to dict of list)
+            for key in final_data_info.keys():
+                final_data_info[key] = [_data[key] for _data in data_infos]
+        else:
+            final_data_info = data_info.copy()
+            final_data_info['is_video_data'] = self.load_as_video
+
+        return self.pipeline(final_data_info)
+
+    def ref_img_sampling(self,
+                         idx: int,
+                         data_info: dict,
+                         frame_range: list,
+                         stride: int = 1,
+                         num_ref_imgs: int = 1,
+                         filter_key_img: bool = True,
+                         method: str = 'uniform') -> List[dict]:
+        """Sampling reference frames in the same video for key frame.
+
+        Args:
+            idx (int): The index of `data_info`.
+            data_info (dict): The information of key frame.
+            frame_range (List(int) | int): The sampling range of reference
+                frames in the same video for key frame.
+            stride (int): The sampling frame stride when sampling reference
+                images. Default: 1.
+            num_ref_imgs (int): The number of sampled reference images.
+                Default: 1.
+            filter_key_img (bool): If False, the key image will be in the
+                sampling reference candidates, otherwise, it is exclude.
+                Default: True.
+            method (str): The sampling method. Options are 'uniform',
+                'bilateral_uniform', 'test_with_adaptive_stride',
+                'test_with_fix_stride'. 'uniform' denotes reference images are
+                randomly sampled from the nearby frames of key frame.
+                'bilateral_uniform' denotes reference images are randomly
+                sampled from the two sides of the nearby frames of key frame.
+                'test_with_adaptive_stride' is only used in testing, and
+                denotes the sampling frame stride is equal to (video length /
+                the number of reference images). test_with_fix_stride is only
+                used in testing with sampling frame stride equalling to
+                `stride`. Default: 'uniform'.
+
+        Returns:
+            list[dict]: `data_info` and the reference images information.
+        """
+        assert isinstance(data_info, dict)
+        if isinstance(frame_range, int):
+            assert frame_range >= 0, 'frame_range can not be a negative value.'
+            frame_range = [-frame_range, frame_range]
+        elif isinstance(frame_range, list):
+            assert len(frame_range) == 2, 'The length must be 2.'
+            assert frame_range[0] <= 0 and frame_range[1] >= 0
+            for i in frame_range:
+                assert isinstance(i, int), 'Each element must be int.'
+        else:
+            raise TypeError('The type of frame_range must be int or list.')
+
+        if 'test' in method and \
+                (frame_range[1] - frame_range[0]) != num_ref_imgs:
+            logger = MMLogger.get_current_instance()
+            logger.info(
+                'Warning:'
+                "frame_range[1] - frame_range[0] isn't equal to num_ref_imgs."
+                'Set num_ref_imgs to frame_range[1] - frame_range[0].')
+            self.ref_img_sampler[
+                'num_ref_imgs'] = frame_range[1] - frame_range[0]
+
+        if (not self.load_as_video) or data_info.get('frame_id', -1) < 0 \
+                or (frame_range[0] == 0 and frame_range[1] == 0):
+            ref_data_infos = []
+            for i in range(num_ref_imgs):
+                ref_data_infos.append(data_info.copy())
+        else:
+            frame_id = data_info['frame_id']
+            left = max(0, frame_id + frame_range[0])
+            right = min(frame_id + frame_range[1],
+                        data_info['video_length'] - 1)
+            frame_ids = list(range(0, data_info['video_length']))
+
+            ref_frame_ids = []
+            if method == 'uniform':
+                valid_ids = frame_ids[left:right + 1]
+                if filter_key_img and frame_id in valid_ids:
+                    valid_ids.remove(frame_id)
+                num_samples = min(num_ref_imgs, len(valid_ids))
+                ref_frame_ids.extend(random.sample(valid_ids, num_samples))
+            elif method == 'bilateral_uniform':
+                assert num_ref_imgs % 2 == 0, \
+                    'only support load even number of ref_imgs.'
+                for mode in ['left', 'right']:
+                    if mode == 'left':
+                        valid_ids = frame_ids[left:frame_id + 1]
+                    else:
+                        valid_ids = frame_ids[frame_id:right + 1]
+                    if filter_key_img and frame_id in valid_ids:
+                        valid_ids.remove(frame_id)
+                    num_samples = min(num_ref_imgs // 2, len(valid_ids))
+                    sampled_inds = random.sample(valid_ids, num_samples)
+                    ref_frame_ids.extend(sampled_inds)
+            elif method == 'test_with_adaptive_stride':
+                if frame_id == 0:
+                    stride = float(len(frame_ids) - 1) / (num_ref_imgs - 1)
+                    for i in range(num_ref_imgs):
+                        ref_id = round(i * stride)
+                        ref_frame_ids.append(frame_ids[ref_id])
+            elif method == 'test_with_fix_stride':
+                if frame_id == 0:
+                    for i in range(frame_range[0], 1):
+                        ref_frame_ids.append(frame_ids[0])
+                    for i in range(1, frame_range[1] + 1):
+                        ref_id = min(round(i * stride), len(frame_ids) - 1)
+                        ref_frame_ids.append(frame_ids[ref_id])
+                elif frame_id % stride == 0:
+                    ref_id = min(
+                        round(frame_id + frame_range[1] * stride),
+                        len(frame_ids) - 1)
+                    ref_frame_ids.append(frame_ids[ref_id])
+                data_info['num_left_ref_imgs'] = abs(frame_range[0])
+                data_info['frame_stride'] = stride
+            else:
+                raise NotImplementedError
+
+            ref_data_infos = []
+            for ref_frame_id in ref_frame_ids:
+                offset = ref_frame_id - frame_id
+                ref_data_info = self._get_ori_data_info(
+                    self.valid_data_indices[idx] + offset)
+
+                # We need data_info and ref_data_info to have the same keys.
+                for key in data_info.keys():
+                    if key not in ref_data_info:
+                        ref_data_info[key] = data_info[key]
+
+                ref_data_infos.append(ref_data_info)
+
+            ref_data_infos = sorted(
+                ref_data_infos, key=lambda i: i['frame_id'])
+        return [data_info, *ref_data_infos]
diff --git a/mmtrack/datasets/builder.py b/mmtrack/datasets/builder.py
deleted file mode 100644
index c79137e5f..000000000
--- a/mmtrack/datasets/builder.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import random
-import warnings
-from functools import partial
-
-import numpy as np
-import torch
-from mmcv.parallel import collate
-from mmcv.runner import get_dist_info
-from mmcv.utils import TORCH_VERSION, digit_version
-from mmdet.datasets.samplers import (DistributedGroupSampler,
-                                     DistributedSampler, GroupSampler)
-from torch.utils.data import DataLoader
-from torch.utils.data.sampler import RandomSampler
-
-from mmtrack.datasets.samplers.quota_sampler import DistributedQuotaSampler
-from .base_sot_dataset import BaseSOTDataset
-from .samplers import DistributedVideoSampler, SOTVideoSampler
-
-
-def build_dataloader(dataset,
-                     samples_per_gpu,
-                     workers_per_gpu,
-                     num_gpus=1,
-                     samples_per_epoch=None,
-                     dist=True,
-                     shuffle=True,
-                     seed=None,
-                     persistent_workers=False,
-                     **kwargs):
-    """Build PyTorch DataLoader.
-
-    In distributed training, each GPU/process has a dataloader.
-    In non-distributed training, there is only one dataloader for all GPUs.
-
-    Args:
-        dataset (Dataset): A PyTorch dataset.
-        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
-            batch size of each GPU.
-        workers_per_gpu (int): How many subprocesses to use for data loading
-            for each GPU.
-        num_gpus (int): Number of GPUs. Only used in non-distributed training.
-        samples_per_epoch (int | None, Optional): The number of samples per
-            epoch. If equal to -1, using all samples in the datasets per epoch.
-            Otherwise, using the `samples_per_epoch` samples. Default: None.
-        dist (bool): Distributed training/test or not. Default: True.
-        shuffle (bool): Whether to shuffle the data at every epoch.
-            Default: True.
-        seed (int, Optional): Seed to be used. Default: None.
-        persistent_workers (bool): If True, the data loader will not shutdown
-            the worker processes after a dataset has been consumed once.
-            This allows to maintain the workers `Dataset` instances alive.
-            This argument is only valid when PyTorch>=1.7.0. Default: False.
-        kwargs: any keyword argument to be used to initialize DataLoader
-
-    Returns:
-        DataLoader: A PyTorch dataloader.
-    """
-    rank, world_size = get_dist_info()
-
-    def is_base_sot_dataset(_dataset):
-        # handle the case: `_dataset` is a wrapper of normal dataset, such as
-        # 'RepeatDataset', 'ClassBalancedDataset' and so on.
-        if hasattr(_dataset, 'dataset'):
-            return is_base_sot_dataset(_dataset.dataset)
-        # handle the case: `_dataset` is a wrapper of concatenated dataset,
-        # such as `ConcatDataset`, `RandomSampleConcatDataset` and so on.
-        elif hasattr(_dataset, 'datasets'):
-            return is_base_sot_dataset(_dataset.datasets[0])
-        else:
-            return isinstance(_dataset, BaseSOTDataset)
-
-    # We set specific data sampler for SOT datasets.
-    is_sot_dataset = is_base_sot_dataset(dataset)
-    if dist:
-        # ----- distributed train mode ------
-        if shuffle:
-            if is_sot_dataset:
-                if samples_per_epoch is None:
-                    sampler = DistributedSampler(
-                        dataset, world_size, rank, shuffle=True)
-                else:
-                    # get fixed number of samples per epoch to train
-                    # sampling with no-replacement mode
-                    sampler = DistributedQuotaSampler(
-                        dataset,
-                        samples_per_epoch,
-                        world_size,
-                        rank,
-                        replacement=False)
-            else:
-                sampler = DistributedGroupSampler(dataset, samples_per_gpu,
-                                                  world_size, rank)
-        # ----- distributed test mode ------
-        else:
-            if hasattr(dataset, 'load_as_video') and dataset.load_as_video:
-                # sample videos
-                sampler = DistributedVideoSampler(
-                    dataset, world_size, rank, shuffle=False)
-            else:
-                sampler = DistributedSampler(
-                    dataset, world_size, rank, shuffle=False)
-
-        batch_size = samples_per_gpu
-        num_workers = workers_per_gpu
-    else:
-        # ----- non-distributed train mode ------
-        if shuffle:
-            if is_sot_dataset:
-                if samples_per_epoch is None:
-                    sampler = RandomSampler(dataset)
-                else:
-                    # get fixed number of samples per epoch to train
-                    # sampling with replacement mode
-                    sampler = RandomSampler(
-                        dataset,
-                        replacement=True,
-                        num_samples=samples_per_epoch)
-            else:
-                sampler = GroupSampler(dataset, samples_per_gpu)
-        # ----- non-distributed test mode ------
-        else:
-            sampler = SOTVideoSampler(dataset) if is_sot_dataset else None
-
-        batch_size = num_gpus * samples_per_gpu
-        num_workers = num_gpus * workers_per_gpu
-
-    init_fn = partial(
-        worker_init_fn, num_workers=num_workers, rank=rank,
-        seed=seed) if seed is not None else None
-
-    if (TORCH_VERSION != 'parrots'
-            and digit_version(TORCH_VERSION) >= digit_version('1.7.0')):
-        kwargs['persistent_workers'] = persistent_workers
-    elif persistent_workers is True:
-        warnings.warn('persistent_workers is invalid because your pytorch '
-                      'version is lower than 1.7.0')
-
-    data_loader = DataLoader(
-        dataset,
-        batch_size=batch_size,
-        sampler=sampler,
-        num_workers=num_workers,
-        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
-        pin_memory=False,
-        worker_init_fn=init_fn,
-        **kwargs)
-
-    return data_loader
-
-
-def worker_init_fn(worker_id, num_workers, rank, seed):
-    # The seed of each worker equals to
-    # num_worker * rank + worker_id + user_seed
-    worker_seed = num_workers * rank + worker_id + seed
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)
-    torch.manual_seed(worker_seed)
diff --git a/mmtrack/datasets/coco_video_dataset.py b/mmtrack/datasets/coco_video_dataset.py
deleted file mode 100644
index 428a87e99..000000000
--- a/mmtrack/datasets/coco_video_dataset.py
+++ /dev/null
@@ -1,496 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import random
-
-import numpy as np
-from mmcv.utils import print_log
-from mmdet.datasets import DATASETS, CocoDataset
-from terminaltables import AsciiTable
-
-from mmtrack.core import eval_mot
-from mmtrack.utils import get_root_logger
-from .parsers import CocoVID
-
-
-@DATASETS.register_module()
-class CocoVideoDataset(CocoDataset):
-    """Base coco video dataset for VID, MOT and SOT tasks.
-
-    Args:
-        load_as_video (bool): If True, using COCOVID class to load dataset,
-            otherwise, using COCO class. Default: True.
-        key_img_sampler (dict): Configuration of sampling key images.
-        ref_img_sampler (dict): Configuration of sampling ref images.
-        test_load_ann (bool): If True, loading annotations during testing,
-            otherwise, not loading. Default: False.
-    """
-
-    CLASSES = None
-
-    def __init__(self,
-                 load_as_video=True,
-                 key_img_sampler=dict(interval=1),
-                 ref_img_sampler=dict(
-                     frame_range=10,
-                     stride=1,
-                     num_ref_imgs=1,
-                     filter_key_img=True,
-                     method='uniform',
-                     return_key_img=True),
-                 test_load_ann=False,
-                 *args,
-                 **kwargs):
-        self.load_as_video = load_as_video
-        self.key_img_sampler = key_img_sampler
-        self.ref_img_sampler = ref_img_sampler
-        self.test_load_ann = test_load_ann
-        super().__init__(*args, **kwargs)
-        self.logger = get_root_logger()
-
-    def load_annotations(self, ann_file):
-        """Load annotations from COCO/COCOVID style annotation file.
-
-        Args:
-            ann_file (str): Path of annotation file.
-
-        Returns:
-            list[dict]: Annotation information from COCO/COCOVID api.
-        """
-        if not self.load_as_video:
-            data_infos = super().load_annotations(ann_file)
-        else:
-            data_infos = self.load_video_anns(ann_file)
-        return data_infos
-
-    def load_video_anns(self, ann_file):
-        """Load annotations from COCOVID style annotation file.
-
-        Args:
-            ann_file (str): Path of annotation file.
-
-        Returns:
-            list[dict]: Annotation information from COCOVID api.
-        """
-        self.coco = CocoVID(ann_file)
-        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
-        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
-
-        data_infos = []
-        self.vid_ids = self.coco.get_vid_ids()
-        self.img_ids = []
-        for vid_id in self.vid_ids:
-            img_ids = self.coco.get_img_ids_from_vid(vid_id)
-            if self.key_img_sampler is not None:
-                img_ids = self.key_img_sampling(img_ids,
-                                                **self.key_img_sampler)
-            self.img_ids.extend(img_ids)
-            for img_id in img_ids:
-                info = self.coco.load_imgs([img_id])[0]
-                info['filename'] = info['file_name']
-                data_infos.append(info)
-        return data_infos
-
-    def key_img_sampling(self, img_ids, interval=1):
-        """Sampling key images."""
-        return img_ids[::interval]
-
-    def ref_img_sampling(self,
-                         img_info,
-                         frame_range,
-                         stride=1,
-                         num_ref_imgs=1,
-                         filter_key_img=True,
-                         method='uniform',
-                         return_key_img=True):
-        """Sampling reference frames in the same video for key frame.
-
-        Args:
-            img_info (dict): The information of key frame.
-            frame_range (List(int) | int): The sampling range of reference
-                frames in the same video for key frame.
-            stride (int): The sampling frame stride when sampling reference
-                images. Default: 1.
-            num_ref_imgs (int): The number of sampled reference images.
-                Default: 1.
-            filter_key_img (bool): If False, the key image will be in the
-                sampling reference candidates, otherwise, it is exclude.
-                Default: True.
-            method (str): The sampling method. Options are 'uniform',
-                'bilateral_uniform', 'test_with_adaptive_stride',
-                'test_with_fix_stride'. 'uniform' denotes reference images are
-                randomly sampled from the nearby frames of key frame.
-                'bilateral_uniform' denotes reference images are randomly
-                sampled from the two sides of the nearby frames of key frame.
-                'test_with_adaptive_stride' is only used in testing, and
-                denotes the sampling frame stride is equal to (video length /
-                the number of reference images). test_with_fix_stride is only
-                used in testing with sampling frame stride equalling to
-                `stride`. Default: 'uniform'.
-            return_key_img (bool): If True, the information of key frame is
-                returned, otherwise, not returned. Default: True.
-
-        Returns:
-            list(dict): `img_info` and the reference images information or
-            only the reference images information.
-        """
-        assert isinstance(img_info, dict)
-        if isinstance(frame_range, int):
-            assert frame_range >= 0, 'frame_range can not be a negative value.'
-            frame_range = [-frame_range, frame_range]
-        elif isinstance(frame_range, list):
-            assert len(frame_range) == 2, 'The length must be 2.'
-            assert frame_range[0] <= 0 and frame_range[1] >= 0
-            for i in frame_range:
-                assert isinstance(i, int), 'Each element must be int.'
-        else:
-            raise TypeError('The type of frame_range must be int or list.')
-
-        if 'test' in method and \
-                (frame_range[1] - frame_range[0]) != num_ref_imgs:
-            print_log(
-                'Warning:'
-                "frame_range[1] - frame_range[0] isn't equal to num_ref_imgs."
-                'Set num_ref_imgs to frame_range[1] - frame_range[0].',
-                logger=self.logger)
-            self.ref_img_sampler[
-                'num_ref_imgs'] = frame_range[1] - frame_range[0]
-
-        if (not self.load_as_video) or img_info.get('frame_id', -1) < 0 \
-                or (frame_range[0] == 0 and frame_range[1] == 0):
-            ref_img_infos = []
-            for i in range(num_ref_imgs):
-                ref_img_infos.append(img_info.copy())
-        else:
-            vid_id, img_id, frame_id = img_info['video_id'], img_info[
-                'id'], img_info['frame_id']
-            img_ids = self.coco.get_img_ids_from_vid(vid_id)
-            left = max(0, frame_id + frame_range[0])
-            right = min(frame_id + frame_range[1], len(img_ids) - 1)
-
-            ref_img_ids = []
-            if method == 'uniform':
-                valid_ids = img_ids[left:right + 1]
-                if filter_key_img and img_id in valid_ids:
-                    valid_ids.remove(img_id)
-                num_samples = min(num_ref_imgs, len(valid_ids))
-                ref_img_ids.extend(random.sample(valid_ids, num_samples))
-            elif method == 'bilateral_uniform':
-                assert num_ref_imgs % 2 == 0, \
-                    'only support load even number of ref_imgs.'
-                for mode in ['left', 'right']:
-                    if mode == 'left':
-                        valid_ids = img_ids[left:frame_id + 1]
-                    else:
-                        valid_ids = img_ids[frame_id:right + 1]
-                    if filter_key_img and img_id in valid_ids:
-                        valid_ids.remove(img_id)
-                    num_samples = min(num_ref_imgs // 2, len(valid_ids))
-                    sampled_inds = random.sample(valid_ids, num_samples)
-                    ref_img_ids.extend(sampled_inds)
-            elif method == 'test_with_adaptive_stride':
-                if frame_id == 0:
-                    stride = float(len(img_ids) - 1) / (num_ref_imgs - 1)
-                    for i in range(num_ref_imgs):
-                        ref_id = round(i * stride)
-                        ref_img_ids.append(img_ids[ref_id])
-            elif method == 'test_with_fix_stride':
-                if frame_id == 0:
-                    for i in range(frame_range[0], 1):
-                        ref_img_ids.append(img_ids[0])
-                    for i in range(1, frame_range[1] + 1):
-                        ref_id = min(round(i * stride), len(img_ids) - 1)
-                        ref_img_ids.append(img_ids[ref_id])
-                elif frame_id % stride == 0:
-                    ref_id = min(
-                        round(frame_id + frame_range[1] * stride),
-                        len(img_ids) - 1)
-                    ref_img_ids.append(img_ids[ref_id])
-                img_info['num_left_ref_imgs'] = abs(frame_range[0]) \
-                    if isinstance(frame_range, list) else frame_range
-                img_info['frame_stride'] = stride
-            else:
-                raise NotImplementedError
-
-            ref_img_infos = []
-            for ref_img_id in ref_img_ids:
-                ref_img_info = self.coco.load_imgs([ref_img_id])[0]
-                ref_img_info['filename'] = ref_img_info['file_name']
-                ref_img_infos.append(ref_img_info)
-            ref_img_infos = sorted(ref_img_infos, key=lambda i: i['frame_id'])
-
-        if return_key_img:
-            return [img_info, *ref_img_infos]
-        else:
-            return ref_img_infos
-
-    def get_ann_info(self, img_info):
-        """Get COCO annotations by the information of image.
-
-        Args:
-            img_info (int): Information of image.
-
-        Returns:
-            dict: Annotation information of `img_info`.
-        """
-        img_id = img_info['id']
-        ann_ids = self.coco.get_ann_ids(img_ids=[img_id], cat_ids=self.cat_ids)
-        ann_info = self.coco.load_anns(ann_ids)
-        return self._parse_ann_info(img_info, ann_info)
-
-    def prepare_results(self, img_info):
-        """Prepare results for image (e.g. the annotation information, ...)."""
-        results = dict(img_info=img_info)
-        if not self.test_mode or self.test_load_ann:
-            results['ann_info'] = self.get_ann_info(img_info)
-        if self.proposals is not None:
-            idx = self.img_ids.index(img_info['id'])
-            results['proposals'] = self.proposals[idx]
-
-        super().pre_pipeline(results)
-        results['is_video_data'] = self.load_as_video
-        return results
-
-    def prepare_data(self, idx):
-        """Get data and annotations after pipeline.
-
-        Args:
-            idx (int): Index of data.
-
-        Returns:
-            dict: Data and annotations after pipeline with new keys introduced
-            by pipeline.
-        """
-        img_info = self.data_infos[idx]
-        if self.ref_img_sampler is not None:
-            img_infos = self.ref_img_sampling(img_info, **self.ref_img_sampler)
-            results = [
-                self.prepare_results(img_info) for img_info in img_infos
-            ]
-        else:
-            results = self.prepare_results(img_info)
-        return self.pipeline(results)
-
-    def prepare_train_img(self, idx):
-        """Get training data and annotations after pipeline.
-
-        Args:
-            idx (int): Index of data.
-
-        Returns:
-            dict: Training data and annotations after pipeline with new keys
-            introduced by pipeline.
-        """
-        return self.prepare_data(idx)
-
-    def prepare_test_img(self, idx):
-        """Get testing data after pipeline.
-
-        Args:
-            idx (int): Index of data.
-
-        Returns:
-            dict: Testing data after pipeline with new keys intorduced by
-            pipeline.
-        """
-        return self.prepare_data(idx)
-
-    def _parse_ann_info(self, img_info, ann_info):
-        """Parse bbox and mask annotations.
-
-        Args:
-            img_anfo (dict): Information of image.
-            ann_info (list[dict]): Annotation information of image.
-
-        Returns:
-            dict: A dict containing the following keys: bboxes, bboxes_ignore,
-            labels, instance_ids, masks, seg_map. "masks" are raw
-            annotations and not decoded into binary masks.
-        """
-        gt_bboxes = []
-        gt_labels = []
-        gt_bboxes_ignore = []
-        gt_masks = []
-        gt_instance_ids = []
-
-        for i, ann in enumerate(ann_info):
-            if ann.get('ignore', False):
-                continue
-            x1, y1, w, h = ann['bbox']
-            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
-            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
-            if inter_w * inter_h == 0:
-                continue
-            if ann['area'] <= 0 or w < 1 or h < 1:
-                continue
-            if ann['category_id'] not in self.cat_ids:
-                continue
-            bbox = [x1, y1, x1 + w, y1 + h]
-            if ann.get('iscrowd', False):
-                gt_bboxes_ignore.append(bbox)
-            else:
-                gt_bboxes.append(bbox)
-                gt_labels.append(self.cat2label[ann['category_id']])
-                if 'segmentation' in ann:
-                    gt_masks.append(ann['segmentation'])
-                if 'instance_id' in ann:
-                    gt_instance_ids.append(ann['instance_id'])
-
-        if gt_bboxes:
-            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
-            gt_labels = np.array(gt_labels, dtype=np.int64)
-        else:
-            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
-            gt_labels = np.array([], dtype=np.int64)
-
-        if gt_bboxes_ignore:
-            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
-        else:
-            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
-
-        seg_map = img_info['filename'].replace('jpg', 'png')
-
-        ann = dict(
-            bboxes=gt_bboxes,
-            labels=gt_labels,
-            bboxes_ignore=gt_bboxes_ignore,
-            masks=gt_masks,
-            seg_map=seg_map)
-
-        if self.load_as_video:
-            ann['instance_ids'] = np.array(gt_instance_ids).astype(np.int)
-        else:
-            ann['instance_ids'] = np.arange(len(gt_labels))
-
-        return ann
-
-    def evaluate(self,
-                 results,
-                 metric=['bbox', 'track'],
-                 logger=None,
-                 bbox_kwargs=dict(
-                     classwise=False,
-                     proposal_nums=(100, 300, 1000),
-                     iou_thrs=None,
-                     metric_items=None),
-                 track_kwargs=dict(
-                     iou_thr=0.5,
-                     ignore_iof_thr=0.5,
-                     ignore_by_classes=False,
-                     nproc=4)):
-        """Evaluation in COCO protocol and CLEAR MOT metric (e.g. MOTA, IDF1).
-
-        Args:
-            results (dict): Testing results of the dataset.
-            metric (str | list[str]): Metrics to be evaluated. Options are
-                'bbox', 'segm', 'track'.
-            logger (logging.Logger | str | None): Logger used for printing
-                related information during evaluation. Default: None.
-            bbox_kwargs (dict): Configuration for COCO styple evaluation.
-            track_kwargs (dict): Configuration for CLEAR MOT evaluation.
-
-        Returns:
-            dict[str, float]: COCO style and CLEAR MOT evaluation metric.
-        """
-        if isinstance(metric, list):
-            metrics = metric
-        elif isinstance(metric, str):
-            metrics = [metric]
-        else:
-            raise TypeError('metric must be a list or a str.')
-        allowed_metrics = ['bbox', 'segm', 'track']
-        for metric in metrics:
-            if metric not in allowed_metrics:
-                raise KeyError(f'metric {metric} is not supported.')
-
-        eval_results = dict()
-        if 'track' in metrics:
-            assert len(self.data_infos) == len(results['track_bboxes'])
-            inds = [
-                i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0
-            ]
-            num_vids = len(inds)
-            inds.append(len(self.data_infos))
-
-            track_bboxes = [
-                results['track_bboxes'][inds[i]:inds[i + 1]]
-                for i in range(num_vids)
-            ]
-            ann_infos = [self.get_ann_info(_) for _ in self.data_infos]
-            ann_infos = [
-                ann_infos[inds[i]:inds[i + 1]] for i in range(num_vids)
-            ]
-            track_eval_results = eval_mot(
-                results=track_bboxes,
-                annotations=ann_infos,
-                logger=logger,
-                classes=self.CLASSES,
-                **track_kwargs)
-            eval_results.update(track_eval_results)
-
-        # evaluate for detectors without tracker
-        super_metrics = ['bbox', 'segm']
-        super_metrics = [_ for _ in metrics if _ in super_metrics]
-        if super_metrics:
-            if isinstance(results, dict):
-                if 'bbox' in super_metrics and 'segm' in super_metrics:
-                    super_results = []
-                    for bbox, mask in zip(results['det_bboxes'],
-                                          results['det_masks']):
-                        super_results.append((bbox, mask))
-                else:
-                    super_results = results['det_bboxes']
-            elif isinstance(results, list):
-                super_results = results
-            else:
-                raise TypeError('Results must be a dict or a list.')
-            super_eval_results = super().evaluate(
-                results=super_results,
-                metric=super_metrics,
-                logger=logger,
-                **bbox_kwargs)
-            eval_results.update(super_eval_results)
-
-        return eval_results
-
-    def __repr__(self):
-        """Print the number of instance number suit for video dataset."""
-        dataset_type = 'Test' if self.test_mode else 'Train'
-        result = (f'\n{self.__class__.__name__} {dataset_type} dataset '
-                  f'with number of images {len(self)}, '
-                  f'and instance counts: \n')
-        if self.CLASSES is None:
-            result += 'Category names are not provided. \n'
-            return result
-        instance_count = np.zeros(len(self.CLASSES) + 1).astype(int)
-        # count the instance number in each image
-        for idx in range(len(self)):
-            img_info = self.data_infos[idx]
-            label = self.get_ann_info(img_info)['labels']
-            unique, counts = np.unique(label, return_counts=True)
-            if len(unique) > 0:
-                # add the occurrence number to each class
-                instance_count[unique] += counts
-            else:
-                # background is the last index
-                instance_count[-1] += 1
-        # create a table with category count
-        table_data = [['category', 'count'] * 5]
-        row_data = []
-        for cls, count in enumerate(instance_count):
-            if cls < len(self.CLASSES):
-                row_data += [f'{cls} [{self.CLASSES[cls]}]', f'{count}']
-            else:
-                # add the background number
-                row_data += ['-1 background', f'{count}']
-            if len(row_data) == 10:
-                table_data.append(row_data)
-                row_data = []
-        if len(row_data) >= 2:
-            if row_data[-1] == '0':
-                row_data = row_data[:-2]
-            if len(row_data) >= 2:
-                table_data.append([])
-                table_data.append(row_data)
-
-        table = AsciiTable(table_data)
-        result += table.table
-        return result
diff --git a/mmtrack/datasets/dancetrack_dataset.py b/mmtrack/datasets/dancetrack_dataset.py
index f281fab4d..2a411c704 100644
--- a/mmtrack/datasets/dancetrack_dataset.py
+++ b/mmtrack/datasets/dancetrack_dataset.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.datasets import DATASETS
-
+from mmtrack.registry import DATASETS
 from .mot_challenge_dataset import MOTChallengeDataset
 
 
@@ -8,19 +7,8 @@
 class DanceTrackDataset(MOTChallengeDataset):
     """Dataset for DanceTrack: https://github.com/DanceTrack/DanceTrack.
 
-    Most content is inherited from MOTChallengeDataset.
+    All content is inherited from MOTChallengeDataset.
     """
 
-    def get_benchmark_and_eval_split(self):
-        """Get benchmark and dataset split to evaluate.
-
-        Get benchmark from upeper/lower-case image prefix and the dataset
-        split to evaluate.
-
-        Returns:
-            tuple(string): The first string denotes the type of dataset.
-            The second string denots the split of the dataset to eval.
-        """
-        # As DanceTrack only has train/val and use 'val' for evaluation as
-        # default, we can directly output the desired split.
-        return 'DanceTrack', 'val'
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
diff --git a/mmtrack/datasets/dataset_wrappers.py b/mmtrack/datasets/dataset_wrappers.py
index 97a2cd03b..d3b37eaed 100644
--- a/mmtrack/datasets/dataset_wrappers.py
+++ b/mmtrack/datasets/dataset_wrappers.py
@@ -1,27 +1,30 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import random
+from typing import List, Optional
 
-from mmdet.datasets.builder import DATASETS, build_dataset
-from torch.utils.data.dataset import ConcatDataset
+from mmengine.dataset import ConcatDataset
+
+from mmtrack.registry import DATASETS
 
 
 @DATASETS.register_module()
 class RandomSampleConcatDataset(ConcatDataset):
     """A wrapper of concatenated dataset. Support randomly sampling one dataset
     from concatenated datasets and then getting samples from the sampled
-    dataset.
+    dataset. This class only support training.
 
     Args:
-        dataset_cfgs (list[dict]): The list contains all configs of
+        datasets (list[dict]): The list contains all configs of
             concatenated datasets.
-        dataset_sampling_weights (list[float]): The list contains the sampling
-            weights of each dataset.
+        dataset_sampling_weights (Optional[List[float]], optional): The list
+            contains the sampling weights of each dataset. Defaults to None.
     """
 
-    def __init__(self, dataset_cfgs, dataset_sampling_weights=None):
+    def __init__(self,
+                 datasets: List[dict],
+                 dataset_sampling_weights: Optional[List[float]] = None):
         if dataset_sampling_weights is None:
-            self.dataset_sampling_probs = [1. / len(dataset_cfgs)
-                                           ] * len(dataset_cfgs)
+            self.dataset_sampling_probs = [1. / len(datasets)] * len(datasets)
         else:
             for x in dataset_sampling_weights:
                 assert x >= 0.
@@ -31,16 +34,21 @@ def __init__(self, dataset_cfgs, dataset_sampling_weights=None):
                 x / prob_total for x in dataset_sampling_weights
             ]
 
-        datasets = [build_dataset(cfg) for cfg in dataset_cfgs]
+        datasets = [DATASETS.build(cfg) for cfg in datasets]
         # add an attribute `CLASSES` for the calling in `tools/train.py`
-        self.CLASSES = datasets[0].CLASSES
+        self.CLASSES = datasets[0].META['CLASSES']
 
         super().__init__(datasets)
 
-    def __getitem__(self, ind):
-        """Random sampling a dataset and get samples from this dataset.
+    def __getitem__(self, ind: int) -> dict:
+        """Random sampling a dataset and get samples from this dataset..
+
+        Args:
+            ind (int): The random index.  Actually, in this class,
+                the input 'ind' is not used in 'dataset'.
 
-        Actually, the input 'ind' is not used in 'dataset'.
+        Returns:
+            dict: The results after the dataset pipeline.
         """
         while True:
             dataset = random.choices(self.datasets,
diff --git a/mmtrack/datasets/got10k_dataset.py b/mmtrack/datasets/got10k_dataset.py
index e8302ac50..44850bb2d 100644
--- a/mmtrack/datasets/got10k_dataset.py
+++ b/mmtrack/datasets/got10k_dataset.py
@@ -1,13 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import os.path as osp
-import shutil
 import time
+from typing import List
 
 import numpy as np
-from mmcv.utils import print_log
-from mmdet.datasets import DATASETS
 
+from mmtrack.registry import DATASETS
 from .base_sot_dataset import BaseSOTDataset
 
 
@@ -19,13 +18,11 @@ class GOT10kDataset(BaseSOTDataset):
     """
 
     def __init__(self, *args, **kwargs):
+        """Initialization of SOT dataset class."""
         super(GOT10kDataset, self).__init__(*args, **kwargs)
 
-    def load_data_infos(self, split='train'):
-        """Load dataset information.
-
-        Args:
-            split (str, optional): the split of dataset. Defaults to 'train'.
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``.
 
         Returns:
             list[dict]: the length of the list is the number of videos. The
@@ -42,10 +39,9 @@ def load_data_infos(self, split='train'):
         """
         print('Loading GOT10k dataset...')
         start_time = time.time()
-        assert split in ['train', 'val', 'test', 'val_vot', 'train_vot']
         data_infos = []
-        data_infos_str = self.loadtxt(
-            self.ann_file, return_array=False).split('\n')
+        data_infos_str = self._loadtxt(
+            self.ann_file, return_ndarray=False).split('\n')
         # the first line of annotation file is a dataset comment.
         for line in data_infos_str[1:]:
             # compatible with different OS.
@@ -60,115 +56,69 @@ def load_data_infos(self, split='train'):
         print(f'GOT10k dataset loaded! ({time.time()-start_time:.2f} s)')
         return data_infos
 
-    def get_visibility_from_video(self, video_ind):
-        """Get the visible information of instance in a video."""
+    def get_visibility_from_video(self, video_idx: int) -> dict:
+        """Get the visible information of instance in a video.
+
+        Args:
+            video_idx (int): The index of video.
+
+        Returns:
+            dict: The visibilities of each object in the video.
+        """
         if not self.test_mode:
-            absense_info_path = osp.join(
-                self.img_prefix, self.data_infos[video_ind]['video_path'],
-                'absence.label')
-            cover_info_path = osp.join(
-                self.img_prefix, self.data_infos[video_ind]['video_path'],
-                'cover.label')
-            absense_info = self.loadtxt(absense_info_path, dtype=bool)
+            video_path = self.get_data_info(video_idx)['video_path']
+            absense_info_path = osp.join(self.data_prefix['img_path'],
+                                         video_path, 'absence.label')
+            cover_info_path = osp.join(self.data_prefix['img_path'],
+                                       video_path, 'cover.label')
+            absense_info = self._loadtxt(absense_info_path, dtype=bool)
             # The values of key 'cover' are
             # int numbers in range [0,8], which correspond to
             # ranges of object visible ratios: 0%, (0%, 15%],
             # (15%~30%], (30%, 45%], (45%, 60%],(60%, 75%],
             # (75%, 90%], (90%, 100%) and 100% respectively
-            cover_info = self.loadtxt(cover_info_path, dtype=int)
+            cover_info = self._loadtxt(cover_info_path, dtype=int)
             visible = ~absense_info & (cover_info > 0)
             visible_ratio = cover_info / 8.
             return dict(visible=visible, visible_ratio=visible_ratio)
         else:
             return super(GOT10kDataset,
-                         self).get_visibility_from_video(video_ind)
+                         self).get_visibility_from_video(video_idx)
 
-    def prepare_test_data(self, video_ind, frame_ind):
+    def prepare_test_data(self, video_idx: int, frame_idx: int) -> dict:
         """Get testing data of one frame. We parse one video, get one frame
         from it and pass the frame information to the pipeline.
 
         Args:
-            video_ind (int): video index
-            frame_ind (int): frame index
+            video_idx (int): The index of video.
+            frame_idx (int): The index of frame.
 
         Returns:
-            dict: testing data of one frame.
+            dict: Testing data of one frame.
         """
-        if self.test_memo.get('video_ind', None) != video_ind:
-            self.test_memo.video_ind = video_ind
-            self.test_memo.img_infos = self.get_img_infos_from_video(video_ind)
-        assert 'video_ind' in self.test_memo and 'img_infos' in self.test_memo
-
-        img_info = dict(
-            filename=self.test_memo.img_infos['filename'][frame_ind],
-            frame_id=frame_ind)
-        if frame_ind == 0:
-            ann_infos = self.get_ann_infos_from_video(video_ind)
-            ann_info = dict(
-                bboxes=ann_infos['bboxes'][frame_ind], visible=True)
-        else:
-            ann_info = dict(
-                bboxes=np.array([0] * 4, dtype=np.float32), visible=True)
-
-        results = dict(img_info=img_info, ann_info=ann_info)
-        self.pre_pipeline(results)
+        if self.test_memo.get('video_idx', None) != video_idx:
+            self.test_memo.video_idx = video_idx
+            ann_infos = self.get_ann_infos_from_video(video_idx)
+            img_infos = self.get_img_infos_from_video(video_idx)
+            self.test_memo.video_infos = dict(**img_infos, **ann_infos)
+        assert 'video_idx' in self.test_memo and 'video_infos'\
+            in self.test_memo
+
+        results = {}
+        results['img_path'] = self.test_memo.video_infos['img_paths'][
+            frame_idx]
+        results['frame_id'] = frame_idx
+        results['video_id'] = video_idx
+        results['video_length'] = self.test_memo.video_infos['video_length']
+
+        instance = {}
+        if frame_idx == 0:
+            ann_infos = self.get_ann_infos_from_video(video_idx)
+            instance['bbox'] = ann_infos['bboxes'][frame_idx]
+
+        results['instances'] = []
+        instance['visible'] = True
+        instance['bbox_label'] = np.array([0], dtype=np.int32)
+        results['instances'].append(instance)
         results = self.pipeline(results)
         return results
-
-    def format_results(self, results, resfile_path=None, logger=None):
-        """Format the results to txts (standard format for GOT10k Challenge).
-
-        Args:
-            results (dict(list[ndarray])): Testing results of the dataset.
-            resfile_path (str): Path to save the formatted results.
-                Defaults to None.
-            logger (logging.Logger | str | None, optional): defaults to None.
-        """
-        # prepare saved dir
-        assert resfile_path is not None, 'Please give key-value pair \
-            like resfile_path=xxx in argparse'
-
-        if not osp.isdir(resfile_path):
-            os.makedirs(resfile_path, exist_ok=True)
-
-        # transform tracking results format
-        # from [bbox_1, bbox_2, ...] to {'video_1':[bbox_1, bbox_2, ...], ...}
-        track_bboxes = results['track_bboxes']
-        print_log(
-            f'-------- There are total {len(track_bboxes)} images --------',
-            logger=logger)
-
-        start_ind = end_ind = 0
-        for num, video_info in zip(self.num_frames_per_video, self.data_infos):
-            end_ind += num
-            video_name = video_info['video_path'].split(os.sep)[-1]
-            video_resfiles_path = osp.join(resfile_path, video_name)
-            if not osp.isdir(video_resfiles_path):
-                os.makedirs(video_resfiles_path, exist_ok=True)
-            video_bbox_txt = osp.join(video_resfiles_path,
-                                      '{}_001.txt'.format(video_name))
-            video_time_txt = osp.join(video_resfiles_path,
-                                      '{}_time.txt'.format(video_name))
-            with open(video_bbox_txt,
-                      'w') as f_bbox, open(video_time_txt, 'w') as f_time:
-
-                for bbox in results['track_bboxes'][start_ind:end_ind]:
-                    bbox = [
-                        str(f'{bbox[0]:.4f}'),
-                        str(f'{bbox[1]:.4f}'),
-                        str(f'{(bbox[2] - bbox[0]):.4f}'),
-                        str(f'{(bbox[3] - bbox[1]):.4f}')
-                    ]
-                    line = ','.join(bbox) + '\n'
-                    f_bbox.writelines(line)
-                    # We don't record testing time, so we set a default
-                    # time in order to test on the server.
-                    f_time.writelines('0.0001\n')
-            start_ind += num
-
-        shutil.make_archive(resfile_path, 'zip', resfile_path)
-        shutil.rmtree(resfile_path)
-
-        print_log(
-            f'-------- The results are stored in {resfile_path}.zip --------',
-            logger=logger)
diff --git a/mmtrack/datasets/imagenet_vid_dataset.py b/mmtrack/datasets/imagenet_vid_dataset.py
index 681ebf3e8..d9e99c90c 100644
--- a/mmtrack/datasets/imagenet_vid_dataset.py
+++ b/mmtrack/datasets/imagenet_vid_dataset.py
@@ -1,91 +1,141 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.datasets import DATASETS
+import copy
+from typing import List, Tuple
+
 from mmdet.datasets.api_wrappers import COCO
+from mmengine.fileio import FileClient
 
-from .coco_video_dataset import CocoVideoDataset
-from .parsers import CocoVID
+from mmtrack.registry import DATASETS
+from .api_wrappers import CocoVID
+from .base_video_dataset import BaseVideoDataset
 
 
 @DATASETS.register_module()
-class ImagenetVIDDataset(CocoVideoDataset):
+class ImagenetVIDDataset(BaseVideoDataset):
     """ImageNet VID dataset for video object detection."""
 
-    CLASSES = ('airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
-               'cattle', 'dog', 'domestic_cat', 'elephant', 'fox',
-               'giant_panda', 'hamster', 'horse', 'lion', 'lizard', 'monkey',
-               'motorcycle', 'rabbit', 'red_panda', 'sheep', 'snake',
-               'squirrel', 'tiger', 'train', 'turtle', 'watercraft', 'whale',
-               'zebra')
+    METAINFO = {
+        'CLASSES':
+        ('airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+         'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+         'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle',
+         'rabbit', 'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train',
+         'turtle', 'watercraft', 'whale', 'zebra')
+    }
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def load_annotations(self, ann_file):
-        """Load annotations from COCO/COCOVID style annotation file.
-
-        Args:
-            ann_file (str): Path of annotation file.
+    def load_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from an annotation file named as ``self.ann_file``.
+        Specifically, if self.load_as_video is True, it loads from the video
+        annotation file. Otherwise, from the image annotation file.
 
         Returns:
-            list[dict]: Annotation information from COCO/COCOVID api.
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
         """
         if self.load_as_video:
-            data_infos = self.load_video_anns(ann_file)
+            data_list, valid_data_indices = self._load_video_data_list()
         else:
-            data_infos = self.load_image_anns(ann_file)
-        return data_infos
+            data_list, valid_data_indices = self._load_image_data_list()
 
-    def load_image_anns(self, ann_file):
-        """Load annotations from COCO style annotation file.
+        return data_list, valid_data_indices
 
-        Args:
-            ann_file (str): Path of annotation file.
+    def _load_video_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from a video annotation file named as
+        ``self.ann_file``.
 
         Returns:
-            list[dict]: Annotation information from COCO api.
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
         """
-        self.coco = COCO(ann_file)
-        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
+        file_client = FileClient.infer_client(uri=self.ann_file)
+        with file_client.get_local_path(self.ann_file) as local_path:
+            coco = CocoVID(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the CLASSES
+        self.cat_ids = coco.get_cat_ids(cat_names=self.metainfo['CLASSES'])
         self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(coco.cat_img_map)
 
-        all_img_ids = self.coco.get_img_ids()
-        self.img_ids = []
-        data_infos = []
-        for img_id in all_img_ids:
-            info = self.coco.load_imgs([img_id])[0]
-            info['filename'] = info['file_name']
-            if info['is_vid_train_frame']:
-                self.img_ids.append(img_id)
-                data_infos.append(info)
-        return data_infos
+        data_list = []
+        valid_data_indices = []
+        data_id = 0
+        vid_ids = coco.get_vid_ids()
+
+        for vid_id in vid_ids:
+            img_ids = coco.get_img_ids_from_vid(vid_id)
+            for img_id in img_ids:
+                # load img info
+                raw_img_info = coco.load_imgs([img_id])[0]
+                raw_img_info['img_id'] = img_id
+                raw_img_info['video_length'] = len(img_ids)
 
-    def load_video_anns(self, ann_file):
-        """Load annotations from COCOVID style annotation file.
+                # load ann info
+                ann_ids = coco.get_ann_ids(
+                    img_ids=[img_id], cat_ids=self.cat_ids)
+                raw_ann_info = coco.load_anns(ann_ids)
+
+                # load frames for training
+                if self.test_mode:
+                    assert not raw_img_info['is_vid_train_frame'], \
+                        'is_vid_train_frame must be False in testing'
+                    valid_data_indices.append(data_id)
+                elif raw_img_info['is_vid_train_frame']:
+                    valid_data_indices.append(data_id)
 
-        Args:
-            ann_file (str): Path of annotation file.
+                # get data_info
+                parsed_data_info = self.parse_data_info(
+                    dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info))
+                data_list.append(parsed_data_info)
+                data_id += 1
+        assert len(
+            valid_data_indices
+        ) != 0, f"There is no frame for training in '{self.ann_file}'!"
+
+        return data_list, valid_data_indices
+
+    def _load_image_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from an image annotation file named as
+        ``self.ann_file``.
 
         Returns:
-            list[dict]: Annotation information from COCOVID api.
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
         """
-        self.coco = CocoVID(ann_file)
-        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
+        file_client = FileClient.infer_client(uri=self.ann_file)
+        with file_client.get_local_path(self.ann_file) as local_path:
+            coco = COCO(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the CLASSES
+        self.cat_ids = coco.get_cat_ids(cat_names=self.metainfo['CLASSES'])
         self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
-
-        data_infos = []
-        self.vid_ids = self.coco.get_vid_ids()
-        self.img_ids = []
-        for vid_id in self.vid_ids:
-            img_ids = self.coco.get_img_ids_from_vid(vid_id)
-            for img_id in img_ids:
-                info = self.coco.load_imgs([img_id])[0]
-                info['filename'] = info['file_name']
-                if self.test_mode:
-                    assert not info['is_vid_train_frame'], \
-                        'is_vid_train_frame must be False in testing'
-                    self.img_ids.append(img_id)
-                    data_infos.append(info)
-                elif info['is_vid_train_frame']:
-                    self.img_ids.append(img_id)
-                    data_infos.append(info)
-        return data_infos
+        self.cat_img_map = copy.deepcopy(coco.cat_img_map)
+
+        img_ids = coco.get_img_ids()
+        data_id = 0
+        valid_data_indices = []
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            ann_ids = coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            # load images for training
+            if raw_img_info['is_vid_train_frame']:
+                valid_data_indices.append(data_id)
+
+            parsed_data_info = self.parse_data_info(
+                dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info))
+            data_list.append(parsed_data_info)
+            data_id += 1
+        assert len(set(total_ann_ids)) == len(
+            total_ann_ids
+        ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        return data_list, valid_data_indices
diff --git a/mmtrack/datasets/lasot_dataset.py b/mmtrack/datasets/lasot_dataset.py
index 7566acc80..00c8fbc08 100644
--- a/mmtrack/datasets/lasot_dataset.py
+++ b/mmtrack/datasets/lasot_dataset.py
@@ -2,9 +2,9 @@
 import os
 import os.path as osp
 import time
+from typing import List
 
-from mmdet.datasets import DATASETS
-
+from mmtrack.registry import DATASETS
 from .base_sot_dataset import BaseSOTDataset
 
 
@@ -19,11 +19,8 @@ def __init__(self, *args, **kwargs):
         """Initialization of SOT dataset class."""
         super(LaSOTDataset, self).__init__(*args, **kwargs)
 
-    def load_data_infos(self, split='test'):
-        """Load dataset information.
-
-        Args:
-            split (str, optional): Dataset split. Defaults to 'test'.
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``.
 
         Returns:
             list[dict]: The length of the list is the number of videos. The
@@ -40,10 +37,9 @@ def load_data_infos(self, split='test'):
         """
         print('Loading LaSOT dataset...')
         start_time = time.time()
-        assert split in ['train', 'test']
         data_infos = []
-        data_infos_str = self.loadtxt(
-            self.ann_file, return_array=False).split('\n')
+        data_infos_str = self._loadtxt(
+            self.ann_file, return_ndarray=False).split('\n')
         # the first line of annotation file is a dataset comment.
         for line in data_infos_str[1:]:
             # compatible with different OS.
@@ -58,15 +54,23 @@ def load_data_infos(self, split='test'):
         print(f'LaSOT dataset loaded! ({time.time()-start_time:.2f} s)')
         return data_infos
 
-    def get_visibility_from_video(self, video_ind):
-        """Get the visible information of instance in a video."""
-        video_path = osp.dirname(self.data_infos[video_ind]['video_path'])
-        full_occlusion_file = osp.join(self.img_prefix, video_path,
-                                       'full_occlusion.txt')
-        out_of_view_file = osp.join(self.img_prefix, video_path,
+    def get_visibility_from_video(self, video_idx: int) -> dict:
+        """Get the visible information of instance in a video.
+
+        Args:
+            video_idx (int): The index of video.
+
+        Returns:
+            dict: The visibilities of each object in the video.
+        """
+        video_path = osp.dirname(self.get_data_info(video_idx)['video_path'])
+        full_occlusion_file = osp.join(self.data_prefix['img_path'],
+                                       video_path, 'full_occlusion.txt')
+        out_of_view_file = osp.join(self.data_prefix['img_path'], video_path,
                                     'out_of_view.txt')
-        full_occlusion = self.loadtxt(
+        full_occlusion = self._loadtxt(
             full_occlusion_file, dtype=bool, delimiter=',')
-        out_of_view = self.loadtxt(out_of_view_file, dtype=bool, delimiter=',')
+        out_of_view = self._loadtxt(
+            out_of_view_file, dtype=bool, delimiter=',')
         visible = ~(full_occlusion | out_of_view)
         return dict(visible=visible)
diff --git a/mmtrack/datasets/mot_challenge_dataset.py b/mmtrack/datasets/mot_challenge_dataset.py
index f446333e4..d22ef0a34 100644
--- a/mmtrack/datasets/mot_challenge_dataset.py
+++ b/mmtrack/datasets/mot_challenge_dataset.py
@@ -1,109 +1,68 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import os
 import os.path as osp
-import tempfile
+from typing import List, Union
 
-import mmcv
-import motmetrics as mm
-import numpy as np
-from mmcv.utils import print_log
-from mmdet.core import eval_map
-from mmdet.datasets import DATASETS
-
-from mmtrack.core import interpolate_tracks, results2outs
-from .coco_video_dataset import CocoVideoDataset
-
-try:
-    import trackeval
-except ImportError:
-    trackeval = None
+from mmtrack.registry import DATASETS
+from .base_video_dataset import BaseVideoDataset
 
 
 @DATASETS.register_module()
-class MOTChallengeDataset(CocoVideoDataset):
+class MOTChallengeDataset(BaseVideoDataset):
     """Dataset for MOTChallenge.
 
     Args:
         visibility_thr (float, optional): The minimum visibility
             for the objects during training. Default to -1.
-        interpolate_tracks_cfg (dict, optional): If not None, Interpolate
-            tracks linearly to make tracks more complete. Defaults to None.
-            - min_num_frames (int, optional): The minimum length of a track
-                that will be interpolated. Defaults to 5.
-            - max_num_frames (int, optional): The maximum disconnected length
-                in a track. Defaults to 20.
         detection_file (str, optional): The path of the public
             detection file. Default to None.
     """
 
-    CLASSES = ('pedestrian', )
+    METAINFO = {
+        'CLASSES':
+        ('pedestrian', 'person_on_vehicle', 'car', 'bicycle', 'motorbike',
+         'non_mot_vehicle', 'static_person', 'distractor', 'occluder',
+         'occluder_on_ground', 'occluder_full', 'reflection', 'crowd')
+    }
 
     def __init__(self,
-                 visibility_thr=-1,
-                 interpolate_tracks_cfg=None,
-                 detection_file=None,
+                 visibility_thr: float = -1,
+                 detection_file: str = None,
                  *args,
                  **kwargs):
-        super().__init__(*args, **kwargs)
         self.visibility_thr = visibility_thr
-        self.interpolate_tracks_cfg = interpolate_tracks_cfg
-        self.detections = self.load_detections(detection_file)
-
-    def load_detections(self, detection_file=None):
-        """Load public detections."""
-        # support detections in three formats
-        # 1. MMDet: [img_1, img_2, ...]
-        # 2. MMTrack: dict(det_bboxes=[img_1, img_2, ...])
-        # 3. Public:
-        #    1) dict(img1_name: [], img2_name: [], ...)
-        #    2) dict(det_bboxes=dict(img1_name: [], img2_name: [], ...))
-        # return as a dict or a list
-        if detection_file is not None:
-            detections = mmcv.load(detection_file)
-            if isinstance(detections, dict):
-                # results from mmtrack
-                if 'det_bboxes' in detections:
-                    detections = detections['det_bboxes']
-            else:
-                # results from mmdet
-                if not isinstance(detections, list):
-                    raise TypeError('detections must be a dict or a list.')
-            return detections
-        else:
-            return None
-
-    def prepare_results(self, img_info):
-        """Prepare results for image (e.g. the annotation information, ...)."""
-        results = super().prepare_results(img_info)
-        if self.detections is not None:
-            if isinstance(self.detections, dict):
-                indice = img_info['file_name']
-            elif isinstance(self.detections, list):
-                indice = self.img_ids.index(img_info['id'])
-            results['detections'] = self.detections[indice]
-        return results
+        self.detection_file = detection_file
+        super().__init__(*args, **kwargs)
 
-    def _parse_ann_info(self, img_info, ann_info):
-        """Parse bbox and mask annotation.
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
 
         Args:
-            ann_info (list[dict]): Annotation info of an image.
-            with_mask (bool): Whether to parse mask annotations.
+            raw_data_info (dict): Raw data information load from ``ann_file``
 
         Returns:
-            dict: A dict containing the following keys: bboxes, bboxes_ignore,
-            labels, masks, seg_map. "masks" are raw annotations and not
-            decoded into binary masks.
+            Union[dict, List[dict]]: Parsed annotation.
         """
-        gt_bboxes = []
-        gt_labels = []
-        gt_bboxes_ignore = []
-        gt_instance_ids = []
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        data_info = {}
+
+        data_info.update(img_info)
+        if self.data_prefix.get('img_path', None) is not None:
+            img_path = osp.join(self.data_prefix['img_path'],
+                                img_info['file_name'])
+        else:
+            img_path = img_info['file_name']
+        data_info['img_path'] = img_path
 
+        instances = []
         for i, ann in enumerate(ann_info):
+            instance = {}
+
             if (not self.test_mode) and (ann['visibility'] <
                                          self.visibility_thr):
                 continue
+            if ann.get('ignore', False):
+                continue
             x1, y1, w, h = ann['bbox']
             inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
             inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
@@ -114,399 +73,18 @@ def _parse_ann_info(self, img_info, ann_info):
             if ann['category_id'] not in self.cat_ids:
                 continue
             bbox = [x1, y1, x1 + w, y1 + h]
-            if ann.get('ignore', False) or ann.get('iscrowd', False):
-                # note: normally no `iscrowd` for MOT17Dataset
-                gt_bboxes_ignore.append(bbox)
-            else:
-                gt_bboxes.append(bbox)
-                gt_labels.append(self.cat2label[ann['category_id']])
-                gt_instance_ids.append(ann['instance_id'])
-
-        if gt_bboxes:
-            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
-            gt_labels = np.array(gt_labels, dtype=np.int64)
-            gt_instance_ids = np.array(gt_instance_ids, dtype=np.int64)
-        else:
-            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
-            gt_labels = np.array([], dtype=np.int64)
-            gt_instance_ids = np.array([], dtype=np.int64)
-
-        if gt_bboxes_ignore:
-            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
-        else:
-            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
-
-        ann = dict(
-            bboxes=gt_bboxes,
-            labels=gt_labels,
-            bboxes_ignore=gt_bboxes_ignore,
-            instance_ids=gt_instance_ids)
-
-        return ann
-
-    def format_results(self, results, resfile_path=None, metrics=['track']):
-        """Format the results to txts (standard format for MOT Challenge).
-
-        Args:
-            results (dict(list[ndarray])): Testing results of the dataset.
-            resfile_path (str, optional): Path to save the formatted results.
-                Defaults to None.
-            metrics (list[str], optional): The results of the specific metrics
-                will be formatted.. Defaults to ['track'].
-
-        Returns:
-            tuple: (resfile_path, resfiles, names, tmp_dir), resfile_path is
-            the path to save the formatted results, resfiles is a dict
-            containing the filepaths, names is a list containing the name of
-            the videos, tmp_dir is the temporal directory created for saving
-            files.
-        """
-        assert isinstance(results, dict), 'results must be a dict.'
-        if resfile_path is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            resfile_path = tmp_dir.name
-        else:
-            tmp_dir = None
-            if osp.exists(resfile_path):
-                print_log('remove previous results.', self.logger)
-                import shutil
-                shutil.rmtree(resfile_path)
-
-        resfiles = dict()
-        for metric in metrics:
-            resfiles[metric] = osp.join(resfile_path, metric)
-            os.makedirs(resfiles[metric], exist_ok=True)
-
-        inds = [i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0]
-        num_vids = len(inds)
-        assert num_vids == len(self.vid_ids)
-        inds.append(len(self.data_infos))
-        vid_infos = self.coco.load_vids(self.vid_ids)
-        names = [_['name'] for _ in vid_infos]
-
-        for i in range(num_vids):
-            for metric in metrics:
-                formatter = getattr(self, f'format_{metric}_results')
-                formatter(results[f'{metric}_bboxes'][inds[i]:inds[i + 1]],
-                          self.data_infos[inds[i]:inds[i + 1]],
-                          f'{resfiles[metric]}/{names[i]}.txt')
-
-        return resfile_path, resfiles, names, tmp_dir
-
-    def format_track_results(self, results, infos, resfile):
-        """Format tracking results."""
-
-        results_per_video = []
-        for frame_id, result in enumerate(results):
-            outs_track = results2outs(bbox_results=result)
-            track_ids, bboxes = outs_track['ids'], outs_track['bboxes']
-            frame_ids = np.full_like(track_ids, frame_id)
-            results_per_frame = np.concatenate(
-                (frame_ids[:, None], track_ids[:, None], bboxes), axis=1)
-            results_per_video.append(results_per_frame)
-        # `results_per_video` is a ndarray with shape (N, 7). Each row denotes
-        # (frame_id, track_id, x1, y1, x2, y2, score)
-        results_per_video = np.concatenate(results_per_video)
-
-        if self.interpolate_tracks_cfg is not None:
-            results_per_video = interpolate_tracks(
-                results_per_video, **self.interpolate_tracks_cfg)
-
-        with open(resfile, 'wt') as f:
-            for frame_id, info in enumerate(infos):
-                # `mot_frame_id` is the actually frame id used for evaluation.
-                # It may not start from 0.
-                if 'mot_frame_id' in info:
-                    mot_frame_id = info['mot_frame_id']
-                else:
-                    mot_frame_id = info['frame_id'] + 1
-
-                results_per_frame = \
-                    results_per_video[results_per_video[:, 0] == frame_id]
-                for i in range(len(results_per_frame)):
-                    _, track_id, x1, y1, x2, y2, conf = results_per_frame[i]
-                    f.writelines(
-                        f'{mot_frame_id},{track_id},{x1:.3f},{y1:.3f},' +
-                        f'{(x2-x1):.3f},{(y2-y1):.3f},{conf:.3f},-1,-1,-1\n')
-
-    def format_bbox_results(self, results, infos, resfile):
-        """Format detection results."""
-        with open(resfile, 'wt') as f:
-            for res, info in zip(results, infos):
-                if 'mot_frame_id' in info:
-                    frame = info['mot_frame_id']
-                else:
-                    frame = info['frame_id'] + 1
-
-                outs_det = results2outs(bbox_results=res)
-                for bbox, label in zip(outs_det['bboxes'], outs_det['labels']):
-                    x1, y1, x2, y2, conf = bbox
-                    f.writelines(
-                        f'{frame},-1,{x1:.3f},{y1:.3f},{(x2-x1):.3f},' +
-                        f'{(y2-y1):.3f},{conf:.3f}\n')
-            f.close()
-
-    def get_benchmark_and_eval_split(self):
-        """Get benchmark and dataset split to evaluate.
 
-        Get benchmark from upeper/lower-case image prefix and the dataset
-        split to evaluate.
-
-        Returns:
-            tuple(string): The first string denotes the type of dataset.
-            The second string denotes the split of the dataset to eval.
-        """
-        BENCHMARKS = ['MOT15', 'MOT16', 'MOT17', 'MOT20']
-        for benchmark in BENCHMARKS:
-            if benchmark in self.img_prefix.upper():
-                break
-        # We directly return 'train' for the dataset split to evaluate, since
-        # MOT challenge only provides annotations for train split.
-        return benchmark, 'train'
-
-    def get_dataset_cfg_for_hota(self, gt_folder, tracker_folder, seqmap):
-        """Get default configs for trackeval.datasets.MotChallenge2DBox.
-
-        Args:
-            gt_folder (str): the name of the GT folder
-            tracker_folder (str): the name of the tracker folder
-            seqmap (str): the file that contains the sequence of video names
-
-        Returns:
-            Dataset Configs for MotChallenge2DBox.
-        """
-        benchmark, split_to_eval = self.get_benchmark_and_eval_split()
-
-        dataset_config = dict(
-            # Location of GT data
-            GT_FOLDER=gt_folder,
-            # Trackers location
-            TRACKERS_FOLDER=tracker_folder,
-            # Where to save eval results
-            # (if None, same as TRACKERS_FOLDER)
-            OUTPUT_FOLDER=None,
-            # Use 'track' as the default tracker
-            TRACKERS_TO_EVAL=['track'],
-            # Option values: ['pedestrian']
-            CLASSES_TO_EVAL=list(self.CLASSES),
-            # Option Values: 'MOT17', 'MOT16', 'MOT20', 'MOT15'
-            BENCHMARK=benchmark,
-            # Option Values: 'train', 'test'
-            SPLIT_TO_EVAL=split_to_eval,
-            # Whether tracker input files are zipped
-            INPUT_AS_ZIP=False,
-            # Whether to print current config
-            PRINT_CONFIG=True,
-            # Whether to perform preprocessing
-            # (never done for MOT15)
-            DO_PREPROC=False if 'MOT15' in self.img_prefix else True,
-            # Tracker files are in
-            # TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
-            TRACKER_SUB_FOLDER='',
-            # Output files are saved in
-            # OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
-            OUTPUT_SUB_FOLDER='',
-            # Names of trackers to display
-            # (if None: TRACKERS_TO_EVAL)
-            TRACKER_DISPLAY_NAMES=None,
-            # Where seqmaps are found
-            # (if None: GT_FOLDER/seqmaps)
-            SEQMAP_FOLDER=None,
-            # Directly specify seqmap file
-            # (if none use seqmap_folder/benchmark-split_to_eval)
-            SEQMAP_FILE=seqmap,
-            # If not None, specify sequences to eval
-            # and their number of timesteps
-            SEQ_INFO=None,
-            # '{gt_folder}/{seq}/gt/gt.txt'
-            GT_LOC_FORMAT='{gt_folder}/{seq}/gt/gt.txt',
-            # If False, data is in GT_FOLDER/BENCHMARK-SPLIT_TO_EVAL/ and in
-            # TRACKERS_FOLDER/BENCHMARK-SPLIT_TO_EVAL/tracker/
-            # If True, the middle 'benchmark-split' folder is skipped for both.
-            SKIP_SPLIT_FOL=True,
-        )
-
-        if 'half-train' in self.ann_file:
-            dataset_config[
-                'GT_LOC_FORMAT'] = '{gt_folder}/{seq}/gt/gt_half-train.txt'
-        elif 'half-val' in self.ann_file:
-            dataset_config[
-                'GT_LOC_FORMAT'] = '{gt_folder}/{seq}/gt/gt_half-val.txt'
-
-        return dataset_config
-
-    def evaluate(self,
-                 results,
-                 metric='track',
-                 logger=None,
-                 resfile_path=None,
-                 bbox_iou_thr=0.5,
-                 track_iou_thr=0.5):
-        """Evaluation in MOT Challenge.
-
-        Args:
-            results (list[list | tuple]): Testing results of the dataset.
-            metric (str | list[str]): Metrics to be evaluated. Options are
-                'bbox', 'track'. Defaults to 'track'.
-            logger (logging.Logger | str | None): Logger used for printing
-                related information during evaluation. Default: None.
-            resfile_path (str, optional): Path to save the formatted results.
-                Defaults to None.
-            bbox_iou_thr (float, optional): IoU threshold for detection
-                evaluation. Defaults to 0.5.
-            track_iou_thr (float, optional): IoU threshold for tracking
-                evaluation.. Defaults to 0.5.
-
-        Returns:
-            dict[str, float]: MOTChallenge style evaluation metric.
-        """
-        eval_results = dict()
-        if isinstance(metric, list):
-            metrics = metric
-        elif isinstance(metric, str):
-            metrics = [metric]
-        else:
-            raise TypeError('metric must be a list or a str.')
-        allowed_metrics = ['bbox', 'track']
-        for metric in metrics:
-            if metric not in allowed_metrics:
-                raise KeyError(f'metric {metric} is not supported.')
-
-        if 'track' in metrics:
-            resfile_path, resfiles, names, tmp_dir = self.format_results(
-                results, resfile_path, metrics)
-            print_log('Evaluate CLEAR MOT results.', logger=logger)
-            distth = 1 - track_iou_thr
-            accs = []
-            # support loading data from ceph
-            local_dir = tempfile.TemporaryDirectory()
-
-            for name in names:
-                if 'half-train' in self.ann_file:
-                    gt_file = osp.join(self.img_prefix,
-                                       f'{name}/gt/gt_half-train.txt')
-                elif 'half-val' in self.ann_file:
-                    gt_file = osp.join(self.img_prefix,
-                                       f'{name}/gt/gt_half-val.txt')
-                else:
-                    gt_file = osp.join(self.img_prefix, f'{name}/gt/gt.txt')
-                res_file = osp.join(resfiles['track'], f'{name}.txt')
-                # copy gt file from ceph to local temporary directory
-                gt_dir_path = osp.join(local_dir.name, name, 'gt')
-                os.makedirs(gt_dir_path)
-                copied_gt_file = osp.join(
-                    local_dir.name,
-                    gt_file.replace(gt_file.split(name)[0], ''))
-
-                f = open(copied_gt_file, 'wb')
-                gt_content = self.file_client.get(gt_file)
-                if hasattr(gt_content, 'tobytes'):
-                    gt_content = gt_content.tobytes()
-                f.write(gt_content)
-                f.close()
-                # copy sequence file from ceph to local temporary directory
-                copied_seqinfo_path = osp.join(local_dir.name, name,
-                                               'seqinfo.ini')
-                f = open(copied_seqinfo_path, 'wb')
-                seq_content = self.file_client.get(
-                    osp.join(self.img_prefix, name, 'seqinfo.ini'))
-                if hasattr(seq_content, 'tobytes'):
-                    seq_content = seq_content.tobytes()
-                f.write(seq_content)
-                f.close()
-
-                gt = mm.io.loadtxt(copied_gt_file)
-                res = mm.io.loadtxt(res_file)
-                if osp.exists(copied_seqinfo_path
-                              ) and 'MOT15' not in self.img_prefix:
-                    acc, ana = mm.utils.CLEAR_MOT_M(
-                        gt, res, copied_seqinfo_path, distth=distth)
-                else:
-                    acc = mm.utils.compare_to_groundtruth(
-                        gt, res, distth=distth)
-                accs.append(acc)
-
-            mh = mm.metrics.create()
-            summary = mh.compute_many(
-                accs,
-                names=names,
-                metrics=mm.metrics.motchallenge_metrics,
-                generate_overall=True)
-
-            if trackeval is None:
-                raise ImportError(
-                    'Please run'
-                    'pip install git+https://github.com/JonathonLuiten/TrackEval.git'  # noqa
-                    'to manually install trackeval')
-
-            seqmap = osp.join(resfile_path, 'videoseq.txt')
-            with open(seqmap, 'w') as f:
-                f.write('name\n')
-                for name in names:
-                    f.write(name + '\n')
-                f.close()
-
-            eval_config = trackeval.Evaluator.get_default_eval_config()
-
-            # tracker's name is set to 'track',
-            # so this word needs to be splited out
-            output_folder = resfiles['track'].rsplit(os.sep, 1)[0]
-            dataset_config = self.get_dataset_cfg_for_hota(
-                local_dir.name, output_folder, seqmap)
-
-            evaluator = trackeval.Evaluator(eval_config)
-            dataset = [trackeval.datasets.MotChallenge2DBox(dataset_config)]
-            hota_metrics = [
-                trackeval.metrics.HOTA(dict(METRICS=['HOTA'], THRESHOLD=0.5))
-            ]
-            output_res, _ = evaluator.evaluate(dataset, hota_metrics)
-
-            # modify HOTA results sequence according to summary list,
-            # indexes of summary are sequence names and 'OVERALL'
-            # while for hota they are sequence names and 'COMBINED_SEQ'
-            seq_list = list(summary.index)
-            seq_list.append('COMBINED_SEQ')
-
-            hota = [
-                np.average(output_res['MotChallenge2DBox']['track'][seq]
-                           ['pedestrian']['HOTA']['HOTA']) for seq in seq_list
-                if 'OVERALL' not in seq
-            ]
-
-            eval_results.update({
-                mm.io.motchallenge_metric_names[k]: v['OVERALL']
-                for k, v in summary.to_dict().items()
-            })
-            eval_results['HOTA'] = hota[-1]
-
-            summary['HOTA'] = hota
-            str_summary = mm.io.render_summary(
-                summary,
-                formatters=mh.formatters,
-                namemap=mm.io.motchallenge_metric_names)
-            print(str_summary)
-            local_dir.cleanup()
-            if tmp_dir is not None:
-                tmp_dir.cleanup()
-
-        if 'bbox' in metrics:
-            if isinstance(results, dict):
-                bbox_results = results['det_bboxes']
-            elif isinstance(results, list):
-                bbox_results = results
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
             else:
-                raise TypeError('results must be a dict or a list.')
-            annotations = [self.get_ann_info(info) for info in self.data_infos]
-            mean_ap, _ = eval_map(
-                bbox_results,
-                annotations,
-                iou_thr=bbox_iou_thr,
-                dataset=self.CLASSES,
-                logger=logger)
-            eval_results['mAP'] = mean_ap
-
-        for k, v in eval_results.items():
-            if isinstance(v, float):
-                eval_results[k] = float(f'{(v):.3f}')
-
-        return eval_results
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+            instance['instance_id'] = ann['instance_id']
+            instance['category_id'] = ann['category_id']
+            instance['mot_conf'] = ann['mot_conf']
+            instance['visibility'] = ann['visibility']
+            if len(instance) > 0:
+                instances.append(instance)
+        data_info['instances'] = instances
+        return data_info
diff --git a/mmtrack/datasets/otb_dataset.py b/mmtrack/datasets/otb_dataset.py
index d00c09984..c3d3bec29 100644
--- a/mmtrack/datasets/otb_dataset.py
+++ b/mmtrack/datasets/otb_dataset.py
@@ -3,10 +3,11 @@
 import os.path as osp
 import re
 import time
+from typing import List
 
 import numpy as np
-from mmdet.datasets import DATASETS
 
+from mmtrack.registry import DATASETS
 from .base_sot_dataset import BaseSOTDataset
 
 
@@ -21,12 +22,9 @@ def __init__(self, *args, **kwargs):
         """Initialization of SOT dataset class."""
         super().__init__(*args, **kwargs)
 
-    def load_data_infos(self, split='test'):
+    def load_data_list(self) -> List[dict]:
         """Load dataset information.
 
-        Args:
-            split (str, optional): Dataset split. Defaults to 'test'.
-
         Returns:
             list[dict]: The length of the list is the number of videos. The
                 inner dict is in the following format:
@@ -45,8 +43,8 @@ def load_data_infos(self, split='test'):
         print('Loading OTB100 dataset...')
         start_time = time.time()
         data_infos = []
-        data_infos_str = self.loadtxt(
-            self.ann_file, return_array=False).split('\n')
+        data_infos_str = self._loadtxt(
+            self.ann_file, return_ndarray=False).split('\n')
         # the first line of annotation file is a dataset comment.
         for line in data_infos_str[1:]:
             # compatible with different OS.
@@ -71,31 +69,33 @@ def load_data_infos(self, split='test'):
         print(f'OTB100 dataset loaded! ({time.time()-start_time:.2f} s)')
         return data_infos
 
-    def get_bboxes_from_video(self, video_ind):
+    def get_bboxes_from_video(self, video_ind: int) -> np.ndarray:
         """Get bboxes annotation about the instance in a video.
 
         Args:
             video_ind (int): video index
 
         Returns:
-            ndarray: in [N, 4] shape. The N is the bbox number and the bbox
+            np.ndarray: in [N, 4] shape. The N is the bbox number and the bbox
                 is in (x, y, w, h) format.
         """
-        bboxes_file = osp.join(self.img_prefix,
-                               self.data_infos[video_ind]['ann_path'])
+        meta_video_info = self.get_data_info(video_ind)
+        bbox_path = osp.join(self.data_prefix['img_path'],
+                             meta_video_info['ann_path'])
         bboxes = []
-        bboxes_info = self.loadtxt(bboxes_file, return_array=False).split('\n')
+        bboxes_info = self._loadtxt(
+            bbox_path, return_ndarray=False).split('\n')
         for bbox in bboxes_info:
             bbox = list(map(int, re.findall(r'-?\d+', bbox)))
             bboxes.append(bbox)
         bboxes = np.array(bboxes, dtype=float)
 
-        if 'init_skip_num' in self.data_infos[video_ind]:
-            init_skip_num = self.data_infos[video_ind]['init_skip_num']
+        if 'init_skip_num' in meta_video_info:
+            init_skip_num = meta_video_info['init_skip_num']
             bboxes = bboxes[init_skip_num:]
 
-        end_frame_id = self.data_infos[video_ind]['end_frame_id']
-        start_frame_id = self.data_infos[video_ind]['start_frame_id']
+        end_frame_id = meta_video_info['end_frame_id']
+        start_frame_id = meta_video_info['start_frame_id']
         assert len(bboxes) == (
             end_frame_id - start_frame_id + 1
         ), f'{len(bboxes)} is not equal to {end_frame_id}-{start_frame_id}+1'
diff --git a/mmtrack/datasets/pipelines/__init__.py b/mmtrack/datasets/pipelines/__init__.py
deleted file mode 100644
index 525dcbf36..000000000
--- a/mmtrack/datasets/pipelines/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.datasets.builder import PIPELINES
-
-from .formatting import (CheckPadMaskValidity, ConcatSameTypeFrames,
-                         ConcatVideoReferences, ReIDFormatBundle,
-                         SeqDefaultFormatBundle, ToList, VideoCollect)
-from .loading import (LoadDetections, LoadMultiImagesFromFile,
-                      SeqLoadAnnotations)
-from .processing import MatchInstances, PairSampling, TridentSampling
-from .transforms import (SeqBboxJitter, SeqBlurAug, SeqBrightnessAug,
-                         SeqColorAug, SeqCropLikeSiamFC, SeqCropLikeStark,
-                         SeqGrayAug, SeqNormalize, SeqPad,
-                         SeqPhotoMetricDistortion, SeqRandomCrop,
-                         SeqRandomFlip, SeqResize, SeqShiftScaleAug)
-
-__all__ = [
-    'PIPELINES', 'LoadMultiImagesFromFile', 'SeqLoadAnnotations', 'SeqResize',
-    'SeqNormalize', 'SeqRandomFlip', 'SeqPad', 'SeqDefaultFormatBundle',
-    'VideoCollect', 'CheckPadMaskValidity', 'ConcatVideoReferences',
-    'LoadDetections', 'MatchInstances', 'SeqRandomCrop',
-    'SeqPhotoMetricDistortion', 'SeqCropLikeSiamFC', 'SeqShiftScaleAug',
-    'SeqBlurAug', 'SeqColorAug', 'ToList', 'ReIDFormatBundle', 'SeqGrayAug',
-    'SeqBrightnessAug', 'SeqBboxJitter', 'SeqCropLikeStark', 'TridentSampling',
-    'ConcatSameTypeFrames', 'PairSampling'
-]
diff --git a/mmtrack/datasets/pipelines/formatting.py b/mmtrack/datasets/pipelines/formatting.py
deleted file mode 100644
index 3cd58397d..000000000
--- a/mmtrack/datasets/pipelines/formatting.py
+++ /dev/null
@@ -1,528 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import cv2
-import numpy as np
-from mmcv.parallel import DataContainer as DC
-from mmdet.datasets.builder import PIPELINES
-from mmdet.datasets.pipelines import to_tensor
-
-
-@PIPELINES.register_module()
-class ConcatSameTypeFrames(object):
-    """Concat the frames of the same type. We divide all the frames into two
-    types: 'key' frames and 'reference' frames.
-
-    The input list contains as least two dicts. We concat the first
-    `num_key_frames` dicts to one dict, and the rest of dicts are concated
-    to another dict.
-
-    In SOT field, 'key' denotes template image and 'reference' denotes search
-    image.
-
-    Args:
-        num_key_frames (int, optional): the number of key frames.
-            Defaults to 1.
-    """
-
-    def __init__(self, num_key_frames=1):
-        self.num_key_frames = num_key_frames
-
-    def concat_one_mode_results(self, results):
-        """Concatenate the results of the same mode."""
-        out = dict()
-        for i, result in enumerate(results):
-            if 'img' in result:
-                img = result['img']
-                if len(img.shape) < 3:
-                    img = np.expand_dims(img, -1)
-                if i == 0:
-                    result['img'] = np.expand_dims(img, -1)
-                else:
-                    out['img'] = np.concatenate(
-                        (out['img'], np.expand_dims(img, -1)), axis=-1)
-            for key in ['img_metas', 'gt_masks']:
-                if key in result:
-                    if i == 0:
-                        result[key] = [result[key]]
-                    else:
-                        out[key].append(result[key])
-            for key in [
-                    'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
-                    'gt_instance_ids'
-            ]:
-                if key not in result:
-                    continue
-                value = result[key]
-                if value.ndim == 1:
-                    value = value[:, None]
-                N = value.shape[0]
-                value = np.concatenate((np.full(
-                    (N, 1), i, dtype=np.float32), value),
-                                       axis=1)
-                if i == 0:
-                    result[key] = value
-                else:
-                    out[key] = np.concatenate((out[key], value), axis=0)
-            if 'gt_semantic_seg' in result:
-                if i == 0:
-                    result['gt_semantic_seg'] = result['gt_semantic_seg'][...,
-                                                                          None,
-                                                                          None]
-                else:
-                    out['gt_semantic_seg'] = np.concatenate(
-                        (out['gt_semantic_seg'],
-                         result['gt_semantic_seg'][..., None, None]),
-                        axis=-1)
-
-            if 'padding_mask' in result:
-                if i == 0:
-                    result['padding_mask'] = np.expand_dims(
-                        result['padding_mask'], 0)
-                else:
-                    out['padding_mask'] = np.concatenate(
-                        (out['padding_mask'],
-                         np.expand_dims(result['padding_mask'], 0)),
-                        axis=0)
-
-            if i == 0:
-                out = result
-        return out
-
-    def __call__(self, results):
-        """Call function.
-
-        Args:
-            results (list[dict]): list of dict that contain keys such as 'img',
-                'img_metas', 'gt_masks','proposals', 'gt_bboxes',
-                'gt_bboxes_ignore', 'gt_labels','gt_semantic_seg',
-                'gt_instance_ids', 'padding_mask'.
-
-        Returns:
-            list[dict]: The first dict of outputs concats the dicts of 'key'
-                information. The second dict of outputs concats the dicts of
-                'reference' information.
-        """
-        assert (isinstance(results, list)), 'results must be list'
-        key_results = []
-        reference_results = []
-        for i, result in enumerate(results):
-            if i < self.num_key_frames:
-                key_results.append(result)
-            else:
-                reference_results.append(result)
-        outs = []
-        if self.num_key_frames == 1:
-            # if single key, not expand the dim of variables
-            outs.append(key_results[0])
-        else:
-            outs.append(self.concat_one_mode_results(key_results))
-        outs.append(self.concat_one_mode_results(reference_results))
-
-        return outs
-
-
-@PIPELINES.register_module()
-class ConcatVideoReferences(ConcatSameTypeFrames):
-    """Concat video references.
-
-    If the input list contains at least two dicts, concat the input list of
-    dict to one dict from 2-nd dict of the input list.
-
-    Note: the 'ConcatVideoReferences' class will be deprecated in the
-    future, please use 'ConcatSameTypeFrames' instead.
-    """
-
-    def __init__(self):
-        warnings.warn(
-            "The 'ConcatVideoReferences' class will be deprecated in the "
-            "future, please use 'ConcatSameTypeFrames' instead")
-        super(ConcatVideoReferences, self).__init__(num_key_frames=1)
-
-
-@PIPELINES.register_module()
-class MultiImagesToTensor(object):
-    """Multi images to tensor.
-
-    1. Transpose and convert image/multi-images to Tensor.
-    2. Add prefix to every key in the second dict of the inputs. Then, add
-    these keys and corresponding values into the outputs.
-
-    Args:
-        ref_prefix (str): The prefix of key added to the second dict of inputs.
-            Defaults to 'ref'.
-    """
-
-    def __init__(self, ref_prefix='ref'):
-        self.ref_prefix = ref_prefix
-
-    def __call__(self, results):
-        """Multi images to tensor.
-
-        1. Transpose and convert image/multi-images to Tensor.
-        2. Add prefix to every key in the second dict of the inputs. Then, add
-        these keys and corresponding values into the output dict.
-
-        Args:
-            results (list[dict]): List of two dicts.
-
-        Returns:
-            dict: Each key in the first dict of `results` remains unchanged.
-            Each key in the second dict of `results` adds `self.ref_prefix`
-            as prefix.
-        """
-        outs = []
-        for _results in results:
-            _results = self.images_to_tensor(_results)
-            outs.append(_results)
-
-        data = {}
-        data.update(outs[0])
-        if len(outs) == 2:
-            for k, v in outs[1].items():
-                data[f'{self.ref_prefix}_{k}'] = v
-
-        return data
-
-    def images_to_tensor(self, results):
-        """Transpose and convert images/multi-images to Tensor."""
-        if 'img' in results:
-            img = results['img']
-            if len(img.shape) == 3:
-                # (H, W, 3) to (3, H, W)
-                img = np.ascontiguousarray(img.transpose(2, 0, 1))
-            else:
-                # (H, W, 3, N) to (N, 3, H, W)
-                img = np.ascontiguousarray(img.transpose(3, 2, 0, 1))
-            results['img'] = to_tensor(img)
-        if 'proposals' in results:
-            results['proposals'] = to_tensor(results['proposals'])
-        if 'img_metas' in results:
-            results['img_metas'] = DC(results['img_metas'], cpu_only=True)
-        return results
-
-
-@PIPELINES.register_module()
-class SeqDefaultFormatBundle(object):
-    """Sequence Default formatting bundle.
-
-    It simplifies the pipeline of formatting common fields, including "img",
-    "img_metas", "proposals", "gt_bboxes", "gt_instance_ids",
-    "gt_match_indices", "gt_bboxes_ignore", "gt_labels", "gt_masks",
-    "gt_semantic_seg" and 'padding_mask'.
-    These fields are formatted as follows.
-
-    - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True)
-    - img_metas: (1) to DataContainer (cpu_only=True)
-    - proposals: (1) to tensor, (2) to DataContainer
-    - gt_bboxes: (1) to tensor, (2) to DataContainer
-    - gt_instance_ids: (1) to tensor, (2) to DataContainer
-    - gt_match_indices: (1) to tensor, (2) to DataContainer
-    - gt_bboxes_ignore: (1) to tensor, (2) to DataContainer
-    - gt_labels: (1) to tensor, (2) to DataContainer
-    - gt_masks: (1) to DataContainer (cpu_only=True)
-    - gt_semantic_seg: (1) unsqueeze dim-0 (2) to tensor, \
-                       (3) to DataContainer (stack=True)
-    - padding_mask: (1) to tensor, (2) to DataContainer
-
-    Args:
-        ref_prefix (str): The prefix of key added to the second dict of input
-            list. Defaults to 'ref'.
-    """
-
-    def __init__(self, ref_prefix='ref'):
-        self.ref_prefix = ref_prefix
-
-    def __call__(self, results):
-        """Sequence Default formatting bundle call function.
-
-        Args:
-            results (list[dict]): List of two dicts.
-
-        Returns:
-            dict: The result dict contains the data that is formatted with
-            default bundle. Each key in the second dict of the input list
-            adds `self.ref_prefix` as prefix.
-        """
-        outs = []
-        for _results in results:
-            _results = self.default_format_bundle(_results)
-            outs.append(_results)
-
-        data = {}
-        data.update(outs[0])
-        for k, v in outs[1].items():
-            data[f'{self.ref_prefix}_{k}'] = v
-
-        return data
-
-    def default_format_bundle(self, results):
-        """Transform and format common fields in results.
-
-        Args:
-            results (dict): Result dict contains the data to convert.
-
-        Returns:
-            dict: The result dict contains the data that is formatted with
-            default bundle.
-        """
-        if 'img' in results:
-            img = results['img']
-            if len(img.shape) == 3:
-                img = np.ascontiguousarray(img.transpose(2, 0, 1))
-            else:
-                img = np.ascontiguousarray(img.transpose(3, 2, 0, 1))
-            results['img'] = DC(to_tensor(img), stack=True)
-        if 'padding_mask' in results:
-            results['padding_mask'] = DC(
-                to_tensor(results['padding_mask'].copy()), stack=True)
-        for key in [
-                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
-                'gt_instance_ids', 'gt_match_indices'
-        ]:
-            if key not in results:
-                continue
-            results[key] = DC(to_tensor(results[key]))
-        for key in ['img_metas', 'gt_masks']:
-            if key in results:
-                results[key] = DC(results[key], cpu_only=True)
-        if 'gt_semantic_seg' in results:
-            semantic_seg = results['gt_semantic_seg']
-            if len(semantic_seg.shape) == 2:
-                semantic_seg = semantic_seg[None, ...]
-            else:
-                semantic_seg = np.ascontiguousarray(
-                    semantic_seg.transpose(3, 2, 0, 1))
-            results['gt_semantic_seg'] = DC(
-                to_tensor(results['gt_semantic_seg']), stack=True)
-        return results
-
-    def __repr__(self):
-        return self.__class__.__name__
-
-
-@PIPELINES.register_module()
-class VideoCollect(object):
-    """Collect data from the loader relevant to the specific task.
-
-    Args:
-        keys (Sequence[str]): Keys of results to be collected in ``data``.
-        meta_keys (Sequence[str]): Meta keys to be converted to
-            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
-            Defaults to None.
-        default_meta_keys (tuple): Default meta keys. Defaults to ('filename',
-            'ori_filename', 'ori_shape', 'img_shape', 'pad_shape',
-            'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg',
-            'frame_id', 'is_video_data').
-    """
-
-    def __init__(self,
-                 keys,
-                 meta_keys=None,
-                 default_meta_keys=('filename', 'ori_filename', 'ori_shape',
-                                    'img_shape', 'pad_shape', 'scale_factor',
-                                    'flip', 'flip_direction', 'img_norm_cfg',
-                                    'frame_id', 'is_video_data')):
-        self.keys = keys
-        self.meta_keys = default_meta_keys
-        if meta_keys is not None:
-            if isinstance(meta_keys, str):
-                meta_keys = (meta_keys, )
-            else:
-                assert isinstance(meta_keys, tuple), \
-                    'meta_keys must be str or tuple'
-            self.meta_keys += meta_keys
-
-    def __call__(self, results):
-        """Call function to collect keys in results.
-
-        The keys in ``meta_keys`` and ``default_meta_keys`` will be converted
-        to :obj:mmcv.DataContainer.
-
-        Args:
-            results (list[dict] | dict): List of dict or dict which contains
-                the data to collect.
-
-        Returns:
-            list[dict] | dict: List of dict or dict that contains the
-            following keys:
-
-            - keys in ``self.keys``
-            - ``img_metas``
-        """
-        results_is_dict = isinstance(results, dict)
-        if results_is_dict:
-            results = [results]
-        outs = []
-        for _results in results:
-            _results = self._add_default_meta_keys(_results)
-            _results = self._collect_meta_keys(_results)
-            outs.append(_results)
-
-        if results_is_dict:
-            outs[0]['img_metas'] = DC(outs[0]['img_metas'], cpu_only=True)
-
-        return outs[0] if results_is_dict else outs
-
-    def _collect_meta_keys(self, results):
-        """Collect `self.keys` and `self.meta_keys` from `results` (dict)."""
-        data = {}
-        img_meta = {}
-        for key in self.meta_keys:
-            if key in results:
-                img_meta[key] = results[key]
-            elif key in results['img_info']:
-                img_meta[key] = results['img_info'][key]
-        data['img_metas'] = img_meta
-        for key in self.keys:
-            data[key] = results[key]
-        return data
-
-    def _add_default_meta_keys(self, results):
-        """Add default meta keys.
-
-        We set default meta keys including `pad_shape`, `scale_factor` and
-        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
-        `Pad` are implemented during the whole pipeline.
-
-        Args:
-            results (dict): Result dict contains the data to convert.
-
-        Returns:
-            results (dict): Updated result dict contains the data to convert.
-        """
-        img = results['img']
-        results.setdefault('pad_shape', img.shape)
-        results.setdefault('scale_factor', 1.0)
-        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
-        results.setdefault(
-            'img_norm_cfg',
-            dict(
-                mean=np.zeros(num_channels, dtype=np.float32),
-                std=np.ones(num_channels, dtype=np.float32),
-                to_rgb=False))
-        return results
-
-
-@PIPELINES.register_module()
-class CheckPadMaskValidity(object):
-    """Check the validity of data. Generally, it's used in such case: The image
-    padding masks generated in the image preprocess need to be downsampled, and
-    then passed into Transformer model, like DETR. The computation in the
-    subsequent Transformer model must make sure that the values of downsampled
-    mask are not all zeros.
-
-    Args:
-        stride (int): the max stride of feature map.
-    """
-
-    def __init__(self, stride):
-        self.stride = stride
-
-    def __call__(self, results):
-        """Call function.
-
-        Args:
-            results (dict): Result dict contains the data to be checked.
-
-        Returns:
-            dict | None: If invalid, return None; otherwise, return original
-                input.
-        """
-        for _results in results:
-            assert 'padding_mask' in _results
-            mask = _results['padding_mask'].copy().astype(np.float32)
-            img_h, img_w = _results['img'].shape[:2]
-            feat_h, feat_w = img_h // self.stride, img_w // self.stride
-            downsample_mask = cv2.resize(
-                mask, dsize=(feat_h, feat_w)).astype(bool)
-            if (downsample_mask == 1).all():
-                return None
-        return results
-
-
-@PIPELINES.register_module()
-class ToList(object):
-    """Use list to warp each value of the input dict.
-
-    Args:
-        results (dict): Result dict contains the data to convert.
-
-    Returns:
-        dict: Updated result dict contains the data to convert.
-    """
-
-    def __call__(self, results):
-        out = {}
-        for k, v in results.items():
-            out[k] = [v]
-        return out
-
-
-@PIPELINES.register_module()
-class ReIDFormatBundle(object):
-    """ReID formatting bundle.
-
-    It first concatenates common fields, then simplifies the pipeline of
-    formatting common fields, including "img", and "gt_label".
-    These fields are formatted as follows.
-
-    - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True)
-    - gt_labels: (1) to tensor, (2) to DataContainer
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-
-    def __call__(self, results):
-        """ReID formatting bundle call function.
-
-        Args:
-            results (list[dict] or dict): List of dicts or dict.
-
-        Returns:
-            dict: The result dict contains the data that is formatted with
-            ReID bundle.
-        """
-        inputs = dict()
-        if isinstance(results, list):
-            assert len(results) > 1, \
-                'the \'results\' only have one item, ' \
-                'please directly use normal pipeline not \'Seq\' pipeline.'
-            inputs['img'] = np.stack([_results['img'] for _results in results],
-                                     axis=3)
-            inputs['gt_label'] = np.stack(
-                [_results['gt_label'] for _results in results], axis=0)
-        elif isinstance(results, dict):
-            inputs['img'] = results['img']
-            inputs['gt_label'] = results['gt_label']
-        else:
-            raise TypeError('results must be a list or a dict.')
-        outs = self.reid_format_bundle(inputs)
-
-        return outs
-
-    def reid_format_bundle(self, results):
-        """Transform and format gt_label fields in results.
-
-        Args:
-            results (dict): Result dict contains the data to convert.
-
-        Returns:
-            dict: The result dict contains the data that is formatted with
-            ReID bundle.
-        """
-        for key in results:
-            if key == 'img':
-                img = results[key]
-                if img.ndim == 3:
-                    img = np.ascontiguousarray(img.transpose(2, 0, 1))
-                else:
-                    img = np.ascontiguousarray(img.transpose(3, 2, 0, 1))
-                results['img'] = DC(to_tensor(img), stack=True)
-            elif key == 'gt_label':
-                results[key] = DC(
-                    to_tensor(results[key]), stack=True, pad_dims=None)
-            else:
-                raise KeyError(f'key {key} is not supported')
-        return results
diff --git a/mmtrack/datasets/pipelines/loading.py b/mmtrack/datasets/pipelines/loading.py
deleted file mode 100644
index 90d4f3f7d..000000000
--- a/mmtrack/datasets/pipelines/loading.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.datasets.builder import PIPELINES
-from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile
-
-from mmtrack.core import results2outs
-
-
-@PIPELINES.register_module()
-class LoadMultiImagesFromFile(LoadImageFromFile):
-    """Load multi images from file.
-
-    Please refer to `mmdet.datasets.pipelines.loading.py:LoadImageFromFile`
-    for detailed docstring.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in `results`, call the call function of
-        `LoadImageFromFile` to load image.
-
-        Args:
-            results (list[dict]): List of dict from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains loaded image.
-        """
-        outs = []
-        for _results in results:
-            _results = super().__call__(_results)
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqLoadAnnotations(LoadAnnotations):
-    """Sequence load annotations.
-
-    Please refer to `mmdet.datasets.pipelines.loading.py:LoadAnnotations`
-    for detailed docstring.
-
-    Args:
-        with_track (bool): If True, load instance ids of bboxes.
-    """
-
-    def __init__(self, with_track=False, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.with_track = with_track
-
-    def _load_track(self, results):
-        """Private function to load label annotations.
-
-        Args:
-            results (dict): Result dict from :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            dict: The dict contains loaded label annotations.
-        """
-
-        results['gt_instance_ids'] = results['ann_info']['instance_ids'].copy()
-
-        return results
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, call the call function of `LoadAnnotations`
-        to load annotation.
-
-        Args:
-            results (list[dict]): List of dict that from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains loaded annotations, such as
-            bounding boxes, labels, instance ids, masks and semantic
-            segmentation annotations.
-        """
-        outs = []
-        for _results in results:
-            _results = super().__call__(_results)
-            if self.with_track:
-                _results = self._load_track(_results)
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class LoadDetections(object):
-    """Load public detections from MOT benchmark.
-
-    Args:
-        results (dict): Result dict from :obj:`mmtrack.CocoVideoDataset`.
-    """
-
-    def __call__(self, results):
-        outs_det = results2outs(bbox_results=results['detections'])
-        bboxes = outs_det['bboxes']
-        labels = outs_det['labels']
-
-        results['public_bboxes'] = bboxes[:, :4]
-        if bboxes.shape[1] > 4:
-            results['public_scores'] = bboxes[:, -1]
-        results['public_labels'] = labels
-        results['bbox_fields'].append('public_bboxes')
-        return results
diff --git a/mmtrack/datasets/pipelines/processing.py b/mmtrack/datasets/pipelines/processing.py
deleted file mode 100644
index 161496741..000000000
--- a/mmtrack/datasets/pipelines/processing.py
+++ /dev/null
@@ -1,432 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import random
-
-import numpy as np
-from mmcv.utils import print_log
-from mmdet.datasets.builder import PIPELINES
-
-
-@PIPELINES.register_module()
-class TridentSampling(object):
-    """Multitemplate-style sampling in a trident manner. It's firstly used in
-    `STARK <https://arxiv.org/abs/2103.17154.>`_.
-
-    Args:
-        num_search_frames (int, optional): the number of search frames
-        num_template_frames (int, optional): the number of template frames
-        max_frame_range (list[int], optional): the max frame range of sampling
-            a positive search image for the template image. Its length is equal
-            to the number of extra templates, i.e., `num_template_frames`-1.
-            Default length is 1.
-        cls_pos_prob (float, optional): the probility of sampling positive
-            samples in classification training.
-        train_cls_head (bool, optional): whether to train classification head.
-        min_num_frames (int, optional): the min number of frames to be sampled.
-    """
-
-    def __init__(self,
-                 num_search_frames=1,
-                 num_template_frames=2,
-                 max_frame_range=[200],
-                 cls_pos_prob=0.5,
-                 train_cls_head=False,
-                 min_num_frames=20):
-        assert num_template_frames >= 2
-        assert len(max_frame_range) == num_template_frames - 1
-        self.num_search_frames = num_search_frames
-        self.num_template_frames = num_template_frames
-        self.max_frame_range = max_frame_range
-        self.train_cls_head = train_cls_head
-        self.cls_pos_prob = cls_pos_prob
-        self.min_num_frames = min_num_frames
-
-    def random_sample_inds(self,
-                           video_visibility,
-                           num_samples=1,
-                           frame_range=None,
-                           allow_invisible=False,
-                           force_invisible=False):
-        """Random sampling a specific number of samples from the specified
-        frame range of the video. It also considers the visibility of each
-        frame.
-
-        Args:
-            video_visibility (ndarray): the visibility of each frame in the
-                video.
-            num_samples (int, optional): the number of samples. Defaults to 1.
-            frame_range (list | None, optional): the frame range of sampling.
-                Defaults to None.
-            allow_invisible (bool, optional): whether to allow to get invisible
-                samples. Defaults to False.
-            force_invisible (bool, optional): whether to force to get invisible
-                samples. Defaults to False.
-
-        Returns:
-            List: The sampled frame indexes.
-        """
-        assert num_samples > 0
-        if frame_range is None:
-            frame_range = [0, len(video_visibility)]
-        else:
-            assert isinstance(frame_range, list) and len(frame_range) == 2
-            frame_range[0] = max(0, frame_range[0])
-            frame_range[1] = min(len(video_visibility), frame_range[1])
-
-        video_visibility = np.asarray(video_visibility)
-        visibility_in_range = video_visibility[frame_range[0]:frame_range[1]]
-        # get indexes of valid samples
-        if force_invisible:
-            valid_inds = np.where(~visibility_in_range)[0] + frame_range[0]
-        else:
-            valid_inds = np.arange(
-                *frame_range) if allow_invisible else np.where(
-                    visibility_in_range)[0] + frame_range[0]
-
-        # No valid samples
-        if len(valid_inds) == 0:
-            return [None] * num_samples
-
-        return random.choices(valid_inds, k=num_samples)
-
-    def sampling_trident(self, video_visibility):
-        """Sampling multiple template images and one search images in one
-        video.
-
-        Args:
-            video_visibility (ndarray): the visibility of each frame in the
-                video.
-
-        Returns:
-            List: the indexes of template and search images.
-        """
-        extra_template_inds = [None]
-        sampling_count = 0
-        if self.is_video_data:
-            while None in extra_template_inds:
-                # first randomly sample two frames from a video
-                template_ind, search_ind = self.random_sample_inds(
-                    video_visibility, num_samples=2)
-
-                # then sample the extra templates
-                extra_template_inds = []
-                for max_frame_range in self.max_frame_range:
-                    # make the sampling range is near the template_ind
-                    if template_ind >= search_ind:
-                        min_ind, max_ind = search_ind, \
-                            search_ind + max_frame_range
-                    else:
-                        min_ind, max_ind = search_ind - max_frame_range, \
-                            search_ind
-                    extra_template_index = self.random_sample_inds(
-                        video_visibility,
-                        num_samples=1,
-                        frame_range=[min_ind, max_ind],
-                        allow_invisible=False)[0]
-
-                    extra_template_inds.append(extra_template_index)
-
-                sampling_count += 1
-                if sampling_count > 100:
-                    print_log('-------Not sampling extra valid templates'
-                              'successfully. Stop sampling and copy the'
-                              'first template as extra templates-------')
-                    extra_template_inds = [template_ind] * len(
-                        self.max_frame_range)
-
-            sampled_inds = [template_ind] + extra_template_inds + [search_ind]
-        else:
-            sampled_inds = [0] * (
-                self.num_template_frames + self.num_search_frames)
-
-        return sampled_inds
-
-    def prepare_data(self, video_info, sampled_inds, with_label=False):
-        """Prepare sampled training data according to the sampled index.
-
-        Args:
-            video_info (dict): the video information. It contains the keys:
-                ['bboxes','bboxes_isvalid','filename','frame_ids',
-                'video_id','visible'].
-            sampled_inds (list[int]): the sampled frame indexes.
-            with_label (bool, optional): whether to recode labels in ann infos.
-                Only set True in classification training. Defaults to False.
-
-        Returns:
-            List[dict]: contains the information of sampled data.
-        """
-        extra_infos = {}
-        for key, info in video_info.items():
-            if key in [
-                    'bbox_fields', 'mask_fields', 'seg_fields', 'img_prefix'
-            ]:
-                extra_infos[key] = info
-
-        bboxes = video_info['bboxes']
-        results = []
-        for frame_ind in sampled_inds:
-            if with_label:
-                ann_info = dict(
-                    bboxes=np.expand_dims(bboxes[frame_ind], axis=0),
-                    labels=np.array([1.], dtype=np.float32))
-            else:
-                ann_info = dict(
-                    bboxes=np.expand_dims(bboxes[frame_ind], axis=0))
-            img_info = dict(
-                filename=video_info['filename'][frame_ind],
-                frame_id=video_info['frame_ids'][frame_ind],
-                video_id=video_info['video_id'])
-            result = dict(img_info=img_info, ann_info=ann_info, **extra_infos)
-            results.append(result)
-        return results
-
-    def prepare_cls_data(self, video_info, video_info_another, sampled_inds):
-        """Prepare the sampled classification training data according to the
-        sampled index.
-
-        Args:
-            video_info (dict): the video information. It contains the keys:
-                ['bboxes','bboxes_isvalid','filename','frame_ids',
-                'video_id','visible'].
-            video_info_another (dict): the another video information. It's only
-                used to get negative samples in classification train. It
-                contains the keys: ['bboxes','bboxes_isvalid','filename',
-                'frame_ids','video_id','visible'].
-            sampled_inds (list[int]): the sampled frame indexes.
-
-        Returns:
-            List[dict]: contains the information of sampled data.
-        """
-        results = self.prepare_data(
-            video_info,
-            sampled_inds[:self.num_template_frames],
-            with_label=True)
-
-        if random.random() < self.cls_pos_prob:
-            pos_search_samples = self.prepare_data(
-                video_info, sampled_inds[-self.num_search_frames:])
-            for sample in pos_search_samples:
-                sample['ann_info']['labels'] = np.array([1], dtype=np.float32)
-            results.extend(pos_search_samples)
-        else:
-            if self.is_video_data:
-                neg_search_ind = self.random_sample_inds(
-                    video_info_another['bboxes_isvalid'], num_samples=1)
-                # may not get valid negative sample in current video
-                if neg_search_ind[0] is None:
-                    return None
-                neg_search_samples = self.prepare_data(video_info_another,
-                                                       neg_search_ind)
-            else:
-                neg_search_samples = self.prepare_data(video_info_another, [0])
-
-            for sample in neg_search_samples:
-                sample['ann_info']['labels'] = np.array([0], dtype=np.float32)
-            results.extend(neg_search_samples)
-        return results
-
-    def __call__(self, pair_video_infos):
-        """
-        Args:
-            pair_video_infos (list[dict]): contains two video infos. Each video
-                info contains the keys: ['bboxes','bboxes_isvalid','filename',
-                'frame_ids','video_id','visible'].
-
-        Returns:
-            List[dict]: contains the information of sampled data.
-        """
-        video_info, video_info_another = pair_video_infos
-        self.is_video_data = len(video_info['frame_ids']) > 1 and len(
-            video_info_another['frame_ids']) > 1
-        enough_visible_frames = sum(video_info['visible']) > 2 * (
-            self.num_search_frames + self.num_template_frames) and len(
-                video_info['visible']) >= self.min_num_frames
-        enough_visible_frames = enough_visible_frames or not \
-            self.is_video_data
-
-        if not enough_visible_frames:
-            return None
-
-        sampled_inds = np.array(self.sampling_trident(video_info['visible']))
-        # the sizes of some bboxes may be zero, because extral templates may
-        # get invalid bboxes.
-        if not video_info['bboxes_isvalid'][sampled_inds].all():
-            return None
-
-        if not self.train_cls_head:
-            results = self.prepare_data(video_info, sampled_inds)
-        else:
-            results = self.prepare_cls_data(video_info, video_info_another,
-                                            sampled_inds)
-
-        return results
-
-
-@PIPELINES.register_module()
-class PairSampling(object):
-    """Pair-style sampling. It's used in `SiameseRPN++
-
-    <https://arxiv.org/abs/1812.11703.>`_.
-
-    Args:
-        frame_range (List(int) | int): the sampling range of search
-            frames in the same video for template frame. Defaults to 5.
-        pos_prob (float, optional):  the probility of sampling positive
-            sample pairs. Defaults to 0.8.
-        filter_template_img (bool, optional): if False, the template image will
-            be in the sampling search candidates, otherwise, it is exclude.
-            Defaults to False.
-    """
-
-    def __init__(self, frame_range=5, pos_prob=0.8, filter_template_img=False):
-        assert pos_prob >= 0.0 and pos_prob <= 1.0
-        if isinstance(frame_range, int):
-            assert frame_range >= 0, 'frame_range can not be a negative value.'
-            frame_range = [-frame_range, frame_range]
-        elif isinstance(frame_range, list):
-            assert len(frame_range) == 2, 'The length must be 2.'
-            assert frame_range[0] <= 0 and frame_range[1] >= 0
-            for i in frame_range:
-                assert isinstance(i, int), 'Each element must be int.'
-        else:
-            raise TypeError('The type of frame_range must be int or list.')
-        self.frame_range = frame_range
-        self.pos_prob = pos_prob
-        self.filter_template_img = filter_template_img
-
-    def prepare_data(self, video_info, sampled_inds, is_positive_pairs=False):
-        """Prepare sampled training data according to the sampled index.
-
-        Args:
-            video_info (dict): the video information. It contains the keys:
-                ['bboxes','bboxes_isvalid','filename','frame_ids',
-                'video_id','visible'].
-            sampled_inds (list[int]): the sampled frame indexes.
-            is_positive_pairs (bool, optional): whether it's the positive
-                pairs. Defaults to False.
-
-        Returns:
-            List[dict]: contains the information of sampled data.
-        """
-        extra_infos = {}
-        for key, info in video_info.items():
-            if key in [
-                    'bbox_fields', 'mask_fields', 'seg_fields', 'img_prefix'
-            ]:
-                extra_infos[key] = info
-
-        bboxes = video_info['bboxes']
-        results = []
-        for frame_ind in sampled_inds:
-            ann_info = dict(bboxes=np.expand_dims(bboxes[frame_ind], axis=0))
-            img_info = dict(
-                filename=video_info['filename'][frame_ind],
-                frame_id=video_info['frame_ids'][frame_ind],
-                video_id=video_info['video_id'])
-            result = dict(
-                img_info=img_info,
-                ann_info=ann_info,
-                is_positive_pairs=is_positive_pairs,
-                **extra_infos)
-            results.append(result)
-        return results
-
-    def __call__(self, pair_video_infos):
-        """
-        Args:
-            pair_video_infos (list[dict]): contains two video infos. Each video
-                info contains the keys: ['bboxes','bboxes_isvalid','filename',
-                'frame_ids','video_id','visible'].
-
-        Returns:
-            List[dict]: contains the information of sampled data.
-        """
-        video_info, video_info_another = pair_video_infos
-        if len(video_info['frame_ids']) > 1 and len(
-                video_info_another['frame_ids']) > 1:
-            template_frame_ind = np.random.choice(len(video_info['frame_ids']))
-            if self.pos_prob > np.random.random():
-                left_ind = max(template_frame_ind + self.frame_range[0], 0)
-                right_ind = min(template_frame_ind + self.frame_range[1],
-                                len(video_info['frame_ids']))
-                if self.filter_template_img:
-                    ref_frames_inds = list(
-                        range(left_ind, template_frame_ind)) + list(
-                            range(template_frame_ind + 1, right_ind))
-                else:
-                    ref_frames_inds = list(range(left_ind, right_ind))
-                search_frame_ind = np.random.choice(ref_frames_inds)
-                results = self.prepare_data(
-                    video_info, [template_frame_ind, search_frame_ind],
-                    is_positive_pairs=True)
-            else:
-                search_frame_ind = np.random.choice(
-                    len(video_info_another['frame_ids']))
-                results = self.prepare_data(
-                    video_info, [template_frame_ind], is_positive_pairs=False)
-                results.extend(
-                    self.prepare_data(
-                        video_info_another, [search_frame_ind],
-                        is_positive_pairs=False))
-        else:
-            if self.pos_prob > np.random.random():
-                results = self.prepare_data(
-                    video_info, [0, 0], is_positive_pairs=True)
-            else:
-                results = self.prepare_data(
-                    video_info, [0], is_positive_pairs=False)
-                results.extend(
-                    self.prepare_data(
-                        video_info_another, [0], is_positive_pairs=False))
-        return results
-
-
-@PIPELINES.register_module()
-class MatchInstances(object):
-    """Matching objects on a pair of images.
-
-    Args:
-        skip_nomatch (bool, optional): Whether skip the pair of image
-        during training when there are no matched objects. Default
-        to True.
-    """
-
-    def __init__(self, skip_nomatch=True):
-        self.skip_nomatch = skip_nomatch
-
-    def _match_gts(self, instance_ids, ref_instance_ids):
-        """Matching objects according to ground truth `instance_ids`.
-
-        Args:
-            instance_ids (ndarray): of shape (N1, ).
-            ref_instance_ids (ndarray): of shape (N2, ).
-
-        Returns:
-            tuple: Matching results which contain the indices of the
-            matched target.
-        """
-        ins_ids = list(instance_ids)
-        ref_ins_ids = list(ref_instance_ids)
-        match_indices = np.array([
-            ref_ins_ids.index(i) if (i in ref_ins_ids and i > 0) else -1
-            for i in ins_ids
-        ])
-        ref_match_indices = np.array([
-            ins_ids.index(i) if (i in ins_ids and i > 0) else -1
-            for i in ref_ins_ids
-        ])
-        return match_indices, ref_match_indices
-
-    def __call__(self, results):
-        if len(results) != 2:
-            raise NotImplementedError('Only support match 2 images now.')
-
-        match_indices, ref_match_indices = self._match_gts(
-            results[0]['gt_instance_ids'], results[1]['gt_instance_ids'])
-        nomatch = (match_indices == -1).all()
-        if self.skip_nomatch and nomatch:
-            return None
-        else:
-            results[0]['gt_match_indices'] = match_indices.copy()
-            results[1]['gt_match_indices'] = ref_match_indices.copy()
-
-        return results
diff --git a/mmtrack/datasets/pipelines/transforms.py b/mmtrack/datasets/pipelines/transforms.py
deleted file mode 100644
index ac49e4f21..000000000
--- a/mmtrack/datasets/pipelines/transforms.py
+++ /dev/null
@@ -1,1095 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-
-import cv2
-import mmcv
-import numpy as np
-from mmcv.utils import print_log
-from mmdet.datasets.builder import PIPELINES
-from mmdet.datasets.pipelines import Normalize, Pad, RandomFlip, Resize
-
-from mmtrack.core import crop_image
-
-
-@PIPELINES.register_module()
-class SeqCropLikeSiamFC(object):
-    """Crop images as SiamFC did.
-
-    The way of cropping an image is proposed in
-    "Fully-Convolutional Siamese Networks for Object Tracking."
-    `SiamFC <https://arxiv.org/abs/1606.09549>`_.
-
-    Args:
-        context_amount (float): The context amount around a bounding box.
-            Defaults to 0.5.
-        exemplar_size (int): Exemplar size. Defaults to 127.
-        crop_size (int): Crop size. Defaults to 511.
-    """
-
-    def __init__(self, context_amount=0.5, exemplar_size=127, crop_size=511):
-        self.context_amount = context_amount
-        self.exemplar_size = exemplar_size
-        self.crop_size = crop_size
-
-    def crop_like_SiamFC(self,
-                         image,
-                         bbox,
-                         context_amount=0.5,
-                         exemplar_size=127,
-                         crop_size=511):
-        """Crop an image as SiamFC did.
-
-        Args:
-            image (ndarray): of shape (H, W, 3).
-            bbox (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
-            context_amount (float): The context amount around a bounding box.
-                Defaults to 0.5.
-            exemplar_size (int): Exemplar size. Defaults to 127.
-            crop_size (int): Crop size. Defaults to 511.
-
-        Returns:
-            ndarray: The cropped image of shape (crop_size, crop_size, 3).
-        """
-        padding = np.mean(image, axis=(0, 1)).tolist()
-
-        bbox = np.array([
-            0.5 * (bbox[2] + bbox[0]), 0.5 * (bbox[3] + bbox[1]),
-            bbox[2] - bbox[0], bbox[3] - bbox[1]
-        ])
-        z_width = bbox[2] + context_amount * (bbox[2] + bbox[3])
-        z_height = bbox[3] + context_amount * (bbox[2] + bbox[3])
-        z_size = np.sqrt(z_width * z_height)
-
-        z_scale = exemplar_size / z_size
-        d_search = (crop_size - exemplar_size) / 2.
-        pad = d_search / z_scale
-        x_size = z_size + 2 * pad
-        x_bbox = np.array([
-            bbox[0] - 0.5 * x_size, bbox[1] - 0.5 * x_size,
-            bbox[0] + 0.5 * x_size, bbox[1] + 0.5 * x_size
-        ])
-
-        x_crop_img = crop_image(image, x_bbox, crop_size, padding)
-        return x_crop_img
-
-    def generate_box(self, image, gt_bbox, context_amount, exemplar_size):
-        """Generate box based on cropped image.
-
-        Args:
-            image (ndarray): The cropped image of shape
-                (self.crop_size, self.crop_size, 3).
-            gt_bbox (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
-            context_amount (float): The context amount around a bounding box.
-            exemplar_size (int): Exemplar size. Defaults to 127.
-
-        Returns:
-            ndarray: Generated box of shape (4, ) in [x1, y1, x2, y2] format.
-        """
-        img_h, img_w = image.shape[:2]
-        w, h = gt_bbox[2] - gt_bbox[0], gt_bbox[3] - gt_bbox[1]
-
-        z_width = w + context_amount * (w + h)
-        z_height = h + context_amount * (w + h)
-        z_scale = np.sqrt(z_width * z_height)
-        z_scale_factor = exemplar_size / z_scale
-        w = w * z_scale_factor
-        h = h * z_scale_factor
-        cx, cy = img_w // 2, img_h // 2
-        bbox = np.array(
-            [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h],
-            dtype=np.float32)
-
-        return bbox
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, crop image like SiamFC did.
-
-        Args:
-            results (list[dict]): List of dict that from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains cropped image and
-            corresponding ground truth box.
-        """
-        outs = []
-        for _results in results:
-            image = _results['img']
-            gt_bbox = _results['gt_bboxes'][0]
-
-            crop_img = self.crop_like_SiamFC(image, gt_bbox,
-                                             self.context_amount,
-                                             self.exemplar_size,
-                                             self.crop_size)
-            generated_bbox = self.generate_box(crop_img, gt_bbox,
-                                               self.context_amount,
-                                               self.exemplar_size)
-            generated_bbox = generated_bbox[None]
-
-            _results['img'] = crop_img
-            if 'img_shape' in _results:
-                _results['img_shape'] = crop_img.shape
-            _results['gt_bboxes'] = generated_bbox
-
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqCropLikeStark(object):
-    """Crop images as Stark did.
-
-    The way of cropping an image is proposed in
-    "Learning Spatio-Temporal Transformer for Visual Tracking."
-    `Stark <https://arxiv.org/abs/2103.17154>`_.
-
-    Args:
-        crop_size_factor (list[int | float]): contains the ratio of crop size
-            to bbox size.
-        output_size (list[int | float]): contains the size of resized image
-            (always square).
-    """
-
-    def __init__(self, crop_size_factor, output_size):
-        self.crop_size_factor = crop_size_factor
-        self.output_size = output_size
-
-    def crop_like_stark(self, img, bbox, crop_size_factor, output_size):
-        """Crop an image as Stark did.
-
-        Args:
-            image (ndarray): of shape (H, W, 3).
-            bbox (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
-            crop_size_factor (float): the ratio of crop size to bbox size
-            output_size (int): the size of resized image (always square).
-
-        Returns:
-            img_crop_padded (ndarray): the cropped image of shape
-                (crop_size, crop_size, 3).
-            resize_factor (float): the ratio of original image scale to cropped
-                image scale.
-            pdding_mask (ndarray): the padding mask caused by cropping.
-        """
-        x1, y1, x2, y2 = np.split(bbox, 4, axis=-1)
-        bbox_w, bbox_h = x2 - x1, y2 - y1
-        cx, cy = x1 + bbox_w / 2., y1 + bbox_h / 2.
-
-        img_h, img_w, _ = img.shape
-        # 1. Crop image
-        # 1.1 calculate crop size and pad size
-        crop_size = math.ceil(math.sqrt(bbox_w * bbox_h) * crop_size_factor)
-        crop_size = max(crop_size, 1)
-
-        x1 = int(np.round(cx - crop_size * 0.5))
-        x2 = x1 + crop_size
-        y1 = int(np.round(cy - crop_size * 0.5))
-        y2 = y1 + crop_size
-
-        x1_pad = max(0, -x1)
-        x2_pad = max(x2 - img_w + 1, 0)
-        y1_pad = max(0, -y1)
-        y2_pad = max(y2 - img_h + 1, 0)
-
-        # 1.2 crop image
-        img_crop = img[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
-
-        # 1.3 pad image
-        img_crop_padded = cv2.copyMakeBorder(img_crop, y1_pad, y2_pad, x1_pad,
-                                             x2_pad, cv2.BORDER_CONSTANT)
-        # 1.4 generate padding mask
-        img_h, img_w, _ = img_crop_padded.shape
-        pdding_mask = np.ones((img_h, img_w))
-        end_x, end_y = -x2_pad, -y2_pad
-        if y2_pad == 0:
-            end_y = None
-        if x2_pad == 0:
-            end_x = None
-        pdding_mask[y1_pad:end_y, x1_pad:end_x] = 0
-
-        # 2. Resize image and padding mask
-        resize_factor = output_size / crop_size
-        img_crop_padded = cv2.resize(img_crop_padded,
-                                     (output_size, output_size))
-        pdding_mask = cv2.resize(pdding_mask,
-                                 (output_size, output_size)).astype(np.bool_)
-
-        return img_crop_padded, resize_factor, pdding_mask
-
-    def generate_box(self,
-                     bbox_gt,
-                     bbox_cropped,
-                     resize_factor,
-                     output_size,
-                     normalize=False):
-        """Transform the box coordinates from the original image coordinates to
-        the coordinates of the cropped image.
-
-        Args:
-            bbox_gt (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
-            bbox_cropped (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
-            resize_factor (float): the ratio of original image scale to cropped
-                image scale.
-            output_size (float): the size of output image.
-            normalize (bool): whether to normalize the output box.
-                Default to True.
-
-        Returns:
-            ndarray: generated box of shape (4, ) in [x1, y1, x2, y2] format.
-        """
-        assert output_size > 0
-        bbox_gt_center = (bbox_gt[0:2] + bbox_gt[2:4]) * 0.5
-        bbox_cropped_center = (bbox_cropped[0:2] + bbox_cropped[2:4]) * 0.5
-
-        bbox_out_center = (output_size - 1) / 2. + (
-            bbox_gt_center - bbox_cropped_center) * resize_factor
-        bbox_out_wh = (bbox_gt[2:4] - bbox_gt[0:2]) * resize_factor
-        bbox_out = np.concatenate((bbox_out_center - 0.5 * bbox_out_wh,
-                                   bbox_out_center + 0.5 * bbox_out_wh),
-                                  axis=-1)
-
-        return bbox_out / output_size if normalize else bbox_out
-
-    def __call__(self, results):
-        """Call function. For each dict in results, crop image like Stark did.
-
-        Args:
-            results (list[dict]): list of dict from
-                :obj:`mmtrack.base_sot_dataset`.
-
-        Returns:
-            List[dict]: list of dict that contains cropped image and
-                the corresponding groundtruth bbox.
-        """
-        outs = []
-        for i, _results in enumerate(results):
-            image = _results['img']
-            gt_bbox = _results['gt_bboxes'][0]
-            jittered_bboxes = _results['jittered_bboxes'][0]
-            crop_img, resize_factor, padding_mask = self.crop_like_stark(
-                image, jittered_bboxes, self.crop_size_factor[i],
-                self.output_size[i])
-
-            generated_bbox = self.generate_box(
-                gt_bbox,
-                jittered_bboxes,
-                resize_factor,
-                self.output_size[i],
-                normalize=False)
-
-            generated_bbox = generated_bbox[None]
-
-            _results['img'] = crop_img
-            if 'img_shape' in _results:
-                _results['img_shape'] = crop_img.shape
-            _results['gt_bboxes'] = generated_bbox
-            _results['seg_fields'] = ['padding_mask']
-            _results['padding_mask'] = padding_mask
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqBboxJitter(object):
-    """Bounding box jitter augmentation. The jittered bboxes are used for
-    subsequent image cropping, like `SeqCropLikeStark`.
-
-    Args:
-        scale_jitter_factor (list[int | float]): contains the factor of scale
-            jitter.
-        center_jitter_factor (list[int | float]): contains the factor of center
-            jitter.
-        crop_size_factor (list[int | float]): contains the ratio of crop size
-            to bbox size.
-    """
-
-    def __init__(self, scale_jitter_factor, center_jitter_factor,
-                 crop_size_factor):
-        self.scale_jitter_factor = scale_jitter_factor
-        self.center_jitter_factor = center_jitter_factor
-        self.crop_size_factor = crop_size_factor
-
-    def __call__(self, results):
-        """Call function.
-
-        Args:
-            results (list[dict]): list of dict from
-                :obj:`mmtrack.base_sot_dataset`.
-
-        Returns:
-            list[dict]: list of dict that contains augmented images.
-        """
-        outs = []
-        for i, _results in enumerate(results):
-            gt_bbox = _results['gt_bboxes'][0]
-            x1, y1, x2, y2 = np.split(gt_bbox, 4, axis=-1)
-            bbox_w, bbox_h = x2 - x1, y2 - y1
-            gt_bbox_cxcywh = np.concatenate(
-                [x1 + bbox_w / 2., y1 + bbox_h / 2., bbox_w, bbox_h], axis=-1)
-
-            crop_img_size = -1
-            # avoid croped image size too small.
-            count = 0
-            while crop_img_size < 1:
-                count += 1
-                if count > 100:
-                    print_log(
-                        f'-------- bbox {gt_bbox_cxcywh} is invalid -------')
-                    return None
-                jittered_wh = gt_bbox_cxcywh[2:4] * np.exp(
-                    np.random.randn(2) * self.scale_jitter_factor[i])
-                crop_img_size = np.ceil(
-                    np.sqrt(jittered_wh.prod()) * self.crop_size_factor[i])
-
-            max_offset = np.sqrt(
-                jittered_wh.prod()) * self.center_jitter_factor[i]
-            jittered_center = gt_bbox_cxcywh[0:2] + max_offset * (
-                np.random.rand(2) - 0.5)
-
-            jittered_bboxes = np.concatenate(
-                (jittered_center - 0.5 * jittered_wh,
-                 jittered_center + 0.5 * jittered_wh),
-                axis=-1)
-
-            _results['jittered_bboxes'] = jittered_bboxes[None]
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqBrightnessAug(object):
-    """Brightness augmention for images.
-
-    Args:
-        jitter_range (float): The range of brightness jitter.
-            Defaults to 0..
-    """
-
-    def __init__(self, jitter_range=0):
-        self.jitter_range = jitter_range
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, perform brightness augmention for image in
-        the dict.
-
-        Args:
-            results (list[dict]): list of dict that from
-                :obj:`mmtrack.base_sot_dataset`.
-        Returns:
-            list[dict]: list of dict that contains augmented image.
-        """
-        brightness_factor = np.random.uniform(
-            max(0, 1 - self.jitter_range), 1 + self.jitter_range)
-        outs = []
-        for _results in results:
-            image = _results['img']
-            image = np.dot(image, brightness_factor).clip(0, 255.0)
-            _results['img'] = image
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqGrayAug(object):
-    """Gray augmention for images.
-
-    Args:
-        prob (float): The probability to perform gray augmention.
-            Defaults to 0..
-    """
-
-    def __init__(self, prob=0.):
-        self.prob = prob
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, perform gray augmention for image in the
-        dict.
-
-        Args:
-            results (list[dict]): List of dict that from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains augmented gray image.
-        """
-        outs = []
-        gray_prob = np.random.random()
-        for _results in results:
-            if self.prob > gray_prob:
-                grayed = cv2.cvtColor(_results['img'], cv2.COLOR_BGR2GRAY)
-                image = cv2.cvtColor(grayed, cv2.COLOR_GRAY2BGR)
-                _results['img'] = image
-
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqShiftScaleAug(object):
-    """Shift and rescale images and bounding boxes.
-
-    Args:
-        target_size (list[int]): list of int denoting exemplar size and search
-            size, respectively. Defaults to [127, 255].
-        shift (list[int]): list of int denoting the max shift offset. Defaults
-            to [4, 64].
-        scale (list[float]): list of float denoting the max rescale factor.
-            Defaults to [0.05, 0.18].
-    """
-
-    def __init__(self,
-                 target_size=[127, 255],
-                 shift=[4, 64],
-                 scale=[0.05, 0.18]):
-        self.target_size = target_size
-        self.shift = shift
-        self.scale = scale
-
-    def _shift_scale_aug(self, image, bbox, target_size, shift, scale):
-        """Shift and rescale an image and corresponding bounding box.
-
-        Args:
-            image (ndarray): of shape (H, W, 3). Typically H and W equal to
-                511.
-            bbox (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
-            target_size (int): Exemplar size or search size.
-            shift (int): The max shift offset.
-            scale (float): The max rescale factor.
-
-        Returns:
-            tuple(crop_img, bbox): crop_img is a ndarray of shape
-            (target_size, target_size, 3), bbox is the corresponding ground
-            truth box in [x1, y1, x2, y2] format.
-        """
-        img_h, img_w = image.shape[:2]
-
-        scale_x = (2 * np.random.random() - 1) * scale + 1
-        scale_y = (2 * np.random.random() - 1) * scale + 1
-        scale_x = min(scale_x, float(img_w) / target_size)
-        scale_y = min(scale_y, float(img_h) / target_size)
-        crop_region = np.array([
-            img_w // 2 - 0.5 * scale_x * target_size,
-            img_h // 2 - 0.5 * scale_y * target_size,
-            img_w // 2 + 0.5 * scale_x * target_size,
-            img_h // 2 + 0.5 * scale_y * target_size
-        ])
-
-        shift_x = (2 * np.random.random() - 1) * shift
-        shift_y = (2 * np.random.random() - 1) * shift
-        shift_x = max(-crop_region[0], min(img_w - crop_region[2], shift_x))
-        shift_y = max(-crop_region[1], min(img_h - crop_region[3], shift_y))
-        shift = np.array([shift_x, shift_y, shift_x, shift_y])
-        crop_region += shift
-
-        crop_img = crop_image(image, crop_region, target_size)
-        bbox -= np.array(
-            [crop_region[0], crop_region[1], crop_region[0], crop_region[1]])
-        bbox /= np.array([scale_x, scale_y, scale_x, scale_y],
-                         dtype=np.float32)
-        return crop_img, bbox
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, shift and rescale the image and the bounding
-        box in the dict.
-
-        Args:
-            results (list[dict]): List of dict that from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains cropped image and
-            corresponding ground truth box.
-        """
-        outs = []
-        for i, _results in enumerate(results):
-            image = _results['img']
-            gt_bbox = _results['gt_bboxes'][0]
-
-            crop_img, crop_bbox = self._shift_scale_aug(
-                image, gt_bbox, self.target_size[i], self.shift[i],
-                self.scale[i])
-            crop_bbox = crop_bbox[None]
-
-            _results['img'] = crop_img
-            if 'img_shape' in _results:
-                _results['img_shape'] = crop_img.shape
-            _results['gt_bboxes'] = crop_bbox
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqColorAug(object):
-    """Color augmention for images.
-
-    Args:
-        prob (list[float]): The probability to perform color augmention for
-            each image. Defaults to [1.0, 1.0].
-        rgb_var (list[list]]): The values of color augmentaion. Defaults to
-            [[-0.55919361, 0.98062831, -0.41940627],
-            [1.72091413, 0.19879334, -1.82968581],
-            [4.64467907, 4.73710203, 4.88324118]].
-    """
-
-    def __init__(self,
-                 prob=[1.0, 1.0],
-                 rgb_var=[[-0.55919361, 0.98062831, -0.41940627],
-                          [1.72091413, 0.19879334, -1.82968581],
-                          [4.64467907, 4.73710203, 4.88324118]]):
-        self.prob = prob
-        self.rgb_var = np.array(rgb_var, dtype=np.float32)
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, perform color augmention for image in the
-        dict.
-
-        Args:
-            results (list[dict]): List of dict that from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains augmented color image.
-        """
-        outs = []
-        for i, _results in enumerate(results):
-            image = _results['img']
-
-            if self.prob[i] > np.random.random():
-                offset = np.dot(self.rgb_var, np.random.randn(3, 1))
-                # bgr to rgb
-                offset = offset[::-1]
-                offset = offset.reshape(3)
-                image = (image - offset).astype(np.float32)
-
-            _results['img'] = image
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqBlurAug(object):
-    """Blur augmention for images.
-
-    Args:
-        prob (list[float]): The probability to perform blur augmention for
-            each image. Defaults to [0.0, 0.2].
-    """
-
-    def __init__(self, prob=[0.0, 0.2]):
-        self.prob = prob
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, perform blur augmention for image in the
-        dict.
-
-        Args:
-            results (list[dict]): List of dict that from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains augmented blur image.
-        """
-        outs = []
-        for i, _results in enumerate(results):
-            image = _results['img']
-
-            if self.prob[i] > np.random.random():
-                sizes = np.arange(5, 46, 2)
-                size = np.random.choice(sizes)
-                kernel = np.zeros((size, size))
-                c = int(size / 2)
-                wx = np.random.random()
-                kernel[:, c] += 1. / size * wx
-                kernel[c, :] += 1. / size * (1 - wx)
-                image = cv2.filter2D(image, -1, kernel)
-
-            _results['img'] = image
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqResize(Resize):
-    """Resize images.
-
-    Please refer to `mmdet.datasets.pipelines.transforms.py:Resize` for
-    detailed docstring.
-
-    Args:
-        share_params (bool): If True, share the resize parameters for all
-            images. Defaults to True.
-    """
-
-    def __init__(self, share_params=True, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.share_params = share_params
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, call the call function of `Resize` to resize
-        image and corresponding annotations.
-
-        Args:
-            results (list[dict]): List of dict that from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains resized results,
-            'img_shape', 'pad_shape', 'scale_factor', 'keep_ratio' keys
-            are added into result dict.
-        """
-        outs, scale = [], None
-        for i, _results in enumerate(results):
-            if self.share_params and i > 0:
-                _results['scale'] = scale
-            _results = super().__call__(_results)
-            if self.share_params and i == 0:
-                scale = _results['scale']
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqNormalize(Normalize):
-    """Normalize images.
-
-    Please refer to `mmdet.datasets.pipelines.transforms.py:Normalize` for
-    detailed docstring.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, call the call function of `Normalize` to
-        normalize image.
-
-        Args:
-            results (list[dict]): List of dict that from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains normalized results,
-            'img_norm_cfg' key is added into result dict.
-        """
-        outs = []
-        for _results in results:
-            _results = super().__call__(_results)
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqRandomFlip(RandomFlip):
-    """Randomly flip for images.
-
-    Please refer to `mmdet.datasets.pipelines.transforms.py:RandomFlip` for
-    detailed docstring.
-
-    Args:
-        share_params (bool): If True, share the flip parameters for all images.
-            Defaults to True.
-    """
-
-    def __init__(self, share_params, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.share_params = share_params
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, call `RandomFlip` to randomly flip image.
-
-        Args:
-            results (list[dict]): List of dict that from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains flipped results, 'flip',
-            'flip_direction' keys are added into the dict.
-        """
-        if self.share_params:
-            if isinstance(self.direction, list):
-                # None means non-flip
-                direction_list = self.direction + [None]
-            else:
-                # None means non-flip
-                direction_list = [self.direction, None]
-
-            if isinstance(self.flip_ratio, list):
-                non_flip_ratio = 1 - sum(self.flip_ratio)
-                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
-            else:
-                non_flip_ratio = 1 - self.flip_ratio
-                # exclude non-flip
-                single_ratio = self.flip_ratio / (len(direction_list) - 1)
-                flip_ratio_list = [single_ratio] * (len(direction_list) -
-                                                    1) + [non_flip_ratio]
-
-            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
-            flip = cur_dir is not None
-            flip_direction = cur_dir
-
-            for _results in results:
-                _results['flip'] = flip
-                _results['flip_direction'] = flip_direction
-
-        outs = []
-        for _results in results:
-            _results = super().__call__(_results)
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqPad(Pad):
-    """Pad images.
-
-    Please refer to `mmdet.datasets.pipelines.transforms.py:Pad` for detailed
-    docstring.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def __call__(self, results):
-        """Call function.
-
-        For each dict in results, call the call function of `Pad` to pad image.
-
-        Args:
-            results (list[dict]): List of dict that from
-                :obj:`mmtrack.CocoVideoDataset`.
-
-        Returns:
-            list[dict]: List of dict that contains padding results,
-            'pad_shape', 'pad_fixed_size' and 'pad_size_divisor' keys are
-            added into the dict.
-        """
-        outs = []
-        for _results in results:
-            _results = super().__call__(_results)
-            outs.append(_results)
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqRandomCrop(object):
-    """Sequentially random crop the images & bboxes & masks.
-
-    The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
-    then the cropped results are generated.
-
-    Args:
-        crop_size (tuple): The relative ratio or absolute pixels of
-            height and width.
-        allow_negative_crop (bool, optional): Whether to allow a crop that does
-            not contain any bbox area. Default False.
-        share_params (bool, optional): Whether share the cropping parameters
-            for the images.
-        bbox_clip_border (bool, optional): Whether clip the objects outside
-            the border of the image. Defaults to True.
-
-    Note:
-        - If the image is smaller than the absolute crop size, return the
-            original image.
-        - The keys for bboxes, labels and masks must be aligned. That is,
-          `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
-          `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
-          `gt_masks_ignore`.
-        - If the crop does not contain any gt-bbox region and
-          `allow_negative_crop` is set to False, skip this image.
-    """
-
-    def __init__(self,
-                 crop_size,
-                 allow_negative_crop=False,
-                 share_params=False,
-                 bbox_clip_border=False):
-        assert crop_size[0] > 0 and crop_size[1] > 0
-        self.crop_size = crop_size
-        self.allow_negative_crop = allow_negative_crop
-        self.share_params = share_params
-        self.bbox_clip_border = bbox_clip_border
-        # The key correspondence from bboxes to labels and masks.
-        self.bbox2label = {
-            'gt_bboxes': ['gt_labels', 'gt_instance_ids'],
-            'gt_bboxes_ignore': ['gt_labels_ignore', 'gt_instance_ids_ignore']
-        }
-        self.bbox2mask = {
-            'gt_bboxes': 'gt_masks',
-            'gt_bboxes_ignore': 'gt_masks_ignore'
-        }
-
-    def get_offsets(self, img):
-        """Random generate the offsets for cropping."""
-        margin_h = max(img.shape[0] - self.crop_size[0], 0)
-        margin_w = max(img.shape[1] - self.crop_size[1], 0)
-        offset_h = np.random.randint(0, margin_h + 1)
-        offset_w = np.random.randint(0, margin_w + 1)
-        return offset_h, offset_w
-
-    def random_crop(self, results, offsets=None):
-        """Call function to randomly crop images, bounding boxes, masks,
-        semantic segmentation maps.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-            offsets (tuple, optional): Pre-defined offsets for cropping.
-                Default to None.
-
-        Returns:
-            dict: Randomly cropped results, 'img_shape' key in result dict is
-            updated according to crop size.
-        """
-
-        for key in results.get('img_fields', ['img']):
-            img = results[key]
-            if offsets is not None:
-                offset_h, offset_w = offsets
-            else:
-                offset_h, offset_w = self.get_offsets(img)
-            results['img_info']['crop_offsets'] = (offset_h, offset_w)
-            crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
-            crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]
-
-            # crop the image
-            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
-            img_shape = img.shape
-            results[key] = img
-        results['img_shape'] = img_shape
-
-        # crop bboxes accordingly and clip to the image boundary
-        for key in results.get('bbox_fields', []):
-            # e.g. gt_bboxes and gt_bboxes_ignore
-            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
-                                   dtype=np.float32)
-            bboxes = results[key] - bbox_offset
-            if self.bbox_clip_border:
-                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
-                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
-            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
-                bboxes[:, 3] > bboxes[:, 1])
-            # If the crop does not contain any gt-bbox area and
-            # self.allow_negative_crop is False, skip this image.
-            if (key == 'gt_bboxes' and not valid_inds.any()
-                    and not self.allow_negative_crop):
-                return None
-            results[key] = bboxes[valid_inds, :]
-            # label fields. e.g. gt_labels and gt_labels_ignore
-            label_keys = self.bbox2label.get(key)
-            for label_key in label_keys:
-                if label_key in results:
-                    results[label_key] = results[label_key][valid_inds]
-
-            # mask fields, e.g. gt_masks and gt_masks_ignore
-            mask_key = self.bbox2mask.get(key)
-            if mask_key in results:
-                results[mask_key] = results[mask_key][
-                    valid_inds.nonzero()[0]].crop(
-                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
-
-        # crop semantic seg
-        for key in results.get('seg_fields', []):
-            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
-        return results
-
-    def __call__(self, results):
-        """Call function to sequentially randomly crop images, bounding boxes,
-        masks, semantic segmentation maps.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Randomly cropped results, 'img_shape' key in result dict is
-            updated according to crop size.
-        """
-        if self.share_params:
-            offsets = self.get_offsets(results[0]['img'])
-        else:
-            offsets = None
-
-        outs = []
-        for _results in results:
-            _results = self.random_crop(_results, offsets)
-            if _results is None:
-                return None
-            outs.append(_results)
-
-        return outs
-
-
-@PIPELINES.register_module()
-class SeqPhotoMetricDistortion(object):
-    """Apply photometric distortion to image sequentially, every transformation
-    is applied with a probability of 0.5. The position of random contrast is in
-    second or second to last.
-
-    1. random brightness
-    2. random contrast (mode 0)
-    3. convert color from BGR to HSV
-    4. random saturation
-    5. random hue
-    6. convert color from HSV to BGR
-    7. random contrast (mode 1)
-    8. randomly swap channels
-
-    Args:
-        brightness_delta (int): delta of brightness.
-        contrast_range (tuple): range of contrast.
-        saturation_range (tuple): range of saturation.
-        hue_delta (int): delta of hue.
-    """
-
-    def __init__(self,
-                 share_params=True,
-                 brightness_delta=32,
-                 contrast_range=(0.5, 1.5),
-                 saturation_range=(0.5, 1.5),
-                 hue_delta=18):
-        self.share_params = share_params
-        self.brightness_delta = brightness_delta
-        self.contrast_lower, self.contrast_upper = contrast_range
-        self.saturation_lower, self.saturation_upper = saturation_range
-        self.hue_delta = hue_delta
-
-    def get_params(self):
-        """Generate parameters."""
-        params = dict()
-        # delta
-        if np.random.randint(2):
-            params['delta'] = np.random.uniform(-self.brightness_delta,
-                                                self.brightness_delta)
-        else:
-            params['delta'] = None
-        # mode
-        mode = np.random.randint(2)
-        params['contrast_first'] = True if mode == 1 else 0
-        # alpha
-        if np.random.randint(2):
-            params['alpha'] = np.random.uniform(self.contrast_lower,
-                                                self.contrast_upper)
-        else:
-            params['alpha'] = None
-        # saturation
-        if np.random.randint(2):
-            params['saturation'] = np.random.uniform(self.saturation_lower,
-                                                     self.saturation_upper)
-        else:
-            params['saturation'] = None
-        # hue
-        if np.random.randint(2):
-            params['hue'] = np.random.uniform(-self.hue_delta, self.hue_delta)
-        else:
-            params['hue'] = None
-        # swap
-        if np.random.randint(2):
-            params['permutation'] = np.random.permutation(3)
-        else:
-            params['permutation'] = None
-        return params
-
-    def photo_metric_distortion(self, results, params=None):
-        """Call function to perform photometric distortion on images.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-            params (dict, optional): Pre-defined parameters. Default to None.
-
-        Returns:
-            dict: Result dict with images distorted.
-        """
-        if params is None:
-            params = self.get_params()
-        results['img_info']['color_jitter'] = params
-
-        if 'img_fields' in results:
-            assert results['img_fields'] == ['img'], \
-                'Only single img_fields is allowed'
-        img = results['img']
-        assert img.dtype == np.float32, \
-            'PhotoMetricDistortion needs the input image of dtype np.float32,'\
-            ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
-        # random brightness
-        if params['delta'] is not None:
-            img += params['delta']
-
-        # mode == 0 --> do random contrast first
-        # mode == 1 --> do random contrast last
-        if params['contrast_first']:
-            if params['alpha'] is not None:
-                img *= params['alpha']
-
-        # convert color from BGR to HSV
-        img = mmcv.bgr2hsv(img)
-
-        # random saturation
-        if params['saturation'] is not None:
-            img[..., 1] *= params['saturation']
-
-        # random hue
-        if params['hue'] is not None:
-            img[..., 0] += params['hue']
-            img[..., 0][img[..., 0] > 360] -= 360
-            img[..., 0][img[..., 0] < 0] += 360
-
-        # convert color from HSV to BGR
-        img = mmcv.hsv2bgr(img)
-
-        # random contrast
-        if not params['contrast_first']:
-            if params['alpha'] is not None:
-                img *= params['alpha']
-
-        # randomly swap channels
-        if params['permutation'] is not None:
-            img = img[..., params['permutation']]
-
-        results['img'] = img
-        return results
-
-    def __call__(self, results):
-        """Call function to perform photometric distortion on images.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Result dict with images distorted.
-        """
-        if self.share_params:
-            params = self.get_params()
-        else:
-            params = None
-
-        outs = []
-        for _results in results:
-            _results = self.photo_metric_distortion(_results, params)
-            outs.append(_results)
-
-        return outs
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
-        repr_str += 'contrast_range='
-        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
-        repr_str += 'saturation_range='
-        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
-        repr_str += f'hue_delta={self.hue_delta})'
-        return repr_str
diff --git a/mmtrack/datasets/reid_dataset.py b/mmtrack/datasets/reid_dataset.py
index 5ab20fb73..7be9c3f68 100644
--- a/mmtrack/datasets/reid_dataset.py
+++ b/mmtrack/datasets/reid_dataset.py
@@ -1,204 +1,127 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+import os.path as osp
 from collections import defaultdict
+from typing import Any, Dict, List
 
 import numpy as np
-import torch
-from mmcls.datasets import BaseDataset
-from mmdet.datasets import DATASETS
-from mmdet.datasets.pipelines import Compose
+from mmengine.dataset import BaseDataset
+from mmengine.utils import check_file_exist
+
+from mmtrack.registry import DATASETS
 
 
 @DATASETS.register_module()
 class ReIDDataset(BaseDataset):
-    """Dataset for ReID Dataset.
+    """Dataset for ReID.
 
     Args:
-        pipeline (list): a list of dict, where each element represents
-            a operation defined in `mmtrack.datasets.pipelines`
-        triplet_sampler (dict): The sampler for hard mining triplet loss.
+        triplet_sampler (dict, optional): The sampler for hard mining
+            triplet loss. Defaults to None.
+        keys: num_ids (int): The number of person ids.
+              ins_per_id (int): The number of image for each person.
     """
 
-    def __init__(self, pipeline, triplet_sampler=None, *args, **kwargs):
-        super().__init__(pipeline=[], *args, **kwargs)
+    def __init__(self, triplet_sampler: dict = None, *args, **kwargs):
         self.triplet_sampler = triplet_sampler
-        self.pipeline = Compose(pipeline)
-        # for DistributedGroupSampler and GroupSampler
-        self.flag = np.zeros(len(self), dtype=np.uint8)
+        super().__init__(*args, **kwargs)
 
-    def load_annotations(self):
-        """Load annotations from ImageNet style annotation file.
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ''self.ann_file''.
 
         Returns:
-            list[dict]: Annotation information from ReID api.
+              list[dict]: A list of annotation.
         """
         assert isinstance(self.ann_file, str)
-
-        data_infos = []
+        check_file_exist(self.ann_file)
+        data_list = []
         with open(self.ann_file) as f:
             samples = [x.strip().split(' ') for x in f.readlines()]
             for filename, gt_label in samples:
                 info = dict(img_prefix=self.data_prefix)
-                info['img_info'] = dict(filename=filename)
+                if self.data_prefix['img_path'] is not None:
+                    info['img_path'] = osp.join(self.data_prefix['img_path'],
+                                                filename)
+                else:
+                    info['img_path'] = filename
                 info['gt_label'] = np.array(gt_label, dtype=np.int64)
-                data_infos.append(info)
-        self._parse_ann_info(data_infos)
-        return data_infos
+                data_list.append(info)
+        self._parse_ann_info(data_list)
+        return data_list
 
-    def _parse_ann_info(self, data_infos):
+    def _parse_ann_info(self, data_list: List[dict]):
         """Parse person id annotations."""
-        index_tmp_dic = defaultdict(list)
-        self.index_dic = dict()
-        for idx, info in enumerate(data_infos):
+        index_tmp_dic = defaultdict(list)  # pid->[idx1,...,idxN]
+        self.index_dic = dict()  # pid->array([idx1,...,idxN])
+        for idx, info in enumerate(data_list):
             pid = info['gt_label']
             index_tmp_dic[int(pid)].append(idx)
         for pid, idxs in index_tmp_dic.items():
             self.index_dic[pid] = np.asarray(idxs, dtype=np.int64)
-
         self.pids = np.asarray(list(self.index_dic.keys()), dtype=np.int64)
 
-    def triplet_sampling(self, pos_pid, num_ids=8, ins_per_id=4):
+    def prepare_data(self, idx: int) -> Any:
+        """Get data processed by ''self.pipeline''.
+
+        Args:
+            idx (int): The index of ''data_info''
+
+        Returns:
+            Any: Depends on ''self.pipeline''
+        """
+        data_info = self.get_data_info(idx)
+        if self.triplet_sampler is not None:
+            img_info = self.triplet_sampling(data_info['gt_label'],
+                                             **self.triplet_sampler)
+            data_info = copy.deepcopy(img_info)  # triplet -> list
+        else:
+            data_info = copy.deepcopy(data_info)  # no triplet -> dict
+        return self.pipeline(data_info)
+
+    def triplet_sampling(self,
+                         pos_pid,
+                         num_ids: int = 8,
+                         ins_per_id: int = 4) -> Dict:
         """Triplet sampler for hard mining triplet loss. First, for one
         pos_pid, random sample ins_per_id images with same person id.
 
-        Then, random sample num_ids - 1 negative ids.
+        Then, random sample num_ids - 1 images for each negative id.
         Finally, random sample ins_per_id images for each negative id.
 
         Args:
             pos_pid (ndarray): The person id of the anchor.
             num_ids (int): The number of person ids.
-            ins_per_id (int): The number of image for each person.
+            ins_per_id (int): The number of images for each person.
 
         Returns:
-            List: Annotation information of num_ids X ins_per_id images.
+            Dict: Annotation information of num_ids X ins_per_id images.
         """
         assert len(self.pids) >= num_ids, \
             'The number of person ids in the training set must ' \
             'be greater than the number of person ids in the sample.'
 
-        pos_idxs = self.index_dic[int(pos_pid)]
+        pos_idxs = self.index_dic[int(
+            pos_pid)]  # all positive idxs for pos_pid
         idxs_list = []
         # select positive samplers
         idxs_list.extend(pos_idxs[np.random.choice(
             pos_idxs.shape[0], ins_per_id, replace=True)])
-
         # select negative ids
         neg_pids = np.random.choice(
             [i for i, _ in enumerate(self.pids) if i != pos_pid],
             num_ids - 1,
             replace=False)
-
         # select negative samplers for each negative id
         for neg_pid in neg_pids:
             neg_idxs = self.index_dic[neg_pid]
             idxs_list.extend(neg_idxs[np.random.choice(
                 neg_idxs.shape[0], ins_per_id, replace=True)])
-
+        # return the final triplet batch
         triplet_img_infos = []
         for idx in idxs_list:
-            triplet_img_infos.append(copy.deepcopy(self.data_infos[idx]))
-
-        return triplet_img_infos
-
-    def prepare_data(self, idx):
-        """Prepare results for image (e.g. the annotation information, ...)."""
-        data_info = self.data_infos[idx]
-        if self.triplet_sampler is not None:
-            img_infos = self.triplet_sampling(data_info['gt_label'],
-                                              **self.triplet_sampler)
-            results = copy.deepcopy(img_infos)
-        else:
-            results = copy.deepcopy(data_info)
-        return self.pipeline(results)
-
-    def evaluate(self,
-                 results,
-                 metric='mAP',
-                 metric_options=None,
-                 logger=None):
-        """Evaluate the ReID dataset.
-
-        Args:
-            results (list): Testing results of the dataset.
-            metric (str | list[str]): Metrics to be evaluated.
-                Default value is `mAP`.
-            metric_options: (dict, optional): Options for calculating metrics.
-                Allowed keys are 'rank_list' and 'max_rank'. Defaults to None.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Defaults to None.
-
-        Returns:
-            dict: evaluation results
-        """
-        if metric_options is None:
-            metric_options = dict(rank_list=[1, 5, 10, 20], max_rank=20)
-        for rank in metric_options['rank_list']:
-            assert rank >= 1 and rank <= metric_options['max_rank']
-        if isinstance(metric, list):
-            metrics = metric
-        elif isinstance(metric, str):
-            metrics = [metric]
-        else:
-            raise TypeError('metric must be a list or a str.')
-        allowed_metrics = ['mAP', 'CMC']
-        for metric in metrics:
-            if metric not in allowed_metrics:
-                raise KeyError(f'metric {metric} is not supported.')
-
-        # distance
-        results = [result.data.cpu() for result in results]
-        features = torch.stack(results)
-
-        n, c = features.size()
-        mat = torch.pow(features, 2).sum(dim=1, keepdim=True).expand(n, n)
-        distmat = mat + mat.t()
-        distmat.addmm_(features, features.t(), beta=1, alpha=-2)
-        distmat = distmat.numpy()
-
-        pids = self.get_gt_labels()
-        indices = np.argsort(distmat, axis=1)
-        matches = (pids[indices] == pids[:, np.newaxis]).astype(np.int32)
-
-        all_cmc = []
-        all_AP = []
-        num_valid_q = 0.
-        for q_idx in range(n):
-            # remove self
-            raw_cmc = matches[q_idx][1:]
-            if not np.any(raw_cmc):
-                # this condition is true when query identity
-                # does not appear in gallery
-                continue
-
-            cmc = raw_cmc.cumsum()
-            cmc[cmc > 1] = 1
-
-            all_cmc.append(cmc[:metric_options['max_rank']])
-            num_valid_q += 1.
-
-            # compute average precision
-            # reference:
-            # https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Average_precision
-            num_rel = raw_cmc.sum()
-            tmp_cmc = raw_cmc.cumsum()
-            tmp_cmc = [x / (i + 1.) for i, x in enumerate(tmp_cmc)]
-            tmp_cmc = np.asarray(tmp_cmc) * raw_cmc
-            AP = tmp_cmc.sum() / num_rel
-            all_AP.append(AP)
-
-        assert num_valid_q > 0, \
-            'Error: all query identities do not appear in gallery'
-
-        all_cmc = np.asarray(all_cmc).astype(np.float32)
-        all_cmc = all_cmc.sum(0) / num_valid_q
-        mAP = np.mean(all_AP)
-
-        eval_results = dict()
-        if 'mAP' in metrics:
-            eval_results['mAP'] = np.around(mAP, decimals=3)
-        if 'CMC' in metrics:
-            for rank in metric_options['rank_list']:
-                eval_results[f'R{rank}'] = np.around(
-                    all_cmc[rank - 1], decimals=3)
-
-        return eval_results
+            triplet_img_infos.append(copy.deepcopy(self.get_data_info(idx)))
+        # Collect data_list scatters (list of dict -> dict of list)
+        out = dict()
+        for key in triplet_img_infos[0].keys():
+            out[key] = [_info[key] for _info in triplet_img_infos]
+        return out
diff --git a/mmtrack/datasets/samplers/__init__.py b/mmtrack/datasets/samplers/__init__.py
index d0affd1ab..12021834f 100644
--- a/mmtrack/datasets/samplers/__init__.py
+++ b/mmtrack/datasets/samplers/__init__.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .quota_sampler import DistributedQuotaSampler
-from .video_sampler import DistributedVideoSampler, SOTVideoSampler
+from .entire_video_batch_sampler import EntireVideoBatchSampler
+from .quota_sampler import QuotaSampler
+from .video_sampler import VideoSampler
 
-__all__ = [
-    'DistributedVideoSampler', 'SOTVideoSampler', 'DistributedQuotaSampler'
-]
+__all__ = ['VideoSampler', 'QuotaSampler', 'EntireVideoBatchSampler']
diff --git a/mmtrack/datasets/samplers/entire_video_batch_sampler.py b/mmtrack/datasets/samplers/entire_video_batch_sampler.py
new file mode 100644
index 000000000..90a7cab92
--- /dev/null
+++ b/mmtrack/datasets/samplers/entire_video_batch_sampler.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+from torch.utils.data import BatchSampler, Sampler
+
+from mmtrack.registry import DATA_SAMPLERS
+
+
+@DATA_SAMPLERS.register_module()
+class EntireVideoBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images from one video into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (int): Size of mini-batch. Here, we take a video as a batch.
+            Defaults to 1.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``. Defaults to False.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 batch_size: int = 1,
+                 drop_last: bool = False) -> None:
+        assert sampler.dataset.test_mode
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size != 1:
+            raise ValueError('batch_size should be 1, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+    def __iter__(self) -> Sequence[int]:
+        batch = []
+        for idx in self.sampler:
+            data_info = self.sampler.dataset.get_data_info(idx)
+            video_length = data_info['video_length']
+            batch.append(idx)
+            if len(batch) == video_length:
+                yield batch
+                batch = []
+
+    def __len__(self) -> int:
+        return self.sampler.num_videos
diff --git a/mmtrack/datasets/samplers/quota_sampler.py b/mmtrack/datasets/samplers/quota_sampler.py
index 5ef3d5a13..fc7c510d9 100644
--- a/mmtrack/datasets/samplers/quota_sampler.py
+++ b/mmtrack/datasets/samplers/quota_sampler.py
@@ -1,12 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
+from typing import Iterator, Sized
 
 import torch
-from mmcv.runner import get_dist_info
+from mmengine.dist import get_dist_info
 from torch.utils.data import Sampler
 
+from mmtrack.registry import DATA_SAMPLERS
 
-class DistributedQuotaSampler(Sampler):
+
+@DATA_SAMPLERS.register_module()
+class QuotaSampler(Sampler):
     """Sampler that gets fixed number of samples per epoch.
 
     It is especially useful in conjunction with
@@ -18,11 +22,8 @@ class DistributedQuotaSampler(Sampler):
         Dataset is assumed to be of constant size.
 
     Args:
-        dataset: Dataset used for sampling.
+        dataset (Sized): Dataset used for sampling.
         samples_per_epoch (int): The number of samples per epoch.
-        num_replicas (optional): Number of processes participating in
-            distributed training.
-        rank (optional): Rank of the current process within num_replicas.
         replacement (bool): samples are drawn with replacement if ``True``,
             Default: False.
         seed (int, optional): random seed used to shuffle the sampler if
@@ -31,30 +32,25 @@ class DistributedQuotaSampler(Sampler):
     """
 
     def __init__(self,
-                 dataset,
-                 samples_per_epoch,
-                 num_replicas=None,
-                 rank=None,
-                 replacement=False,
-                 seed=0):
-        _rank, _num_replicas = get_dist_info()
-        if num_replicas is None:
-            num_replicas = _num_replicas
-        if rank is None:
-            rank = _rank
+                 dataset: Sized,
+                 samples_per_epoch: int,
+                 replacement: bool = False,
+                 seed: int = 0) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
         self.dataset = dataset
         self.samples_per_epoch = samples_per_epoch
-        self.num_replicas = num_replicas
-        self.rank = rank
         self.epoch = 0
         self.seed = seed if seed is not None else 0
         self.replacement = replacement
 
         self.num_samples = int(
-            math.ceil(samples_per_epoch * 1.0 / self.num_replicas))
-        self.total_size = self.num_samples * self.num_replicas
+            math.ceil(samples_per_epoch * 1.0 / self.world_size))
+        self.total_size = self.num_samples * self.world_size
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator:
         # deterministically shuffle based on epoch
         g = torch.Generator()
         g.manual_seed(self.epoch + self.seed)
@@ -77,7 +73,7 @@ def __iter__(self):
         assert len(indices) == self.total_size
 
         # subsample
-        indices = indices[self.rank:self.total_size:self.num_replicas]
+        indices = indices[self.rank:self.total_size:self.world_size]
         assert len(indices) == self.num_samples
 
         return iter(indices)
diff --git a/mmtrack/datasets/samplers/video_sampler.py b/mmtrack/datasets/samplers/video_sampler.py
index 245c2728b..2f27aa397 100644
--- a/mmtrack/datasets/samplers/video_sampler.py
+++ b/mmtrack/datasets/samplers/video_sampler.py
@@ -1,96 +1,84 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Iterator, Sized
+
 import numpy as np
-from torch.utils.data import DistributedSampler as _DistributedSampler
+from mmengine.dist import get_dist_info
 from torch.utils.data import Sampler
 
-from mmtrack.datasets.base_sot_dataset import BaseSOTDataset
+from mmtrack.datasets import BaseSOTDataset, BaseVideoDataset
+from mmtrack.registry import DATA_SAMPLERS
 
 
-class SOTVideoSampler(Sampler):
-    """Only used for sot testing on single gpu.
+@DATA_SAMPLERS.register_module()
+class VideoSampler(Sampler):
+    """The video data sampler is for both distributed and non-distributed
+    environment. It is only used in testing.
 
     Args:
-        dataset (Dataset): Test dataset must have `num_frames_per_video`
-            attribute. It records the frame number of each video.
+        dataset (Sized): The dataset.
     """
 
-    def __init__(self, dataset):
-        super().__init__(dataset)
-        # The input of '__getitem__' function in SOT dataset class must be
-        # a tuple when testing. The tuple is in (video_index, frame_index)
-        # format.
+    def __init__(self, dataset: Sized, seed: int = 0) -> None:
         self.dataset = dataset
-        self.indices = []
-        for video_ind, num_frames in enumerate(
-                self.dataset.num_frames_per_video):
-            self.indices.extend([(video_ind, frame_ind)
-                                 for frame_ind in range(num_frames)])
-
-    def __iter__(self):
-        return iter(self.indices)
-
-
-class DistributedVideoSampler(_DistributedSampler):
-    """Put videos to multi gpus during testing.
-
-    Args:
-        dataset (Dataset): Test dataset must have `data_infos` attribute.
-            Each data_info in `data_infos` records information of one frame or
-            one video (in SOT Dataset). If not SOT Dataset, each video must
-            have one data_info that includes `data_info['frame_id'] == 0`.
-        num_replicas (int): The number of gpus. Defaults to None.
-        rank (int): Gpu rank id. Defaults to None.
-        shuffle (bool): If True, shuffle the dataset. Defaults to False.
-    """
+        assert self.dataset.test_mode
 
-    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=False):
-        super().__init__(dataset, num_replicas=num_replicas, rank=rank)
-        self.shuffle = shuffle
-        assert not self.shuffle, 'Specific for video sequential testing.'
-        self.num_samples = len(dataset)
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
 
-        if isinstance(dataset, BaseSOTDataset):
+        if isinstance(self.dataset, BaseSOTDataset):
             # The input of '__getitem__' function in SOT dataset class must be
             # a tuple when testing. The tuple is in (video_index, frame_index)
             # format.
-            self.num_videos = len(self.dataset.data_infos)
-            self.num_frames_per_video = self.dataset.num_frames_per_video
-            if self.num_videos < num_replicas:
+            self.num_videos = self.dataset.num_videos
+            if self.num_videos < self.world_size:
                 raise ValueError(f'only {self.num_videos} videos loaded,'
-                                 f'but {self.num_replicas} gpus were given.')
+                                 f'but {self.world_size} gpus were given.')
 
             chunks = np.array_split(
-                list(range(self.num_videos)), self.num_replicas)
+                list(range(self.num_videos)), self.world_size)
             self.indices = []
             for videos in chunks:
                 indices_chunk = []
                 for video_ind in videos:
                     indices_chunk.extend([
                         (video_ind, frame_ind) for frame_ind in range(
-                            self.num_frames_per_video[video_ind])
+                            self.dataset.get_len_per_video(video_ind))
                     ])
                 self.indices.append(indices_chunk)
         else:
+            assert isinstance(self.dataset, BaseVideoDataset)
             first_frame_indices = []
-            for i, img_info in enumerate(self.dataset.data_infos):
-                if img_info['frame_id'] == 0:
+            for i in range(len(self.dataset)):
+                data_info = self.dataset.get_data_info(i)
+                if data_info['frame_id'] == 0:
                     first_frame_indices.append(i)
 
-            if len(first_frame_indices) < num_replicas:
-                raise ValueError(
-                    f'only {len(first_frame_indices)} videos loaded,'
-                    f'but {self.num_replicas} gpus were given.')
+            self.num_videos = len(first_frame_indices)
+            if self.num_videos < self.world_size:
+                raise ValueError(f'only {self.num_videos} videos loaded,'
+                                 f'but {self.world_size} gpus were given.')
 
-            chunks = np.array_split(first_frame_indices, self.num_replicas)
+            chunks = np.array_split(first_frame_indices, self.world_size)
             split_flags = [c[0] for c in chunks]
-            split_flags.append(self.num_samples)
+            split_flags.append(len(self.dataset))
 
             self.indices = [
                 list(range(split_flags[i], split_flags[i + 1]))
-                for i in range(self.num_replicas)
+                for i in range(self.world_size)
             ]
 
-    def __iter__(self):
-        """Put videos to specify gpu."""
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
         indices = self.indices[self.rank]
         return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return len(self.indices[self.rank])
+
+    def set_epoch(self, epoch: int) -> None:
+        """Not supported in iteration-based runner."""
+        raise NotImplementedError(
+            'The `VideoSampler` is only used in testing, '
+            "and doesn't need `set_epoch`")
diff --git a/mmtrack/datasets/sot_coco_dataset.py b/mmtrack/datasets/sot_coco_dataset.py
index 7b642175b..b55e140ac 100644
--- a/mmtrack/datasets/sot_coco_dataset.py
+++ b/mmtrack/datasets/sot_coco_dataset.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import time
+import os.path as osp
+from typing import List
 
-import mmcv
 import numpy as np
-from mmdet.datasets import DATASETS
+from mmengine.dataset import force_full_init
+from mmengine.fileio.file_client import FileClient
 from pycocotools.coco import COCO
 
+from mmtrack.registry import DATASETS
 from .base_sot_dataset import BaseSOTDataset
 
 
@@ -16,71 +18,81 @@ class SOTCocoDataset(BaseSOTDataset):
     The dataset only support training mode.
     """
 
-    def __init__(self, ann_file, *args, **kwargs):
-        """Initialization of SOT dataset class.
-
-        Args:
-            ann_file (str): The official coco annotation file. It will be
-                loaded and parsed in the `self.load_data_infos` function.
-        """
-        file_client_args = kwargs.get('file_client_args', dict(backend='disk'))
-        self.file_client = mmcv.FileClient(**file_client_args)
-        with self.file_client.get_local_path(ann_file) as local_path:
-            self.coco = COCO(local_path)
+    def __init__(self, *args, **kwargs):
+        """Initialization of SOT dataset class."""
         super().__init__(*args, **kwargs)
 
-    def load_data_infos(self, split='train'):
-        """Load dataset information. Each instance is viewed as a video.
-
-        Args:
-            split (str, optional): The split of dataset. Defaults to 'train'.
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``.
+        Each instance is viewed as a video.
 
         Returns:
-            list[int]: The length of the list is the number of valid object
-                annotations. The elemment in the list is annotation ID in coco
+            list[dict]: The length of the list is the number of valid object
+                annotations. The inner dict contains annotation ID in coco
                 API.
         """
-        print('Loading Coco dataset...')
-        start_time = time.time()
+        file_client = FileClient.infer_client(uri=self.ann_file)
+        with file_client.get_local_path(self.ann_file) as local_file:
+            self.coco = COCO(local_file)
         ann_list = list(self.coco.anns.keys())
-        videos_list = [
-            ann for ann in ann_list
+        data_infos = [
+            dict(ann_id=ann) for ann in ann_list
             if self.coco.anns[ann].get('iscrowd', 0) == 0
         ]
-        print(f'Coco dataset loaded! ({time.time()-start_time:.2f} s)')
-        return videos_list
+        return data_infos
 
-    def get_bboxes_from_video(self, video_ind):
-        """Get bbox annotation about the instance in an image.
+    def get_bboxes_from_video(self, video_idx: int) -> np.ndarray:
+        """Get bbox annotation about one instance in an image.
 
         Args:
-            video_ind (int): video index. Each video_ind denotes an instance.
+            video_idx (int): The index of video.
 
         Returns:
-            ndarray: in [1, 4] shape. The bbox is in (x, y, w, h) format.
+            ndarray: In [1, 4] shape. The bbox is in (x, y, w, h) format.
         """
-        ann_id = self.data_infos[video_ind]
+        ann_id = self.get_data_info(video_idx)['ann_id']
         anno = self.coco.anns[ann_id]
-        bboxes = np.array(anno['bbox']).reshape(-1, 4)
+        bboxes = np.array(anno['bbox'], dtype=np.float32).reshape(-1, 4)
         return bboxes
 
-    def get_img_infos_from_video(self, video_ind):
-        """Get all frame paths in a video.
+    def get_img_infos_from_video(self, video_idx: int) -> dict:
+        """Get the image information about one instance in a image.
 
         Args:
-            video_ind (int): video index. Each video_ind denotes an instance.
+            video_idx (int): The index of video.
 
         Returns:
-            list[str]: all image paths
+            dict: {
+                    'video_id': int,
+                    'frame_ids': np.ndarray,
+                    'img_paths': list[str],
+                    'video_length': int
+                  }
         """
-        ann_id = self.data_infos[video_ind]
+        ann_id = self.get_data_info(video_idx)['ann_id']
         imgs = self.coco.loadImgs([self.coco.anns[ann_id]['image_id']])
-        img_names = [img['file_name'] for img in imgs]
-        frame_ids = np.arange(self.get_len_per_video(video_ind))
+        img_names = [
+            osp.join(self.data_prefix['img_path'], img['file_name'])
+            for img in imgs
+        ]
+        frame_ids = np.arange(self.get_len_per_video(video_idx))
         img_infos = dict(
-            filename=img_names, frame_ids=frame_ids, video_id=video_ind)
+            video_id=video_idx,
+            frame_ids=frame_ids,
+            img_paths=img_names,
+            video_length=1)
         return img_infos
 
-    def get_len_per_video(self, video_ind):
-        """Get the number of frames in a video."""
+    @force_full_init
+    def get_len_per_video(self, video_idx: int) -> int:
+        """Get the number of frames in a video. Here, it returns 1 since Coco
+        is a image dataset.
+
+        Args:
+            video_idx (int): The index of video. Each video_idx denotes an
+                instance.
+
+        Returns:
+            int: The length of video.
+        """
         return 1
diff --git a/mmtrack/datasets/sot_imagenet_vid_dataset.py b/mmtrack/datasets/sot_imagenet_vid_dataset.py
index 93a7a1764..235a2a58e 100644
--- a/mmtrack/datasets/sot_imagenet_vid_dataset.py
+++ b/mmtrack/datasets/sot_imagenet_vid_dataset.py
@@ -1,9 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
+import os.path as osp
+from typing import List
+
 import numpy as np
-from mmdet.datasets import DATASETS
+from mmengine.dataset import force_full_init
+from mmengine.fileio.file_client import FileClient
 
-from mmtrack.datasets.parsers import CocoVID
+from mmtrack.registry import DATASETS
+from .api_wrappers import CocoVID
 from .base_sot_dataset import BaseSOTDataset
 
 
@@ -14,89 +18,97 @@ class SOTImageNetVIDDataset(BaseSOTDataset):
     The dataset only support training mode.
     """
 
-    def __init__(self, ann_file, *args, **kwargs):
-        """Initialization of SOT dataset class.
-
-        Args:
-            ann_file (str): The coco-format annotation file of ImageNet VID
-                Dataset. It will be loaded and parsed in the
-                `self.load_data_infos` function.
-        """
-        file_client_args = kwargs.get('file_client_args', dict(backend='disk'))
-        self.file_client = mmcv.FileClient(**file_client_args)
-        with self.file_client.get_local_path(ann_file) as local_path:
-            self.coco = CocoVID(local_path)
+    def __init__(self, *args, **kwargs):
+        """Initialization of SOT dataset class."""
         super().__init__(*args, **kwargs)
 
-    def load_data_infos(self, split='train'):
-        """Load dataset information.
-
-        Args:
-            split (str, optional): The split of dataset. Defaults to 'train'.
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``.
 
         Returns:
-            list[int]: The length of the list is the number of instances. The
-                elemment in the list is instance ID in coco API.
+            list[dict]: The length of the list is the number of instances. The
+                inner dict contains instance ID in CocoVID API.
         """
-        data_infos = list(self.coco.instancesToImgs.keys())
+        file_client = FileClient.infer_client(uri=self.ann_file)
+        with file_client.get_local_path(self.ann_file) as local_file:
+            self.coco = CocoVID(local_file)
+        data_infos = [
+            dict(ins_id=ins_id) for ins_id in self.coco.instancesToImgs.keys()
+        ]
         return data_infos
 
-    def get_bboxes_from_video(self, video_ind):
-        """Get bbox annotation about the instance in a video. Considering
+    def get_bboxes_from_video(self, video_idx: int) -> np.ndarray:
+        """Get bbox annotation about one instance in a video. Considering
         `get_bboxes_from_video` in `SOTBaseDataset` is not compatible with
         `SOTImageNetVIDDataset`, we oveload this function though it's not
         called by `self.get_ann_infos_from_video`.
 
         Args:
-            video_ind (int): video index. Each video_ind denotes an instance.
+            video_idx (int): The index of video. Here, each video_idx denotes
+                an instance.
 
         Returns:
-            ndarray: in [N, 4] shape. The bbox is in (x, y, w, h) format.
+            ndarray: In [N, 4] shape. The bbox is in (x, y, w, h) format.
         """
-        instance_id = self.data_infos[video_ind]
+        instance_id = self.get_data_info(video_idx)['ins_id']
         img_ids = self.coco.instancesToImgs[instance_id]
         bboxes = []
         for img_id in img_ids:
             for ann in self.coco.imgToAnns[img_id]:
                 if ann['instance_id'] == instance_id:
                     bboxes.append(ann['bbox'])
-        bboxes = np.array(bboxes).reshape(-1, 4)
+        bboxes = np.array(bboxes, dtype=np.float32).reshape(-1, 4)
         return bboxes
 
-    def get_img_infos_from_video(self, video_ind):
-        """Get image information in a video.
+    def get_img_infos_from_video(self, video_idx: int) -> dict:
+        """Get image information about one instance in a video.
 
         Args:
-            video_ind (int): video index
+            video_idx (int): The index of video.
 
         Returns:
-            dict: {'filename': list[str], 'frame_ids':ndarray, 'video_id':int}
+            dict: {
+                    'video_id': int,
+                    'frame_ids': np.ndarray,
+                    'img_paths': list[str],
+                    'video_length': int
+                  }
         """
-        instance_id = self.data_infos[video_ind]
+        instance_id = self.get_data_info(video_idx)['ins_id']
         img_ids = self.coco.instancesToImgs[instance_id]
         frame_ids = []
         img_names = []
         # In ImageNetVID dataset, frame_ids may not be continuous.
         for img_id in img_ids:
             frame_ids.append(self.coco.imgs[img_id]['frame_id'])
-            img_names.append(self.coco.imgs[img_id]['file_name'])
+            img_names.append(
+                osp.join(self.data_prefix['img_path'],
+                         self.coco.imgs[img_id]['file_name']))
         img_infos = dict(
-            filename=img_names, frame_ids=frame_ids, video_id=video_ind)
+            video_id=video_idx,
+            frame_ids=frame_ids,
+            img_paths=img_names,
+            video_length=len(frame_ids))
         return img_infos
 
-    def get_ann_infos_from_video(self, video_ind):
-        """Get annotation information in a video.
+    def get_ann_infos_from_video(self, video_idx: int) -> dict:
+        """Get annotation information about one instance in a video.
         Note: We overload this function for speed up loading video information.
 
         Args:
-            video_ind (int): video index. Each video_ind denotes an instance.
+            video_idx (int): The index of video. Here, each video_idx denotes
+                an instance.
 
         Returns:
-            dict: {'bboxes': ndarray in (N, 4) shape, 'bboxes_isvalid':
-                ndarray, 'visible':ndarray}. The bbox is in
-                (x1, y1, x2, y2) format.
+            dict: {
+                    'bboxes': np.ndarray in (N, 4) shape,
+                    'bboxes_isvalid': np.ndarray,
+                    'visible': np.ndarray
+                  }.
+                  The annotation information in some datasets may contain
+                    'visible_ratio'. The bbox is in (x1, y1, x2, y2) format.
         """
-        instance_id = self.data_infos[video_ind]
+        instance_id = self.get_data_info(video_idx)['ins_id']
         img_ids = self.coco.instancesToImgs[instance_id]
         bboxes = []
         visible = []
@@ -105,7 +117,7 @@ def get_ann_infos_from_video(self, video_ind):
                 if ann['instance_id'] == instance_id:
                     bboxes.append(ann['bbox'])
                     visible.append(not ann.get('occluded', False))
-        bboxes = np.array(bboxes).reshape(-1, 4)
+        bboxes = np.array(bboxes, dtype=np.float32).reshape(-1, 4)
         bboxes_isvalid = (bboxes[:, 2] > self.bbox_min_size) & (
             bboxes[:, 3] > self.bbox_min_size)
         bboxes[:, 2:] += bboxes[:, :2]
@@ -114,14 +126,19 @@ def get_ann_infos_from_video(self, video_ind):
             bboxes=bboxes, bboxes_isvalid=bboxes_isvalid, visible=visible)
         return ann_infos
 
-    def get_visibility_from_video(self, video_ind):
-        """Get the visible information in a video.
-
+    def get_visibility_from_video(self, video_idx: int) -> dict:
+        """Get the visible information about one instance in a video.
         Considering `get_visibility_from_video` in `SOTBaseDataset` is not
         compatible with `SOTImageNetVIDDataset`, we oveload this function
         though it's not called by `self.get_ann_infos_from_video`.
+
+        Args:
+            video_idx (int): The index of video.
+
+        Returns:
+            dict: The visibilities of each object in the video.
         """
-        instance_id = self.data_infos[video_ind]
+        instance_id = self.get_data_info(video_idx)['ins_id']
         img_ids = self.coco.instancesToImgs[instance_id]
         visible = []
         for img_id in img_ids:
@@ -131,7 +148,13 @@ def get_visibility_from_video(self, video_ind):
         visible_info = dict(visible=np.array(visible, dtype=np.bool_))
         return visible_info
 
-    def get_len_per_video(self, video_ind):
-        """Get the number of frames in a video."""
-        instance_id = self.data_infos[video_ind]
+    @force_full_init
+    def get_len_per_video(self, video_idx: int) -> int:
+        """Get the length of filtered dataset and automatically call
+        ``full_init`` if the  dataset has not been fully init.
+
+        Returns:
+            int: The length of filtered dataset.
+        """
+        instance_id = self.get_data_info(video_idx)['ins_id']
         return len(self.coco.instancesToImgs[instance_id])
diff --git a/mmtrack/datasets/sot_test_dataset.py b/mmtrack/datasets/sot_test_dataset.py
deleted file mode 100644
index 5208cfb18..000000000
--- a/mmtrack/datasets/sot_test_dataset.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-from mmcv.utils import print_log
-from mmdet.datasets import DATASETS
-
-from mmtrack.core.evaluation import eval_sot_ope
-from .coco_video_dataset import CocoVideoDataset
-
-
-@DATASETS.register_module()
-class SOTTestDataset(CocoVideoDataset):
-    """Dataset for the testing of single object tracking.
-
-    The dataset doesn't support training mode.
-    """
-
-    CLASSES = (0, )
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def _parse_ann_info(self, img_info, ann_info):
-        """Parse bbox annotations.
-
-        Args:
-            img_info (dict): image information.
-            ann_info (list[dict]): Annotation information of an image. Each
-                image only has one bbox annotation.
-
-        Returns:
-            dict: A dict containing the following keys: bboxes, labels.
-            labels are not useful in SOT.
-        """
-        gt_bboxes = np.array(ann_info[0]['bbox'], dtype=np.float32)
-        # convert [x1, y1, w, h] to [x1, y1, x2, y2]
-        gt_bboxes[2] += gt_bboxes[0]
-        gt_bboxes[3] += gt_bboxes[1]
-        gt_labels = np.array(self.cat2label[ann_info[0]['category_id']])
-        if 'ignore' in ann_info[0]:
-            ann = dict(
-                bboxes=gt_bboxes,
-                labels=gt_labels,
-                ignore=ann_info[0]['ignore'])
-        else:
-            ann = dict(bboxes=gt_bboxes, labels=gt_labels)
-        return ann
-
-    def evaluate(self, results, metric=['track'], logger=None):
-        """Evaluation in OPE protocol.
-
-        Args:
-            results (dict): Testing results of the dataset.
-            metric (str | list[str]): Metrics to be evaluated. Options are
-                'track'.
-            logger (logging.Logger | str | None): Logger used for printing
-                related information during evaluation. Default: None.
-
-        Returns:
-            dict[str, float]: OPE style evaluation metric (i.e. success,
-            norm precision and precision).
-        """
-        if isinstance(metric, list):
-            metrics = metric
-        elif isinstance(metric, str):
-            metrics = [metric]
-        else:
-            raise TypeError('metric must be a list or a str.')
-        allowed_metrics = ['track']
-        for metric in metrics:
-            if metric not in allowed_metrics:
-                raise KeyError(f'metric {metric} is not supported.')
-
-        eval_results = dict()
-        if 'track' in metrics:
-            assert len(self.data_infos) == len(results['track_bboxes'])
-            print_log('Evaluate OPE Benchmark...', logger=logger)
-            inds = [
-                i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0
-            ]
-            num_vids = len(inds)
-            inds.append(len(self.data_infos))
-
-            track_bboxes = [
-                list(
-                    map(lambda x: x[:4],
-                        results['track_bboxes'][inds[i]:inds[i + 1]]))
-                for i in range(num_vids)
-            ]
-
-            ann_infos = [self.get_ann_info(_) for _ in self.data_infos]
-            ann_infos = [
-                ann_infos[inds[i]:inds[i + 1]] for i in range(num_vids)
-            ]
-            track_eval_results = eval_sot_ope(
-                results=track_bboxes, annotations=ann_infos)
-            eval_results.update(track_eval_results)
-
-            for k, v in eval_results.items():
-                if isinstance(v, float):
-                    eval_results[k] = float(f'{(v):.3f}')
-            print_log(eval_results, logger=logger)
-
-        return eval_results
diff --git a/mmtrack/datasets/sot_train_dataset.py b/mmtrack/datasets/sot_train_dataset.py
deleted file mode 100644
index 32dc7b9fa..000000000
--- a/mmtrack/datasets/sot_train_dataset.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-from mmdet.datasets import DATASETS
-
-from .coco_video_dataset import CocoVideoDataset
-from .parsers import CocoVID
-
-
-@DATASETS.register_module()
-class SOTTrainDataset(CocoVideoDataset):
-    """Dataset for the training of single object tracking.
-
-    The dataset doesn't support testing mode.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        assert self.load_as_video and not self.test_mode
-
-    def load_video_anns(self, ann_file):
-        """Load annotations from COCOVID style annotation file.
-
-        Args:
-            ann_file (str): Path of annotation file.
-
-        Returns:
-            list[dict]: Annotation information from COCOVID api.
-        """
-        self.coco = CocoVID(ann_file, self.load_as_video)
-
-        data_infos = []
-        self.vid_ids = self.coco.get_vid_ids()
-        for vid_id in self.vid_ids:
-            info = self.coco.load_vids([vid_id])[0]
-            data_infos.append(info)
-        return data_infos
-
-    def _filter_imgs(self):
-        """Filter videos without ground truths."""
-        valid_inds = []
-        # obtain videos that contain annotation
-        ids_with_ann = set(_['video_id'] for _ in self.coco.anns.values())
-
-        valid_vid_ids = []
-        for i, vid_info in enumerate(self.data_infos):
-            vid_id = self.vid_ids[i]
-            if self.filter_empty_gt and vid_id not in ids_with_ann:
-                continue
-            valid_inds.append(i)
-            valid_vid_ids.append(vid_id)
-        self.vid_ids = valid_vid_ids
-        return valid_inds
-
-    def _set_group_flag(self):
-        """Set flag according to video aspect ratio.
-
-        It is not useful since all flags are set as 0.
-        """
-        self.flag = np.zeros(len(self), dtype=np.uint8)
-
-    def get_snippet_of_instance(self, idx):
-        """Get a snippet of an instance in a video.
-
-        Args:
-            idx (int): Index of data.
-
-        Returns:
-            tuple: (snippet, image_id, instance_id), snippet is a list
-            containing the successive image ids where the instance
-            appears, image_id is a random sampled image id from the
-            snippet.
-        """
-        vid_id = self.vid_ids[idx]
-        instance_ids = self.coco.get_ins_ids_from_vid(vid_id)
-        instance_id = np.random.choice(instance_ids)
-        image_ids = self.coco.get_img_ids_from_ins_id(instance_id)
-        if len(image_ids) > 1:
-            snippets = np.split(
-                image_ids,
-                np.array(np.where(np.diff(image_ids) > 1)[0]) + 1)
-            # remove isolated frame
-            snippets = [s for s in snippets if len(s) > 1]
-            # TODO: use random rather than -1
-            snippet = snippets[-1].tolist()
-        else:
-            snippet = image_ids
-
-        image_id = np.random.choice(snippet)
-        return snippet, image_id, instance_id
-
-    def ref_img_sampling(self,
-                         snippet,
-                         image_id,
-                         instance_id,
-                         frame_range=5,
-                         pos_prob=0.8,
-                         filter_key_img=False,
-                         return_key_img=True,
-                         **kwargs):
-        """Get a search image for an instance in an exemplar image.
-
-        If sampling a positive search image, the positive search image is
-        randomly sampled from the exemplar image, where the sampled range is
-        decided by `frame_range`.
-        If sampling a negative search image, the negative search image and
-        negative instance are randomly sampled from the entire dataset.
-
-        Args:
-            snippet (list[int]): The successive image ids where the instance
-                appears.
-            image_id (int): The id of exemplar image where the instance
-                appears.
-            instance_id (int): The id of the instance.
-            frame_range (List(int) | int): The frame range of sampling a
-                positive search image for the exemplar image. Default: 5.
-            pos_prob (float): The probability of sampling a positive search
-                image. Default: 0.8.
-            filter_key_img (bool): If False, the exemplar image will be in the
-                sampling candidates, otherwise, it is exclude. Default: False.
-            return_key_img (bool): If True, the `image_id` and `instance_id`
-                are returned, otherwise, not returned. Default: True.
-
-        Returns:
-            tuple: (image_ids, instance_ids, is_positive_pair), image_ids is
-            a list that must contain search image id and may contain
-            `image_id`, instance_ids is a list that must contain search
-            instance id and may contain `instance_id`, is_positive_pair is
-            a bool denoting positive or negative sample pair.
-        """
-        assert pos_prob >= 0.0 and pos_prob <= 1.0
-        if isinstance(frame_range, int):
-            assert frame_range >= 0, 'frame_range can not be a negative value.'
-            frame_range = [-frame_range, frame_range]
-        elif isinstance(frame_range, list):
-            assert len(frame_range) == 2, 'The length must be 2.'
-            assert frame_range[0] <= 0 and frame_range[1] >= 0
-            for i in frame_range:
-                assert isinstance(i, int), 'Each element must be int.'
-        else:
-            raise TypeError('The type of frame_range must be int or list.')
-
-        ref_image_ids = []
-        ref_instance_ids = []
-        if pos_prob > np.random.random():
-            index = snippet.index(image_id)
-            left = max(index + frame_range[0], 0)
-            right = index + frame_range[1] + 1
-            valid_ids = snippet[left:right]
-            if filter_key_img and image_id in valid_ids:
-                valid_ids.remove(image_id)
-            ref_image_id = np.random.choice(valid_ids)
-            ref_instance_id = instance_id
-            is_positive_pair = True
-        else:
-            (ref_snippet, ref_image_id,
-             ref_instance_id) = self.get_snippet_of_instance(
-                 np.random.choice(range(len(self))))
-            is_positive_pair = False
-
-        ref_image_ids.append(ref_image_id)
-        ref_instance_ids.append(ref_instance_id)
-
-        if return_key_img:
-            return [image_id, *ref_image_ids], \
-                [instance_id, *ref_instance_ids], is_positive_pair
-        else:
-            return ref_image_ids, ref_instance_ids, is_positive_pair
-
-    def prepare_results(self, img_id, instance_id, is_positive_pair):
-        """Get training data and annotations.
-
-        Args:
-            img_id (int): The id of image.
-            instance_id (int): The id of instance.
-            is_positive_pair (bool): denoting positive or negative sample pair.
-
-        Returns:
-            dict: The information of training image and annotation.
-        """
-        img_info = self.coco.load_imgs([img_id])[0]
-        img_info['filename'] = img_info['file_name']
-        ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
-        ann_infos = self.coco.load_anns(ann_ids)
-        ann = self._parse_ann_info(instance_id, ann_infos)
-
-        result = dict(img_info=img_info, ann_info=ann)
-        self.pre_pipeline(result)
-        result['is_positive_pairs'] = is_positive_pair
-        return result
-
-    def prepare_train_img(self, idx):
-        """Get training data and annotations after pipeline.
-
-        Args:
-            idx (int): Index of data.
-
-        Returns:
-            dict: Training data and annotation after pipeline with new keys
-            introduced by pipeline.
-        """
-        snippet, image_id, instance_id = self.get_snippet_of_instance(idx)
-        image_ids, instance_ids, is_positive_pair = self.ref_img_sampling(
-            snippet, image_id, instance_id, **self.ref_img_sampler)
-        results = [
-            self.prepare_results(img_id, instance_id, is_positive_pair)
-            for img_id, instance_id in zip(image_ids, instance_ids)
-        ]
-        results = self.pipeline(results)
-        return results
-
-    def _parse_ann_info(self, instance_id, ann_infos):
-        """Parse bbox annotation.
-
-        Parse a given instance annotation from annotation infos of an image.
-
-        Args:
-            instance_id (int): The instance_id of an image need be parsed.
-            ann_info (list[dict]): Annotation information of an image.
-
-        Returns:
-            dict: A dict containing the following keys: bboxes, labels. labels
-            is set to `np.array([0])`.
-        """
-        has_instance_id = 0
-        for ann_info in ann_infos:
-            if ann_info['instance_id'] == instance_id:
-                has_instance_id = 1
-                break
-        assert has_instance_id
-
-        bbox = [[
-            ann_info['bbox'][0], ann_info['bbox'][1],
-            ann_info['bbox'][0] + ann_info['bbox'][2],
-            ann_info['bbox'][1] + ann_info['bbox'][3]
-        ]]
-        ann = dict(
-            bboxes=np.array(bbox, dtype=np.float32), labels=np.array([0]))
-        return ann
diff --git a/mmtrack/datasets/tao_dataset.py b/mmtrack/datasets/tao_dataset.py
index 326df96f8..156fc3d04 100644
--- a/mmtrack/datasets/tao_dataset.py
+++ b/mmtrack/datasets/tao_dataset.py
@@ -1,269 +1,126 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import os
-import tempfile
+import copy
+from typing import List, Tuple
 
-import mmcv
-import numpy as np
-from mmcv.utils import print_log
-from mmdet.datasets import DATASETS
 from mmdet.datasets.api_wrappers import COCO
+from mmengine.fileio import FileClient
 
-from .coco_video_dataset import CocoVideoDataset
-from .parsers import CocoVID
-
-try:
-    import tao
-    from tao.toolkit.tao import TaoEval
-except ImportError:
-    tao = None
-
-try:
-    import lvis
-    from lvis import LVIS, LVISEval, LVISResults
-except ImportError:
-    lvis = None
+from mmtrack.registry import DATASETS
+from .api_wrappers import CocoVID
+from .base_video_dataset import BaseVideoDataset
 
 
 @DATASETS.register_module()
-class TaoDataset(CocoVideoDataset):
+class TaoDataset(BaseVideoDataset):
     """Dataset for TAO."""
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def load_annotations(self, ann_file):
-        """Load annotation from annotation file."""
-        if not self.load_as_video:
-            data_infos = self.load_lvis_anns(ann_file)
-        else:
-            data_infos = self.load_tao_anns(ann_file)
-        return data_infos
-
-    def load_lvis_anns(self, ann_file):
-        """Load annotation from COCO style annotation file.
-
-        Args:
-            ann_file (str): Path of annotation file.
+    def load_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from an annotation file named as ``self.ann_file``
 
         Returns:
-            list[dict]: Annotation info from COCO api.
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
         """
+        if self.load_as_video:
+            data_list, valid_data_indices = self._load_tao_data_list()
+        else:
+            data_list, valid_data_indices = self._load_lvis_data_list()
 
-        self.coco = COCO(ann_file)
-        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
-        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
-        self.img_ids = self.coco.get_img_ids()
-        data_infos = []
-        for i in self.img_ids:
-            info = self.coco.load_imgs([i])[0]
-            info['filename'] = info['file_name']
-            if info['file_name'].startswith('COCO'):
-                # Convert form the COCO 2014 file naming convention of
-                # COCO_[train/val/test]2014_000000000000.jpg to the 2017
-                # naming convention of 000000000000.jpg
-                # (LVIS v1 will fix this naming issue)
-                info['filename'] = info['file_name'][-16:]
-            else:
-                info['filename'] = info['file_name']
-            data_infos.append(info)
-        return data_infos
-
-    def load_tao_anns(self, ann_file):
-        """Load annotation from COCOVID style annotation file.
+        return data_list, valid_data_indices
 
-        Args:
-            ann_file (str): Path of annotation file.
+    def _load_tao_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from an annotation file named as ``self.ann_file``
 
         Returns:
-            list[dict]: Annotation info from COCOVID api.
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
         """
-        self.coco = CocoVID(ann_file)
-        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
+        file_client = FileClient.infer_client(uri=self.ann_file)
+        with file_client.get_local_path(self.ann_file) as local_path:
+            coco = CocoVID(local_path)
+        self._metainfo['categories'] = coco.cats
+        # The order of returned `cat_ids` will not
+        # change with the order of the CLASSES
+        self.cat_ids = coco.get_cat_ids(cat_names=self.metainfo['CLASSES'])
         self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(coco.cat_img_map)
 
-        data_infos = []
-        self.vid_ids = self.coco.get_vid_ids()
-        self.img_ids = []
-        for vid_id in self.vid_ids:
-            img_ids = self.coco.get_img_ids_from_vid(vid_id)
-            if self.key_img_sampler is not None:
-                img_ids = self.key_img_sampling(img_ids,
-                                                **self.key_img_sampler)
-            self.img_ids.extend(img_ids)
+        data_list = []
+        vid_ids = coco.get_vid_ids()
+        for vid_id in vid_ids:
+            img_ids = coco.get_img_ids_from_vid(vid_id)
             for img_id in img_ids:
-                info = self.coco.load_imgs([img_id])[0]
-                if info['file_name'].startswith('COCO'):
+                # load img info
+                raw_img_info = coco.load_imgs([img_id])[0]
+                if raw_img_info['file_name'].startswith('COCO'):
                     # Convert form the COCO 2014 file naming convention of
                     # COCO_[train/val/test]2014_000000000000.jpg to the 2017
                     # naming convention of 000000000000.jpg
                     # (LVIS v1 will fix this naming issue)
-                    info['filename'] = info['file_name'][-16:]
+                    raw_img_info['filename'] = raw_img_info['file_name'][-16:]
                 else:
-                    info['filename'] = info['file_name']
-                data_infos.append(info)
-        return data_infos
+                    raw_img_info['filename'] = raw_img_info['file_name']
+                raw_img_info['img_id'] = img_id
+                raw_img_info['video_length'] = len(img_ids)
 
-    def _track2json(self, results):
-        """Convert tracking results to TAO json style."""
-        ids = [i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0]
-        num_vids = len(ids)
-        ids.append(len(self.data_infos))
-        results = [results[ids[i]:ids[i + 1]] for i in range(num_vids)]
-        img_infos = [
-            self.data_infos[ids[i]:ids[i + 1]] for i in range(num_vids)
-        ]
+                # load ann info
+                ann_ids = coco.get_ann_ids(
+                    img_ids=[img_id], cat_ids=self.cat_ids)
+                raw_ann_info = coco.load_anns(ann_ids)
 
-        json_results = []
-        max_track_id = 0
-        for _img_infos, _results in zip(img_infos, results):
-            track_ids = []
-            for img_info, result in zip(_img_infos, _results):
-                img_id = img_info['id']
-                for label in range(len(result)):
-                    bboxes = result[label]
-                    for i in range(bboxes.shape[0]):
-                        data = dict(
-                            image_id=img_id,
-                            bbox=self.xyxy2xywh(bboxes[i, 1:]),
-                            score=float(bboxes[i][-1]),
-                            video_id=img_info['video_id'],
-                            category_id=label
-                            # 1230 is the number of categories in LVIS dataset
-                            if len(results) == 1230 else self.cat_ids[label],
-                            track_id=max_track_id + int(bboxes[i][0]))
-                        track_ids.append(int(bboxes[i][0]))
-                        json_results.append(data)
-            track_ids = list(set(track_ids))
-            max_track_id += max(track_ids) + 1
+                # get data_info
+                parsed_data_info = self.parse_data_info(
+                    dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info))
+                data_list.append(parsed_data_info)
 
-        return json_results
+        valid_data_indices = list(range(len(data_list)))
+        return data_list, valid_data_indices
 
-    def _det2json(self, results):
-        """Convert detection results to COCO json style."""
-        json_results = []
-        for idx in range(len(self)):
-            img_id = self.img_ids[idx]
-            result = results[idx]
-            for label in range(len(result)):
-                bboxes = result[label]
-                for i in range(bboxes.shape[0]):
-                    data = dict(
-                        image_id=img_id,
-                        bbox=self.xyxy2xywh(bboxes[i]),
-                        score=float(bboxes[i][4]),
-                        category_id=label
-                        # 1230 is the number of categories in LVIS dataset
-                        if len(result) == 1230 else self.cat_ids[label])
-                    json_results.append(data)
-        return json_results
-
-    def format_results(self, results, resfile_path=None):
-        """Format the results to json (standard format for TAO evaluation).
-
-        Args:
-            results (list[ndarray]): Testing results of the dataset.
-            resfile_path (str, optional): Path to save the formatted results.
-                Defaults to None.
+    def _load_lvis_data_list(self):
+        """Load annotations from an annotation file named as ``self.ann_file``
 
         Returns:
-            tuple: (result_files, tmp_dir), result_files is a dict containing \
-                the json filepaths, tmp_dir is the temporal directory created \
-                for saving json files when resfile_path is not specified.
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
         """
-        assert isinstance(results, dict), 'results must be a list'
-        assert 'track_bboxes' in results
-        assert 'det_bboxes' in results
-
-        if resfile_path is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            resfile_path = tmp_dir.name
-        else:
-            tmp_dir = None
-        os.makedirs(resfile_path, exist_ok=True)
-        result_files = dict()
-
-        bbox_results = self._det2json(results['det_bboxes'])
-        result_files['bbox'] = f'{resfile_path}/tao_bbox.json'
-        mmcv.dump(bbox_results, result_files['bbox'])
-
-        track_results = self._track2json(results['track_bboxes'])
-        result_files['track'] = f'{resfile_path}/tao_track.json'
-        mmcv.dump(track_results, result_files['track'])
-
-        return result_files, tmp_dir
-
-    def evaluate(self,
-                 results,
-                 metric=['bbox', 'track'],
-                 logger=None,
-                 resfile_path=None):
-        if isinstance(metric, list):
-            metrics = metric
-        elif isinstance(metric, str):
-            metrics = [metric]
-        else:
-            raise TypeError('metric must be a list or a str.')
-        allowed_metrics = ['bbox', 'track']
-        for metric in metrics:
-            if metric not in allowed_metrics:
-                raise KeyError(f'metric {metric} is not supported.')
-
-        result_files, tmp_dir = self.format_results(results, resfile_path)
-
-        eval_results = dict()
-
-        if 'track' in metrics:
-            if tao is None:
-                raise ImportError(
-                    'Please run'
-                    ' pip install git+https://github.com/TAO-Dataset/tao.git '
-                    'to manually install tao')
-
-            print_log('Evaluating tracking results...', logger)
-            tao_eval = TaoEval(self.ann_file, result_files['track'])
-            tao_eval.params.img_ids = self.img_ids
-            tao_eval.params.cat_ids = self.cat_ids
-            tao_eval.params.iou_thrs = np.array([0.5, 0.75])
-            tao_eval.run()
-
-            tao_eval.print_results()
-            tao_results = tao_eval.get_results()
-            for k, v in tao_results.items():
-                if isinstance(k, str) and k.startswith('AP'):
-                    key = 'track_{}'.format(k)
-                    val = float('{:.3f}'.format(float(v)))
-                    eval_results[key] = val
+        file_client = FileClient.infer_client(uri=self.ann_file)
+        with file_client.get_local_path(self.ann_file) as local_path:
+            coco = COCO(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the CLASSES
+        self.cat_ids = coco.get_cat_ids(cat_names=self.metainfo['CLASSES'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(coco.cat_img_map)
+
+        img_ids = coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            if raw_img_info['file_name'].startswith('COCO'):
+                # Convert form the COCO 2014 file naming convention of
+                # COCO_[train/val/test]2014_000000000000.jpg to the 2017
+                # naming convention of 000000000000.jpg
+                # (LVIS v1 will fix this naming issue)
+                raw_img_info['filename'] = raw_img_info['file_name'][-16:]
+            else:
+                raw_img_info['filename'] = raw_img_info['file_name']
 
-        if 'bbox' in metrics:
-            if lvis is None:
-                raise ImportError(
-                    'Please run'
-                    ' pip install git+https://github.com/lvis-dataset/lvis-api.git '  # noqa
-                    'to manually install lvis')
+            ann_ids = coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
 
-            print_log('Evaluating detection results...', logger)
-            lvis_gt = LVIS(self.ann_file)
-            lvis_dt = LVISResults(lvis_gt, result_files['bbox'])
-            lvis_eval = LVISEval(lvis_gt, lvis_dt, 'bbox')
-            lvis_eval.params.imgIds = self.img_ids
-            lvis_eval.params.catIds = self.cat_ids
-            lvis_eval.evaluate()
-            lvis_eval.accumulate()
-            lvis_eval.summarize()
-            lvis_eval.print_results()
-            lvis_results = lvis_eval.get_results()
-            for k, v in lvis_results.items():
-                if k.startswith('AP'):
-                    key = '{}_{}'.format('bbox', k)
-                    val = float('{:.3f}'.format(float(v)))
-                    eval_results[key] = val
-            ap_summary = ' '.join([
-                '{}:{:.3f}'.format(k, float(v))
-                for k, v in lvis_results.items() if k.startswith('AP')
-            ])
-            eval_results['bbox_mAP_copypaste'] = ap_summary
+            parsed_data_info = self.parse_data_info(
+                dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info))
+            data_list.append(parsed_data_info)
 
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
+        assert len(set(total_ann_ids)) == len(
+            total_ann_ids
+        ), f"Annotation ids in '{self.ann_file}' are not unique!"
 
-        return eval_results
+        valid_data_indices = list(range(len(data_list)))
+        return data_list, valid_data_indices
diff --git a/mmtrack/datasets/trackingnet_dataset.py b/mmtrack/datasets/trackingnet_dataset.py
index 36b7990df..9df925b4c 100644
--- a/mmtrack/datasets/trackingnet_dataset.py
+++ b/mmtrack/datasets/trackingnet_dataset.py
@@ -1,13 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-import os.path as osp
-import shutil
 import time
+from typing import List, Union
 
 import numpy as np
-from mmcv.utils import print_log
-from mmdet.datasets import DATASETS
 
+from mmtrack.registry import DATASETS
 from .base_sot_dataset import BaseSOTDataset
 
 
@@ -18,23 +16,29 @@ class TrackingNetDataset(BaseSOTDataset):
     The dataset can both support training and testing mode.
     """
 
-    def __init__(self, chunks_list=['all'], *args, **kwargs):
+    def __init__(self,
+                 chunks_list: List[Union[int, str]] = ['all'],
+                 *args,
+                 **kwargs):
         """Initialization of SOT dataset class.
 
         Args:
-            chunks_list (list, optional): the training chunks. The optional
-                values in this list are: 0, 1, 2, ..., 10, 11 and 'all'. Some
-                methods may only use part of the dataset. Default to all
-                chunks, namely ['all'].
+            chunks_list (list[Union[int, str]], optional): The training chunks.
+                The optional values in this list are: 0, 1, 2, ..., 10, 11 and
+                'all'. Some methods may only use part of the dataset. Default
+                to all chunks, namely ['all'].
         """
         if isinstance(chunks_list, (str, int)):
             chunks_list = [chunks_list]
         assert set(chunks_list).issubset(set(range(12)) | {'all'})
-        self.chunks_list = chunks_list
+        if 'all' in chunks_list:
+            self.chunks_list = [f'TRAIN_{i}' for i in range(12)]
+        else:
+            self.chunks_list = [f'TRAIN_{chunk}' for chunk in chunks_list]
         super(TrackingNetDataset, self).__init__(*args, **kwargs)
 
-    def load_data_infos(self, split='train'):
-        """Load dataset information.
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``.
 
         Args:
             split (str, optional): the split of dataset. Defaults to 'train'.
@@ -54,27 +58,16 @@ def load_data_infos(self, split='train'):
         """
         print('Loading TrackingNet dataset...')
         start_time = time.time()
-        if split == 'test':
-            chunks = ['TEST']
-        elif split == 'train':
-            if 'all' in self.chunks_list:
-                chunks = [f'TRAIN_{i}' for i in range(12)]
-            else:
-                chunks = [f'TRAIN_{chunk}' for chunk in self.chunks_list]
-        else:
-            raise NotImplementedError
-
-        assert len(chunks) > 0
-        chunks = set(chunks)
+        chunks = set(self.chunks_list)
         data_infos = []
-        data_infos_str = self.loadtxt(
-            self.ann_file, return_array=False).split('\n')
+        data_infos_str = self._loadtxt(
+            self.ann_file, return_ndarray=False).split('\n')
         # the first line of annotation file is a dataset comment.
         for line in data_infos_str[1:]:
             # compatible with different OS.
             line = line.strip().replace('/', os.sep).split(',')
             chunk = line[0].split(os.sep)[0]
-            if chunk in chunks:
+            if chunk == 'TEST' or chunk in chunks:
                 data_info = dict(
                     video_path=line[0],
                     ann_path=line[1],
@@ -85,82 +78,40 @@ def load_data_infos(self, split='train'):
         print(f'TrackingNet dataset loaded! ({time.time()-start_time:.2f} s)')
         return data_infos
 
-    def prepare_test_data(self, video_ind, frame_ind):
+    def prepare_test_data(self, video_idx: int, frame_idx: int) -> dict:
         """Get testing data of one frame. We parse one video, get one frame
         from it and pass the frame information to the pipeline.
 
         Args:
-            video_ind (int): video index
-            frame_ind (int): frame index
+            video_idx (int): The index of video.
+            frame_idx (int): The index of frame.
 
         Returns:
-            dict: testing data of one frame.
+            dict: Testing data of one frame.
         """
-        if self.test_memo.get('video_ind', None) != video_ind:
-            self.test_memo.video_ind = video_ind
-            self.test_memo.img_infos = self.get_img_infos_from_video(video_ind)
-        assert 'video_ind' in self.test_memo and 'img_infos' in self.test_memo
-
-        img_info = dict(
-            filename=self.test_memo.img_infos['filename'][frame_ind],
-            frame_id=frame_ind)
-        if frame_ind == 0:
-            ann_infos = self.get_ann_infos_from_video(video_ind)
-            ann_info = dict(
-                bboxes=ann_infos['bboxes'][frame_ind], visible=True)
-        else:
-            ann_info = dict(
-                bboxes=np.array([0] * 4, dtype=np.float32), visible=True)
-
-        results = dict(img_info=img_info, ann_info=ann_info)
-        self.pre_pipeline(results)
+        if self.test_memo.get('video_idx', None) != video_idx:
+            self.test_memo.video_idx = video_idx
+            ann_infos = self.get_ann_infos_from_video(video_idx)
+            img_infos = self.get_img_infos_from_video(video_idx)
+            self.test_memo.video_infos = dict(**img_infos, **ann_infos)
+        assert 'video_idx' in self.test_memo and 'video_infos'\
+            in self.test_memo
+
+        results = {}
+        results['img_path'] = self.test_memo.video_infos['img_paths'][
+            frame_idx]
+        results['frame_id'] = frame_idx
+        results['video_id'] = video_idx
+        results['video_length'] = self.test_memo.video_infos['video_length']
+
+        instance = {}
+        if frame_idx == 0:
+            ann_infos = self.get_ann_infos_from_video(video_idx)
+            instance['bbox'] = ann_infos['bboxes'][frame_idx]
+
+        results['instances'] = []
+        instance['visible'] = True
+        instance['bbox_label'] = np.array([0], dtype=np.int32)
+        results['instances'].append(instance)
         results = self.pipeline(results)
         return results
-
-    def format_results(self, results, resfile_path=None, logger=None):
-        """Format the results to txts (standard format for TrackingNet
-        Challenge).
-
-        Args:
-            results (dict(list[ndarray])): Testing results of the dataset.
-            resfile_path (str): Path to save the formatted results.
-                Defaults to None.
-            logger (logging.Logger | str | None, optional): defaults to None.
-        """
-        # prepare saved dir
-        assert resfile_path is not None, 'Please give key-value pair \
-            like resfile_path=xxx in argparse'
-
-        if not osp.isdir(resfile_path):
-            os.makedirs(resfile_path, exist_ok=True)
-
-        print_log(
-            f"-------- There are total {len(results['track_bboxes'])} images "
-            '--------',
-            logger=logger)
-
-        # transform tracking results format
-        # from [bbox_1, bbox_2, ...] to {'video_1':[bbox_1, bbox_2, ...], ...}
-        start_ind = end_ind = 0
-        for num, video_info in zip(self.num_frames_per_video, self.data_infos):
-            end_ind += num
-            video_name = video_info['video_path'].split(os.sep)[-1]
-            video_txt = osp.join(resfile_path, '{}.txt'.format(video_name))
-            with open(video_txt, 'w') as f:
-                for bbox in results['track_bboxes'][start_ind:end_ind]:
-                    bbox = [
-                        str(f'{bbox[0]:.4f}'),
-                        str(f'{bbox[1]:.4f}'),
-                        str(f'{(bbox[2] - bbox[0]):.4f}'),
-                        str(f'{(bbox[3] - bbox[1]):.4f}')
-                    ]
-                    line = ','.join(bbox) + '\n'
-                    f.writelines(line)
-            start_ind += num
-
-        shutil.make_archive(resfile_path, 'zip', resfile_path)
-        shutil.rmtree(resfile_path)
-
-        print_log(
-            f'-------- The results are stored in {resfile_path}.zip --------',
-            logger=logger)
diff --git a/mmtrack/datasets/transforms/__init__.py b/mmtrack/datasets/transforms/__init__.py
new file mode 100644
index 000000000..bf40ceba4
--- /dev/null
+++ b/mmtrack/datasets/transforms/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .formatting import CheckPadMaskValidity, PackReIDInputs, PackTrackInputs
+from .loading import LoadTrackAnnotations
+from .processing import PairSampling, TridentSampling
+from .transforms import (BrightnessAug, CropLikeDiMP, CropLikeSiamFC, GrayAug,
+                         SeqBboxJitter, SeqBlurAug, SeqColorAug,
+                         SeqCropLikeStark, SeqShiftScaleAug)
+
+__all__ = [
+    'LoadTrackAnnotations', 'PackTrackInputs', 'PackReIDInputs',
+    'PairSampling', 'CropLikeSiamFC', 'SeqShiftScaleAug', 'SeqColorAug',
+    'SeqBlurAug', 'TridentSampling', 'GrayAug', 'BrightnessAug',
+    'SeqBboxJitter', 'SeqCropLikeStark', 'CheckPadMaskValidity', 'CropLikeDiMP'
+]
diff --git a/mmtrack/datasets/transforms/formatting.py b/mmtrack/datasets/transforms/formatting.py
new file mode 100644
index 000000000..25f67f1e9
--- /dev/null
+++ b/mmtrack/datasets/transforms/formatting.py
@@ -0,0 +1,463 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple, Union
+
+import cv2
+import numpy as np
+from mmcv.transforms import BaseTransform, to_tensor
+from mmdet.structures.mask import BitmapMasks
+from mmengine.structures import InstanceData
+
+from mmtrack.registry import TRANSFORMS
+from mmtrack.structures import ReIDDataSample, TrackDataSample
+
+
+@TRANSFORMS.register_module()
+class PackTrackInputs(BaseTransform):
+    """Pack the inputs data for the video object detection / multi object
+    tracking / single object tracking / video instance segmentation.
+
+    For each value (``List`` type) in the input dict, we concat the first
+    `num_key_frames` elements to the first dict with a new key, and the rest
+    of elements are concated to the second dict with a new key.
+    All the information of images are packed to ``inputs``.
+    All the information except images are packed to ``data_samples``.
+
+    Args:
+        ref_prefix (str): The prefix of key added to the 'reference' frames.
+            Defaults to 'ref'.
+        num_key_frames (int): The number of key frames. Defaults to 1.
+        num_template_frames (optional, int): The number of template frames. It
+            is only used for training in SOT.
+        pack_single_img (bool, optional): Whether to only pack single image. If
+            True, pack the data as a list additionally. Defaults to False.
+        meta_keys (Sequence[str]): Meta keys to be collected in
+            ``data_sample.metainfo``. Defaults to None.
+        default_meta_keys (tuple): Default meta keys. Defaults to ('img_id',
+            'img_path', 'ori_shape', 'img_shape', 'scale_factor',
+            'flip', 'flip_direction', 'frame_id', 'is_video_data',
+            'video_id', 'video_length', 'instances').
+    """
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_masks': 'masks',
+        'gt_instances_id': 'instances_id'
+    }
+
+    def __init__(self,
+                 ref_prefix: str = 'ref',
+                 num_key_frames: int = 1,
+                 num_template_frames: Optional[int] = None,
+                 pack_single_img: Optional[bool] = False,
+                 meta_keys: Optional[dict] = None,
+                 default_meta_keys: tuple = ('img_id', 'img_path', 'ori_shape',
+                                             'img_shape', 'scale_factor',
+                                             'flip', 'flip_direction',
+                                             'frame_id', 'is_video_data',
+                                             'video_id', 'video_length',
+                                             'instances', 'num_left_ref_imgs',
+                                             'frame_stride')):
+        self.ref_prefix = ref_prefix
+        # If ``num_template_frames`` is not None, this class is used in SOT.
+        # In this case, we assign the value of ``num_template_frames`` to
+        # ``self.num_key_frames`` for the consistency in the processing.
+        self.num_key_frames = num_key_frames if num_template_frames is None \
+            else num_template_frames
+        self.meta_keys = default_meta_keys
+        if meta_keys is not None:
+            if isinstance(meta_keys, str):
+                meta_keys = (meta_keys, )
+            else:
+                assert isinstance(meta_keys, tuple), \
+                    'meta_keys must be str or tuple'
+            self.meta_keys += meta_keys
+
+        self.pack_single_img = pack_single_img
+
+    def _cat_same_type_data(
+            self,
+            data: Union[List, int],
+            return_ndarray: bool = True,
+            axis: int = 0,
+            stack: bool = False) -> Tuple[np.ndarray, np.ndarray]:
+        """Concatenate data with the same type.
+
+        Args:
+            data (Union[List, int]): Input data.
+            return_ndarray (bool, optional): Whether to return ``np.ndarray``.
+                Defaults to True.
+            axis (int, optional): The axis that concatenating along. Defaults
+                to 0.
+            stack (bool, optional): Whether to stack all the data. If not,
+                using the concatenating operation. Defaults to False.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: The first element is the
+                concatenated data of key frames, and the second element is the
+                concatenated data of reference frames.
+        """
+        if self.pack_single_img:
+            data = [data]
+        key_data = data[:self.num_key_frames]
+        ref_data = data[self.num_key_frames:] if len(
+            data) > self.num_key_frames else None
+
+        if return_ndarray:
+            if stack:
+                key_data = np.stack(key_data, axis=axis)
+                if ref_data is not None:
+                    ref_data = np.stack(ref_data, axis=axis)
+            else:
+                key_data = np.concatenate(key_data, axis=axis)
+                if ref_data is not None:
+                    ref_data = np.concatenate(ref_data, axis=axis)
+
+        return key_data, ref_data
+
+    def _get_img_idx_map(self, anns: List) -> Tuple[np.ndarray, np.ndarray]:
+        """Get the index of images for the annotations. The multiple instances
+        in one image need to be denoted the image index when concatenating
+        multiple images.
+
+        Args:
+            anns (List): Input annotations.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: The first element is the
+                concatenated indexes of key frames, and the second element is
+                the concatenated indexes of reference frames.
+        """
+        if self.pack_single_img:
+            anns = [anns]
+        key_img_idx_map = []
+        for img_idx, ann in enumerate(anns[:self.num_key_frames]):
+            key_img_idx_map.extend([img_idx] * len(ann))
+        key_img_idx_map = np.array(key_img_idx_map, dtype=np.int32)
+        if len(anns) > self.num_key_frames:
+            ref_img_idx_map = []
+            for img_idx, ann in enumerate(anns[self.num_key_frames:]):
+                ref_img_idx_map.extend([img_idx] * len(ann))
+            ref_img_idx_map = np.array(ref_img_idx_map, dtype=np.int32)
+        else:
+            ref_img_idx_map = None
+        return key_img_idx_map, ref_img_idx_map
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict:
+            - 'inputs' (dict[Tensor]): The forward data of models.
+            - 'data_samples' (obj:`TrackDataSample`): The annotation info of
+                the samples.
+        """
+        packed_results = dict()
+        packed_results['inputs'] = dict()
+
+        # 1. Pack images
+        if 'img' in results:
+            imgs = results['img']
+            key_imgs, ref_imgs = self._cat_same_type_data(imgs, stack=True)
+            key_imgs = key_imgs.transpose(0, 3, 1, 2)
+            packed_results['inputs']['img'] = to_tensor(key_imgs)
+
+            if ref_imgs is not None:
+                ref_imgs = ref_imgs.transpose(0, 3, 1, 2)
+                packed_results['inputs'][f'{self.ref_prefix}_img'] = to_tensor(
+                    ref_imgs)
+
+        data_sample = TrackDataSample()
+
+        # 2. Pack InstanceData
+        if 'gt_ignore_flags' in results:
+            gt_ignore_flags = results['gt_ignore_flags']
+            (key_gt_ignore_flags,
+             ref_gt_ignore_flags) = self._cat_same_type_data(gt_ignore_flags)
+            key_valid_idx = key_gt_ignore_flags == 0
+            if ref_gt_ignore_flags is not None:
+                ref_valid_idx = ref_gt_ignore_flags == 0
+
+        instance_data = InstanceData()
+        ignore_instance_data = InstanceData()
+        ref_instance_data = InstanceData()
+        ref_ignore_instance_data = InstanceData()
+
+        # Flag that whether have recorded the image index
+        img_idx_map_flag = False
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks':
+                gt_masks = results[key]
+                gt_masks_ndarray = [
+                    mask.to_ndarray() for mask in gt_masks
+                ] if isinstance(gt_masks, list) else gt_masks.to_ndarray()
+                key_gt_masks, ref_gt_masks = self._cat_same_type_data(
+                    gt_masks_ndarray)
+
+                mapped_key = self.mapping_table[key]
+                if 'gt_ignore_flags' in results:
+                    instance_data[mapped_key] = BitmapMasks(
+                        key_gt_masks[key_valid_idx], *key_gt_masks.shape[-2:])
+                    ignore_instance_data[mapped_key] = BitmapMasks(
+                        key_gt_masks[~key_valid_idx], *key_gt_masks.shape[-2:])
+
+                    if ref_gt_masks is not None:
+                        ref_instance_data[mapped_key] = BitmapMasks(
+                            ref_gt_masks[ref_valid_idx],
+                            *key_gt_masks.shape[-2:])
+                        ref_ignore_instance_data[mapped_key] = BitmapMasks(
+                            ref_gt_masks[~ref_valid_idx],
+                            *key_gt_masks.shape[-2:])
+                else:
+                    instance_data[mapped_key] = BitmapMasks(
+                        key_gt_masks, *key_gt_masks.shape[-2:])
+                    if ref_gt_masks is not None:
+                        ref_instance_data[mapped_key] = BitmapMasks(
+                            ref_gt_masks, *ref_gt_masks.shape[-2:])
+
+            else:
+                anns = results[key]
+                key_anns, ref_anns = self._cat_same_type_data(anns)
+
+                if not img_idx_map_flag:
+                    # The multiple instances in one image need to be
+                    # denoted the image index when concatenating multiple
+                    # images.
+                    key_img_idx_map, ref_img_idx_map = self._get_img_idx_map(
+                        anns)
+                    img_idx_map_flag = True
+
+                mapped_key = self.mapping_table[key]
+                if 'gt_ignore_flags' in results:
+                    instance_data[mapped_key] = to_tensor(
+                        key_anns[key_valid_idx])
+                    ignore_instance_data[mapped_key] = to_tensor(
+                        key_anns[~key_valid_idx])
+                    instance_data['map_instances_to_img_idx'] = to_tensor(
+                        key_img_idx_map[key_valid_idx])
+                    ignore_instance_data[
+                        'map_instances_to_img_idx'] = to_tensor(
+                            key_img_idx_map[~key_valid_idx])
+
+                    if ref_anns is not None:
+                        ref_instance_data[mapped_key] = to_tensor(
+                            ref_anns[ref_valid_idx])
+                        ref_ignore_instance_data[mapped_key] = to_tensor(
+                            ref_anns[~ref_valid_idx])
+                        ref_instance_data[
+                            'map_instances_to_img_idx'] = to_tensor(
+                                ref_img_idx_map[ref_valid_idx])
+                        ref_ignore_instance_data[
+                            'map_instances_to_img_idx'] = to_tensor(
+                                ref_img_idx_map[~ref_valid_idx])
+                else:
+                    instance_data[mapped_key] = to_tensor(key_anns)
+                    instance_data['map_instances_to_img_idx'] = to_tensor(
+                        key_img_idx_map)
+                    if ref_anns is not None:
+                        ref_instance_data[mapped_key] = to_tensor(ref_anns)
+                        ref_instance_data[
+                            'map_instances_to_img_idx'] = to_tensor(
+                                ref_img_idx_map)
+
+        data_sample.gt_instances = instance_data
+        data_sample.ignored_instances = ignore_instance_data
+        setattr(data_sample, f'{self.ref_prefix}_gt_instances',
+                ref_instance_data)
+        setattr(data_sample, f'{self.ref_prefix}_ignored_instances',
+                ref_ignore_instance_data)
+
+        # 3. Pack metainfo
+        new_img_metas = {}
+        for key in self.meta_keys:
+            if key not in results:
+                continue
+            img_metas = results[key]
+            key_img_metas, ref_img_metas = self._cat_same_type_data(
+                img_metas, return_ndarray=False)
+            # To compatible the interface of ``MMDet``, we don't use
+            # the fotmat of list when the length of meta information is
+            # equal to 1.
+            if len(key_img_metas) > 1:
+                new_img_metas[key] = key_img_metas
+            else:
+                new_img_metas[key] = key_img_metas[0]
+            if ref_img_metas is not None:
+                if len(ref_img_metas) > 1:
+                    new_img_metas[f'{self.ref_prefix}_{key}'] = ref_img_metas
+                else:
+                    new_img_metas[f'{self.ref_prefix}_{key}'] = ref_img_metas[
+                        0]
+
+        data_sample.set_metainfo(new_img_metas)
+
+        # 4. Pack some additional properties.
+        if 'padding_mask' in results:
+            # This property is used in ``STARK`` method in SOT.
+            padding_mask = results['padding_mask']
+            key_padding_mask, ref_padding_mask = self._cat_same_type_data(
+                padding_mask, stack=True)
+            data_sample.padding_mask = to_tensor(key_padding_mask)
+            if ref_padding_mask is not None:
+                setattr(data_sample, f'{self.ref_prefix}_padding_mask',
+                        to_tensor(ref_padding_mask))
+
+        packed_results['data_samples'] = data_sample
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(ref_prefix={self.ref_prefix}, '
+        repr_str += f'meta_keys={self.meta_keys}, '
+        repr_str += f'default_meta_keys={self.default_meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CheckPadMaskValidity(BaseTransform):
+    """Check the validity of data. Generally, it's used in such case: The image
+    padding masks generated in the image preprocess need to be downsampled, and
+    then passed into Transformer model, like DETR. The computation in the
+    subsequent Transformer model must make sure that the values of downsampled
+    mask are not all zeros.
+
+    Required Keys:
+
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int32)
+    - gt_instances_id (np.int32)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (np.bool)
+    - img
+    - img_shape (optional)
+    - jittered_bboxes (optional)
+    - padding_mask (np.bool)
+
+    Args:
+        stride (int): the max stride of feature map.
+    """
+
+    def __init__(self, stride: int):
+        self.stride = stride
+
+    def transform(self, results: dict) -> Optional[dict]:
+        """The transform function.
+
+        Args:
+            results (dict): Result dict contains the data to be checked.
+
+        Returns:
+            Optional[dict]: If invalid, return None; otherwise, return original
+                input.
+        """
+        assert 'padding_mask' in results
+        masks = results['padding_mask']
+        imgs = results['img']
+        for mask, img in zip(masks, imgs):
+            mask = mask.copy().astype(np.float32)
+            img_h, img_w = img.shape[:2]
+            feat_h, feat_w = img_h // self.stride, img_w // self.stride
+            downsample_mask = cv2.resize(
+                mask, dsize=(feat_h, feat_w)).astype(bool)
+            if (downsample_mask == 1).all():
+                return None
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'stride={self.stride})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PackReIDInputs(BaseTransform):
+    """Pack the inputs data for the ReID.
+
+    The ``meta_info`` item is always populated. The contents of the
+    ``meta_info`` dictionary depends on ``meta_keys``. By default
+    this includes:
+
+        - ``img_path``: path to the image file.
+
+        - ``ori_shape``: original shape of the image as a tuple (H, W).
+
+        - ``img_shape``: shape of the image input to the network as a tuple
+            (H, W). Note that images may be zero padded on the bottom/right
+          if the batch tensor is larger than this shape.
+
+        - ``scale``: scale of the image as a tuple (W, H).
+
+        - ``scale_factor``: a float indicating the pre-processing scale.
+
+        -  ``flip``: a boolean indicating if image flip transform was used.
+
+        - ``flip_direction``: the flipping direction.
+
+    Args:
+        meta_keys (Sequence[str], optional): The meta keys to saved in the
+            ``metainfo`` of the packed ``data_sample``.
+    """
+    default_meta_keys = ('img_path', 'ori_shape', 'img_shape', 'scale',
+                         'scale_factor')
+
+    def __init__(self, meta_keys: Sequence[str] = ()) -> None:
+        self.meta_keys = self.default_meta_keys
+        if meta_keys is not None:
+            if isinstance(meta_keys, str):
+                meta_keys = (meta_keys, )
+            else:
+                assert isinstance(meta_keys, tuple), \
+                    'meta_keys must be str or tuple.'
+            self.meta_keys += meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict:
+            - 'inputs' (dict[Tensor]): The forward data of models.
+            - 'data_samples' (obj:`ReIDDataSample`): The meta info of the
+                sample.
+        """
+        packed_results = dict(inputs=dict(), data_samples=None)
+        assert 'img' in results, 'Missing the key ``img``.'
+        _type = type(results['img'])
+        label = results['gt_label']
+
+        if _type == list:
+            img = results['img']
+            label = np.stack(label, axis=0)  # (N,)
+            assert all([type(v) == _type for v in results.values()]), \
+                'All items in the results must have the same type.'
+        else:
+            img = [results['img']]
+
+        img = np.stack(img, axis=3)  # (H, W, C, N)
+        img = img.transpose(3, 2, 0, 1)  # (N, C, H, W)
+        img = np.ascontiguousarray(img)
+
+        packed_results['inputs'] = to_tensor(img)
+
+        data_sample = ReIDDataSample()
+        data_sample.set_gt_label(label)
+
+        meta_info = dict()
+        for key in self.meta_keys:
+            meta_info[key] = results[key]
+        data_sample.set_metainfo(meta_info)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
diff --git a/mmtrack/datasets/transforms/loading.py b/mmtrack/datasets/transforms/loading.py
new file mode 100644
index 000000000..59d3772b3
--- /dev/null
+++ b/mmtrack/datasets/transforms/loading.py
@@ -0,0 +1,179 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmdet.datasets.transforms import LoadAnnotations as MMDet_LoadAnnotations
+
+from mmtrack.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class LoadTrackAnnotations(MMDet_LoadAnnotations):
+    """Load and process the ``instances`` and ``seg_map`` annotation provided
+    by dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+
+                # Label of image classification.
+                'bbox_label': 1,
+
+                # Used in tracking.
+                # Id of instances.
+                'instance_id': 100,
+
+                # Used in instance/panoptic segmentation. The segmentation mask
+                # of the instance or the information of segments.
+                # 1. If list[list[float]], it represents a list of polygons,
+                # one for each connected component of the object. Each
+                # list[float] is one simple polygon in the format of
+                # [x1, y1, ..., xn, yn] (n≥3). The Xs and Ys are absolute
+                # coordinates in unit of pixels.
+                # 2. If dict, it represents the per-pixel segmentation mask in
+                # COCO’s compressed RLE format. The dict should have keys
+                # “size” and “counts”.  Can be loaded by pycocotools
+                'mask': list[list[float]] or dict,
+
+                }
+            ]
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in an image
+            'gt_bboxes': np.ndarray(N, 4)
+             # In int type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In built-in class
+            'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W)
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # in (x, y, v) order, float type.
+        }
+
+    Required Keys:
+
+    - height (optional)
+    - width (optional)
+    - instances
+
+      - bbox (optional)
+      - bbox_label
+      - instance_id (optional)
+      - mask (optional)
+      - ignore_flag (optional)
+
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int32)
+    - gt_instances_id (np.int32)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (np.bool)
+
+    Args:
+        with_instance_id (bool): Whether to parse and load the instance id
+            annotation. Defaults to False.
+    """
+
+    def __init__(self, with_instance_id: bool = False, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.with_instance_id = with_instance_id
+
+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        The only difference is that we record the type of `gt_ignore_flags`
+        as np.int32.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        gt_ignore_flags = []
+        for instance in results['instances']:
+            # The datasets which are only format in evaluation don't have
+            # groundtruth boxes.
+            if 'bbox' in instance:
+                gt_bboxes.append(instance['bbox'])
+            if 'ignore_flag' in instance:
+                gt_ignore_flags.append(instance['ignore_flag'])
+
+        if len(gt_bboxes) != len(gt_ignore_flags):
+            # There may be no ``gt_ignore_flags`` in some cases, we treat them
+            # as all False in order to keep the length of ``gt_bboxes`` and
+            # ``gt_ignore_flags`` the same
+            gt_ignore_flags = [False] * len(gt_bboxes)
+
+        if len(gt_bboxes) > 0 and len(gt_bboxes[0]) == 8:
+            # The bbox of VOT2018 has (N, 8) shape and it's not possible to be
+            # empty.
+            results['gt_bboxes'] = np.array(
+                gt_bboxes, dtype=np.float32).reshape(-1, 8)
+        else:
+            # Some tasks, such as VID, may have empty bboxes and their bboxes
+            # need to be reshaped to (0, 4) format forcely in order to be
+            # compatible with ``TransformBroadcaster``.
+            results['gt_bboxes'] = np.array(
+                gt_bboxes, dtype=np.float32).reshape(-1, 4)
+
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=np.bool)
+
+    def _load_instances_id(self, results: dict) -> None:
+        """Private function to load instances id annotations.
+
+        Args:
+            results (dict): Result dict from :obj :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded instances id annotations.
+        """
+        gt_instances_id = []
+        for instance in results['instances']:
+            gt_instances_id.append(instance['instance_id'])
+        results['gt_instances_id'] = np.array(gt_instances_id, dtype=np.int32)
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, instances id
+            and semantic segmentation and keypoints annotations.
+        """
+        results = super().transform(results)
+        if self.with_instance_id:
+            self._load_instances_id(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_instance_id={self.with_instance_id}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'file_client_args={self.file_client_args})'
+        return repr_str
diff --git a/mmtrack/datasets/transforms/processing.py b/mmtrack/datasets/transforms/processing.py
new file mode 100644
index 000000000..2aee09071
--- /dev/null
+++ b/mmtrack/datasets/transforms/processing.py
@@ -0,0 +1,596 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmengine.logging import print_log
+
+from mmtrack.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class DiMPSampling(BaseTransform):
+    """DiMP-style sampling. It's firstly used in `DiMP.
+
+    <https://arxiv.org/abs/1904.07220>`_.
+
+    Required Keys:
+
+    - img_paths
+    - frame_ids
+    - video_id
+    - video_length
+    - bboxes
+    - instance_id (optional)
+    - mask (optional)
+
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - instances
+
+      - bbox (np.float32)
+      - bbox_label (np.int32)
+      - frame_id (np.int32)
+      - ignore_flag (np.bool)
+      - img_path (str)
+
+    Args:
+        num_search_frames (int, optional): the number of search frames
+        num_template_frames (int, optional): the number of template frames
+        max_frame_range (list[int], optional): the max frame range of sampling
+            a positive search image for the template image. Its length is equal
+            to the number of extra templates, i.e., `num_template_frames`-1.
+            Default length is 1.
+        min_num_frames (int, optional): the min number of frames to be sampled.
+    """
+
+    def __init__(self,
+                 num_search_frames: int = 3,
+                 num_template_frames: int = 3,
+                 max_frame_range: int = 200,
+                 min_num_frames: int = 20):
+        self.num_search_frames = num_search_frames
+        self.num_template_frames = num_template_frames
+        self.max_frame_range = max_frame_range
+        self.min_num_frames = min_num_frames
+
+    def random_sample_inds(self,
+                           video_visibility: np.ndarray,
+                           num_samples: int = 1,
+                           frame_range: Optional[List] = None,
+                           allow_invisible: bool = False,
+                           force_invisible: bool = False) -> List[int]:
+        """Random sampling a specific number of samples from the specified
+        frame range of the video. It also considers the visibility of each
+        frame.
+
+        Args:
+            video_visibility (np.ndarray): the visibility of each frame in the
+                video.
+            num_samples (int, optional): the number of samples. Defaults to 1.
+            frame_range (list | None, optional): the frame range of sampling.
+                Defaults to None.
+            allow_invisible (bool, optional): whether to allow to get invisible
+                samples. Defaults to False.
+            force_invisible (bool, optional): whether to force to get invisible
+                samples. Defaults to False.
+
+        Returns:
+            List[int]: The sampled frame indexes.
+        """
+        assert num_samples > 0
+        if frame_range is None:
+            frame_range = [0, len(video_visibility)]
+        else:
+            assert isinstance(frame_range, list) and len(frame_range) == 2
+            frame_range[0] = max(0, frame_range[0])
+            frame_range[1] = min(len(video_visibility), frame_range[1])
+
+        video_visibility = np.asarray(video_visibility)
+        visibility_in_range = video_visibility[frame_range[0]:frame_range[1]]
+        # get indexes of valid samples
+        if force_invisible:
+            valid_inds = np.where(~visibility_in_range)[0] + frame_range[0]
+        else:
+            valid_inds = np.arange(
+                *frame_range) if allow_invisible else np.where(
+                    visibility_in_range)[0] + frame_range[0]
+
+        # No valid samples
+        if len(valid_inds) == 0:
+            return [None] * num_samples
+
+        return random.choices(valid_inds, k=num_samples)
+
+    def sampling_frames(self, video_visibility: np.ndarray) -> List:
+        """Sampling multiple template images and one search images in one
+        video.
+
+        Args:
+            video_visibility (np.ndarray): the visibility of each frame in the
+                video.
+
+        Returns:
+            List: the indexes of template and search images.
+        """
+        search_frame_inds = [None]
+        gap_increase = 0
+        if self.is_video_data:
+            while search_frame_inds[0] is None:
+                # first randomly sample two frames from a video
+                base_frame_ind = self.random_sample_inds(
+                    video_visibility,
+                    num_samples=1,
+                    frame_range=[
+                        self.num_template_frames - 1,
+                        len(video_visibility) - self.num_search_frames
+                    ])
+
+                prev_frame_inds = self.random_sample_inds(
+                    video_visibility,
+                    num_samples=self.num_template_frames - 1,
+                    frame_range=[
+                        base_frame_ind[0] - self.max_frame_range -
+                        gap_increase, base_frame_ind[0]
+                    ])
+
+                if prev_frame_inds[0] is None:
+                    gap_increase += 5
+                    continue
+
+                temp_frame_inds = base_frame_ind + prev_frame_inds
+                search_frame_inds = self.random_sample_inds(
+                    video_visibility,
+                    num_samples=self.num_search_frames,
+                    frame_range=[
+                        temp_frame_inds[0] + 1, temp_frame_inds[0] +
+                        self.max_frame_range + gap_increase
+                    ])
+
+                gap_increase += 5
+
+            sampled_inds = temp_frame_inds + search_frame_inds
+        else:
+            sampled_inds = [0] * (
+                self.num_template_frames + self.num_search_frames)
+
+        return sampled_inds
+
+    def prepare_data(self,
+                     video_info: dict,
+                     sampled_inds: List[int],
+                     is_positive_pairs: bool = True,
+                     results: Optional[dict] = None) -> Dict[str, List]:
+        """Prepare sampled training data according to the sampled index.
+
+        Args:
+            video_info (dict): the video information. It contains the keys:
+                ['bboxes', 'bboxes_isvalid', 'img_paths', 'frame_ids',
+                'video_id', 'visible', 'video_length].
+            sampled_inds (list[int]): the sampled frame indexes.
+            is_positive_pairs (bool, optional): whether it's the positive
+                pairs. Defaults to True.
+            results (dict[list], optional): The prepared results which need to
+                be updated. Defaults to None.
+
+        Returns:
+            Dict[str, List]: contains the information of sampled data.
+        """
+        if results is None:
+            results = defaultdict(list)
+        assert isinstance(results, dict)
+        for frame_ind in sampled_inds:
+            results['img_path'].append(video_info['img_paths'][frame_ind])
+            results['frame_id'].append(video_info['frame_ids'][frame_ind])
+            results['video_id'].append(video_info['video_id'])
+            results['video_length'].append(video_info['video_length'])
+            instance = [
+                dict(
+                    bbox=video_info['bboxes'][frame_ind],
+                    bbox_label=np.array(is_positive_pairs, dtype=np.int32))
+            ]
+            results['instances'].append(instance)
+        return results
+
+    def transform(self,
+                  pair_video_infos: List[dict]) -> Optional[Dict[str, List]]:
+        """
+        Args:
+            pair_video_infos (list[dict]): contains two video infos. Each video
+                info contains the keys: ['bboxes','bboxes_isvalid','filename',
+                'frame_ids','video_id','visible'].
+
+        Returns:
+            Optional[Dict[str, List]]: contains the information of sampled
+                data.
+        """
+        video_info, video_info_another = pair_video_infos
+        self.is_video_data = len(video_info['frame_ids']) > 1 and len(
+            video_info_another['frame_ids']) > 1
+        enough_visible_frames = sum(video_info['visible']) > 2 * (
+            self.num_search_frames + self.num_template_frames) and len(
+                video_info['visible']) >= self.min_num_frames
+        enough_visible_frames = enough_visible_frames or not \
+            self.is_video_data
+
+        if not enough_visible_frames:
+            return None
+
+        sampled_inds = np.array(self.sampling_frames(video_info['visible']))
+        # the sizes of some bboxes may be zero, because extral templates may
+        # get invalid bboxes.
+        if not video_info['bboxes_isvalid'][sampled_inds].all():
+            return None
+
+        results = self.prepare_data(video_info, sampled_inds)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'num_search_frames={self.num_search_frames}, '
+        repr_str += f'num_template_frames={self.num_template_frames}, '
+        repr_str += f'max_frame_range={self.max_frame_range})'
+        repr_str += f'min_num_frames={self.min_num_frames})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class TridentSampling(DiMPSampling):
+    """Multitemplate-style sampling in a trident manner. It's firstly used in
+    `STARK <https://arxiv.org/abs/2103.17154.>`_.
+
+    The input in this transform is a list of dict. In each dict:
+
+    Required Keys:
+
+    - img_paths
+    - frame_ids
+    - video_id
+    - video_length
+    - bboxes
+    - instance_id (optional)
+    - mask (optional)
+
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - instances
+
+      - bbox (np.float32)
+      - bbox_label (np.int32)
+      - frame_id (np.int32)
+      - ignore_flag (np.bool)
+      - img_path (str)
+
+    Args:
+        num_search_frames (int, optional): the number of search frames
+        num_template_frames (int, optional): the number of template frames
+        max_frame_range (list[int], optional): the max frame range of sampling
+            a positive search image for the template image. Its length is equal
+            to the number of extra templates, i.e., `num_template_frames`-1.
+            Default length is 1.
+        cls_pos_prob (float, optional): the probility of sampling positive
+            samples in classification training.
+        train_cls_head (bool, optional): whether to train classification head.
+        min_num_frames (int, optional): the min number of frames to be sampled.
+    """
+
+    def __init__(self,
+                 num_search_frames: int = 1,
+                 num_template_frames: int = 2,
+                 max_frame_range: List[int] = [200],
+                 min_num_frames: int = 20,
+                 cls_pos_prob: float = 0.5,
+                 train_cls_head: bool = False):
+        assert num_template_frames >= 2
+        assert len(max_frame_range) == num_template_frames - 1
+        super().__init__(num_search_frames, num_template_frames,
+                         max_frame_range, min_num_frames)
+        self.train_cls_head = train_cls_head
+        self.cls_pos_prob = cls_pos_prob
+
+    def sampling_frames(self, video_visibility: np.ndarray) -> List[int]:
+        """Sampling multiple template images and one search images in one
+        video.
+
+        Args:
+            video_visibility (np.ndarray): the visibility of each frame in the
+                video.
+
+        Returns:
+            List[int]: the indexes of template and search images.
+        """
+        extra_template_inds = [None]
+        sampling_count = 0
+        if self.is_video_data:
+            while None in extra_template_inds:
+                # first randomly sample two frames from a video
+                template_ind, search_ind = self.random_sample_inds(
+                    video_visibility, num_samples=2)
+
+                # then sample the extra templates
+                extra_template_inds = []
+                for max_frame_range in self.max_frame_range:
+                    # make the sampling range is near the template_ind
+                    if template_ind >= search_ind:
+                        min_ind, max_ind = search_ind, \
+                            search_ind + max_frame_range
+                    else:
+                        min_ind, max_ind = search_ind - max_frame_range, \
+                            search_ind
+                    extra_template_index = self.random_sample_inds(
+                        video_visibility,
+                        num_samples=1,
+                        frame_range=[min_ind, max_ind],
+                        allow_invisible=False)[0]
+
+                    extra_template_inds.append(extra_template_index)
+
+                sampling_count += 1
+                if sampling_count > 100:
+                    print_log('-------Not sampling extra valid templates'
+                              'successfully. Stop sampling and copy the'
+                              'first template as extra templates-------')
+                    extra_template_inds = [template_ind] * len(
+                        self.max_frame_range)
+
+            sampled_inds = [template_ind] + extra_template_inds + [search_ind]
+        else:
+            sampled_inds = [0] * (
+                self.num_template_frames + self.num_search_frames)
+
+        return sampled_inds
+
+    def prepare_cls_data(self, video_info: dict, video_info_another: dict,
+                         sampled_inds: List[int]) -> Dict[str, List]:
+        """Prepare the sampled classification training data according to the
+        sampled index.
+
+        Args:
+            video_info (dict): the video information. It contains the keys:
+                ['bboxes', 'bboxes_isvalid', 'filename', 'frame_ids',
+                'video_id', 'visible', 'video_length].
+            video_info_another (dict): the another video information. It's only
+                used to get negative samples in classification train. It
+                contains the keys: ['bboxes','bboxes_isvalid','filename',
+                'frame_ids','video_id','visible','video_length]].
+            sampled_inds (list[int]): the sampled frame indexes.
+
+        Returns:
+            Dict[str, List]: contains the information of sampled data.
+        """
+        if random.random() < self.cls_pos_prob:
+            results = self.prepare_data(
+                video_info, sampled_inds, is_positive_pairs=True)
+        else:
+            results = self.prepare_data(
+                video_info,
+                sampled_inds[:self.num_template_frames],
+                is_positive_pairs=False)
+
+            if self.is_video_data:
+                neg_search_ind = self.random_sample_inds(
+                    video_info_another['bboxes_isvalid'], num_samples=1)
+                # may not get valid negative sample in current video
+                if neg_search_ind[0] is None:
+                    return None
+            else:
+                neg_search_ind = [0]
+
+            results = self.prepare_data(
+                video_info_another,
+                neg_search_ind,
+                is_positive_pairs=False,
+                results=results)
+
+        return results
+
+    def transform(self,
+                  pair_video_infos: List[dict]) -> Optional[Dict[str, List]]:
+        """
+        Args:
+            pair_video_infos (list[dict]): contains two video infos. Each video
+                info contains the keys: ['bboxes','bboxes_isvalid','filename',
+                'frame_ids','video_id','visible','video_length'].
+
+        Returns:
+            Optional[Dict[str, List]]: contains the information of sampled
+                data. If not enough visible frames, return None.
+        """
+        video_info, video_info_another = pair_video_infos
+        self.is_video_data = len(video_info['frame_ids']) > 1 and len(
+            video_info_another['frame_ids']) > 1
+        enough_visible_frames = sum(video_info['visible']) > 2 * (
+            self.num_search_frames + self.num_template_frames) and len(
+                video_info['visible']) >= self.min_num_frames
+        enough_visible_frames = enough_visible_frames or not \
+            self.is_video_data
+
+        if not enough_visible_frames:
+            return None
+
+        sampled_inds = np.array(self.sampling_frames(video_info['visible']))
+        # the sizes of some bboxes may be zero, because extral templates may
+        # get invalid bboxes.
+        if not video_info['bboxes_isvalid'][sampled_inds].all():
+            return None
+
+        if not self.train_cls_head:
+            results = self.prepare_data(video_info, sampled_inds)
+        else:
+            results = self.prepare_cls_data(video_info, video_info_another,
+                                            sampled_inds)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'num_search_frames={self.num_search_frames}, '
+        repr_str += f'num_template_frames={self.num_template_frames}, '
+        repr_str += f'max_frame_range={self.max_frame_range}, '
+        repr_str += f'cls_pos_prob={self.cls_pos_prob}, '
+        repr_str += f'train_cls_head={self.train_cls_head}, '
+        repr_str += f'min_num_frames={self.min_num_frames})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PairSampling(BaseTransform):
+    """Pair-style sampling. It's used in `SiameseRPN++
+
+    <https://arxiv.org/abs/1812.11703.>`_.
+
+    The input in this transform is a list of dict. In each dict:
+
+    Required Keys:
+
+    - img_paths
+    - frame_ids
+    - video_id
+    - video_length
+    - bboxes
+    - instance_id (optional)
+    - mask (optional)
+
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - instances
+
+      - bbox (np.float32)
+      - bbox_label (np.int32)
+      - frame_id (np.int32)
+      - ignore_flag (np.bool)
+      - img_path (str)
+
+    Args:
+        frame_range (List(int) | int): The sampling range of search
+            frames in the same video for template frame. Defaults to 5.
+        pos_prob (float, optional):  The probility of sampling positive
+            sample pairs. Defaults to 0.8.
+        filter_template_img (bool, optional): If False, the template image will
+            be in the sampling search candidates, otherwise, it is exclude.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 frame_range: int = 5,
+                 pos_prob: float = 0.8,
+                 filter_template_img: bool = False):
+        assert pos_prob >= 0.0 and pos_prob <= 1.0
+        if isinstance(frame_range, int):
+            assert frame_range >= 0, 'frame_range can not be a negative value.'
+            frame_range = [-frame_range, frame_range]
+        elif isinstance(frame_range, list):
+            assert len(frame_range) == 2, 'The length must be 2.'
+            assert frame_range[0] <= 0 and frame_range[1] >= 0
+            for i in frame_range:
+                assert isinstance(i, int), 'Each element must be int.'
+        else:
+            raise TypeError('The type of frame_range must be int or list.')
+        self.frame_range = frame_range
+        self.pos_prob = pos_prob
+        self.filter_template_img = filter_template_img
+
+    def prepare_data(self,
+                     video_info: dict,
+                     sampled_inds: List[int],
+                     is_positive_pairs: bool = False,
+                     results: Optional[dict] = None) -> Dict[str, List]:
+        """Prepare sampled training data according to the sampled index.
+
+        Args:
+            video_info (dict): the video information. It contains the keys:
+                ['bboxes', 'bboxes_isvalid', 'img_paths', 'frame_ids',
+                'video_id', 'visible', 'video_length].
+            sampled_inds (list[int]): the sampled frame indexes.
+            is_positive_pairs (bool, optional): whether it's the positive
+                pairs. Defaults to False.
+            results (dict[list], optional): The prepared results which need to
+                be updated. Defaults to None.
+
+        Returns:
+            Dict[str, List]: contains the information of sampled data.
+        """
+        if results is None:
+            results = defaultdict(list)
+        assert isinstance(results, dict)
+        for frame_ind in sampled_inds:
+            results['img_path'].append(video_info['img_paths'][frame_ind])
+            results['frame_id'].append(video_info['frame_ids'][frame_ind])
+            results['video_id'].append(video_info['video_id'])
+            results['video_length'].append(video_info['video_length'])
+            instance = [
+                dict(
+                    bbox=video_info['bboxes'][frame_ind],
+                    bbox_label=np.array(is_positive_pairs, dtype=np.int32))
+            ]
+            results['instances'].append(instance)
+        return results
+
+    def transform(self, pair_video_infos: List[dict]) -> dict:
+        """
+        Args:
+            pair_video_infos (list[dict]): Containing the information of two
+                videos. Each video information contains the keys:
+                ['bboxes','bboxes_isvalid', 'img_paths', 'frame_ids',
+                'video_id', 'visible', 'video_length'].
+
+        Returns:
+            dict: contains the information of sampled data.
+        """
+        video_info, video_info_another = pair_video_infos
+        if len(video_info['frame_ids']) > 1 and len(
+                video_info_another['frame_ids']) > 1:
+            template_frame_ind = np.random.choice(len(video_info['frame_ids']))
+            if self.pos_prob > np.random.random():
+                left_ind = max(template_frame_ind + self.frame_range[0], 0)
+                right_ind = min(template_frame_ind + self.frame_range[1],
+                                len(video_info['frame_ids']))
+                if self.filter_template_img:
+                    ref_frames_inds = list(
+                        range(left_ind, template_frame_ind)) + list(
+                            range(template_frame_ind + 1, right_ind))
+                else:
+                    ref_frames_inds = list(range(left_ind, right_ind))
+                search_frame_ind = np.random.choice(ref_frames_inds)
+                results = self.prepare_data(
+                    video_info, [template_frame_ind, search_frame_ind],
+                    is_positive_pairs=True)
+            else:
+                search_frame_ind = np.random.choice(
+                    len(video_info_another['frame_ids']))
+                results = self.prepare_data(
+                    video_info, [template_frame_ind], is_positive_pairs=False)
+                results = self.prepare_data(
+                    video_info_another, [search_frame_ind],
+                    is_positive_pairs=False,
+                    results=results)
+
+        else:
+            if self.pos_prob > np.random.random():
+                results = self.prepare_data(
+                    video_info, [0, 0], is_positive_pairs=True)
+            else:
+                results = self.prepare_data(
+                    video_info, [0], is_positive_pairs=False)
+                results = self.prepare_data(
+                    video_info_another, [0],
+                    is_positive_pairs=False,
+                    results=results)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'frame_range={self.frame_range}, '
+        repr_str += f'pos_prob={self.pos_prob}, '
+        repr_str += f'filter_template_img={self.filter_template_img})'
+        return repr_str
diff --git a/mmtrack/datasets/transforms/transforms.py b/mmtrack/datasets/transforms/transforms.py
new file mode 100644
index 000000000..c0ae2111d
--- /dev/null
+++ b/mmtrack/datasets/transforms/transforms.py
@@ -0,0 +1,984 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Dict, List, Optional, Tuple, Union
+
+import cv2
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+from mmengine.logging import print_log
+
+from mmtrack.registry import TRANSFORMS
+from mmtrack.utils import crop_image
+
+
+@TRANSFORMS.register_module()
+class CropLikeSiamFC(BaseTransform):
+    """Crop images as SiamFC did.
+
+    The way of cropping an image is proposed in
+    "Fully-Convolutional Siamese Networks for Object Tracking."
+    `SiamFC <https://arxiv.org/abs/1606.09549>`_.
+
+    Required Keys:
+
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int32)
+    - gt_instances_id (np.int32)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (np.bool)
+    - img
+    - img_shape (optional)
+
+    Modified Keys:
+
+    - gt_bboxes
+    - img
+    - img_shape (optional)
+
+    Args:
+        context_amount (float): The context amount around a bounding box.
+            Defaults to 0.5.
+        exemplar_size (int): Exemplar size. Defaults to 127.
+        crop_size (int): Crop size. Defaults to 511.
+    """
+
+    def __init__(self,
+                 context_amount: float = 0.5,
+                 exemplar_size: int = 127,
+                 crop_size: int = 511):
+        self.context_amount = context_amount
+        self.exemplar_size = exemplar_size
+        self.crop_size = crop_size
+
+    def crop_like_SiamFC(self,
+                         image: np.ndarray,
+                         bbox: np.ndarray,
+                         context_amount: float = 0.5,
+                         exemplar_size: int = 127,
+                         crop_size: int = 511) -> np.ndarray:
+        """Crop an image as SiamFC did.
+
+        Args:
+            image (np.ndarray): of shape (H, W, 3).
+            bbox (np.ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
+            context_amount (float): The context amount around a bounding box.
+                Defaults to 0.5.
+            exemplar_size (int): Exemplar size. Defaults to 127.
+            crop_size (int): Crop size. Defaults to 511.
+
+        Returns:
+            np.ndarray: The cropped image of shape (crop_size, crop_size, 3).
+        """
+        padding = np.mean(image, axis=(0, 1)).tolist()
+
+        bbox = np.array([
+            0.5 * (bbox[2] + bbox[0]), 0.5 * (bbox[3] + bbox[1]),
+            bbox[2] - bbox[0], bbox[3] - bbox[1]
+        ])
+        z_width = bbox[2] + context_amount * (bbox[2] + bbox[3])
+        z_height = bbox[3] + context_amount * (bbox[2] + bbox[3])
+        z_size = np.sqrt(z_width * z_height)
+
+        z_scale = exemplar_size / z_size
+        d_search = (crop_size - exemplar_size) / 2.
+        pad = d_search / z_scale
+        x_size = z_size + 2 * pad
+        x_bbox = np.array([
+            bbox[0] - 0.5 * x_size, bbox[1] - 0.5 * x_size,
+            bbox[0] + 0.5 * x_size, bbox[1] + 0.5 * x_size
+        ])
+
+        x_crop_img = crop_image(image, x_bbox, crop_size, padding)
+        return x_crop_img
+
+    def generate_box(self, image: np.ndarray, gt_bbox: np.ndarray,
+                     context_amount: float, exemplar_size: int) -> np.ndarray:
+        """Generate box based on cropped image.
+
+        Args:
+            image (np.ndarray): The cropped image of shape
+                (self.crop_size, self.crop_size, 3).
+            gt_bbox (np.ndarray): In shape (4, ), in [x1, y1, x2, y2] format.
+            context_amount (float): The context amount around a bounding box.
+            exemplar_size (int): Exemplar size. Defaults to 127.
+
+        Returns:
+            np.ndarray: Generated box of shape (4, ) in [x1, y1, x2, y2]
+                format.
+        """
+        img_h, img_w = image.shape[:2]
+        w, h = gt_bbox[2] - gt_bbox[0], gt_bbox[3] - gt_bbox[1]
+
+        z_width = w + context_amount * (w + h)
+        z_height = h + context_amount * (w + h)
+        z_scale = np.sqrt(z_width * z_height)
+        z_scale_factor = exemplar_size / z_scale
+        w = w * z_scale_factor
+        h = h * z_scale_factor
+        cx, cy = img_w // 2, img_h // 2
+        bbox = np.array(
+            [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h],
+            dtype=np.float32)
+
+        return bbox
+
+    def transform(self, results: dict) -> dict:
+        """The transform function.
+
+        For each dict in results, crop image like SiamFC did.
+
+        Args:
+            results (dict): Dict from :obj:`mmtrack.dataset.BaseSOTDataset`.
+
+        Returns:
+            dict: Dict that contains cropped images and
+                corresponding ground truth boxes.
+        """
+        crop_img = self.crop_like_SiamFC(results['img'],
+                                         results['gt_bboxes'].squeeze(),
+                                         self.context_amount,
+                                         self.exemplar_size, self.crop_size)
+        generated_bbox = self.generate_box(crop_img,
+                                           results['gt_bboxes'].squeeze(),
+                                           self.context_amount,
+                                           self.exemplar_size)
+        if 'img_shape' in results:
+            results['img_shape'] = crop_img.shape
+
+        results['img'] = crop_img
+        results['gt_bboxes'] = generated_bbox[None]
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(context_amount={self.context_amount}, '
+        repr_str += f'exemplar_size={self.exemplar_size}, '
+        repr_str += f'crop_size={self.crop_size})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class SeqCropLikeStark(BaseTransform):
+    """Crop images as Stark did.
+
+    The way of cropping an image is proposed in
+    "Learning Spatio-Temporal Transformer for Visual Tracking."
+    `Stark <https://arxiv.org/abs/2103.17154>`_.
+
+    Required Keys:
+
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int32)
+    - gt_instances_id (np.int32)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (np.bool)
+    - img
+    - img_shape (optional)
+    - jittered_bboxes (np.float32)
+
+    Modified Keys:
+
+    - gt_bboxes
+    - img
+    - img_shape (optional)
+
+    Added keys:
+
+    - padding_mask
+
+    Args:
+        crop_size_factor (list[int | float]): contains the ratio of crop size
+            to bbox size.
+        output_size (list[int | float]): contains the size of resized image
+            (always square).
+    """
+
+    def __init__(self, crop_size_factor: List[Union[int, float]],
+                 output_size: List[Union[int, float]]):
+        self.crop_size_factor = crop_size_factor
+        self.output_size = output_size
+
+    def crop_like_stark(
+            self, img: np.ndarray, bbox: np.ndarray,
+            crop_size_factor: np.ndarray,
+            output_size: int) -> Union[np.ndarray, float, np.ndarray]:
+        """Crop an image as Stark did.
+
+        Args:
+            img (np.ndarray): of shape (H, W, 3).
+            bbox (np.ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
+            crop_size_factor (float): the ratio of crop size to bbox size
+            output_size (int): the size of resized image (always square).
+
+        Returns:
+            img_crop_padded (np.ndarray): the cropped image of shape
+                (crop_size, crop_size, 3).
+            resize_factor (float): the ratio of original image scale to cropped
+                image scale.
+            pdding_mask (np.ndarray): the padding mask caused by cropping.
+        """
+        x1, y1, x2, y2 = np.split(bbox, 4, axis=-1)
+        bbox_w, bbox_h = x2 - x1, y2 - y1
+        cx, cy = x1 + bbox_w / 2., y1 + bbox_h / 2.
+
+        img_h, img_w, _ = img.shape
+        # 1. Crop image
+        # 1.1 calculate crop size and pad size
+        crop_size = math.ceil(math.sqrt(bbox_w * bbox_h) * crop_size_factor)
+        crop_size = max(crop_size, 1)
+
+        x1 = int(np.round(cx - crop_size * 0.5))
+        x2 = x1 + crop_size
+        y1 = int(np.round(cy - crop_size * 0.5))
+        y2 = y1 + crop_size
+
+        x1_pad = max(0, -x1)
+        x2_pad = max(x2 - img_w + 1, 0)
+        y1_pad = max(0, -y1)
+        y2_pad = max(y2 - img_h + 1, 0)
+
+        # 1.2 crop image
+        img_crop = img[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
+
+        # 1.3 pad image
+        img_crop_padded = cv2.copyMakeBorder(img_crop, y1_pad, y2_pad, x1_pad,
+                                             x2_pad, cv2.BORDER_CONSTANT)
+        # 1.4 generate padding mask
+        img_h, img_w, _ = img_crop_padded.shape
+        pdding_mask = np.ones((img_h, img_w))
+        end_x, end_y = -x2_pad, -y2_pad
+        if y2_pad == 0:
+            end_y = None
+        if x2_pad == 0:
+            end_x = None
+        pdding_mask[y1_pad:end_y, x1_pad:end_x] = 0
+
+        # 2. Resize image and padding mask
+        resize_factor = output_size / crop_size
+        img_crop_padded = cv2.resize(img_crop_padded,
+                                     (output_size, output_size))
+        pdding_mask = cv2.resize(pdding_mask,
+                                 (output_size, output_size)).astype(np.bool_)
+
+        return img_crop_padded, resize_factor, pdding_mask
+
+    def generate_box(self,
+                     bbox_gt: np.ndarray,
+                     bbox_cropped: np.ndarray,
+                     resize_factor: float,
+                     output_size: float,
+                     normalize: bool = False) -> np.ndarray:
+        """Transform the box coordinates from the original image coordinates to
+        the coordinates of the cropped image.
+
+        Args:
+            bbox_gt (np.ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
+            bbox_cropped (np.ndarray): of shape (4, ) in [x1, y1, x2, y2]
+                format.
+            resize_factor (float): the ratio of original image scale to cropped
+                image scale.
+            output_size (float): the size of output image.
+            normalize (bool): whether to normalize the output box.
+                Defaults to False.
+
+        Returns:
+            np.ndarray: generated box of shape (4, ) in [x1, y1, x2, y2]
+                format.
+        """
+        assert output_size > 0
+        bbox_gt_center = (bbox_gt[0:2] + bbox_gt[2:4]) * 0.5
+        bbox_cropped_center = (bbox_cropped[0:2] + bbox_cropped[2:4]) * 0.5
+
+        bbox_out_center = (output_size - 1) / 2. + (
+            bbox_gt_center - bbox_cropped_center) * resize_factor
+        bbox_out_wh = (bbox_gt[2:4] - bbox_gt[0:2]) * resize_factor
+        bbox_out = np.concatenate((bbox_out_center - 0.5 * bbox_out_wh,
+                                   bbox_out_center + 0.5 * bbox_out_wh),
+                                  axis=-1)
+
+        return bbox_out / output_size if normalize else bbox_out
+
+    def transform(self, results: dict) -> dict:
+        """The transform function. For each dict in results, crop image like
+        Stark did.
+
+        Args:
+            results (dict): Dict of list from
+                :obj:`mmtrack.datasets.SeqBboxJitter`.
+
+        Returns:
+            dict: Dict of list that contains cropped image and
+                the corresponding groundtruth bbox.
+        """
+        imgs = results['img']
+        gt_bboxes = results['gt_bboxes']
+        jittered_bboxes = results['jittered_bboxes']
+        new_imgs = []
+        results['padding_mask'] = []
+        for i, (img, gt_bbox, jittered_bbox) in enumerate(
+                zip(imgs, gt_bboxes, jittered_bboxes)):
+            gt_bbox, jittered_bbox = gt_bbox[0], jittered_bbox[0]
+            crop_img, resize_factor, padding_mask = self.crop_like_stark(
+                img, jittered_bbox, self.crop_size_factor[i],
+                self.output_size[i])
+
+            generated_bbox = self.generate_box(
+                gt_bbox,
+                jittered_bbox,
+                resize_factor,
+                self.output_size[i],
+                normalize=False)
+
+            new_imgs.append(crop_img)
+            if 'img_shape' in results:
+                results['img_shape'][i] = crop_img.shape
+            results['gt_bboxes'][i] = generated_bbox[None]
+            results['padding_mask'].append(padding_mask)
+
+        results['img'] = new_imgs
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'crop_size_factor={self.crop_size_factor}, '
+        repr_str += f'output_size={self.output_size})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CropLikeDiMP(BaseTransform):
+    """Crop images as PrDiMP did.
+
+    The way of cropping an image is proposed in
+    "Learning Discriminative Model Prediction for Tracking."
+    `DiMP <https://arxiv.org/abs/1904.07220>`_.
+
+    Args:
+        crop_size_factor (float): contains the ratio of crop size
+            to bbox size.
+        output_size (float): contains the size of resized image
+            (always square).
+    """
+
+    def __init__(self, crop_size_factor: float, output_size: float):
+        self.crop_size_factor = crop_size_factor
+        self.output_size = output_size
+
+    def crop_like_dimp(
+            self, img: np.ndarray, bbox: np.ndarray, crop_size_factor: float,
+            output_size: int) -> Tuple[np.ndarray, np.ndarray, float]:
+        """Crop an image as DiMP did.
+
+        Note: The difference between dimp and stark is the operation of moving
+        box inside image in dimp. This may cause the cropped image is not
+        centered on the `bbox`.
+
+        Args:
+            image (np.ndarray): of shape (H, W, 3).
+            bbox (np.ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
+            crop_size_factor (float): the ratio of crop size to bbox size
+            output_size (int): the size of resized image (always square).
+
+        Returns:
+            img_crop_padded (np.ndarray): the cropped image of shape
+                (crop_size, crop_size, 3).
+            resize_factor (float): the ratio of original image scale to cropped
+                image scale.
+            pdding_mask (np.ndarray): the padding mask caused by cropping.
+        """
+        x1, y1, x2, y2 = np.split(bbox, 4, axis=-1)
+        bbox_w, bbox_h = x2 - x1, y2 - y1
+        cx, cy = x1 + bbox_w / 2., y1 + bbox_h / 2.
+
+        img_h, img_w, _ = img.shape
+        # 1. Crop image
+        # 1.1 calculate crop size
+        crop_size = math.ceil(math.sqrt(bbox_w * bbox_h) * crop_size_factor)
+        crop_size = max(crop_size, 1)
+
+        x1 = int(np.round(cx - crop_size * 0.5))
+        x2 = x1 + crop_size
+        y1 = int(np.round(cy - crop_size * 0.5))
+        y2 = y1 + crop_size
+
+        # 1.2 Move box inside image
+        shift_x = max(0, -x1) + min(0, img_w - x2)
+        x1 += shift_x
+        x2 += shift_x
+
+        shift_y = max(0, -y1) + min(0, img_h - y2)
+        y1 += shift_y
+        y2 += shift_y
+
+        # keep the balance of left and right spacing if crop area exceeds the
+        # image
+        out_x = (max(0, -x1) + max(0, x2 - img_w)) // 2
+        out_y = (max(0, -y1) + max(0, y2 - img_h)) // 2
+        shift_x = (-x1 - out_x) * (out_x > 0)
+        shift_y = (-y1 - out_y) * (out_y > 0)
+
+        x1 += shift_x
+        x2 += shift_x
+        y1 += shift_y
+        y2 += shift_y
+
+        # 1.3 pad size
+        x1_pad = max(0, -x1)
+        x2_pad = max(x2 - img_w + 1, 0)
+        y1_pad = max(0, -y1)
+        y2_pad = max(y2 - img_h + 1, 0)
+
+        # 1.4 crop image
+        img_crop = img[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
+
+        # 1.5 pad image
+        img_crop_padded = cv2.copyMakeBorder(img_crop, y1_pad, y2_pad, x1_pad,
+                                             x2_pad, cv2.BORDER_REPLICATE)
+
+        # 2. Resize image and padding mask
+        assert y2 - y1 == crop_size
+        assert x2 - x1 == crop_size
+        resize_factor = output_size / crop_size
+        img_crop_padded = cv2.resize(img_crop_padded,
+                                     (output_size, output_size))
+
+        # the new box of cropped area
+        crop_area_bbox = np.array([x1, y1, x2 - x1, y2 - y1], dtype=float)
+
+        return img_crop_padded, crop_area_bbox, resize_factor
+
+    def generate_box(self, bbox_gt: np.ndarray, crop_area_bbox: np.ndarray,
+                     resize_factor: np.ndarray) -> np.ndarray:
+        """Transform the box coordinates from the original image coordinates to
+        the coordinates of the resized cropped image. The center of cropped
+        image may be not jittered bbox since the operation of moving box inside
+        image.
+
+        Args:
+            bbox_gt (np.ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
+            crop_area_bbox (np.ndarray): of shape (4, ) in [x1, y1, w, h]
+                format.
+            resize_factor (float): the ratio of original image scale to cropped
+                image scale.
+            output_size (float): the size of output image.
+            normalize (bool): whether to normalize the output box.
+                Default to True.
+
+        Returns:
+            np.ndarray: generated box of shape (4, ) in [x1, y1, x2, y2]
+                format.
+        """
+        bbox_out = bbox_gt.copy()
+        # The coordinate origin of `bbox_out` is the top left corner of
+        # `crop_area_bbox`.
+        bbox_out[0:4:2] -= crop_area_bbox[0]
+        bbox_out[1:4:2] -= crop_area_bbox[1]
+
+        bbox_out *= resize_factor
+
+        return bbox_out
+
+    def transform(self, results: dict) -> dict:
+        """Call function. Crop image like DiMP did.
+
+        Args:
+            results (dict): Dict from :obj:`mmtrack.dataset.BaseSOTDataset`.
+
+        Returns:
+            dict: Dict that contains cropped images and
+                corresponding ground truth boxes.
+        """
+        gt_bbox = results['gt_bboxes'][0]
+        jittered_bboxes = results['jittered_bboxes'][0]
+        crop_img, crop_area_bbox, resize_factor = self.crop_like_dimp(
+            results['img'], jittered_bboxes, self.crop_size_factor,
+            self.output_size)
+
+        generated_bbox = self.generate_box(gt_bbox, crop_area_bbox,
+                                           resize_factor)
+
+        results['img'] = crop_img
+        if 'img_shape' in results:
+            results['img_shape'] = crop_img.shape
+        results['gt_bboxes'] = generated_bbox[None]
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class SeqBboxJitter(BaseTransform):
+    """Bounding box jitter augmentation. The jittered bboxes are used for
+    subsequent image cropping, like `SeqCropLikeStark`.
+
+    Required Keys:
+
+    - gt_bboxes
+    - gt_bboxes_labels (optional)
+    - gt_instances_id (optional)
+    - gt_masks (optional)
+    - gt_seg_map (optional)
+    - gt_ignore_flags (optional)
+    - img
+    - img_shape (optional)
+
+    Added Keys:
+
+    - jittered_bboxes
+
+    Args:
+        scale_jitter_factor (list[int | float]): contains the factor of scale
+            jitter.
+        center_jitter_factor (list[int | float]): contains the factor of center
+            jitter.
+        crop_size_factor (list[int | float]): contains the ratio of crop size
+            to bbox size.
+    """
+
+    def __init__(self, scale_jitter_factor: List[Union[int, float]],
+                 center_jitter_factor: List[Union[int, float]],
+                 crop_size_factor: List[Union[int, float]]):
+        self.scale_jitter_factor = scale_jitter_factor
+        self.center_jitter_factor = center_jitter_factor
+        self.crop_size_factor = crop_size_factor
+
+    def transform(self, results: Dict[str, List]) -> Optional[dict]:
+        """The transform function.
+
+        Args:
+            results (Dict[str, List]): Dict of list from
+                :obj:`mmtrack.datasets.BaseSOTDataset`.
+
+        Returns:
+            Optional[dict]: Dict of list that contains augmented images. If
+                getting invalid cropped image, return None.
+        """
+        gt_bboxes = results['gt_bboxes']
+        jittered_bboxes = []
+        for i, gt_bbox in enumerate(gt_bboxes):
+            x1, y1, x2, y2 = np.split(gt_bbox.squeeze(), 4, axis=-1)
+            bbox_w, bbox_h = x2 - x1, y2 - y1
+            gt_bbox_cxcywh = np.concatenate(
+                [x1 + bbox_w / 2., y1 + bbox_h / 2., bbox_w, bbox_h], axis=-1)
+
+            crop_img_size = -1
+            # avoid croped image size too small.
+            count = 0
+            while crop_img_size < 1:
+                count += 1
+                if count > 100:
+                    print_log(
+                        f'-------- bbox {gt_bbox_cxcywh} is invalid -------')
+                    return None
+                jittered_wh = gt_bbox_cxcywh[2:4] * np.exp(
+                    np.random.randn(2) * self.scale_jitter_factor[i])
+                crop_img_size = np.ceil(
+                    np.sqrt(jittered_wh.prod()) * self.crop_size_factor[i])
+
+            max_offset = np.sqrt(
+                jittered_wh.prod()) * self.center_jitter_factor[i]
+            jittered_center = gt_bbox_cxcywh[0:2] + max_offset * (
+                np.random.rand(2) - 0.5)
+
+            jittered_bbox = np.concatenate(
+                (jittered_center - 0.5 * jittered_wh,
+                 jittered_center + 0.5 * jittered_wh),
+                axis=-1)
+            jittered_bboxes.append(jittered_bbox[None])
+
+        results['jittered_bboxes'] = jittered_bboxes
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'scale_jitter_factor={self.scale_jitter_factor}, '
+        repr_str += f'center_jitter_factor={self.center_jitter_factor}, '
+        repr_str += f'crop_size_factor={self.crop_size_factor})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BrightnessAug(BaseTransform):
+    """Brightness augmention for images.
+
+    Required Keys:
+
+    - gt_bboxes
+    - gt_bboxes_labels (optional)
+    - gt_instances_id (optional)
+    - gt_masks (optional)
+    - gt_seg_map (optional)
+    - gt_ignore_flags (optional)
+    - img
+    - img_shape (optional)
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        jitter_range (float): The range of brightness jitter.
+            Defaults to 0..
+    """
+
+    def __init__(self, jitter_range: float = 0.):
+        self.jitter_range = jitter_range
+
+    @cache_randomness
+    def _random_brightness_factor(self) -> float:
+        """Generate the factor of brightness randomly.
+
+        Returns:
+            float: The factor of brightness.
+        """
+
+        brightness_factor = np.random.uniform(
+            max(0, 1 - self.jitter_range), 1 + self.jitter_range)
+        return brightness_factor
+
+    def transform(self, results: dict) -> dict:
+        """The transform function.
+
+        For each dict in results, perform brightness augmention for image in
+        the dict.
+
+        Args:
+            results (dict): list of dict from :obj:`mmengine.BaseDataset`.
+        Returns:
+            dict: Dict that contains augmented image.
+        """
+        brightness_factor = self._random_brightness_factor()
+        image = np.dot(results['img'], brightness_factor).clip(0, 255.0)
+        results['img'] = image
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'jitter_range={self.jitter_range})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class GrayAug(BaseTransform):
+    """Gray augmention for images.
+
+    Required Keys:
+
+    - gt_bboxes
+    - gt_bboxes_labels (optional)
+    - gt_instances_id (optional)
+    - gt_masks (optional)
+    - gt_seg_map (optional)
+    - gt_ignore_flags (optional)
+    - img
+    - img_shape (optional)
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability to perform gray augmention.
+            Defaults to 0..
+    """
+
+    def __init__(self, prob: float = 0.):
+        self.prob = prob
+
+    @cache_randomness
+    def _random_gray(self) -> bool:
+        """Whether to convert the original image to gray image.
+
+        Returns:
+            bool: Whether to convert the original image to gray image.
+        """
+        if self.prob > np.random.random():
+            convert2gray = True
+        else:
+            convert2gray = False
+        return convert2gray
+
+    def transform(self, results: dict) -> dict:
+        """The transform function.
+
+        For each dict in results, perform gray augmention for image in the
+        dict.
+
+        Args:
+            results (list[dict]): List of dict from
+                :obj:`mmengine.BaseDataset`.
+
+        Returns:
+            dict: Dict that contains augmented gray image.
+        """
+        if self._random_gray():
+            grayed = cv2.cvtColor(results['img'], cv2.COLOR_BGR2GRAY)
+            image = cv2.cvtColor(grayed, cv2.COLOR_GRAY2BGR)
+            results['img'] = image
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class SeqShiftScaleAug(BaseTransform):
+    """Shift and rescale images and bounding boxes.
+
+    Required Keys:
+
+    - gt_bboxes
+    - gt_bboxes_labels (optional)
+    - gt_instances_id (optional)
+    - gt_masks (optional)
+    - gt_seg_map (optional)
+    - gt_ignore_flags (optional)
+    - img
+    - img_shape (optional)
+
+    Modified Keys:
+
+    - gt_bboxes
+    - img
+    - img_shape (optional)
+
+    Args:
+        target_size (list[int]): list of int denoting exemplar size and search
+            size, respectively. Defaults to [127, 255].
+        shift (list[int]): list of int denoting the max shift offset. Defaults
+            to [4, 64].
+        scale (list[float]): list of float denoting the max rescale factor.
+            Defaults to [0.05, 0.18].
+    """
+
+    def __init__(self,
+                 target_size: List[int] = [127, 255],
+                 shift: List[int] = [4, 64],
+                 scale: List[float] = [0.05, 0.18]):
+        self.target_size = target_size
+        self.shift = shift
+        self.scale = scale
+
+    def _shift_scale_aug(self, image: np.ndarray, bbox: np.ndarray,
+                         target_size: int, shift: int,
+                         scale: float) -> Tuple[np.ndarray, np.ndarray]:
+        """Shift and rescale an image and corresponding bounding box.
+
+        Args:
+            image (np.ndarray): of shape (H, W, 3). Typically H and W equal to
+                511.
+            bbox (np.ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
+            target_size (int): Exemplar size or search size.
+            shift (int): The max shift offset.
+            scale (float): The max rescale factor.
+
+        Returns:
+            tuple(np.ndarray, np.ndarray): The first element is of shape
+            (target_size, target_size, 3), and the second element is the
+            corresponding ground truth box in [x1, y1, x2, y2] format.
+        """
+        img_h, img_w = image.shape[:2]
+
+        scale_x = (2 * np.random.random() - 1) * scale + 1
+        scale_y = (2 * np.random.random() - 1) * scale + 1
+        scale_x = min(scale_x, float(img_w) / target_size)
+        scale_y = min(scale_y, float(img_h) / target_size)
+        crop_region = np.array([
+            img_w // 2 - 0.5 * scale_x * target_size,
+            img_h // 2 - 0.5 * scale_y * target_size,
+            img_w // 2 + 0.5 * scale_x * target_size,
+            img_h // 2 + 0.5 * scale_y * target_size
+        ])
+
+        shift_x = (2 * np.random.random() - 1) * shift
+        shift_y = (2 * np.random.random() - 1) * shift
+        shift_x = max(-crop_region[0], min(img_w - crop_region[2], shift_x))
+        shift_y = max(-crop_region[1], min(img_h - crop_region[3], shift_y))
+        shift = np.array([shift_x, shift_y, shift_x, shift_y])
+        crop_region += shift
+
+        crop_img = crop_image(image, crop_region, target_size)
+        bbox -= np.array(
+            [crop_region[0], crop_region[1], crop_region[0], crop_region[1]])
+        bbox /= np.array([scale_x, scale_y, scale_x, scale_y],
+                         dtype=np.float32)
+        return crop_img, bbox
+
+    def transform(self, results: dict) -> dict:
+        """The transform function.
+
+        For each dict in results, shift and rescale the image and the
+        bounding box in the dict.
+
+        Args:
+            results (dict(list)): Dict of list that from
+                :obj:`mmtrack.dataset.BaseSOTDataset`.
+
+        Returns:
+            dict(list): List of dict that contains cropped image and
+            corresponding ground truth box.
+        """
+        imgs = results['img']
+        gt_bboxes = results['gt_bboxes']
+        new_imgs = []
+        new_gt_bboxes = []
+        for i, (img, gt_bbox) in enumerate(zip(imgs, gt_bboxes)):
+            crop_img, crop_bbox = self._shift_scale_aug(
+                img, gt_bbox.squeeze(), self.target_size[i], self.shift[i],
+                self.scale[i])
+            crop_bbox = crop_bbox[None]
+            new_gt_bboxes.append(crop_bbox)
+            new_imgs.append(crop_img)
+            if 'img_shape' in results:
+                results['img_shape'][i] = crop_img.shape
+        results['img'] = new_imgs
+        results['gt_bboxes'] = new_gt_bboxes
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(target_size={self.target_size}, '
+        repr_str += f'shift={self.shift}, '
+        repr_str += f'scale={self.scale})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class SeqColorAug(BaseTransform):
+    """Color augmention for images.
+
+    Required Keys:
+
+    - gt_bboxes
+    - gt_bboxes_labels (optional)
+    - gt_instances_id (optional)
+    - gt_masks (optional)
+    - gt_seg_map (optional)
+    - gt_ignore_flags (optional)
+    - img
+    - img_shape (optional)
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (list[float]): The probability to perform color augmention for
+            each image. Defaults to [1.0, 1.0].
+        rgb_var (list[list]]): The values of color augmentaion. Defaults to
+            [[-0.55919361, 0.98062831, -0.41940627],
+            [1.72091413, 0.19879334, -1.82968581],
+            [4.64467907, 4.73710203, 4.88324118]].
+    """
+
+    def __init__(self,
+                 prob: List[float] = [1.0, 1.0],
+                 rgb_var: List[List] = [[-0.55919361, 0.98062831, -0.41940627],
+                                        [1.72091413, 0.19879334, -1.82968581],
+                                        [4.64467907, 4.73710203, 4.88324118]]):
+        self.prob = prob
+        self.rgb_var = np.array(rgb_var, dtype=np.float32)
+
+    def transform(self, results: dict) -> dict:
+        """The transform function.
+
+        For each dict in results, perform color augmention for image in the
+        dict.
+
+        Args:
+            results (dict[list]): Dict of list that from
+                :obj:`mmengine.BaseDataset`.
+
+        Returns:
+            dict[list]: Dict of list that contains augmented color image.
+        """
+        imgs = results['img']
+        new_imgs = []
+        for i, img in enumerate(imgs):
+            if self.prob[i] > np.random.random():
+                offset = np.dot(self.rgb_var, np.random.randn(3, 1))
+                # bgr to rgb
+                offset = offset[::-1]
+                offset = offset.reshape(3)
+                img = (img - offset).astype(np.float32)
+            new_imgs.append(img)
+
+        results['img'] = new_imgs
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'rgb_var={self.rgb_var})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class SeqBlurAug(BaseTransform):
+    """Blur augmention for images.
+
+    Required Keys:
+
+    - gt_bboxes
+    - gt_bboxes_labels (optional)
+    - gt_instances_id (optional)
+    - gt_masks (optional)
+    - gt_seg_map (optional)
+    - gt_ignore_flags (optional)
+    - img
+    - img_shape (optional)
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (list[float]): The probability to perform blur augmention for
+            each image. Defaults to [0.0, 0.2].
+    """
+
+    def __init__(self, prob: List[float] = [0.0, 0.2]):
+        self.prob = prob
+
+    def transform(self, results: dict) -> dict:
+        """The transform function.
+
+        For each dict in results, perform blur augmention for image in the
+        dict.
+
+        Args:
+            results (dict[list]): Dict of list that from
+                :obj:`mmtrack.CocoVideoDataset`.
+
+        Returns:
+            dict[list]: Dict of list that contains augmented blur image.
+        """
+        imgs = results['img']
+        new_imgs = []
+        for i, img in enumerate(imgs):
+            if self.prob[i] > np.random.random():
+                sizes = np.arange(5, 46, 2)
+                size = np.random.choice(sizes)
+                kernel = np.zeros((size, size))
+                c = int(size / 2)
+                wx = np.random.random()
+                kernel[:, c] += 1. / size * wx
+                kernel[c, :] += 1. / size * (1 - wx)
+                img = cv2.filter2D(img, -1, kernel)
+            new_imgs.append(img)
+
+        results['img'] = new_imgs
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'prob={self.prob})'
+        return repr_str
diff --git a/mmtrack/datasets/uav123_dataset.py b/mmtrack/datasets/uav123_dataset.py
index ea4ac8881..7cc041c87 100644
--- a/mmtrack/datasets/uav123_dataset.py
+++ b/mmtrack/datasets/uav123_dataset.py
@@ -1,9 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import time
+from typing import List
 
-from mmdet.datasets import DATASETS
-
+from mmtrack.registry import DATASETS
 from .base_sot_dataset import BaseSOTDataset
 
 
@@ -18,12 +18,9 @@ def __init__(self, *args, **kwargs):
         """Initialization of SOT dataset class."""
         super().__init__(*args, **kwargs)
 
-    def load_data_infos(self, split='test'):
+    def load_data_list(self) -> List[dict]:
         """Load dataset information.
 
-        Args:
-            split (str, optional): Dataset split. Defaults to 'test'.
-
         Returns:
             list[dict]: The length of the list is the number of videos. The
                 inner dict is in the following format:
@@ -40,8 +37,8 @@ def load_data_infos(self, split='test'):
         print('Loading UAV123 dataset...')
         start_time = time.time()
         data_infos = []
-        data_infos_str = self.loadtxt(
-            self.ann_file, return_array=False).split('\n')
+        data_infos_str = self._loadtxt(
+            self.ann_file, return_ndarray=False).split('\n')
         # the first line of annotation file is a dataset comment.
         for line in data_infos_str[1:]:
             # compatible with different OS.
diff --git a/mmtrack/datasets/vot_dataset.py b/mmtrack/datasets/vot_dataset.py
index 6ec84e227..6fd4f26c7 100644
--- a/mmtrack/datasets/vot_dataset.py
+++ b/mmtrack/datasets/vot_dataset.py
@@ -1,52 +1,35 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-import os.path as osp
 import time
+from typing import List
 
-import mmcv
 import numpy as np
-from mmcv.utils import print_log
-from mmdet.datasets import DATASETS
 
-from mmtrack.core.evaluation import eval_sot_accuracy_robustness, eval_sot_eao
+from mmtrack.registry import DATASETS
 from .base_sot_dataset import BaseSOTDataset
 
 
 @DATASETS.register_module()
 class VOTDataset(BaseSOTDataset):
-    """VOT dataset of single object tracking.
+    """VOT dataset of single object tracking. The dataset is only used to test.
 
-    The dataset is only used to test.
+    Args:
+        dataset_type (str, optional): The type of VOT challenge. The
+            optional values are in ['vot2018', 'vot2018_lt',
+            'vot2019', 'vot2019_lt']
     """
 
-    def __init__(self, dataset_type='vot2018', *args, **kwargs):
-        """Initialization of SOT dataset class.
-
-        Args:
-            dataset_type (str, optional): The type of VOT challenge. The
-                optional values are in ['vot2018', 'vot2018_lt',
-                'vot2019', 'vot2019_lt', 'vot2020', 'vot2021']
-        """
+    def __init__(self, dataset_type: str = 'vot2018', *args, **kwargs):
+        """Initialization of SOT dataset class."""
         assert dataset_type in [
-            'vot2018', 'vot2018_lt', 'vot2019', 'vot2019_lt', 'vot2020',
-            'vot2021'
-        ]
+            'vot2018', 'vot2018_lt', 'vot2019', 'vot2019_lt'
+        ], 'We only support VOT-[2018~2019] chanllenges'
         self.dataset_type = dataset_type
         super().__init__(*args, **kwargs)
-        # parameter, used for EAO evaluation, may vary by different vot
-        # challenges.
-        self.INTERVAL = dict(
-            vot2018=[100, 356],
-            vot2019=[46, 291],
-            vot2020=[115, 755],
-            vot2021=[115, 755])
 
-    def load_data_infos(self, split='test'):
+    def load_data_list(self) -> List[dict]:
         """Load dataset information.
 
-        Args:
-            split (str, optional): Dataset split. Defaults to 'test'.
-
         Returns:
             list[dict]: The length of the list is the number of videos. The
                 inner dict is in the following format:
@@ -63,8 +46,8 @@ def load_data_infos(self, split='test'):
         print('Loading VOT dataset...')
         start_time = time.time()
         data_infos = []
-        data_infos_str = self.loadtxt(
-            self.ann_file, return_array=False).split('\n')
+        data_infos_str = self._loadtxt(
+            self.ann_file, return_ndarray=False).split('\n')
         # the first line of annotation file is a dataset comment.
         for line in data_infos_str[1:]:
             # compatible with different OS.
@@ -79,14 +62,14 @@ def load_data_infos(self, split='test'):
         print(f'VOT dataset loaded! ({time.time()-start_time:.2f} s)')
         return data_infos
 
-    def get_ann_infos_from_video(self, video_ind):
+    def get_ann_infos_from_video(self, video_ind: int) -> np.ndarray:
         """Get bboxes annotation about the instance in a video.
 
         Args:
             video_ind (int): video index
 
         Returns:
-            ndarray: in [N, 8] shape. The N is the bbox number and the bbox
+            np.ndarray: in [N, 8] shape. The N is the bbox number and the bbox
                 is in (x1, y1, x2, y2, x3, y3, x4, y4) format.
         """
         bboxes = self.get_bboxes_from_video(video_ind)
@@ -103,97 +86,3 @@ def get_ann_infos_from_video(self, video_ind):
         ann_infos = dict(
             bboxes=bboxes, bboxes_isvalid=bboxes_isvalid, **visible_info)
         return ann_infos
-
-    # TODO support multirun test
-    def evaluate(self, results, metric=['track'], logger=None, interval=None):
-        """Evaluation in VOT protocol.
-
-        Args:
-            results (dict): Testing results of the dataset. The tracking bboxes
-                are in (tl_x, tl_y, br_x, br_y) format.
-            metric (str | list[str]): Metrics to be evaluated. Options are
-                'track'.
-            logger (logging.Logger | str | None): Logger used for printing
-                related information during evaluation. Default: None.
-            interval (list): an specified interval in EAO curve used to
-                calculate the EAO score. There are different settings in
-                different VOT challenges.
-        Returns:
-            dict[str, float]:
-        """
-        if isinstance(metric, list):
-            metrics = metric
-        elif isinstance(metric, str):
-            metrics = [metric]
-        else:
-            raise TypeError('metric must be a list or a str.')
-        allowed_metrics = ['track']
-        for metric in metrics:
-            if metric not in allowed_metrics:
-                raise KeyError(f'metric {metric} is not supported.')
-
-        # get all test annotations
-        # annotations are in list[ndarray] format
-        annotations = []
-        for video_ind in range(len(self.data_infos)):
-            bboxes = self.get_ann_infos_from_video(video_ind)['bboxes']
-            annotations.append(bboxes)
-
-        # tracking_bboxes converting code
-        eval_results = dict()
-        if 'track' in metrics:
-            assert len(self) == len(
-                results['track_bboxes']
-            ), f"{len(self)} == {len(results['track_bboxes'])}"
-            print_log('Evaluate VOT Benchmark...', logger=logger)
-            track_bboxes = []
-            start_ind = end_ind = 0
-            videos_wh = []
-            for data_info in self.data_infos:
-                num = data_info['end_frame_id'] - data_info[
-                    'start_frame_id'] + 1
-                end_ind += num
-
-                bboxes_per_video = []
-                # results are in dict(track_bboxes=list[ndarray]) format
-                # track_bboxes are in list[list[ndarray]] format
-                for bbox in results['track_bboxes'][start_ind:end_ind]:
-                    # the last element of `bbox` is score.
-                    if len(bbox) != 2:
-                        # convert bbox format from (tl_x, tl_y, br_x, br_y) to
-                        # (x1, y1, w, h)
-                        bbox[2] -= bbox[0]
-                        bbox[3] -= bbox[1]
-
-                    bboxes_per_video.append(bbox[:-1])
-
-                track_bboxes.append(bboxes_per_video)
-                start_ind += num
-
-                # read one image in the video to get video width and height
-                filename = osp.join(self.img_prefix, data_info['video_path'],
-                                    data_info['framename_template'] % 1)
-                img = mmcv.imread(
-                    filename, file_client_args=self.file_client_args)
-                videos_wh.append((img.shape[1], img.shape[0]))
-
-            interval = self.INTERVAL[self.dataset_type] if interval is None \
-                else interval
-
-            eao_score = eval_sot_eao(
-                results=track_bboxes,
-                annotations=annotations,
-                videos_wh=videos_wh,
-                interval=interval)
-            eval_results.update(eao_score)
-
-            accuracy_robustness = eval_sot_accuracy_robustness(
-                results=track_bboxes,
-                annotations=annotations,
-                videos_wh=videos_wh)
-            eval_results.update(accuracy_robustness)
-            for k, v in eval_results.items():
-                if isinstance(v, float):
-                    eval_results[k] = float(f'{(v):.4f}')
-            print_log(eval_results, logger=logger)
-        return eval_results
diff --git a/mmtrack/datasets/youtube_vis_dataset.py b/mmtrack/datasets/youtube_vis_dataset.py
index 4bd7a0bf9..3c230f080 100644
--- a/mmtrack/datasets/youtube_vis_dataset.py
+++ b/mmtrack/datasets/youtube_vis_dataset.py
@@ -1,241 +1,52 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import os.path as osp
-import tempfile
-import zipfile
-from collections import defaultdict
-
-import mmcv
-import numpy as np
-from mmcv.utils import print_log
-from mmdet.datasets import DATASETS
-
-from mmtrack.core import eval_vis, results2outs
-from .coco_video_dataset import CocoVideoDataset
+from mmtrack.registry import DATASETS
+from .base_video_dataset import BaseVideoDataset
 
 
 @DATASETS.register_module()
-class YouTubeVISDataset(CocoVideoDataset):
-    """YouTube VIS dataset for video instance segmentation."""
-
-    CLASSES_2019_version = ('person', 'giant_panda', 'lizard', 'parrot',
-                            'skateboard', 'sedan', 'ape', 'dog', 'snake',
-                            'monkey', 'hand', 'rabbit', 'duck', 'cat', 'cow',
-                            'fish', 'train', 'horse', 'turtle', 'bear',
-                            'motorbike', 'giraffe', 'leopard', 'fox', 'deer',
-                            'owl', 'surfboard', 'airplane', 'truck', 'zebra',
-                            'tiger', 'elephant', 'snowboard', 'boat', 'shark',
-                            'mouse', 'frog', 'eagle', 'earless_seal',
-                            'tennis_racket')
+class YouTubeVISDataset(BaseVideoDataset):
+    """YouTube VIS dataset for video instance segmentation.
 
-    CLASSES_2021_version = ('airplane', 'bear', 'bird', 'boat', 'car', 'cat',
-                            'cow', 'deer', 'dog', 'duck', 'earless_seal',
-                            'elephant', 'fish', 'flying_disc', 'fox', 'frog',
-                            'giant_panda', 'giraffe', 'horse', 'leopard',
-                            'lizard', 'monkey', 'motorbike', 'mouse', 'parrot',
-                            'person', 'rabbit', 'shark', 'skateboard', 'snake',
-                            'snowboard', 'squirrel', 'surfboard',
-                            'tennis_racket', 'tiger', 'train', 'truck',
-                            'turtle', 'whale', 'zebra')
+    Args:
+        dataset_version (str): Select dataset year version.
+    """
 
-    def __init__(self, dataset_version, *args, **kwargs):
+    def __init__(self, dataset_version: str, *args, **kwargs):
         self.set_dataset_classes(dataset_version)
         super().__init__(*args, **kwargs)
 
     @classmethod
-    def set_dataset_classes(cls, dataset_version):
-        if dataset_version == '2019':
-            cls.CLASSES = cls.CLASSES_2019_version
-        elif dataset_version == '2021':
-            cls.CLASSES = cls.CLASSES_2021_version
-        else:
-            raise NotImplementedError('Not supported YouTubeVIS dataset'
-                                      f'version: {dataset_version}')
-
-    def format_results(self,
-                       results,
-                       resfile_path=None,
-                       metrics=['track_segm'],
-                       save_as_json=True):
-        """Format the results to a zip file (standard format for YouTube-VIS
-        Challenge).
+    def set_dataset_classes(cls, dataset_version: str) -> None:
+        """Pass the category of the corresponding year to metainfo.
 
         Args:
-            results (dict(list[ndarray])): Testing results of the dataset.
-            resfile_path (str, optional): Path to save the formatted results.
-                Defaults to None.
-            metrics (list[str], optional): The results of the specific metrics
-                will be formatted. Defaults to ['track_segm'].
-            save_as_json (bool, optional): Whether to save the
-                json results file. Defaults to True.
-
-        Returns:
-            tuple: (resfiles, tmp_dir), resfiles is the path of the result
-            json file, tmp_dir is the temporal directory created for saving
-            files.
+            dataset_version (str): Select dataset year version.
         """
-        assert isinstance(results, dict), 'results must be a dict.'
-        if isinstance(metrics, str):
-            metrics = [metrics]
-        assert 'track_segm' in metrics
-        if resfile_path is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            resfile_path = tmp_dir.name
-        else:
-            tmp_dir = None
-        resfiles = osp.join(resfile_path, 'results.json')
-
-        inds = [i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0]
-        num_vids = len(inds)
-        assert num_vids == len(self.vid_ids)
-        inds.append(len(self.data_infos))
-        vid_infos = self.coco.load_vids(self.vid_ids)
-
-        json_results = []
-        for i in range(num_vids):
-            video_id = vid_infos[i]['id']
-            # collect data for each instances in a video.
-            collect_data = dict()
-            for frame_id, (bbox_res, mask_res) in enumerate(
-                    zip(results['track_bboxes'][inds[i]:inds[i + 1]],
-                        results['track_masks'][inds[i]:inds[i + 1]])):
-                outs_track = results2outs(bbox_results=bbox_res)
-                bboxes = outs_track['bboxes']
-                labels = outs_track['labels']
-                ids = outs_track['ids']
-                masks = mmcv.concat_list(mask_res)
-                assert len(masks) == len(bboxes)
-                for j, id in enumerate(ids):
-                    if id not in collect_data:
-                        collect_data[id] = dict(
-                            category_ids=[], scores=[], segmentations=dict())
-                    collect_data[id]['category_ids'].append(labels[j])
-                    collect_data[id]['scores'].append(bboxes[j][4])
-                    if isinstance(masks[j]['counts'], bytes):
-                        masks[j]['counts'] = masks[j]['counts'].decode()
-                    collect_data[id]['segmentations'][frame_id] = masks[j]
-
-            # transform the collected data into official format
-            for id, id_data in collect_data.items():
-                output = dict()
-                output['video_id'] = video_id
-                output['score'] = np.array(id_data['scores']).mean().item()
-                # majority voting for sequence category
-                output['category_id'] = np.bincount(
-                    np.array(id_data['category_ids'])).argmax().item() + 1
-                output['segmentations'] = []
-                for frame_id in range(inds[i + 1] - inds[i]):
-                    if frame_id in id_data['segmentations']:
-                        output['segmentations'].append(
-                            id_data['segmentations'][frame_id])
-                    else:
-                        output['segmentations'].append(None)
-                json_results.append(output)
-
-        if not save_as_json:
-            return json_results
-        mmcv.dump(json_results, resfiles)
-
-        # zip the json file in order to submit to the test server.
-        zip_file_name = osp.join(resfile_path, 'submission_file.zip')
-        zf = zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED)
-        print_log(f"zip the 'results.json' into '{zip_file_name}', "
-                  'please submmit the zip file to the test server')
-        zf.write(resfiles, 'results.json')
-        zf.close()
-
-        return resfiles, tmp_dir
-
-    def evaluate(self, results, metric=['track_segm'], logger=None):
-        """Evaluation in COCO protocol.
+        CLASSES_2019_version = ('person', 'giant_panda', 'lizard', 'parrot',
+                                'skateboard', 'sedan', 'ape', 'dog', 'snake',
+                                'monkey', 'hand', 'rabbit', 'duck', 'cat',
+                                'cow', 'fish', 'train', 'horse', 'turtle',
+                                'bear', 'motorbike', 'giraffe', 'leopard',
+                                'fox', 'deer', 'owl', 'surfboard', 'airplane',
+                                'truck', 'zebra', 'tiger', 'elephant',
+                                'snowboard', 'boat', 'shark', 'mouse', 'frog',
+                                'eagle', 'earless_seal', 'tennis_racket')
+
+        CLASSES_2021_version = ('airplane', 'bear', 'bird', 'boat', 'car',
+                                'cat', 'cow', 'deer', 'dog', 'duck',
+                                'earless_seal', 'elephant', 'fish',
+                                'flying_disc', 'fox', 'frog', 'giant_panda',
+                                'giraffe', 'horse', 'leopard', 'lizard',
+                                'monkey', 'motorbike', 'mouse', 'parrot',
+                                'person', 'rabbit', 'shark', 'skateboard',
+                                'snake', 'snowboard', 'squirrel', 'surfboard',
+                                'tennis_racket', 'tiger', 'train', 'truck',
+                                'turtle', 'whale', 'zebra')
 
-        Args:
-            results (dict): Testing results of the dataset.
-            metric (str | list[str]): Metrics to be evaluated. Options are
-                'track_segm'.
-            logger (logging.Logger | str | None): Logger used for printing
-                related information during evaluation. Default: None.
-
-        Returns:
-            dict[str, float]: COCO style evaluation metric.
-        """
-        if isinstance(metric, list):
-            metrics = metric
-        elif isinstance(metric, str):
-            metrics = [metric]
+        if dataset_version == '2019':
+            cls.METAINFO = dict(CLASSES=CLASSES_2019_version)
+        elif dataset_version == '2021':
+            cls.METAINFO = dict(CLASSES=CLASSES_2021_version)
         else:
-            raise TypeError('metric must be a list or a str.')
-        allowed_metrics = ['track_segm']
-        for metric in metrics:
-            if metric not in allowed_metrics:
-                raise KeyError(f'metric {metric} is not supported.')
-
-        eval_results = dict()
-        test_results = self.format_results(results, save_as_json=False)
-        vis_results = self.convert_back_to_vis_format()
-        track_segm_results = eval_vis(test_results, vis_results, logger)
-        eval_results.update(track_segm_results)
-
-        return eval_results
-
-    def convert_back_to_vis_format(self):
-        """Convert the annotation back to the format of YouTube-VIS. The main
-        difference between the two is the format of 'annotation'. Before
-        modification, it is recorded in the unit of images, and after
-        modification, it is recorded in the unit of instances.This operation is
-        to make it easier to use the official eval API.
-
-        Returns:
-            dict: A dict with 3 keys, ``categories``, ``annotations``
-                and ``videos``.
-            - | ``categories`` (list[dict]): Each dict has 2 keys,
-                ``id`` and ``name``.
-            - | ``videos`` (list[dict]): Each dict has 4 keys of video info,
-                ``id``, ``name``, ``width`` and ``height``.
-            - | ``annotations`` (list[dict]): Each dict has 7 keys of video
-                info, ``category_id``, ``segmentations``, ``bboxes``,
-                ``video_id``, ``areas``, ``id`` and ``iscrowd``.
-        """
-
-        vis_anns = defaultdict(list)
-
-        vis_anns['categories'] = copy.deepcopy(self.coco.dataset['categories'])
-        vis_anns['videos'] = copy.deepcopy(self.coco.dataset['videos'])
-
-        len_videos = dict()  # mapping from video_id to video_length
-        for video_id, video_infos in self.coco.vidToImgs.items():
-            len_videos[video_id] = len(video_infos)
-
-        for video_id, ins_ids in self.coco.vidToInstances.items():
-            cur_video_len = len_videos[video_id]
-            for ins_id in ins_ids:
-                # In the official format, no instances are represented by
-                # 'None', however, only images with instances are recorded
-                # in the current annotations, so we need to use 'None' to
-                # initialize these lists.
-                segm = [None] * cur_video_len
-                bbox = [None] * cur_video_len
-                area = [None] * cur_video_len
-                category_id = None
-                iscrowd = None
-                for img_id in self.coco.instancesToImgs.get(ins_id):
-                    frame_id = self.coco.imgs[img_id]['frame_id']
-                    for ann in self.coco.imgToAnns[img_id]:
-                        if ann['instance_id'] == ins_id:
-                            segm[frame_id] = ann['segmentation']
-                            bbox[frame_id] = ann['bbox']
-                            area[frame_id] = ann['area']
-                            category_id = ann['category_id']
-                            iscrowd = ann['iscrowd']
-                assert category_id is not None
-                instance = dict(
-                    category_id=category_id,
-                    segmentations=segm,
-                    bboxes=bbox,
-                    video_id=video_id,
-                    areas=area,
-                    id=ins_id,
-                    iscrowd=iscrowd)
-                vis_anns['annotations'].append(instance)
-
-        return dict(vis_anns)
+            raise NotImplementedError('Not supported YouTubeVIS dataset'
+                                      f'version: {dataset_version}')
diff --git a/mmtrack/engine/__init__.py b/mmtrack/engine/__init__.py
new file mode 100644
index 000000000..d5ba554af
--- /dev/null
+++ b/mmtrack/engine/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import *  # noqa: F401, F403
+from .schedulers import *  # noqa: F401, F403
diff --git a/mmtrack/engine/hooks/__init__.py b/mmtrack/engine/hooks/__init__.py
new file mode 100644
index 000000000..272749d98
--- /dev/null
+++ b/mmtrack/engine/hooks/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .siamrpn_backbone_unfreeze_hook import SiamRPNBackboneUnfreezeHook
+from .visualization_hook import TrackVisualizationHook
+from .yolox_mode_switch_hook import YOLOXModeSwitchHook
+
+__all__ = [
+    'YOLOXModeSwitchHook', 'TrackVisualizationHook',
+    'SiamRPNBackboneUnfreezeHook'
+]
diff --git a/mmtrack/engine/hooks/siamrpn_backbone_unfreeze_hook.py b/mmtrack/engine/hooks/siamrpn_backbone_unfreeze_hook.py
new file mode 100644
index 000000000..cd6d04315
--- /dev/null
+++ b/mmtrack/engine/hooks/siamrpn_backbone_unfreeze_hook.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from torch.nn.modules.batchnorm import BatchNorm2d
+
+from mmtrack.registry import HOOKS
+
+
+@HOOKS.register_module()
+class SiamRPNBackboneUnfreezeHook(Hook):
+    """Start to train the backbone of SiamRPN++ from a certrain epoch.
+
+    Args:
+        backbone_start_train_epoch (int): Start to train the backbone at
+            `backbone_start_train_epoch`-th epoch. Note the epoch in this
+            class counts from 0, while the epoch in the log file counts from 1.
+        backbone_train_layers (list(str)): List of str denoting the stages
+            needed be trained in backbone.
+    """
+
+    def __init__(self,
+                 backbone_start_train_epoch: int = 10,
+                 backbone_train_layers: List = ['layer2', 'layer3', 'layer4']):
+        self.backbone_start_train_epoch = backbone_start_train_epoch
+        self.backbone_train_layers = backbone_train_layers
+
+    def before_train_epoch(self, runner):
+        """If `runner.epoch >= self.backbone_start_train_epoch`, start to train
+        the backbone."""
+        if runner.epoch >= self.backbone_start_train_epoch:
+            for layer in self.backbone_train_layers:
+                model = runner.model.module if is_model_wrapper(
+                    runner.model) else runner.model
+                for param in getattr(model.backbone, layer).parameters():
+                    param.requires_grad = True
+                for m in getattr(model.backbone, layer).modules():
+                    if isinstance(m, BatchNorm2d):
+                        m.train()
diff --git a/mmtrack/engine/hooks/visualization_hook.py b/mmtrack/engine/hooks/visualization_hook.py
new file mode 100644
index 000000000..de13b6d96
--- /dev/null
+++ b/mmtrack/engine/hooks/visualization_hook.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Optional, Sequence
+
+import mmcv
+from mmengine.fileio import FileClient
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+from mmengine.utils import mkdir_or_exist
+from mmengine.visualization import Visualizer
+
+from mmtrack.registry import HOOKS
+from mmtrack.structures import TrackDataSample
+
+
+@HOOKS.register_module()
+class TrackVisualizationHook(Hook):
+    """Tracking Visualization Hook. Used to visualize validation and testing
+    process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+    2. If ``test_out_dir`` is specified, it means that the prediction results
+        need to be saved to ``test_out_dir``. In order to avoid vis_backends
+        also storing data, so ``vis_backends`` needs to be excluded.
+    3. ``vis_backends`` takes effect if the user does not specify ``show``
+        and `test_out_dir``. You can set ``vis_backends`` to WandbVisBackend or
+        TensorboardVisBackend to store the prediction result in Wandb or
+        Tensorboard.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        interval (int): The interval of visualization. Defaults to 30.
+        score_thr (float): The threshold to visualize the bboxes
+            and masks. Defaults to 0.3.
+        show (bool): Whether to display the drawn image. Default to False.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        test_out_dir (str, optional): directory where painted images
+            will be saved in testing process.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 interval: int = 30,
+                 score_thr: float = 0.3,
+                 show: bool = False,
+                 wait_time: float = 0.,
+                 test_out_dir: Optional[str] = None,
+                 file_client_args: dict = dict(backend='disk')):
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.interval = interval
+        self.score_thr = score_thr
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+
+        self.wait_time = wait_time
+        self.file_client = FileClient(**file_client_args)
+        self.draw = draw
+        self.test_out_dir = test_out_dir
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[TrackDataSample]) -> None:
+        """Run after every ``self.interval`` validation iteration.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`TrackDataSample`]): Outputs from model.
+        """
+        if self.draw is False:
+            return
+
+        assert len(outputs) == 1,\
+            'only batch_size=1 is supported while validating.'
+
+        total_curr_iter = runner.iter + batch_idx
+
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            data_sample = outputs[0]
+            img_path = data_sample.img_path
+            img_bytes = self.file_client.get(img_path)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+            self._visualizer.add_datasample(
+                osp.basename(img_path) if self.show else 'val_img',
+                img,
+                data_sample=data_sample,
+                show=self.show,
+                wait_time=self.wait_time,
+                pred_score_thr=self.score_thr,
+                step=total_curr_iter)
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[TrackDataSample]) -> None:
+        """Run after every testing iteration.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`TrackDataSample`]): Outputs from model.
+        """
+        if self.draw is False:
+            return
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        assert len(outputs) == 1, \
+            'only batch_size=1 is supported while testing.'
+
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            data_sample = outputs[0]
+            img_path = data_sample.img_path
+            img_bytes = self.file_client.get(img_path)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+            out_file = None
+            if self.test_out_dir is not None:
+                out_file = osp.basename(img_path)
+                out_file = osp.join(self.test_out_dir, out_file)
+
+            self._visualizer.add_datasample(
+                osp.basename(img_path) if self.show else 'test_img',
+                img,
+                data_sample=data_sample,
+                show=self.show,
+                wait_time=self.wait_time,
+                pred_score_thr=self.score_thr,
+                out_file=out_file,
+                step=batch_idx)
diff --git a/mmtrack/core/hook/yolox_mode_switch_hook.py b/mmtrack/engine/hooks/yolox_mode_switch_hook.py
similarity index 88%
rename from mmtrack/core/hook/yolox_mode_switch_hook.py
rename to mmtrack/engine/hooks/yolox_mode_switch_hook.py
index 71eb7e298..577855af5 100644
--- a/mmtrack/core/hook/yolox_mode_switch_hook.py
+++ b/mmtrack/engine/hooks/yolox_mode_switch_hook.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.parallel import is_module_wrapper
-from mmcv.runner.hooks import HOOKS
-from mmdet.core import YOLOXModeSwitchHook as _YOLOXModeSwitchHook
+from mmdet.engine import YOLOXModeSwitchHook as _YOLOXModeSwitchHook
+from mmengine.model import is_model_wrapper
 
+from mmtrack.registry import HOOKS
 
-@HOOKS.register_module(force=True)
+
+@HOOKS.register_module()
 class YOLOXModeSwitchHook(_YOLOXModeSwitchHook):
     """Switch the mode of YOLOX during training.
 
@@ -21,9 +22,9 @@ class in mmdet use `model.bbox_head.use_l1=True` to switch mode, while
     def before_train_epoch(self, runner):
         """Close mosaic and mixup augmentation and switches to use L1 loss."""
         epoch = runner.epoch
-        train_loader = runner.data_loader
+        train_loader = runner.train_dataloader
         model = runner.model
-        if is_module_wrapper(model):
+        if is_model_wrapper(model):
             model = model.module
         if (epoch + 1) == runner.max_epochs - self.num_last_epochs:
             runner.logger.info('No mosaic and mixup aug now!')
diff --git a/mmtrack/engine/schedulers/__init__.py b/mmtrack/engine/schedulers/__init__.py
new file mode 100644
index 000000000..84920b14a
--- /dev/null
+++ b/mmtrack/engine/schedulers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .siamrpn_exp_scheduler import SiamRPNExpLR, SiamRPNExpParamScheduler
+
+__all__ = ['SiamRPNExpParamScheduler', 'SiamRPNExpLR']
diff --git a/mmtrack/engine/schedulers/siamrpn_exp_scheduler.py b/mmtrack/engine/schedulers/siamrpn_exp_scheduler.py
new file mode 100644
index 000000000..9f000bace
--- /dev/null
+++ b/mmtrack/engine/schedulers/siamrpn_exp_scheduler.py
@@ -0,0 +1,172 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+from mmengine.optim.scheduler.lr_scheduler import LRSchedulerMixin
+from mmengine.optim.scheduler.param_scheduler import INF, _ParamScheduler
+from torch.optim import Optimizer
+
+from mmtrack.registry import PARAM_SCHEDULERS
+
+
+@PARAM_SCHEDULERS.register_module()
+class SiamRPNExpParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group by exponentially
+    changing small multiplicative factor until the number of epoch reaches a
+    pre-defined milestone: ``end``.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    .. math::
+
+        X_{t} = X_{t-1} \times (\frac{end}{begin})^{\frac{1}{epochs}}
+
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        start_factor (float): The number we multiply parameter value in the
+            first epoch. The multiplication factor changes towards end_factor
+            in the following epochs. Defaults to 0.1.
+        end_factor (float): The number we multiply parameter value at the end
+            of linear changing process. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        endpoint (bool): If true, `end_factor`` is included in the ``end``.
+            Otherwise, it is not included. Default is True.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Optimizer,
+                 param_name: str,
+                 start_factor: float = 0.1,
+                 end_factor: float = 1.0,
+                 begin: int = 0,
+                 end: int = INF,
+                 endpoint: bool = True,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        if end >= INF:
+            raise ValueError('``end`` must be less than infinity,'
+                             'Please set ``end`` parameter of '
+                             '``QuadraticWarmupScheduler`` as the '
+                             'number of warmup end.')
+
+        if start_factor > 1.0 or start_factor < 0:
+            raise ValueError(
+                'Starting multiplicative factor should between 0 and 1.')
+
+        if end_factor > 1.0 or end_factor < 0:
+            raise ValueError(
+                'Ending multiplicative factor should between 0 and 1.')
+
+        self.start_factor = start_factor
+        self.end_factor = end_factor
+        self.endpoint = endpoint
+        self.total_iters = end - begin - 1 if self.endpoint else end - begin
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin: int = 0,
+                              end: int = INF,
+                              by_epoch: bool = True,
+                              epoch_length: Optional[int] = None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config.
+
+        Args:
+            begin (int, optional): Step at which to start updating the
+                parameters. Defaults to 0.
+            end (int, optional): Step at which to stop updating the parameters.
+                Defaults to INF.
+            by_epoch (bool, optional): Whether the scheduled parameters are
+                updated by epochs. Defaults to True.
+            epoch_length (Optional[int], optional): The length of each epoch.
+                Defaults to None.
+
+        Returns:
+            Object: The instantiated object of ``SiamRPNExpParamScheduler``.
+        """
+
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = begin * epoch_length
+        if end != INF:
+            end = end * epoch_length
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                group[self.param_name] * self.start_factor
+                for group in self.optimizer.param_groups
+            ]
+
+        return [
+            group[self.param_name] *
+            math.pow(self.end_factor / self.start_factor, 1 /
+                     (self.total_iters))
+            for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class SiamRPNExpLR(LRSchedulerMixin, SiamRPNExpParamScheduler):
+    """Decays the parameter value of each parameter group by exponentially
+    changing small multiplicative factor until the number of epoch reaches a
+    pre-defined milestone: ``end``.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    .. math::
+
+        X_{t} = X_{t-1} \times (\frac{end}{begin})^{\frac{1}{epochs}}
+
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        start_factor (float): The number we multiply parameter value in the
+            first epoch. The multiplication factor changes towards end_factor
+            in the following epochs. Defaults to 0.1.
+        end_factor (float): The number we multiply parameter value at the end
+            of linear changing process. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        endpoint (bool): If true, `end_factor`` is included in the ``end``.
+            Otherwise, it is not included. Default is True.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
diff --git a/mmtrack/evaluation/__init__.py b/mmtrack/evaluation/__init__.py
new file mode 100644
index 000000000..f70dc226d
--- /dev/null
+++ b/mmtrack/evaluation/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .functional import *  # noqa: F401,F403
+from .metrics import *  # noqa: F401,F403
diff --git a/mmtrack/evaluation/functional/__init__.py b/mmtrack/evaluation/functional/__init__.py
new file mode 100644
index 000000000..0161a7569
--- /dev/null
+++ b/mmtrack/evaluation/functional/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .eval_sot_ope import eval_sot_ope
+from .eval_sot_vot import (bbox2region, eval_sot_accuracy_robustness,
+                           eval_sot_eao)
+from .ytvis import YTVIS
+from .ytviseval import YTVISeval
+
+__all__ = [
+    'eval_sot_ope', 'bbox2region', 'eval_sot_eao',
+    'eval_sot_accuracy_robustness', 'YTVIS', 'YTVISeval'
+]
diff --git a/mmtrack/core/evaluation/eval_sot_ope.py b/mmtrack/evaluation/functional/eval_sot_ope.py
similarity index 65%
rename from mmtrack/core/evaluation/eval_sot_ope.py
rename to mmtrack/evaluation/functional/eval_sot_ope.py
index b379daa36..41abd6e3b 100644
--- a/mmtrack/core/evaluation/eval_sot_ope.py
+++ b/mmtrack/evaluation/functional/eval_sot_ope.py
@@ -1,22 +1,25 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional
+
 import numpy as np
-from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmdet.evaluation.functional import bbox_overlaps
 
 
-def success_overlap(gt_bboxes, pred_bboxes, iou_th, video_length):
+def success_overlap(gt_bboxes: np.ndarray, pred_bboxes: np.ndarray,
+                    iou_th: np.ndarray, video_length: int) -> np.ndarray:
     """Evaluation based on iou.
 
     Args:
-        gt_bboxes (ndarray): of shape (video_length, 4) in
+        gt_bboxes (np.ndarray): of shape (video_length, 4) in
             [tl_x, tl_y, br_x, br_y] format.
-        pred_bboxes (ndarray): of shape (video_length, 4) in
+        pred_bboxes (np.ndarray): of shape (video_length, 4) in
             [tl_x, tl_y, br_x, br_y] format.
-        iou_th (ndarray): Different threshold of iou. Typically is set to
+        iou_th (np.ndarray): Different threshold of iou. Typically is set to
             `np.arange(0, 1.05, 0.05)`.
         video_length (int): Video length.
 
     Returns:
-        ndarray: The evaluation results at different threshold of iou.
+        np.ndarray: The evaluation results at different threshold of iou.
     """
     success = np.zeros(len(iou_th))
     iou = np.ones(len(gt_bboxes)) * (-1)
@@ -31,18 +34,21 @@ def success_overlap(gt_bboxes, pred_bboxes, iou_th, video_length):
     return success
 
 
-def success_error(gt_bboxes_center, pred_bboxes_center, pixel_offset_th,
-                  video_length):
+def success_error(gt_bboxes_center: np.ndarray, pred_bboxes_center: np.ndarray,
+                  pixel_offset_th: np.ndarray,
+                  video_length: int) -> np.ndarray:
     """Evaluation based on pixel offset.
 
     Args:
-        gt_bboxes (ndarray): of shape (video_length, 2) in [cx, cy] format.
-        pred_bboxes (ndarray): of shape (video_length, 2) in [cx, cy] format.
-        pixel_offset_th (ndarray): Different threshold of pixel offset.
+        gt_bboxes (np.ndarray): of shape (video_length, 2) in [cx, cy] format.
+        pred_bboxes (np.ndarray): of shape (video_length, 2) in [cx, cy]
+            format.
+        pixel_offset_th (np.ndarray): Different threshold of pixel offset.
         video_length (int): Video length.
 
     Returns:
-        ndarray: The evaluation results at different threshold of pixel offset.
+        np.ndarray: The evaluation results at different threshold of pixel
+            offset.
     """
     success = np.zeros(len(pixel_offset_th))
     dist = np.ones(len(gt_bboxes_center)) * (-1)
@@ -55,24 +61,27 @@ def success_error(gt_bboxes_center, pred_bboxes_center, pixel_offset_th,
     return success
 
 
-def eval_sot_ope(results, annotations, visible_infos=None):
+def eval_sot_ope(
+        results,
+        annotations: List[List[np.ndarray]],
+        visible_infos: Optional[List[np.ndarray]] = None) -> Dict[str, float]:
     """Evaluation in OPE protocol.
 
     Args:
-        results (list[list[ndarray]]): The first list contains the tracking
+        results (List[List[np.ndarray]]): The first list contains the tracking
             results of each video. The second list contains the tracking
             results of each frame in one video. The ndarray denotes the
             tracking box in [tl_x, tl_y, br_x, br_y] format.
-        annotations (list[ndarray]): The list contains the bbox
+        annotations (List[np.ndarray]): The list contains the bbox
             annotations of each video. The ndarray is gt_bboxes of one video.
             It's in (N, 4) shape. Each bbox is in (x1, y1, x2, y2) format.
-        visible_infos (list[ndarray] | None): If not None, the list
-            contains the visible information of each video. The ndarray is
+        visible_infos (Optional[List[np.ndarray]], optional): If not None, the
+            list contains the visible information of each video. The ndarray is
             visibility (with bool type) of object in one video. It's in (N,)
             shape. Default to None.
 
     Returns:
-        dict[str, float]: OPE style evaluation metric (i.e. success,
+        Dict[str, float]: OPE style evaluation metric (i.e. success,
         norm precision and precision).
     """
     success_results = []
@@ -116,9 +125,17 @@ def eval_sot_ope(results, annotations, visible_infos=None):
             success_error(norm_gt_bboxes_center, norm_pred_bboxes_center,
                           norm_pixel_offset_th, video_length))
 
-    success = np.mean(success_results) * 100
-    precision = np.mean(precision_results, axis=0)[20] * 100
-    norm_precision = np.mean(norm_precision_results, axis=0)[20] * 100
+    success_results = np.stack(success_results) * 100
+    precision_results = np.stack(precision_results) * 100
+    norm_precision_results = np.stack(norm_precision_results) * 100
+    success = np.mean(success_results)
+    precision = np.mean(precision_results, axis=0)[20]
+    norm_precision = np.mean(norm_precision_results, axis=0)[20]
     eval_results = dict(
-        success=success, norm_precision=norm_precision, precision=precision)
+        success=success,
+        norm_precision=norm_precision,
+        precision=precision,
+        ori_success=success_results,
+        ori_precision=precision_results,
+        ori_norm_precision=norm_precision_results)
     return eval_results
diff --git a/mmtrack/core/evaluation/eval_sot_vot.py b/mmtrack/evaluation/functional/eval_sot_vot.py
similarity index 69%
rename from mmtrack/core/evaluation/eval_sot_vot.py
rename to mmtrack/evaluation/functional/eval_sot_vot.py
index 85f8d0757..06084f149 100644
--- a/mmtrack/core/evaluation/eval_sot_vot.py
+++ b/mmtrack/evaluation/functional/eval_sot_vot.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # The codes are modified from https://github.com/votchallenge/toolkit/blob/master/vot/analysis/supervised.py # noqa: E501
+from typing import Dict, List, Optional, Tuple, Union
+
 import numpy as np
 
 try:
@@ -11,15 +13,16 @@
     vot = None
 
 
-def bbox2region(bbox):
+def bbox2region(bbox: np.ndarray) -> 'Union[Rectangle, Polygon]':
     """Convert bbox to Rectangle or Polygon Class object.
 
     Args:
-        bbox (ndarray): the format of rectangle bbox is (x1, y1, w, h);
-            the format of polygon is (x1, y1, x2, y2, ...).
+        bbox (np.ndarray): the format of rectangle bbox is
+            [tl_x, tl_y, br_x, br_y];
+            the format of polygon is [x1, y1, x2, y2, ...].
 
     Returns:
-        Rectangle or Polygon Class object.
+        :obj:`Rectangle` or :obj:`Polygon`.
     """
     if vot is None:
         raise ImportError(
@@ -30,7 +33,8 @@ def bbox2region(bbox):
     if len(bbox) == 1:
         return Special(bbox[0])
     elif len(bbox) == 4:
-        return Rectangle(bbox[0], bbox[1], bbox[2], bbox[3])
+        return Rectangle(bbox[0], bbox[1], bbox[2] - bbox[0],
+                         bbox[3] - bbox[1])
     elif len(bbox) % 2 == 0 and len(bbox) > 4:
         return Polygon([(x_, y_) for x_, y_ in zip(bbox[::2], bbox[1::2])])
     else:
@@ -38,11 +42,11 @@ def bbox2region(bbox):
             f'The length of bbox is {len(bbox)}, which is not supported')
 
 
-def trajectory2region(trajectory):
+def trajectory2region(trajectory: List[np.ndarray]) -> List:
     """Convert bbox trajectory to Rectangle or Polygon Class object trajectory.
 
     Args:
-        trajectory (list[ndarray]): The outer list contains bbox of
+        trajectory (List[np.ndarray]): The outer list contains bbox of
             each frame in a video. The bbox is a ndarray.
 
     Returns:
@@ -55,15 +59,16 @@ def trajectory2region(trajectory):
     return traj_region
 
 
-def locate_failures_inits(trajectory):
+def locate_failures_inits(trajectory: List[np.ndarray]) -> Tuple[List, List]:
     """locate the failure frame and initialized frame in a trajectory.
 
     Args:
-        trajectory (list[ndarray]): list of tracking results.
+        trajectory (List[np.ndarray]): list of tracking results.
 
     Returns:
-        fail_inds (list): index of failed frame in a trajectory.
-        init_inds (list): index of initialized frame in a trajectory.
+        Tuple[List, List]:
+            - fail_inds (List): index of failed frame in a trajectory.
+            - init_inds (List): index of initialized frame in a trajectory.
     """
     fail_inds = []
     init_inds = []
@@ -76,11 +81,11 @@ def locate_failures_inits(trajectory):
     return fail_inds, init_inds
 
 
-def count_failures(trajectory):
+def count_failures(trajectory: List[np.ndarray]) -> List:
     """count the number of failed frame in a trajectory.
 
     Args:
-        trajectory (list[ndarray]): list of tracking results.
+        trajectory (List[np.ndarray]): list of tracking results.
 
     Returns:
         List: the number of failed frame in a trajectory.
@@ -92,28 +97,29 @@ def count_failures(trajectory):
     return num_fails
 
 
-def calc_accuracy(gt_trajectory,
-                  pred_trajectory,
-                  burnin=10,
-                  ignore_unknown=True,
-                  video_wh=None):
+def calc_accuracy(gt_trajectory: List[List],
+                  pred_trajectory: List[List],
+                  burnin: int = 10,
+                  ignore_unknown: bool = True,
+                  video_wh: Optional[Tuple[int, int]] = None) -> float:
     """Calculate accuracy over the sequence.
 
     Args:
-        gt_trajectory (list[list]): list of bboxes
-        pred_trajectory (list[ndarray]): The outer list contains the
+        gt_trajectory (List[List]): list of bboxes
+        pred_trajectory (List[np.ndarray]): The outer list contains the
             tracking results of each frame in one video. The ndarray has two
             cases:
-                - bbox: denotes the normal tracking box in [x1, y1, w, h]
-                    format.
+                - bbox: denotes the normal tracking box in
+                    [tl_x, tl_y, br_x, br_y] format.
                 - special tracking state: [0] denotes the unknown state,
                     namely the skipping frame after failure, [1] denotes the
                     initialized state, and [2] denotes the failed state.
-        burnin: number of frames that have to be ignored after the
-            re-initialization when calculating accuracy. Default is 10.
-        ignore_unknown (bool): whether ignore the skipping frames after
-            failures when calculating accuracy. Default is True.
-        video_wh: bounding region (width, height)
+        burnin (int, optional): number of frames that have to be ignored after
+            the re-initialization when calculating accuracy. Default is 10.
+        ignore_unknown (bool, optional): whether ignore the skipping frames
+            after failures when calculating accuracy. Default is True.
+        video_wh (Optional[Tuple[int, int]], optional): bounding region
+            (width, height)
 
     Return:
         Float: accuracy over the sequence.
@@ -132,38 +138,39 @@ def calc_accuracy(gt_trajectory,
                 mask[j] = False
         elif is_special(region, Special.FAILURE):
             mask[i] = False
-    return np.mean(overlaps[mask]) if any(mask) else 0.
+    return np.mean(overlaps[mask]).item() if any(mask) else 0.
 
 
-def eval_sot_accuracy_robustness(results,
-                                 annotations,
-                                 burnin=10,
-                                 ignore_unknown=True,
-                                 videos_wh=None):
+def eval_sot_accuracy_robustness(
+        results: List[List[np.ndarray]],
+        annotations: List[np.ndarray],
+        burnin: int = 10,
+        ignore_unknown: bool = True,
+        videos_wh: Optional[Tuple[int, int]] = None) -> Dict[str, float]:
     """Calculate accuracy and robustness over all tracking sequences.
 
     Args:
-        results (list[list[ndarray]]): The first list contains the
+        results (List[List[np.ndarray]]): The first list contains the
             tracking results of each video. The second list contains the
             tracking results of each frame in one video. The ndarray have two
             cases:
-                - bbox: denotes the normal tracking box in [x1, y1, w, h]
-                    format.
+                - bbox: denotes the normal tracking box in
+                    [tl_x, tl_y, br_x, br_y] format.
                 - special tracking state: [0] denotes the unknown state,
                     namely the skipping frame after failure, [1] denotes the
                     initialized state, and [2] denotes the failed state.
-        annotations (list[ndarray]): The list contains the gt_bboxes of each
+        annotations (List[np.ndarray]): The list contains the gt_bboxes of each
             video. The ndarray is gt_bboxes of one video. It's in (N, 4) shape.
             Each bbox is in (x1, y1, w, h) format.
-        burnin: number of frames that have to be ignored after the
-            re-initialization when calculating accuracy. Default is 10.
-        ignore_unknown (bool): whether ignore the skipping frames after
-            failures when calculating accuracy. Default is True.
-        videos_wh (list[tuple(width, height), ...]): The list contains the
-            width and height of each video. Default is None.
+        burnin (int, optional): number of frames that have to be ignored after
+            the re-initialization when calculating accuracy. Default is 10.
+        ignore_unknown (bool, optional): whether ignore the skipping frames
+            after failures when calculating accuracy. Default is True.
+        videos_wh (Optional[Tuple[int, int]], optional): bounding region
+            (width, height)
 
     Return:
-        dict{str: float}: accuracy and robustness in EAO evaluation metric.
+        Dict[str: float]: accuracy and robustness in EAO evaluation metric.
     """
     if vot is None:
         raise ImportError(
@@ -191,19 +198,19 @@ def eval_sot_accuracy_robustness(results,
     return dict(accuracy=accuracy, robustness=robustness, num_fails=num_fails)
 
 
-def calc_eao_curve(overlaps, successes):
+def calc_eao_curve(overlaps: List[List], successes: List) -> np.ndarray:
     """Calculate EAO curve over all tracking sequences.
 
     Args:
-        overlaps (list[list]): The outer list contains the overlaps of each
+        overlaps (List[List]): The outer list contains the overlaps of each
             video. The inner list contains the overlap of each frame in one
             video.
-        successes (list): The list contains the tracking states of last frame
+        successes (List): The list contains the tracking states of last frame
             in each fragment.
 
     Return:
-        ndarray: The N-th element in ndarray denotes the average overlaps from
-            1 to N in all fragments.
+        np.ndarray: The N-th element in ndarray denotes the average overlaps
+            from 1 to N in all fragments.
     """
     max_length = max([len(_) for _ in overlaps])
     total_runs = len(overlaps)
@@ -234,20 +241,24 @@ def calc_eao_curve(overlaps, successes):
     return np.sum(overlaps_array_sum * mask, axis=0) / np.sum(mask, axis=0)
 
 
-def eval_sot_eao(results, annotations, interval=[100, 356], videos_wh=None):
+def eval_sot_eao(
+        results: List[List[np.ndarray]],
+        annotations: List[np.ndarray],
+        interval: Tuple[int, int] = [100, 356],
+        videos_wh: Optional[Tuple[int, int]] = None) -> Dict[str, float]:
     """Calculate EAO socre over all tracking sequences.
 
     Args:
-        results (list[list[ndarray]]): The first list contains the
+        results (List[List[np.ndarray]]): The first list contains the
             tracking results of each video. The second list contains the
             tracking results of each frame in one video. The ndarray have two
             cases:
-                - bbox: denotes the normal tracking box in [x1, y1, w, h]
-                    format.
+                - bbox: denotes the normal tracking box in
+                    [tl_x, tl_y, br_x, br_y] format.
                 - special tracking state: [0] denotes the unknown state,
                     namely the skipping frame after failure, [1] denotes the
                     initialized state, and [2] denotes the failed state.
-        annotations (list[ndarray]): The list contains the gt_bboxes of each
+        annotations (list[np.ndarray]): The list contains the gt_bboxes of each
             video. The ndarray is gt_bboxes of one video. It's in (N, 4) shape.
             Each bbox is in (x1, y1, w, h) format.
         interval: an specified interval in EAO curve used to calculate the EAO
@@ -257,7 +268,7 @@ def eval_sot_eao(results, annotations, interval=[100, 356], videos_wh=None):
             width and height of each video. Default is None.
 
     Return:
-        dict[str, float]: EAO score in EAO evaluation metric.
+        Dict[str, float]: EAO score in EAO evaluation metric.
     """
     if vot is None:
         raise ImportError(
@@ -278,7 +289,6 @@ def eval_sot_eao(results, annotations, interval=[100, 356], videos_wh=None):
         assert len(pred_traj[0]) == 1 and pred_traj[0][
             0] == 1, f'{len(pred_traj[0])} == 1 and {pred_traj[0][0]} == 1'
         fail_inds, init_inds = locate_failures_inits(pred_traj)
-
         pred_traj = trajectory2region(pred_traj)
         gt_traj = trajectory2region(gt_traj)
         overlaps = calculate_region_overlaps(pred_traj, gt_traj, videos_wh[i])
@@ -299,6 +309,6 @@ def eval_sot_eao(results, annotations, interval=[100, 356], videos_wh=None):
             all_successes.append(True)
 
     eao_curve = calc_eao_curve(all_overlaps, all_successes)
-    eao_score = np.mean(eao_curve[interval[0]:interval[1] + 1])
+    eao_score = np.mean(eao_curve[interval[0]:interval[1] + 1]).item()
     eao = dict(eao=eao_score)
     return eao
diff --git a/mmtrack/core/evaluation/ytvis.py b/mmtrack/evaluation/functional/ytvis.py
similarity index 100%
rename from mmtrack/core/evaluation/ytvis.py
rename to mmtrack/evaluation/functional/ytvis.py
diff --git a/mmtrack/core/evaluation/ytviseval.py b/mmtrack/evaluation/functional/ytviseval.py
similarity index 100%
rename from mmtrack/core/evaluation/ytviseval.py
rename to mmtrack/evaluation/functional/ytviseval.py
diff --git a/mmtrack/evaluation/metrics/__init__.py b/mmtrack/evaluation/metrics/__init__.py
new file mode 100644
index 000000000..7fe15f017
--- /dev/null
+++ b/mmtrack/evaluation/metrics/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_video_metrics import BaseVideoMetric
+from .coco_video_metric import CocoVideoMetric
+from .mot_challenge_metrics import MOTChallengeMetrics
+from .reid_metrics import ReIDMetrics
+from .sot_metrics import SOTMetric
+from .tao_metrics import TAOMetric
+from .youtube_vis_metrics import YouTubeVISMetric
+
+__all__ = [
+    'ReIDMetrics', 'BaseVideoMetric', 'CocoVideoMetric', 'YouTubeVISMetric',
+    'MOTChallengeMetrics', 'SOTMetric', 'TAOMetric'
+]
diff --git a/mmtrack/evaluation/metrics/base_video_metrics.py b/mmtrack/evaluation/metrics/base_video_metrics.py
new file mode 100644
index 000000000..d9a7f3d8f
--- /dev/null
+++ b/mmtrack/evaluation/metrics/base_video_metrics.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import warnings
+from typing import Optional
+
+import torch
+from mmengine.dist import (barrier, broadcast, broadcast_object_list,
+                           get_dist_info, is_main_process)
+from mmengine.evaluator import BaseMetric
+from mmengine.utils import mkdir_or_exist
+
+
+class BaseVideoMetric(BaseMetric):
+    """Base class for a metric in video task.
+
+    The metric first processes each batch of data_samples and predictions,
+    and appends the processed results to the results list. Then it
+    collects all results together from all ranks if distributed training
+    is used. Finally, it computes the metrics of the entire dataset.
+
+    A subclass of class:`BaseVideoMetric` should assign a meaningful value
+    to the class attribute `default_prefix`. See the argument `prefix` for
+    details.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    def evaluate(self, size: int) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+
+def collect_tracking_results(results: list,
+                             device: str = 'cpu',
+                             tmpdir: Optional[str] = None) -> Optional[list]:
+    """Collected results in distributed environments.
+
+    Args:
+        results (list): Result list containing result parts to be
+            collected. Each item of ``result_part`` should be a picklable
+            object.
+        device (str): Device name. Optional values are 'cpu' and 'gpu'.
+        tmpdir (str | None): Temporal directory for collected results to
+            store. If set to None, it will create a temporal directory for it.
+            ``tmpdir`` should be None when device is 'gpu'. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+    """
+    if device not in ['gpu', 'cpu']:
+        raise NotImplementedError(
+            f"device must be 'cpu' or 'gpu', but got {device}")
+
+    if device == 'gpu':
+        assert tmpdir is None, 'tmpdir should be None when device is "gpu"'
+        raise NotImplementedError('GPU collecting has not been supported yet')
+    else:
+        return collect_tracking_results_cpu(results, tmpdir)
+
+
+def collect_tracking_results_cpu(result_part: list,
+                                 tmpdir: Optional[str] = None
+                                 ) -> Optional[list]:
+    """Collect results on cpu mode.
+
+    Saves the results on different gpus to 'tmpdir' and collects them by the
+    rank 0 worker.
+
+    Args:
+        result_part (list): The part of prediction results.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode. If is None, use `tempfile.mkdtemp()`
+            to make a temporary path. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    if world_size == 1:
+        return result_part
+
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8)
+        if rank == 0:
+            mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8)
+            dir_tensor[:len(tmpdir)] = tmpdir
+        broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.numpy().tobytes().decode().rstrip()
+    else:
+        mkdir_or_exist(tmpdir)
+
+    # dump the part result to the dir
+    with open(osp.join(tmpdir, f'part_{rank}.pkl'), 'wb') as f:  # type: ignore
+        pickle.dump(result_part, f, protocol=2)
+
+    barrier()
+
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            path = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
+            with open(path, 'rb') as f:
+                part_list.extend(pickle.load(f))
+        shutil.rmtree(tmpdir)
+        return part_list
diff --git a/mmtrack/evaluation/metrics/coco_video_metric.py b/mmtrack/evaluation/metrics/coco_video_metric.py
new file mode 100644
index 000000000..158917cbd
--- /dev/null
+++ b/mmtrack/evaluation/metrics/coco_video_metric.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional, Sequence
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.evaluation import CocoMetric
+from mmdet.structures.mask import encode_mask_results
+from mmengine.dist import broadcast_object_list, is_main_process
+from mmengine.fileio import FileClient
+
+from mmtrack.registry import METRICS
+from .base_video_metrics import collect_tracking_results
+
+
+@METRICS.register_module()
+class CocoVideoMetric(CocoMetric):
+    """COCO evaluation metric.
+
+    Evaluate AR, AP, and mAP for detection tasks including proposal/box
+    detection and instance segmentation. Please refer to
+    https://cocodataset.org/#detection-eval for more details.
+    """
+
+    def __init__(self, ann_file: Optional[str] = None, **kwargs) -> None:
+        super().__init__(**kwargs)
+        # if ann_file is not specified,
+        # initialize coco api with the converted dataset
+        if ann_file:
+            file_client = FileClient.infer_client(uri=ann_file)
+            with file_client.get_local_path(ann_file) as local_path:
+                self._coco_api = COCO(local_path)
+        else:
+            self._coco_api = None
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Note that we only modify ``pred['pred_instances']`` in ``CocoMetric``
+        to ``pred['pred_det_instances']`` here.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_det_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            # encode mask to RLE
+            if 'masks' in pred:
+                result['masks'] = encode_mask_results(
+                    pred['masks'].detach().cpu().numpy())
+            # some detectors use different scores for bbox and mask
+            if 'mask_scores' in pred:
+                result['mask_scores'] = pred['mask_scores'].cpu().numpy()
+
+            # parse gt
+            gt = dict()
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['img_id'] = data_sample['img_id']
+            if self._coco_api is None:
+                assert 'instances' in data_sample, \
+                    'ground truth is required for evaluation when ' \
+                    '`ann_file` is not provided'
+                gt['anns'] = data_sample['instances']
+            # add converted result to the results list
+            self.results.append((gt, result))
+
+    def evaluate(self, size: int) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
diff --git a/mmtrack/evaluation/metrics/mot_challenge_metrics.py b/mmtrack/evaluation/metrics/mot_challenge_metrics.py
new file mode 100644
index 000000000..d223baa92
--- /dev/null
+++ b/mmtrack/evaluation/metrics/mot_challenge_metrics.py
@@ -0,0 +1,421 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import tempfile
+from collections import defaultdict
+from glob import glob
+from typing import List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+import trackeval
+from mmengine.dist import (all_gather_object, barrier, broadcast,
+                           broadcast_object_list, get_dist_info,
+                           is_main_process)
+from mmengine.logging import MMLogger
+
+from mmtrack.registry import METRICS, TASK_UTILS
+from .base_video_metrics import BaseVideoMetric
+
+
+def get_tmpdir() -> str:
+    """return the same tmpdir for all processes."""
+    rank, world_size = get_dist_info()
+    MAX_LEN = 512
+    # 32 is whitespace
+    dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8)
+    if rank == 0:
+        tmpdir = tempfile.mkdtemp()
+        tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8)
+        dir_tensor[:len(tmpdir)] = tmpdir
+    broadcast(dir_tensor, 0)
+    tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    return tmpdir
+
+
+@METRICS.register_module()
+class MOTChallengeMetrics(BaseVideoMetric):
+    """Evaluation metrics for MOT Challenge.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated. Options are
+            'HOTA', 'CLEAR', 'Identity'.
+            Defaults to ['HOTA', 'CLEAR', 'Identity'].
+        outfile_prefix (str, optional): Path to save the formatted results.
+            Defaults to None.
+        track_iou_thr (float): IoU threshold for tracking evaluation.
+            Defaults to 0.5.
+        benchmark (str): Benchmark to be evaluated. Defaults to 'MOT17'.
+        format_only (bool): If True, only formatting the results to the
+            official format and not performing evaluation. Defaults to False.
+        postprocess_tracklet_cfg (List[dict], optional): configs for tracklets
+            postprocessing methods. `AppearanceFreeLink` and
+            `InterpolateTracklets` are supported. Defaults to [].
+            - AppearanceFreeLink:
+                - checkpoint (str): Checkpoint path.
+                - temporal_threshold (tuple, optional): The temporal constraint
+                    for tracklets association. Defaults to (0, 30).
+                - spatial_threshold (int, optional): The spatial constraint for
+                    tracklets association. Defaults to 75.
+                - confidence_threshold (float, optional): The minimum
+                    confidence threshold for tracklets association.
+                    Defaults to 0.95.
+            - InterpolateTracklets:
+                - min_num_frames (int, optional): The minimum length of a
+                    track that will be interpolated. Defaults to 5.
+                - max_num_frames (int, optional): The maximum disconnected
+                    length in a track. Defaults to 20.
+                - use_gsi (bool, optional): Whether to use the GSI (Gaussian-
+                    smoothed interpolation) method. Defaults to False.
+                - smooth_tau (int, optional): smoothing parameter in GSI.
+                    Defaults to 10.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    Returns:
+    """
+    TRACKER = 'default-tracker'
+    allowed_metrics = ['HOTA', 'CLEAR', 'Identity']
+    allowed_benchmarks = ['MOT15', 'MOT16', 'MOT17', 'MOT20', 'DanceTrack']
+    default_prefix: Optional[str] = 'motchallenge-metric'
+
+    def __init__(self,
+                 metric: Union[str, List[str]] = ['HOTA', 'CLEAR', 'Identity'],
+                 outfile_prefix: Optional[str] = None,
+                 track_iou_thr: float = 0.5,
+                 benchmark: str = 'MOT17',
+                 format_only: bool = False,
+                 postprocess_tracklet_cfg: Optional[List[dict]] = [],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if isinstance(metric, list):
+            metrics = metric
+        elif isinstance(metric, str):
+            metrics = [metric]
+        else:
+            raise TypeError('metric must be a list or a str.')
+        for metric in metrics:
+            if metric not in self.allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported.')
+        self.metrics = metrics
+        self.format_only = format_only
+        self.postprocess_tracklet_cfg = postprocess_tracklet_cfg.copy()
+        self.postprocess_tracklet_methods = [
+            TASK_UTILS.build(cfg) for cfg in self.postprocess_tracklet_cfg
+        ]
+        assert benchmark in self.allowed_benchmarks
+        self.benchmark = benchmark
+        self.track_iou_thr = track_iou_thr
+        self.tmp_dir = tempfile.TemporaryDirectory()
+        self.tmp_dir.name = get_tmpdir()
+        self.seq_info = defaultdict(
+            lambda: dict(seq_length=-1, gt_tracks=[], pred_tracks=[]))
+        self.gt_dir = self._get_gt_dir()
+        self.pred_dir = self._get_pred_dir(outfile_prefix)
+        self.seqmap = osp.join(self.pred_dir, 'videoseq.txt')
+        with open(self.seqmap, 'w') as f:
+            f.write('name\n')
+
+    def __del__(self):
+        # To avoid tmpdir being cleaned up too early, because in multiple
+        # consecutive ValLoops, the value of `self.tmp_dir.name` is unchanged,
+        # and calling `tmp_dir.cleanup()` in compute_metrics will cause errors.
+        self.tmp_dir.cleanup()
+
+    def _get_pred_dir(self, outfile_prefix):
+        """Get directory to save the prediction results."""
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if outfile_prefix is None:
+            outfile_prefix = self.tmp_dir.name
+        else:
+            if osp.exists(outfile_prefix) and is_main_process():
+                logger.info('remove previous results.')
+                shutil.rmtree(outfile_prefix)
+        pred_dir = osp.join(outfile_prefix, self.TRACKER)
+        os.makedirs(pred_dir, exist_ok=True)
+        return pred_dir
+
+    def _get_gt_dir(self):
+        """Get directory to save the gt files."""
+        output_dir = osp.join(self.tmp_dir.name, 'gt')
+        os.makedirs(output_dir, exist_ok=True)
+        return output_dir
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            # load basic info
+            frame_id = data_sample['frame_id']
+            video_length = data_sample['video_length']
+            video = data_sample['img_path'].split(os.sep)[-3]
+            if self.seq_info[video]['seq_length'] == -1:
+                self.seq_info[video]['seq_length'] = video_length
+
+            # load gts
+            if 'instances' in data_sample:
+                gt_instances = data_sample['instances']
+                gt_tracks = [
+                    np.array([
+                        frame_id + 1, gt_instances[i]['instance_id'],
+                        gt_instances[i]['bbox'][0], gt_instances[i]['bbox'][1],
+                        gt_instances[i]['bbox'][2] -
+                        gt_instances[i]['bbox'][0],
+                        gt_instances[i]['bbox'][3] -
+                        gt_instances[i]['bbox'][1],
+                        gt_instances[i]['mot_conf'],
+                        gt_instances[i]['category_id'],
+                        gt_instances[i]['visibility']
+                    ]) for i in range(len(gt_instances))
+                ]
+                self.seq_info[video]['gt_tracks'].extend(gt_tracks)
+
+            # load predictions
+            assert 'pred_track_instances' in data_sample
+            pred_instances = data_sample['pred_track_instances']
+            pred_tracks = [
+                np.array([
+                    frame_id + 1, pred_instances['instances_id'][i].cpu(),
+                    pred_instances['bboxes'][i][0].cpu(),
+                    pred_instances['bboxes'][i][1].cpu(),
+                    (pred_instances['bboxes'][i][2] -
+                     pred_instances['bboxes'][i][0]).cpu(),
+                    (pred_instances['bboxes'][i][3] -
+                     pred_instances['bboxes'][i][1]).cpu(),
+                    pred_instances['scores'][i].cpu()
+                ]) for i in range(len(pred_instances['instances_id']))
+            ]
+            self.seq_info[video]['pred_tracks'].extend(pred_tracks)
+
+            if frame_id == video_length - 1:
+                # postprocessing
+                if self.postprocess_tracklet_cfg:
+                    info = self.seq_info[video]
+                    pred_tracks = np.array(info['pred_tracks'])
+                    for postprocess_tracklet_methods in \
+                            self.postprocess_tracklet_methods:
+                        pred_tracks = postprocess_tracklet_methods\
+                            .forward(pred_tracks)
+                    info['pred_tracks'] = pred_tracks
+                self._save_one_video_gts_preds(video)
+                break
+
+    def _save_one_video_gts_preds(self, seq: str) -> None:
+        """Save the gt and prediction results."""
+        info = self.seq_info[seq]
+        # save predictions
+        pred_file = osp.join(self.pred_dir, seq + '.txt')
+
+        pred_tracks = np.array(info['pred_tracks'])
+
+        with open(pred_file, 'wt') as f:
+            for tracks in pred_tracks:
+                line = '%d,%d,%.3f,%.3f,%.3f,%.3f,%.3f,-1,-1,-1\n' % (
+                    tracks[0], tracks[1], tracks[2], tracks[3], tracks[4],
+                    tracks[5], tracks[6])
+                f.writelines(line)
+
+        info['pred_tracks'] = []
+        # save gts
+        if info['gt_tracks']:
+            gt_file = osp.join(self.gt_dir, seq + '.txt')
+            with open(gt_file, 'wt') as f:
+                for tracks in info['gt_tracks']:
+                    line = '%d,%d,%d,%d,%d,%d,%d,%d,%.5f\n' % (
+                        tracks[0], tracks[1], tracks[2], tracks[3], tracks[4],
+                        tracks[5], tracks[6], tracks[7], tracks[8])
+                    f.writelines(line)
+            info['gt_tracks'].clear()
+        # save seq info
+        with open(self.seqmap, 'a') as f:
+            f.write(seq + '\n')
+            f.close()
+
+    def compute_metrics(self, results: list = None) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+                Defaults to None.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # NOTICE: don't access `self.results` from the method.
+        eval_results = dict()
+
+        if self.format_only:
+            return eval_results
+
+        eval_config = trackeval.Evaluator.get_default_eval_config()
+
+        # need to split out the tracker name
+        # caused by the implementation of TrackEval
+        pred_dir_tmp = self.pred_dir.rsplit(osp.sep, 1)[0]
+        dataset_config = self.get_dataset_cfg(self.gt_dir, pred_dir_tmp)
+
+        evaluator = trackeval.Evaluator(eval_config)
+        dataset = [trackeval.datasets.MotChallenge2DBox(dataset_config)]
+        metrics = [
+            getattr(trackeval.metrics,
+                    metric)(dict(METRICS=[metric], THRESHOLD=0.5))
+            for metric in self.metrics
+        ]
+        output_res, _ = evaluator.evaluate(dataset, metrics)
+        output_res = output_res['MotChallenge2DBox'][
+            self.TRACKER]['COMBINED_SEQ']['pedestrian']
+
+        if 'HOTA' in self.metrics:
+            logger.info('Evaluating HOTA Metrics...')
+            eval_results['HOTA'] = np.average(output_res['HOTA']['HOTA'])
+            eval_results['AssA'] = np.average(output_res['HOTA']['AssA'])
+            eval_results['DetA'] = np.average(output_res['HOTA']['DetA'])
+
+        if 'CLEAR' in self.metrics:
+            logger.info('Evaluating CLEAR Metrics...')
+            eval_results['MOTA'] = np.average(output_res['CLEAR']['MOTA'])
+            eval_results['MOTP'] = np.average(output_res['CLEAR']['MOTP'])
+            eval_results['IDSW'] = np.average(output_res['CLEAR']['IDSW'])
+            eval_results['TP'] = np.average(output_res['CLEAR']['CLR_TP'])
+            eval_results['FP'] = np.average(output_res['CLEAR']['CLR_FP'])
+            eval_results['FN'] = np.average(output_res['CLEAR']['CLR_FN'])
+            eval_results['Frag'] = np.average(output_res['CLEAR']['Frag'])
+            eval_results['MT'] = np.average(output_res['CLEAR']['MT'])
+            eval_results['ML'] = np.average(output_res['CLEAR']['ML'])
+
+        if 'Identity' in self.metrics:
+            logger.info('Evaluating Identity Metrics...')
+            eval_results['IDF1'] = np.average(output_res['Identity']['IDF1'])
+            eval_results['IDTP'] = np.average(output_res['Identity']['IDTP'])
+            eval_results['IDFN'] = np.average(output_res['Identity']['IDFN'])
+            eval_results['IDFP'] = np.average(output_res['Identity']['IDFP'])
+            eval_results['IDP'] = np.average(output_res['Identity']['IDP'])
+            eval_results['IDR'] = np.average(output_res['Identity']['IDR'])
+
+        # clean all txt file
+        for txt_name in glob(osp.join(self.tmp_dir.name, '*.txt')):
+            os.remove(txt_name)
+        return eval_results
+
+    def evaluate(self, size: int = None) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+                Defaults to None.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        # wait for all processes to complete prediction.
+        barrier()
+
+        # gather seq_info and convert the list of dict to a dict.
+        # convert self.seq_info to dict first to make it picklable.
+        gathered_seq_info = all_gather_object(dict(self.seq_info))
+        all_seq_info = dict()
+        for _seq_info in gathered_seq_info:
+            all_seq_info.update(_seq_info)
+        self.seq_info = all_seq_info
+
+        if is_main_process():
+            _metrics = self.compute_metrics()  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+    def get_dataset_cfg(self, gt_folder: str, tracker_folder: str):
+        """Get default configs for trackeval.datasets.MotChallenge2DBox.
+
+        Args:
+            gt_folder (str): the name of the GT folder
+            tracker_folder (str): the name of the tracker folder
+
+        Returns:
+            Dataset Configs for MotChallenge2DBox.
+        """
+        dataset_config = dict(
+            # Location of GT data
+            GT_FOLDER=gt_folder,
+            # Trackers location
+            TRACKERS_FOLDER=tracker_folder,
+            # Where to save eval results
+            # (if None, same as TRACKERS_FOLDER)
+            OUTPUT_FOLDER=None,
+            # Use self.TRACKER as the default tracker
+            TRACKERS_TO_EVAL=[self.TRACKER],
+            # Option values: ['pedestrian']
+            CLASSES_TO_EVAL=['pedestrian'],
+            # Option Values: 'MOT15', 'MOT16', 'MOT17', 'MOT20', 'DanceTrack'
+            BENCHMARK=self.benchmark,
+            # Option Values: 'train', 'test'
+            SPLIT_TO_EVAL='val' if self.benchmark == 'DanceTrack' else 'train',
+            # Whether tracker input files are zipped
+            INPUT_AS_ZIP=False,
+            # Whether to print current config
+            PRINT_CONFIG=True,
+            # Whether to perform preprocessing
+            # (never done for MOT15)
+            DO_PREPROC=False if self.benchmark == 'MOT15' else True,
+            # Tracker files are in
+            # TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            TRACKER_SUB_FOLDER='',
+            # Output files are saved in
+            # OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            OUTPUT_SUB_FOLDER='',
+            # Names of trackers to display
+            # (if None: TRACKERS_TO_EVAL)
+            TRACKER_DISPLAY_NAMES=None,
+            # Where seqmaps are found
+            # (if None: GT_FOLDER/seqmaps)
+            SEQMAP_FOLDER=None,
+            # Directly specify seqmap file
+            # (if none use seqmap_folder/benchmark-split_to_eval)
+            SEQMAP_FILE=self.seqmap,
+            # If not None, specify sequences to eval
+            # and their number of timesteps
+            SEQ_INFO={
+                seq: info['seq_length']
+                for seq, info in self.seq_info.items()
+            },
+            # '{gt_folder}/{seq}.txt'
+            GT_LOC_FORMAT='{gt_folder}/{seq}.txt',
+            # If False, data is in GT_FOLDER/BENCHMARK-SPLIT_TO_EVAL/ and in
+            # TRACKERS_FOLDER/BENCHMARK-SPLIT_TO_EVAL/tracker/
+            # If True, the middle 'benchmark-split' folder is skipped for both.
+            SKIP_SPLIT_FOL=True,
+        )
+
+        return dataset_config
diff --git a/mmtrack/evaluation/metrics/reid_metrics.py b/mmtrack/evaluation/metrics/reid_metrics.py
new file mode 100644
index 000000000..fa7733189
--- /dev/null
+++ b/mmtrack/evaluation/metrics/reid_metrics.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+
+from mmtrack.registry import METRICS
+
+
+@METRICS.register_module()
+class ReIDMetrics(BaseMetric):
+    """mAP and CMC evaluation metrics for the ReID task.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated.
+            Default value is `mAP`.
+        metric_options: (dict, optional): Options for calculating metrics.
+            Allowed keys are 'rank_list' and 'max_rank'. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    """
+    allowed_metrics = ['mAP', 'CMC']
+    default_prefix: Optional[str] = 'reid-metric'
+
+    def __init__(self,
+                 metric: Union[str, Sequence[str]] = 'mAP',
+                 metric_options: Optional[dict] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device, prefix)
+
+        if isinstance(metric, list):
+            metrics = metric
+        elif isinstance(metric, str):
+            metrics = [metric]
+        else:
+            raise TypeError('metric must be a list or a str.')
+        for metric in metrics:
+            if metric not in self.allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported.')
+        self.metrics = metrics
+
+        self.metric_options = metric_options or dict(
+            rank_list=[1, 5, 10, 20], max_rank=20)
+        for rank in self.metric_options['rank_list']:
+            assert 1 <= rank <= self.metric_options['max_rank']
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred_feature = data_sample['pred_feature']
+            assert isinstance(pred_feature, torch.Tensor)
+            gt_label = data_sample.get('gt_label', data_sample['gt_label'])
+            assert isinstance(gt_label['label'], torch.Tensor)
+            result = dict(
+                pred_feature=pred_feature.data.cpu(),
+                gt_label=gt_label['label'].cpu())
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method.
+        metrics = {}
+
+        pids = torch.cat([result['gt_label'] for result in results]).numpy()
+        features = torch.stack([result['pred_feature'] for result in results])
+
+        n, c = features.size()
+        mat = torch.pow(features, 2).sum(dim=1, keepdim=True).expand(n, n)
+        distmat = mat + mat.t()
+        distmat.addmm_(features, features.t(), beta=1, alpha=-2)
+        distmat = distmat.numpy()
+
+        indices = np.argsort(distmat, axis=1)
+        matches = (pids[indices] == pids[:, np.newaxis]).astype(np.int32)
+
+        all_cmc = []
+        all_AP = []
+        num_valid_q = 0.
+        for q_idx in range(n):
+            # remove self
+            raw_cmc = matches[q_idx][1:]
+            if not np.any(raw_cmc):
+                # this condition is true when query identity
+                # does not appear in gallery
+                continue
+
+            cmc = raw_cmc.cumsum()
+            cmc[cmc > 1] = 1
+
+            all_cmc.append(cmc[:self.metric_options['max_rank']])
+            num_valid_q += 1.
+
+            # compute average precision
+            num_rel = raw_cmc.sum()
+            tmp_cmc = raw_cmc.cumsum()
+            tmp_cmc = [x / (i + 1.) for i, x in enumerate(tmp_cmc)]
+            tmp_cmc = np.asarray(tmp_cmc) * raw_cmc
+            AP = tmp_cmc.sum() / num_rel
+            all_AP.append(AP)
+
+        assert num_valid_q > 0, \
+            'Error: all query identities do not appear in gallery'
+
+        all_cmc = np.asarray(all_cmc)
+        all_cmc = all_cmc.sum(0) / num_valid_q
+        mAP = np.mean(all_AP)
+
+        if 'mAP' in self.metrics:
+            metrics['mAP'] = np.around(mAP, decimals=3)
+        if 'CMC' in self.metrics:
+            for rank in self.metric_options['rank_list']:
+                metrics[f'R{rank}'] = np.around(all_cmc[rank - 1], decimals=3)
+
+        return metrics
diff --git a/mmtrack/evaluation/metrics/sot_metrics.py b/mmtrack/evaluation/metrics/sot_metrics.py
new file mode 100644
index 000000000..aedff6921
--- /dev/null
+++ b/mmtrack/evaluation/metrics/sot_metrics.py
@@ -0,0 +1,388 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import tempfile
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Dict, List, Optional, Sequence, Union
+
+import mmengine
+import numpy as np
+from mmengine.logging import MMLogger
+from mmengine.utils import mkdir_or_exist
+from tabulate import tabulate
+
+from mmtrack.registry import METRICS
+from mmtrack.utils import format_video_level_show
+from ..functional import (eval_sot_accuracy_robustness, eval_sot_eao,
+                          eval_sot_ope)
+from .base_video_metrics import BaseVideoMetric
+
+
+@METRICS.register_module()
+class SOTMetric(BaseVideoMetric):
+    """SOT evaluation metrics.
+
+    Args:
+        metric (Union[str, Sequence[str]], optional): Metrics to be evaluated.
+            Valid metrics are included in ``self.allowed_metrics``.
+            Defaults to 'OPE'.
+        metric_options (Optional[dict], optional): Options for calculating
+            metrics. Defaults to dict(dataset_type='vot2018',
+            only_eval_visible=False).
+        format_only (bool, optional): If True, only formatting the results to
+            the official format and not performing evaluation.
+            Defaults to False.
+        outfile_prefix (Optional[str], optional): The prefix of json files. It
+            includes the file path and the prefix of filename,
+            e.g., "a/b/prefix". If not specified, a temp file will be created.
+            Defaults to None.
+        collect_device (str, optional): Device name used for collecting results
+            from different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (Optional[str], optional): The prefix that will be added in the
+            metric names to disambiguate homonymous metrics of different
+            evaluators. If prefix is not provided in the argument,
+            self.default_prefix will be used instead. Defaults to None.
+        saved_track_res_path (Optional[str], optional): The saved path of
+            tracked results. Defaults to None.
+        options_after_eval (Optional[dict], optional): The options after
+            evaluation. Defaults to {}. The usage is the following:
+            ```
+                options_after_eval = dict(
+                    saved_eval_res_file = './results/sot_res.json',
+                    tracker_name = 'siamrpn++',
+                    eval_show_video_indices = 10)
+            ```
+            Here, ``eval_show_video_indices`` is used to index a numpy.ndarray.
+            It can be int (positive or negative) or list.
+            ``saved_eval_res_file`` must be a json/yaml/pickle file.
+    """
+    default_prefix: Optional[str] = 'sot'
+    allowed_metrics = ['OPE', 'VOT']
+    allowed_metric_options = ['dataset_type', 'only_eval_visible']
+    VOT_INTERVAL = dict(vot2018=[100, 356], vot2019=[46, 291])
+
+    def __init__(self,
+                 metric: Union[str, Sequence[str]] = 'OPE',
+                 metric_options: Optional[dict] = dict(
+                     dataset_type='vot2018', only_eval_visible=False),
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 saved_track_res_path: Optional[str] = None,
+                 options_after_eval: dict = {}) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        assert not (
+            'OPE' in self.metrics and 'VOT' in self.metrics
+        ), 'We can not evaluate one tracking result on both OPE and '
+        'VOT metrics since the track result on VOT mode '
+        'may be not the true bbox coordinates.'
+        self.metric_options = metric_options
+        for metric in self.metrics:
+            if metric not in self.allowed_metrics:
+                raise KeyError(
+                    f'metric should be in {str(self.allowed_metrics)}, '
+                    f'but got {metric}.')
+        for metric_option in self.metric_options:
+            if metric_option not in self.allowed_metric_options:
+                raise KeyError(
+                    f'metric option should be in {str(self.allowed_metric_options)}, '  # noqa: E501
+                    f'but got {metric_option}.')
+        self.outfile_prefix = outfile_prefix
+        self.format_only = format_only
+        self.saved_track_res_path = saved_track_res_path
+        self.options_after_eval = options_after_eval
+        self.preds_per_video, self.gts_per_video = [], []
+        self.frame_ids, self.visible_per_video = [], []
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            data_instance = data_sample['instances'][0]
+
+            self.preds_per_video.append(
+                data_sample['pred_track_instances']['bboxes'][0].cpu().numpy())
+            if 'bbox' in data_instance:
+                self.gts_per_video.append(data_instance['bbox'])
+            else:
+                assert self.format_only, 'If there is no ground truth '
+                "bounding bbox, 'format_only' must be True"
+            self.visible_per_video.append(data_instance['visible'])
+            self.frame_ids.append(data_sample['frame_id'])
+
+            if data_sample['frame_id'] == data_sample['video_length'] - 1:
+                img_path_split = data_sample['img_path'].split(os.sep)
+                # The ``img_path`` in LaSOT, OTB100 and VOT2018 have an extra
+                # common directory outside the *.jpg file.
+                video_name = img_path_split[-2] if img_path_split[-2] not in [
+                    'img', 'color'
+                ] else img_path_split[-3]
+                result = dict(
+                    video_name=video_name,
+                    video_id=data_sample['video_id'],
+                    video_size=(data_sample['ori_shape'][1],
+                                data_sample['ori_shape'][0]),
+                    frame_ids=deepcopy(self.frame_ids),
+                    # Collect the annotations and predictions of this video.
+                    # We don't convert the ``preds_per_video`` to
+                    # ``np.ndarray`` since the track results in SOT may not the
+                    # tracking box in EAO metrics.
+                    pred_bboxes=deepcopy(self.preds_per_video),
+                    gt_bboxes=np.array(self.gts_per_video, dtype=np.float32),
+                    visible=np.array(self.visible_per_video, dtype=bool))
+
+                self.frame_ids.clear()
+                self.preds_per_video.clear()
+                self.gts_per_video.clear()
+                self.visible_per_video.clear()
+
+                self.results.append(result)
+                break
+
+    def compute_metrics(self, results: List) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (List): The processed results of all data. The elements of
+                the list are the processed results of one video.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # 1. Convert the format of reuslts for evaluation.
+        all_pred_bboxes, all_gt_bboxes = [], []
+        all_video_names, all_video_sizes, all_visible = [], [], []
+        for result in results:
+            all_video_names.append(result['video_name'])
+            all_video_sizes.append(result['video_size'])
+            all_pred_bboxes.append(result['pred_bboxes'])
+            all_gt_bboxes.append(result['gt_bboxes'])
+            all_visible.append(result['visible'])
+
+        if self.saved_track_res_path is not None:
+            self.save_tracked_results(all_pred_bboxes, all_video_names,
+                                      self.saved_track_res_path)
+
+        # 2. Fromat-only (Optional)
+        if self.format_only:
+            self.save_formatted_results(all_pred_bboxes, all_video_names)
+            return dict()
+
+        # 3. Evaluation (Optional)
+        eval_results = OrderedDict()
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+
+            if metric == 'OPE':
+                if self.metric_options.get('only_eval_visible', False):
+                    results_ope = eval_sot_ope(all_pred_bboxes, all_gt_bboxes,
+                                               all_visible)
+                else:
+                    results_ope = eval_sot_ope(all_pred_bboxes, all_gt_bboxes)
+
+                ori_success = results_ope.pop('ori_success')
+                ori_norm_precision = results_ope.pop('ori_norm_precision')
+                ori_precision = results_ope.pop('ori_precision')
+                eval_results.update(results_ope)
+
+                saved_file_path = self.options_after_eval.get(
+                    'saved_eval_res_file', None)
+                if saved_file_path is not None:
+                    if not saved_file_path.endswith(
+                        ('.json', '.yml', '.yaml', '.pkl')):  # noqa: E125
+                        raise TypeError(
+                            f'Unsupported file format: {saved_file_path}. '
+                            'Please specify a json, yaml or pickle file.')
+                    if 'tracker_name' not in self.options_after_eval:
+                        logger.warning(
+                            'Not specify tracker name in the '
+                            'argument options_after_eval and use the default '
+                            "tracker name: 'anonymous_tracker'")
+                        tracker_name = 'anonymous_tracker'
+                    else:
+                        tracker_name = self.options_after_eval['tracker_name']
+                    mkdir_or_exist(osp.dirname(saved_file_path))
+                    ori_eval_res = {
+                        tracker_name:
+                        dict(
+                            success=np.mean(ori_success, axis=0),
+                            norm_precision=np.mean(ori_norm_precision, axis=0),
+                            precision=np.mean(ori_precision, axis=0))
+                    }
+                    mmengine.dump(ori_eval_res, saved_file_path)
+                    logger.info(
+                        'save evaluation results with different thresholds in '
+                        f"'{saved_file_path}'")
+
+                if self.options_after_eval.get('eval_show_video_indices',
+                                               None) is not None:
+                    success_per_video = np.mean(ori_success, axis=1)
+                    norm_precision_per_video = ori_norm_precision[:, 20]
+                    precision_per_video = ori_precision[:, 20]
+
+                    eval_show_results = format_video_level_show(
+                        all_video_names, [
+                            success_per_video, norm_precision_per_video,
+                            precision_per_video
+                        ],
+                        sort_by_first_metric=True,
+                        show_indices=self.
+                        options_after_eval['eval_show_video_indices'])
+
+                    logger.info('\n' + tabulate(
+                        eval_show_results,
+                        headers=[
+                            'video_name', 'success', 'norm_precision',
+                            'precision'
+                        ]))
+
+            elif metric == 'VOT':
+                if 'interval' in self.metric_options:
+                    interval = self.metric_options['interval']
+                else:
+                    interval = self.VOT_INTERVAL.get(
+                        self.metric_options['dataset_type'], None)
+                eao_scores = eval_sot_eao(
+                    all_pred_bboxes,
+                    all_gt_bboxes,
+                    videos_wh=all_video_sizes,
+                    interval=interval)
+                eval_results.update(eao_scores)
+                accuracy_robustness = eval_sot_accuracy_robustness(
+                    all_pred_bboxes, all_gt_bboxes, videos_wh=all_video_sizes)
+                eval_results.update(accuracy_robustness)
+
+            else:
+                raise KeyError(
+                    f"metric '{metric}' is not supported. Please use the "
+                    f'metric in {str(self.allowed_metrics)}')
+
+        return eval_results
+
+    def save_formatted_results_got10k(self, results: List[List[np.ndarray]],
+                                      video_names: List[str],
+                                      outfile_prefix: str):
+        """Save the formatted results in TrackingNet dataset for evaluation on
+        the test server.
+
+        Args:
+            results (List[List[np.ndarray]]): The formatted results.
+            video_names (List[str]): The video names.
+            outfile_prefix (str): The prefix of output files.
+        """
+        for result, video_name in zip(results, video_names):
+            video_outfile_dir = osp.join(outfile_prefix, video_name)
+            if not osp.isdir(video_outfile_dir):
+                os.makedirs(video_outfile_dir, exist_ok=True)
+            video_bbox_txt = osp.join(video_outfile_dir,
+                                      '{}_001.txt'.format(video_name))
+            video_time_txt = osp.join(video_outfile_dir,
+                                      '{}_time.txt'.format(video_name))
+            with open(video_bbox_txt,
+                      'w') as f_bbox, open(video_time_txt, 'w') as f_time:
+
+                for bbox in result:
+                    bbox = [
+                        str(f'{bbox[0]:.4f}'),
+                        str(f'{bbox[1]:.4f}'),
+                        str(f'{(bbox[2] - bbox[0]):.4f}'),
+                        str(f'{(bbox[3] - bbox[1]):.4f}')
+                    ]
+                    line = ','.join(bbox) + '\n'
+                    f_bbox.writelines(line)
+                    # We don't record testing time, so we set a default
+                    # time in order to test on the server.
+                    f_time.writelines('0.0001\n')
+
+    def save_formatted_results_trackingnet(self,
+                                           results: List[List[np.ndarray]],
+                                           video_names: List[str],
+                                           outfile_prefix: str):
+        """Save the formatted results in TrackingNet dataset for evaluation on
+        the test server.
+
+        Args:
+            results (List[List[np.ndarray]]): The formatted results.
+            video_names (List[str]): The video names.
+            outfile_prefix (str): The prefix of output files.
+        """
+        for result, video_name in zip(results, video_names):
+            video_txt = osp.join(outfile_prefix, f'{video_name}.txt')
+            with open(video_txt, 'w') as f:
+                for bbox in result:
+                    bbox = [
+                        str(f'{bbox[0]:.4f}'),
+                        str(f'{bbox[1]:.4f}'),
+                        str(f'{(bbox[2] - bbox[0]):.4f}'),
+                        str(f'{(bbox[3] - bbox[1]):.4f}')
+                    ]
+                    line = ','.join(bbox) + '\n'
+                    f.writelines(line)
+
+    def save_formatted_results(self, results: List[List[np.ndarray]],
+                               video_names: List[str]):
+        """Save the formatted results for evaluation on the test server.
+
+        Args:
+            results (List[List[np.ndarray]]): The formatted results.
+            video_names (List[str]): The video names.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # prepare saved dir
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        if not osp.isdir(outfile_prefix):
+            os.makedirs(outfile_prefix, exist_ok=True)
+
+        dataset_type = self.metric_options.get('dataset_type', 'got10k')
+        if dataset_type == 'got10k':
+            self.save_formatted_results_got10k(results, video_names,
+                                               outfile_prefix)
+        elif dataset_type == 'trackingnet':
+            self.save_formatted_results_trackingnet(results, video_names,
+                                                    outfile_prefix)
+        shutil.make_archive(outfile_prefix, 'zip', outfile_prefix)
+        shutil.rmtree(outfile_prefix)
+        logger.info(
+            f'-------- The formatted results are stored in {outfile_prefix}.zip --------'  # noqa: E501
+        )
+
+    def save_tracked_results(self, results: List[List[np.ndarray]],
+                             video_names: List[str], saved_path: str):
+        """Save the tracked results.
+
+        Args:
+            results (List[List[np.ndarray]]): The tracked results.
+            video_names (List[str]): The video names.
+            saved_path (str): The saved path of tracked results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if not osp.isdir(saved_path):
+            os.makedirs(saved_path, exist_ok=True)
+
+        self.save_formatted_results_trackingnet(results, video_names,
+                                                saved_path)
+        logger.info(
+            f'-------- The whole tracked results are stored in the fold {saved_path}--------'  # noqa: E501
+        )
diff --git a/mmtrack/evaluation/metrics/tao_metrics.py b/mmtrack/evaluation/metrics/tao_metrics.py
new file mode 100644
index 000000000..031139069
--- /dev/null
+++ b/mmtrack/evaluation/metrics/tao_metrics.py
@@ -0,0 +1,450 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+from collections import defaultdict
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+from mmengine.logging import MMLogger
+
+from mmtrack.registry import METRICS
+from .base_video_metrics import BaseVideoMetric
+
+try:
+    import tao
+    from tao.toolkit.tao import Tao, TaoEval
+except ImportError:
+    tao = None
+
+try:
+    import lvis
+    from lvis import LVIS, LVISEval, LVISResults
+except ImportError:
+    lvis = None
+
+
+@METRICS.register_module()
+class TAOMetric(BaseVideoMetric):
+    """mAP evaluation metrics for the TAO task.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated.
+            Defaults to 'tao_track_ap'.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        outfile_prefix (str | None): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonyms metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        format_only (bool): If True, only formatting the results to the
+            official format and not performing evaluation. Defaults to False.
+    """
+
+    default_prefix: Optional[str] = 'tao'
+
+    def __init__(self,
+                 metric: Union[str, List[str]] = 'tao_track_ap',
+                 metric_items: Optional[Sequence[str]] = None,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 format_only: bool = False) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # tao evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        self.format_only = format_only
+        allowed_metrics = ['tao_track_ap', 'bbox']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f"metric should be 'tao_track_ap' or 'bbox',"
+                               f' but got {metric}.')
+
+        self.metric_items = metric_items
+        self.outfile_prefix = outfile_prefix
+        self.per_video_res = []
+        self.img_ids = []
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred_track = data_sample['pred_track_instances']
+            pred_det = data_sample['pred_det_instances']
+            frame_id = data_sample['frame_id']
+            video_length = data_sample['video_length']
+
+            result['img_id'] = data_sample['img_id']
+            result['track_bboxes'] = pred_track['bboxes'].cpu().numpy()
+            result['track_scores'] = pred_track['scores'].cpu().numpy()
+            result['track_labels'] = pred_track['labels'].cpu().numpy()
+            result['track_instances_id'] = pred_track['instances_id'].cpu(
+            ).numpy()
+
+            result['det_bboxes'] = pred_det['bboxes'].cpu().numpy()
+            result['det_scores'] = pred_det['scores'].cpu().numpy()
+            result['det_labels'] = pred_det['labels'].cpu().numpy()
+
+            # parse gt
+            gt = dict()
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            keys = [
+                'frame_id', 'frame_index', 'neg_category_ids',
+                'not_exhaustive_category_ids', 'img_id', 'video_id',
+                'video_length'
+            ]
+            for key in keys:
+                if key not in data_sample:
+                    raise KeyError(
+                        f'The key {key} is not found in track_data_sample,'
+                        f' please pass it into the meta_keys'
+                        f' of the PackTrackInputs')
+                gt[key] = data_sample[key]
+
+            # When the ground truth exists, get annotation from `instances`.
+            # In general, it contains `bbox`, `bbox_label` and `instance_id`.
+            if 'instances' in data_sample:
+                gt['anns'] = data_sample['instances']
+            else:
+                gt['anns'] = dict()
+            self.per_video_res.append((result, gt))
+
+            if frame_id == video_length - 1:
+                preds, gts = zip(*self.per_video_res)
+                # format the results
+                gt_results, tao_meta_info = self._format_one_video_gts(gts)
+                pred_track_results, pred_det_results = \
+                    self._format_one_video_preds(preds, tao_meta_info)
+                self.per_video_res.clear()
+                # add converted result to the results list
+                self.results.append((pred_track_results, pred_det_results,
+                                     gt_results, tao_meta_info))
+
+    def compute_metrics(self, results: List) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (List): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        # split gt and prediction list
+        tmp_pred_track_results, tmp_pred_det_results, \
+            tmp_gt_results, tmp_meta_info = zip(*results)
+        tao_meta_info = self.format_meta(tmp_meta_info)
+        gt_results = self.format_gts(tmp_gt_results, tao_meta_info)
+        pred_track_results = self.format_preds(tmp_pred_track_results)
+        pred_det_results = self.format_preds(tmp_pred_det_results)
+
+        if 'bbox' in self.metrics:
+            # LVIS api only supports reading from files, hence,
+            # save the json result to tmp dir
+            tmp_dir = tempfile.TemporaryDirectory()
+            pred_det_results_path = f'{tmp_dir.name}/tao_bbox.json'
+            gt_results_path = f'{tmp_dir.name}/tao_gt.json'
+            mmengine.dump(pred_det_results, pred_det_results_path)
+            mmengine.dump(gt_results, gt_results_path)
+
+        if self.format_only:
+            self.save_pred_results(pred_track_results, 'track')
+            self.save_pred_results(pred_det_results, 'det')
+            return dict()
+
+        eval_results = dict()
+
+        if 'tao_track_ap' in self.metrics:
+            if tao is None:
+                raise ImportError(
+                    'Please run'
+                    ' pip install git+https://github.com/TAO-Dataset/tao.git '
+                    'to manually install tao')
+
+            logger.info('Evaluating tracking results...')
+            tao_gt = Tao(gt_results)
+            tao_eval = TaoEval(tao_gt, pred_track_results)
+            tao_eval.params.img_ids = self.img_ids
+            tao_eval.params.cat_ids = list(
+                self.dataset_meta['categories'].keys())
+            tao_eval.params.iou_thrs = np.array([0.5, 0.75])
+            tao_eval.run()
+
+            tao_eval.print_results()
+            tao_results = tao_eval.get_results()
+            for k, v in tao_results.items():
+                if isinstance(k, str) and k.startswith('AP'):
+                    key = 'track_{}'.format(k)
+                    val = float('{:.3f}'.format(float(v)))
+                    eval_results[key] = val
+
+        if 'bbox' in self.metrics:
+            if lvis is None:
+                raise ImportError(
+                    'Please run'
+                    ' pip install git+https://github.com/lvis-dataset/lvis-api.git '  # noqa
+                    'to manually install lvis')
+
+            logger.info('Evaluating detection results...')
+            lvis_gt = LVIS(gt_results_path)
+            lvis_dt = LVISResults(lvis_gt, pred_det_results_path)
+            lvis_eval = LVISEval(lvis_gt, lvis_dt, 'bbox')
+            lvis_eval.params.imgIds = self.img_ids
+            lvis_eval.params.catIds = list(
+                self.dataset_meta['categories'].keys())
+            lvis_eval.evaluate()
+            lvis_eval.accumulate()
+            lvis_eval.summarize()
+            lvis_eval.print_results()
+
+            lvis_results = lvis_eval.get_results()
+            for k, v in lvis_results.items():
+                if k.startswith('AP'):
+                    key = '{}_{}'.format('bbox', k)
+                    val = float('{:.3f}'.format(float(v)))
+                    eval_results[key] = val
+            tmp_dir.cleanup()
+        return eval_results
+
+    def format_meta(self, parts_meta: Tuple[dict]) -> dict:
+        """Gather all meta info from self.results."""
+        all_seq_vids_info = []
+        all_seq_imgs_info = []
+        all_seq_tracks_info = []
+        for _seq_info in parts_meta:
+            all_seq_vids_info.extend(_seq_info['videos'])
+            all_seq_imgs_info.extend(_seq_info['images'])
+            all_seq_tracks_info.extend(_seq_info['tracks'])
+
+        # update tao_meta_info
+        tao_meta_info = dict(
+            videos=all_seq_vids_info,
+            images=all_seq_imgs_info,
+            tracks=all_seq_tracks_info)
+
+        return tao_meta_info
+
+    def format_gts(self, gts: Tuple[List], tao_meta_info: dict) -> dict:
+        """Gather all ground-truth from self.results."""
+        categories = list(self.dataset_meta['categories'].values())
+        for img_info in tao_meta_info['images']:
+            self.img_ids.append(img_info['id'])
+        gt_results = dict(
+            info=dict(),
+            images=tao_meta_info['images'],
+            categories=categories,
+            videos=tao_meta_info['videos'],
+            annotations=[],
+            tracks=tao_meta_info['tracks'])
+
+        ann_id = 1
+        for gt_result in gts:
+            for ann in gt_result:
+                ann['id'] = ann_id
+                gt_results['annotations'].append(ann)
+                ann_id += 1
+
+        return gt_results
+
+    def format_preds(self, preds: Tuple[List]) -> List:
+        """Gather all predictions from self.results."""
+        pred_results = []
+        max_track_id = 0
+        for pred_result in preds:
+            # update track id
+            if 'track_id' in pred_result[0]:
+                track_ids = []
+                for ins_info in pred_result:
+                    track_ids.append(ins_info['track_id'])
+                    ins_info['track_id'] += max_track_id
+                track_ids = list(set(track_ids))
+                max_track_id += max(track_ids) + 1
+
+            pred_results.extend(pred_result)
+        return pred_results
+
+    def _format_one_video_preds(self, pred_dicts: Tuple[dict],
+                                tao_meta_info: Dict) -> Tuple[List, List]:
+        """Convert the annotation to the format of YouTube-VIS.
+
+        This operation is to make it easier to use the official eval API.
+
+        Args:
+            pred_dicts (Tuple[dict]): Prediction of the dataset.
+            tao_meta_info (dict): A dict containing videos and images
+                information of TAO.
+
+        Returns:
+            List: The formatted predictions.
+        """
+        # Collate preds scatters (tuple of dict to dict of list)
+        preds = defaultdict(list)
+        cat_ids = list(self.dataset_meta['categories'].keys())
+        for pred in pred_dicts:
+            for key in pred.keys():
+                preds[key].append(pred[key])
+
+        vid_infos = tao_meta_info['videos']
+        track_json_results = []
+        det_json_results = []
+        video_id = vid_infos[-1]['id']
+
+        for img_id, bboxes, scores, labels, ins_ids in zip(
+                preds['img_id'], preds['track_bboxes'], preds['track_scores'],
+                preds['track_labels'], preds['track_instances_id']):
+            for bbox, score, label, ins_id in zip(bboxes, scores, labels,
+                                                  ins_ids):
+                data = dict(
+                    image_id=img_id,
+                    bbox=[
+                        bbox[0],
+                        bbox[1],
+                        bbox[2] - bbox[0],
+                        bbox[3] - bbox[1],
+                    ],
+                    score=score,
+                    category_id=cat_ids[label],
+                    track_id=ins_id,
+                    video_id=video_id)
+                track_json_results.append(data)
+
+        for img_id, bboxes, scores, labels in zip(preds['img_id'],
+                                                  preds['det_bboxes'],
+                                                  preds['det_scores'],
+                                                  preds['det_labels']):
+            for bbox, score, label in zip(bboxes, scores, labels):
+                data = dict(
+                    image_id=img_id,
+                    bbox=[
+                        bbox[0],
+                        bbox[1],
+                        bbox[2] - bbox[0],
+                        bbox[3] - bbox[1],
+                    ],
+                    score=score,
+                    category_id=cat_ids[label],
+                    video_id=video_id)
+                det_json_results.append(data)
+
+        return track_json_results, det_json_results
+
+    def _format_one_video_gts(self,
+                              gt_dicts: Tuple[dict]) -> Tuple[list, dict]:
+        """Convert the annotation to the format of YouTube-VIS.
+
+        This operation is to make it easier to use the official eval API.
+
+        Args:
+            gt_dicts (Tuple[dict]): Ground truth of the dataset.
+
+        Returns:
+            Tuple[list, dict]: The formatted gts and a dict containing videos
+            and images information of TAO.
+        """
+        video_infos = []
+        image_infos = []
+        track_infos = []
+        annotations = []
+        instance_flag = dict()  # flag the ins_id is used or not
+        tao_meta_info = defaultdict(list)
+        cat_ids = list(self.dataset_meta['categories'].keys())
+
+        # get video infos
+        for gt_dict in gt_dicts:
+            frame_id = gt_dict['frame_id']
+            video_id = gt_dict['video_id']
+            img_id = gt_dict['img_id']
+            image_info = dict(
+                id=img_id,
+                width=gt_dict['width'],
+                height=gt_dict['height'],
+                video_id=video_id,
+                frame_id=frame_id,
+                frame_index=gt_dict['frame_index'],
+                neg_category_ids=gt_dict['neg_category_ids'],
+                not_exhaustive_category_ids=gt_dict[
+                    'not_exhaustive_category_ids'],
+                file_name='')
+            image_infos.append(image_info)
+            if frame_id == 0:
+                video_info = dict(
+                    id=video_id,
+                    width=gt_dict['width'],
+                    height=gt_dict['height'],
+                    neg_category_ids=gt_dict['neg_category_ids'],
+                    not_exhaustive_category_ids=gt_dict[
+                        'not_exhaustive_category_ids'],
+                    file_name='')
+                video_infos.append(video_info)
+
+            for ann in gt_dict['anns']:
+                label = ann['bbox_label']
+                bbox = ann['bbox']
+                instance_id = ann['instance_id']
+                coco_bbox = [
+                    bbox[0],
+                    bbox[1],
+                    bbox[2] - bbox[0],
+                    bbox[3] - bbox[1],
+                ]
+
+                annotation = dict(
+                    id=-1,  # need update when all results have been collected
+                    video_id=video_id,
+                    image_id=img_id,
+                    frame_id=frame_id,
+                    bbox=coco_bbox,
+                    track_id=instance_id,
+                    instance_id=instance_id,
+                    iscrowd=ann.get('ignore_flag', 0),
+                    category_id=cat_ids[label],
+                    area=coco_bbox[2] * coco_bbox[3])
+                if not instance_flag.get(instance_id, False):
+                    track_info = dict(
+                        id=instance_id,
+                        category_id=cat_ids[label],
+                        video_id=video_id)
+                    track_infos.append(track_info)
+                    instance_flag[instance_id] = True
+                annotations.append(annotation)
+
+        # update tao meta info
+        tao_meta_info['images'].extend(image_infos)
+        tao_meta_info['videos'].extend(video_infos)
+        tao_meta_info['tracks'].extend(track_infos)
+
+        return annotations, tao_meta_info
+
+    def save_pred_results(self, pred_results: List, res_type: str) -> None:
+        """Save the results to a zip file.
+
+        Args:
+            pred_results (list): Testing results of the
+                dataset.
+            res_type (str): The type of testing results, track or detection.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+        mmengine.dump(pred_results, f'{outfile_prefix}_{res_type}.json')
+
+        logger.info(f'save the results to {outfile_prefix}_{res_type}.json')
diff --git a/mmtrack/evaluation/metrics/youtube_vis_metrics.py b/mmtrack/evaluation/metrics/youtube_vis_metrics.py
new file mode 100644
index 000000000..fafec4a6c
--- /dev/null
+++ b/mmtrack/evaluation/metrics/youtube_vis_metrics.py
@@ -0,0 +1,436 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+import zipfile
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+from mmdet.structures.mask import encode_mask_results
+from mmengine.dist import (all_gather_object, barrier, broadcast_object_list,
+                           is_main_process)
+from mmengine.logging import MMLogger
+
+from mmtrack.registry import METRICS
+from ..functional import YTVIS, YTVISeval
+from .base_video_metrics import BaseVideoMetric, collect_tracking_results
+
+
+@METRICS.register_module()
+class YouTubeVISMetric(BaseVideoMetric):
+    """mAP evaluation metrics for the VIS task.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated.
+            Default value is `youtube_vis_ap`.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        outfile_prefix (str | None): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonyms metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+        format_only (bool): If True, only formatting the results to the
+            official format and not performing evaluation. Defaults to False.
+    """
+
+    default_prefix: Optional[str] = 'youtube_vis'
+
+    def __init__(self,
+                 metric: Union[str, List[str]] = 'youtube_vis_ap',
+                 metric_items: Optional[Sequence[str]] = None,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 format_only: bool = False) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # vis evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        self.format_only = format_only
+        allowed_metrics = ['youtube_vis_ap']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(
+                    f"metric should be 'youtube_vis_ap', but got {metric}.")
+
+        self.metric_items = metric_items
+        self.outfile_prefix = outfile_prefix
+        self.per_video_res = []
+        self.categories = []
+        self._vis_meta_info = defaultdict(list)  # record video and image infos
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            predictions (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_track_instances']
+            frame_id = data_sample['frame_id']
+            video_length = data_sample['video_length']
+            video_id = data_sample['video_id']
+
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            result['instances_id'] = pred['instances_id'].cpu().numpy()
+            # encode mask to RLE
+            assert 'masks' in pred, \
+                'masks must exist in YouTube-VIS metric'
+            result['masks'] = encode_mask_results(
+                pred['masks'].detach().cpu().numpy())
+
+            # parse gt
+            gt = dict()
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['img_id'] = data_sample['img_id']
+            gt['frame_id'] = frame_id
+            gt['video_id'] = video_id
+            gt['video_length'] = video_length
+
+            # When the ground truth exists, get annotation from `instances`.
+            # In general, it contains `bbox`, `bbox_label`, `mask` and
+            # `instance_id`.
+            if 'instances' in data_sample:
+                gt['anns'] = data_sample['instances']
+            else:
+                gt['anns'] = dict()
+            self.per_video_res.append((result, gt))
+
+            if frame_id == video_length - 1:
+                preds, gts = zip(*self.per_video_res)
+                # format the results
+                # we must format gts first to update self._vis_meta_info
+                gt_results = self._format_one_video_gts(gts)
+                pred_results = self._format_one_video_preds(preds)
+                self.per_video_res.clear()
+                # add converted result to the results list
+                self.results.append((pred_results, gt_results))
+
+    def compute_metrics(self, results: List) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (List): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        # split gt and prediction list
+        tmp_pred_results, tmp_gt_results = zip(*results)
+        gt_results = self.format_gts(tmp_gt_results)
+        pred_results = self.format_preds(tmp_pred_results)
+
+        if self.format_only:
+            self.save_pred_results(pred_results)
+            return dict()
+
+        ytvis = YTVIS(gt_results)
+
+        ytvis_dets = ytvis.loadRes(pred_results)
+        vid_ids = ytvis.getVidIds()
+
+        iou_type = metric = 'segm'
+        eval_results = OrderedDict()
+        ytvisEval = YTVISeval(ytvis, ytvis_dets, iou_type)
+        ytvisEval.params.vidIds = vid_ids
+        ytvisEval.evaluate()
+        ytvisEval.accumulate()
+        ytvisEval.summarize()
+
+        coco_metric_names = {
+            'mAP': 0,
+            'mAP_50': 1,
+            'mAP_75': 2,
+            'mAP_s': 3,
+            'mAP_m': 4,
+            'mAP_l': 5,
+            'AR@1': 6,
+            'AR@10': 7,
+            'AR@100': 8,
+            'AR_s@100': 9,
+            'AR_m@100': 10,
+            'AR_l@100': 11
+        }
+        metric_items = self.metric_items
+        if metric_items is not None:
+            for metric_item in metric_items:
+                if metric_item not in coco_metric_names:
+                    raise KeyError(
+                        f'metric item "{metric_item}" is not supported')
+
+        if metric_items is None:
+            metric_items = [
+                'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+            ]
+        for metric_item in metric_items:
+            key = f'{metric}_{metric_item}'
+            val = float(
+                f'{ytvisEval.stats[coco_metric_names[metric_item]]:.3f}')
+            eval_results[key] = val
+
+        return eval_results
+
+    def format_gts(self, gts: Tuple[List]) -> dict:
+        """Gather all ground-truth from self.results."""
+        self.categories = [
+            dict(id=id + 1, name=name)
+            for id, name in enumerate(self.dataset_meta['CLASSES'])
+        ]
+        gt_results = dict(
+            categories=self.categories,
+            videos=self._vis_meta_info['videos'],
+            annotations=[])
+        for gt_result in gts:
+            gt_results['annotations'].extend(gt_result)
+        return gt_results
+
+    def format_preds(self, preds: Tuple[List]) -> List:
+        """Gather all predictions from self.results."""
+        pred_results = []
+        for pred_result in preds:
+            pred_results.extend(pred_result)
+        return pred_results
+
+    def _format_one_video_preds(self, pred_dicts: Tuple[dict]) -> List:
+        """Convert the annotation to the format of YouTube-VIS.
+
+        This operation is to make it easier to use the official eval API.
+
+        Args:
+            pred_dicts (Tuple[dict]): Prediction of the dataset.
+
+        Returns:
+            List: The formatted predictions.
+        """
+        # Collate preds scatters (tuple of dict to dict of list)
+        preds = defaultdict(list)
+        for pred in pred_dicts:
+            for key in pred.keys():
+                preds[key].append(pred[key])
+
+        img_infos = self._vis_meta_info['images']
+        vid_infos = self._vis_meta_info['videos']
+        inds = [i for i, _ in enumerate(img_infos) if _['frame_id'] == 0]
+        inds.append(len(img_infos))
+        json_results = []
+        video_id = vid_infos[-1]['id']
+        # collect data for each instances in a video.
+        collect_data = dict()
+        for frame_id, (masks, scores, labels, ids) in enumerate(
+                zip(preds['masks'], preds['scores'], preds['labels'],
+                    preds['instances_id'])):
+
+            assert len(masks) == len(labels)
+            for j, id in enumerate(ids):
+                if id not in collect_data:
+                    collect_data[id] = dict(
+                        category_ids=[], scores=[], segmentations=dict())
+                collect_data[id]['category_ids'].append(labels[j])
+                collect_data[id]['scores'].append(scores[j])
+                if isinstance(masks[j]['counts'], bytes):
+                    masks[j]['counts'] = masks[j]['counts'].decode()
+                collect_data[id]['segmentations'][frame_id] = masks[j]
+
+        # transform the collected data into official format
+        for id, id_data in collect_data.items():
+            output = dict()
+            output['video_id'] = video_id
+            output['score'] = np.array(id_data['scores']).mean().item()
+            # majority voting for sequence category
+            output['category_id'] = np.bincount(
+                np.array(id_data['category_ids'])).argmax().item() + 1
+            output['segmentations'] = []
+            for frame_id in range(inds[-1] - inds[-2]):
+                if frame_id in id_data['segmentations']:
+                    output['segmentations'].append(
+                        id_data['segmentations'][frame_id])
+                else:
+                    output['segmentations'].append(None)
+            json_results.append(output)
+
+        return json_results
+
+    def _format_one_video_gts(self, gt_dicts: Tuple[dict]) -> List:
+        """Convert the annotation to the format of YouTube-VIS.
+
+        This operation is to make it easier to use the official eval API.
+
+        Args:
+            gt_dicts (Tuple[dict]): Ground truth of the dataset.
+
+        Returns:
+            list: The formatted gts.
+        """
+        video_infos = []
+        image_infos = []
+        instance_infos = defaultdict(list)
+        len_videos = dict()  # mapping from instance_id to video_length
+        vis_anns = []
+
+        # get video infos
+        for gt_dict in gt_dicts:
+            frame_id = gt_dict['frame_id']
+            video_id = gt_dict['video_id']
+            img_id = gt_dict['img_id']
+            image_info = dict(
+                id=img_id,
+                width=gt_dict['width'],
+                height=gt_dict['height'],
+                frame_id=frame_id,
+                file_name='')
+            image_infos.append(image_info)
+            if frame_id == 0:
+                video_info = dict(
+                    id=video_id,
+                    width=gt_dict['width'],
+                    height=gt_dict['height'],
+                    file_name='')
+                video_infos.append(video_info)
+
+            for ann in gt_dict['anns']:
+                label = ann['bbox_label']
+                bbox = ann['bbox']
+                instance_id = ann['instance_id']
+                # update video length
+                len_videos[instance_id] = gt_dict['video_length']
+                coco_bbox = [
+                    bbox[0],
+                    bbox[1],
+                    bbox[2] - bbox[0],
+                    bbox[3] - bbox[1],
+                ]
+
+                annotation = dict(
+                    video_id=video_id,
+                    frame_id=frame_id,
+                    bbox=coco_bbox,
+                    instance_id=instance_id,
+                    iscrowd=ann.get('ignore_flag', 0),
+                    category_id=int(label) + 1,
+                    area=coco_bbox[2] * coco_bbox[3])
+                if ann.get('mask', None):
+                    mask = ann['mask']
+                    # area = mask_util.area(mask)
+                    if isinstance(mask, dict) and isinstance(
+                            mask['counts'], bytes):
+                        mask['counts'] = mask['counts'].decode()
+                    annotation['segmentation'] = mask
+
+                instance_infos[instance_id].append(annotation)
+
+        # update vis meta info
+        self._vis_meta_info['images'].extend(image_infos)
+        self._vis_meta_info['videos'].extend(video_infos)
+
+        for instance_id, ann_infos in instance_infos.items():
+            cur_video_len = len_videos[instance_id]
+            segm = [None] * cur_video_len
+            bbox = [None] * cur_video_len
+            area = [None] * cur_video_len
+            # In the official format, no instances are represented by
+            # 'None', however, only images with instances are recorded
+            # in the current annotations, so we need to use 'None' to
+            # initialize these lists.
+            for ann_info in ann_infos:
+                frame_id = ann_info['frame_id']
+                segm[frame_id] = ann_info['segmentation']
+                bbox[frame_id] = ann_info['bbox']
+                area[frame_id] = ann_info['area']
+            instance = dict(
+                category_id=ann_infos[0]['category_id'],
+                segmentations=segm,
+                bboxes=bbox,
+                video_id=ann_infos[0]['video_id'],
+                areas=area,
+                id=instance_id,
+                iscrowd=ann_infos[0]['iscrowd'])
+            vis_anns.append(instance)
+        return vis_anns
+
+    def save_pred_results(self, pred_results: List) -> None:
+        """Save the results to a zip file (standard format for YouTube-VIS
+        Challenge).
+
+        Args:
+            pred_results (list): Testing results of the
+                dataset.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+        mmengine.dump(pred_results, f'{outfile_prefix}.json')
+        # zip the json file in order to submit to the test server.
+        zip_file_name = f'{outfile_prefix}.submission_file.zip'
+        zf = zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED)
+        logger.info(f"zip the 'results.json' into '{zip_file_name}', "
+                    'please submmit the zip file to the test server')
+        zf.write(f'{outfile_prefix}.json', 'results.json')
+        zf.close()
+
+    def evaluate(self, size: int) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        # wait for all processes to complete prediction.
+        barrier()
+
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        # gather seq_info
+        gathered_seq_info = all_gather_object(self._vis_meta_info['videos'])
+        all_seq_info = []
+        for _seq_info in gathered_seq_info:
+            all_seq_info.extend(_seq_info)
+        # update self._vis_meta_info
+        self._vis_meta_info = dict(videos=all_seq_info)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
diff --git a/mmtrack/models/__init__.py b/mmtrack/models/__init__.py
index 859942a21..8839dceb4 100644
--- a/mmtrack/models/__init__.py
+++ b/mmtrack/models/__init__.py
@@ -1,21 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .aggregators import *  # noqa: F401,F403
 from .backbones import *  # noqa: F401,F403
-from .builder import (AGGREGATORS, MODELS, MOTION, REID, TRACKERS,
-                      build_aggregator, build_model, build_motion, build_reid,
-                      build_tracker)
+from .data_preprocessors import *  # noqa: F401,F403
+from .filter import *  # noqa: F401,F403
+from .layers import *  # noqa: F401,F403
 from .losses import *  # noqa: F401,F403
 from .mot import *  # noqa: F401,F403
 from .motion import *  # noqa: F401,F403
 from .reid import *  # noqa: F401,F403
 from .roi_heads import *  # noqa: F401,F403
 from .sot import *  # noqa: F401,F403
+from .task_modules import *  # noqa: F401,F403
 from .track_heads import *  # noqa: F401,F403
 from .trackers import *  # noqa: F401,F403
 from .vid import *  # noqa: F401,F403
 from .vis import *  # noqa: F401,F403
-
-__all__ = [
-    'AGGREGATORS', 'MODELS', 'TRACKERS', 'MOTION', 'REID', 'build_model',
-    'build_tracker', 'build_motion', 'build_aggregator', 'build_reid'
-]
diff --git a/mmtrack/models/aggregators/embed_aggregator.py b/mmtrack/models/aggregators/embed_aggregator.py
index 1400019c4..487ef4bdd 100644
--- a/mmtrack/models/aggregators/embed_aggregator.py
+++ b/mmtrack/models/aggregators/embed_aggregator.py
@@ -1,13 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
 import torch
 import torch.nn as nn
 from mmcv.cnn.bricks import ConvModule
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
-from ..builder import AGGREGATORS
+from mmtrack.registry import MODELS
 
 
-@AGGREGATORS.register_module()
+@MODELS.register_module()
 class EmbedAggregator(BaseModule):
     """Embedding convs to aggregate multi feature maps.
 
@@ -27,12 +29,12 @@ class EmbedAggregator(BaseModule):
     """
 
     def __init__(self,
-                 num_convs=1,
-                 channels=256,
-                 kernel_size=3,
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
+                 num_convs: int = 1,
+                 channels: int = 256,
+                 kernel_size: int = 3,
+                 norm_cfg: Optional[dict] = None,
+                 act_cfg: dict = dict(type='ReLU'),
+                 init_cfg: Optional[Union[dict, List[dict]]] = None):
         super(EmbedAggregator, self).__init__(init_cfg)
         assert num_convs > 0, 'The number of convs must be bigger than 1.'
         self.embed_convs = nn.ModuleList()
@@ -52,7 +54,7 @@ def __init__(self,
                     norm_cfg=new_norm_cfg,
                     act_cfg=new_act_cfg))
 
-    def forward(self, x, ref_x):
+    def forward(self, x: torch.Tensor, ref_x: torch.Tensor) -> torch.Tensor:
         """Aggregate reference feature maps `ref_x`.
 
         The aggregation mainly contains two steps:
@@ -62,7 +64,7 @@ def forward(self, x, ref_x):
 
         Args:
             x (Tensor): of shape [1, C, H, W]
-            ref_x (Tensor): of shape [N, C, H, W]. N is the number of reference
+            ref_x (Tensor): of shape [T, C, H, W]. T is the number of reference
                 feature maps.
 
         Returns:
diff --git a/mmtrack/models/aggregators/selsa_aggregator.py b/mmtrack/models/aggregators/selsa_aggregator.py
index b8ca29ce5..18ae5bd38 100644
--- a/mmtrack/models/aggregators/selsa_aggregator.py
+++ b/mmtrack/models/aggregators/selsa_aggregator.py
@@ -1,12 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
+from torch import Tensor
 
-from ..builder import AGGREGATORS
+from mmtrack.registry import MODELS
+from mmtrack.utils import OptConfigType
 
 
-@AGGREGATORS.register_module()
+@MODELS.register_module()
 class SelsaAggregator(BaseModule):
     """Selsa aggregator module.
 
@@ -14,15 +16,18 @@ class SelsaAggregator(BaseModule):
     Object Detection". `SELSA <https://arxiv.org/abs/1907.06390>`_.
 
     Args:
-        in_channels (int): The number of channels of the features of
+        in_channels (int, optional): The number of channels of the features of
             proposal.
-        num_attention_blocks (int): The number of attention blocks used in
-            selsa aggregator module. Defaults to 16.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
+        num_attention_blocks (int, optional): The number of attention blocks
+            used in selsa aggregator module. Defaults to 16.
+        init_cfg (OptConfigType, optional): Initialization config dict.
             Defaults to None.
     """
 
-    def __init__(self, in_channels, num_attention_blocks=16, init_cfg=None):
+    def __init__(self,
+                 in_channels: int,
+                 num_attention_blocks: int = 16,
+                 init_cfg: OptConfigType = None):
         super(SelsaAggregator, self).__init__(init_cfg)
         self.fc_embed = nn.Linear(in_channels, in_channels)
         self.ref_fc_embed = nn.Linear(in_channels, in_channels)
@@ -30,7 +35,7 @@ def __init__(self, in_channels, num_attention_blocks=16, init_cfg=None):
         self.ref_fc = nn.Linear(in_channels, in_channels)
         self.num_attention_blocks = num_attention_blocks
 
-    def forward(self, x, ref_x):
+    def forward(self, x: Tensor, ref_x: Tensor) -> Tensor:
         """Aggregate the features `ref_x` of reference proposals.
 
         The aggregation mainly contains two steps:
diff --git a/mmtrack/models/backbones/sot_resnet.py b/mmtrack/models/backbones/sot_resnet.py
index c667cfe7d..c891124b7 100644
--- a/mmtrack/models/backbones/sot_resnet.py
+++ b/mmtrack/models/backbones/sot_resnet.py
@@ -1,26 +1,29 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
 import torch.nn as nn
 from mmcv.cnn import build_conv_layer, build_norm_layer
 from mmdet.models.backbones.resnet import Bottleneck, ResNet
-from mmdet.models.builder import BACKBONES
+
+from mmtrack.registry import MODELS
 
 
 class SOTBottleneck(Bottleneck):
     expansion = 4
 
     def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 dilation=1,
-                 downsample=None,
-                 style='pytorch',
-                 with_cp=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 dcn=None,
-                 plugins=None,
-                 init_cfg=None):
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 with_cp: bool = False,
+                 conv_cfg: Optional[dict] = None,
+                 norm_cfg: dict = dict(type='BN'),
+                 dcn: Optional[dict] = None,
+                 plugins: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
         """Bottleneck block for ResNet.
 
         If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
@@ -147,7 +150,7 @@ def __init__(self,
                 planes * self.expansion, self.after_conv3_plugins)
 
 
-@BACKBONES.register_module()
+@MODELS.register_module()
 class SOTResNet(ResNet):
     """ResNet backbone for SOT.
 
@@ -157,11 +160,14 @@ class SOTResNet(ResNet):
 
     Args:
         depth (int): Depth of resnet, from {50, }.
+        unfreeze_backbone (bool): Whether to unfreeze the parameters of
+            backbone. Defaults to True. It's used for building all the
+            parameters into buckets in the instililzation of DDP.
     """
 
     arch_settings = {50: (SOTBottleneck, (3, 4, 6, 3))}
 
-    def __init__(self, depth, unfreeze_backbone=True, **kwargs):
+    def __init__(self, depth: int, unfreeze_backbone: bool = True, **kwargs):
         assert depth == 50, 'Only support r50 backbone for sot.'
         super(SOTResNet, self).__init__(depth, **kwargs)
         # unfreeze the backbone parameters so that DDP can build all parameters
@@ -192,7 +198,7 @@ def make_res_layer(self, **kwargs):
         """Pack all blocks in a stage into a ``ResLayer``."""
         return SOTResLayer(**kwargs)
 
-    def _make_stem_layer(self, in_channels, stem_channels):
+    def _make_stem_layer(self, in_channels: int, stem_channels: int):
         if self.deep_stem:
             self.stem = nn.Sequential(
                 build_conv_layer(
@@ -250,6 +256,7 @@ class SOTResLayer(nn.Sequential):
         planes (int): Planes of block.
         num_blocks (int): Number of blocks.
         stride (int): Stride of the first block. Default: 1
+        dilation (int): The factor of dilation.
         avg_down (bool): Use AvgPool instead of stride conv when
             downsampling in the bottleneck. Default: False
         conv_cfg (dict): Dictionary to construct and config conv layer.
@@ -261,16 +268,16 @@ class SOTResLayer(nn.Sequential):
     """
 
     def __init__(self,
-                 block,
-                 inplanes,
-                 planes,
-                 num_blocks,
-                 stride=1,
-                 dilation=1,
-                 avg_down=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 downsample_first=True,
+                 block: nn.Module,
+                 inplanes: int,
+                 planes: int,
+                 num_blocks: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 avg_down: bool = False,
+                 conv_cfg: Optional[dict] = None,
+                 norm_cfg: dict = dict(type='BN'),
+                 downsample_first: bool = True,
                  **kwargs):
         self.block = block
 
diff --git a/mmtrack/models/builder.py b/mmtrack/models/builder.py
deleted file mode 100644
index 9b52e6e49..000000000
--- a/mmtrack/models/builder.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn import MODELS as MMCV_MODELS
-from mmcv.utils import Registry
-
-MODELS = Registry('models', parent=MMCV_MODELS)
-TRACKERS = MODELS
-MOTION = MODELS
-REID = MODELS
-AGGREGATORS = MODELS
-
-
-def build_tracker(cfg):
-    """Build tracker."""
-    return TRACKERS.build(cfg)
-
-
-def build_motion(cfg):
-    """Build motion model."""
-    return MOTION.build(cfg)
-
-
-def build_reid(cfg):
-    """Build reid model."""
-    return REID.build(cfg)
-
-
-def build_aggregator(cfg):
-    """Build aggregator model."""
-    return AGGREGATORS.build(cfg)
-
-
-def build_model(cfg, train_cfg=None, test_cfg=None):
-    """Build model."""
-    if train_cfg is None and test_cfg is None:
-        return MODELS.build(cfg)
-    else:
-        return MODELS.build(cfg, MODELS,
-                            dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/mmtrack/models/data_preprocessors/__init__.py b/mmtrack/models/data_preprocessors/__init__.py
new file mode 100644
index 000000000..c853b6d4c
--- /dev/null
+++ b/mmtrack/models/data_preprocessors/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_preprocessor import TrackDataPreprocessor
+
+__all__ = ['TrackDataPreprocessor']
diff --git a/mmtrack/models/data_preprocessors/data_preprocessor.py b/mmtrack/models/data_preprocessors/data_preprocessor.py
new file mode 100644
index 000000000..f2b1e3fbf
--- /dev/null
+++ b/mmtrack/models/data_preprocessors/data_preprocessor.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from numbers import Number
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmdet.structures.mask import BitmapMasks
+from mmengine.model import BaseDataPreprocessor
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.utils import stack_batch
+
+
+@MODELS.register_module()
+class TrackDataPreprocessor(BaseDataPreprocessor):
+    """Image pre-processor for tracking tasks.
+
+    Accepts the data sampled by the dataloader, and preprocesses it into the
+    format of the model input. ``TrackDataPreprocessor`` provides the
+    tracking data pre-processing as follows:
+
+    - Collate and move data to the target device.
+    - Pad inputs to the maximum size of current batch with defined
+      ``pad_value``. The padding size can be divisible by a defined
+      ``pad_size_divisor``
+    - Stack inputs to inputs.
+    - Convert inputs from bgr to rgb if the shape of input is (1, 3, H, W).
+    - Normalize image with defined std and mean.
+    - Do batch augmentations during training.
+    - Record the information of ``batch_input_shape`` and ``pad_shape``.
+
+    Args:
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be
+            divisible by ``pad_size_divisor``. Defaults to 1.
+        pad_value (Number): The padded pixel value. Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+            Defaults to False.
+        batch_augments (list[dict], optional): Batch-level augmentations
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 batch_augments: Optional[List[dict]] = None):
+        super().__init__()
+        assert not (bgr_to_rgb and rgb_to_bgr), (
+            '`bgr2rgb` and `rgb2bgr` cannot be set to True at the same time')
+        assert (mean is None) == (std is None), (
+            'mean and std should be both None or tuple')
+        if mean is not None:
+            assert len(mean) == 3 or len(mean) == 1, (
+                'The length of mean should be 1 or 3 to be compatible with '
+                f'RGB or gray image, but got {len(mean)}')
+            assert len(std) == 3 or len(std) == 1, (  # type: ignore
+                'The length of std should be 1 or 3 to be compatible with RGB '  # type: ignore # noqa: E501
+                f'or gray image, but got {len(std)}')
+
+            # Enable the normalization in preprocessing.
+            self._enable_normalize = True
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(1, -1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(1, -1, 1, 1), False)
+        else:
+            self._enable_normalize = False
+
+        self.channel_conversion = rgb_to_bgr or bgr_to_rgb
+        self.pad_size_divisor = pad_size_divisor
+        self.pad_value = pad_value
+        self.pad_mask = pad_mask
+        self.mask_pad_value = mask_pad_value
+        if batch_augments is not None:
+            self.batch_augments = nn.ModuleList(
+                [MODELS.build(aug) for aug in batch_augments])
+        else:
+            self.batch_augments = None
+
+    def forward(self, data: dict, training: bool = False) -> Dict:
+        """Perform normalization、padding and bgr2rgb conversion based on
+        ``TrackDataPreprocessor``.
+
+        Args:
+            data (dict): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            Tuple[Dict[str, List[torch.Tensor]], OptSampleList]: Data in the
+            same format as the model input.
+        """
+        batch_pad_shape = self._get_pad_shape(data)
+        data = super().forward(data=data, training=training)
+        ori_inputs, data_samples = data['inputs'], data['data_samples']
+
+        inputs = dict()
+        for imgs_key, imgs in ori_inputs.items():
+            # TODO: whether normalize should be after stack_batch
+            # imgs is a list contain multiple Tensor of imgs.
+            # The shape of imgs[0] is (T, C, H, W).
+            channel = imgs[0].size(1)
+            if self.channel_conversion and channel == 3:
+                imgs = [_img[:, [2, 1, 0], ...] for _img in imgs]
+            # change to `float`
+            imgs = [_img.float() for _img in imgs]
+            if self._enable_normalize:
+                imgs = [(_img - self.mean) / self.std for _img in imgs]
+
+            inputs[imgs_key] = stack_batch(imgs, self.pad_size_divisor,
+                                           self.pad_value)
+
+        if data_samples is not None:
+            # NOTE the batched image size information may be useful, e.g.
+            # in DETR, this is needed for the construction of masks, which is
+            # then used for the transformer_head.
+            for key, imgs in inputs.items():
+                img_shape = tuple(imgs.size()[-2:])
+                imgs_shape = [img_shape] * imgs.size(1) if imgs.size(
+                    1) > 1 else img_shape
+                ref_prefix = key[:-3]
+                for data_sample, pad_shapes in zip(data_samples,
+                                                   batch_pad_shape[key]):
+                    data_sample.set_metainfo({
+                        f'{ref_prefix}batch_input_shape':
+                        imgs_shape,
+                        f'{ref_prefix}pad_shape':
+                        pad_shapes
+                    })
+                if self.pad_mask:
+                    self.pad_gt_masks(data_samples, ref_prefix)
+
+        if training and self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                # Only yolox need batch_aug, and yolox can only process
+                # `img` key. Therefore, only img is processed here.
+                # The shape of `img` is (N, T, C, H, W), hence, we use
+                # [:, 0] to change the shape to (N, C, H, W).
+                assert len(inputs) == 1 and 'img' in inputs
+                aug_inputs, data_samples = batch_aug(inputs['img'][:, 0],
+                                                     data_samples)
+                inputs['img'] = aug_inputs.unsqueeze(1)
+
+        return dict(inputs=inputs, data_samples=data_samples)
+
+    def _get_pad_shape(self, data: dict) -> Dict[str, List]:
+        """Get the pad_shape of each image based on data and pad_size_divisor.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+
+        Returns:
+            Dict[str, List]: The shape of padding.
+        """
+        batch_pad_shape = dict()
+        for imgs_key in data['inputs']:
+            pad_shape_list = []
+            for imgs in data['inputs'][imgs_key]:
+                pad_h = int(
+                    np.ceil(imgs.shape[-2] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                pad_w = int(
+                    np.ceil(imgs.shape[-1] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                pad_shapes = [
+                    (pad_h, pad_w)
+                ] * imgs.size(0) if imgs.size(0) > 1 else (pad_h, pad_w)
+                pad_shape_list.append(pad_shapes)
+            batch_pad_shape[imgs_key] = pad_shape_list
+        return batch_pad_shape
+
+    def pad_gt_masks(self, data_samples: Sequence[TrackDataSample],
+                     ref_prefix: str) -> None:
+        """Pad gt_masks to shape of batch_input_shape."""
+        if 'masks' in data_samples[0].get(f'{ref_prefix}gt_instances'):
+            for data_samples in data_samples:
+                masks = data_samples.get(f'{ref_prefix}gt_instances').masks
+                assert isinstance(masks, BitmapMasks)
+                batch_input_shape = data_samples.get(
+                    f'{ref_prefix}batch_input_shape')
+                # handle cases where the number of image > 1
+                if isinstance(batch_input_shape, list):
+                    batch_input_shape = batch_input_shape[0]
+                data_samples.get(
+                    f'{ref_prefix}gt_instances').masks = masks.pad(
+                        batch_input_shape, pad_val=self.mask_pad_value)
diff --git a/mmtrack/models/filter/__init__.py b/mmtrack/models/filter/__init__.py
new file mode 100644
index 000000000..f81a7e6dc
--- /dev/null
+++ b/mmtrack/models/filter/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .filter_head import FilterInitializer
+from .filter_optimizer import PrDiMPFilterOptimizer
+
+__all__ = ['FilterInitializer', 'PrDiMPFilterOptimizer']
diff --git a/mmtrack/models/filter/filter_head.py b/mmtrack/models/filter/filter_head.py
new file mode 100644
index 000000000..7c6b78627
--- /dev/null
+++ b/mmtrack/models/filter/filter_head.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import PrRoIPool
+from mmengine.model import BaseModule
+from torch import Tensor, nn
+
+from mmtrack.registry import MODELS
+
+
+@MODELS.register_module()
+class FilterInitializer(BaseModule):
+    """Initializes a target classification filter.
+
+    Args:
+        filter_size (int, optional):  Size of the filter. Defaults to 4.
+        feature_dim (int, optional):  Input feature dimentionality.
+             Defaults to 512.
+        feature_stride (int, optional):  Input feature stride. Defaults to 16.
+    """
+
+    def __init__(self,
+                 filter_size: int = 4,
+                 feature_dim: int = 512,
+                 feature_stride: int = 16):
+        super().__init__()
+        self.filter_conv = nn.Conv2d(
+            feature_dim, feature_dim, kernel_size=3, padding=1)
+        self.filter_pool = PrRoIPool(filter_size, 1 / feature_stride)
+
+    def init_weights(self):
+        """Initialize the parameters of this module."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                m.weight.data.zero_()
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, feat: Tensor, bboxes: Tensor) -> Tensor:
+        """Runs the initializer module. Note that [] denotes an optional
+        dimension.
+
+        Args:
+            feat (Tensor):  Input feature maps with shape
+                (images_in_sequence, [sequences], feat_dim, H, W).
+            bboxes (Tensor):  Target bounding boxes with
+                (images_in_sequence, [sequences], 4) shape in [x1, y1, x2, y2]
+                format.
+
+        Returns:
+            filter_weights (Tensor):  The output filter with shape
+                (images_in_sequence, c, filter_h, filter_w).
+        """
+
+        num_images = feat.shape[0]
+
+        feat = self.filter_conv(feat.reshape(-1, *feat.shape[-3:]))
+
+        # Add batch_index to rois
+        batch_index = torch.arange(
+            bboxes.shape[0], dtype=torch.float32).reshape(-1,
+                                                          1).to(bboxes.device)
+        roi = torch.cat((batch_index, bboxes), dim=1)
+        filter_weights = self.filter_pool(feat, roi)
+
+        # If multiple input images, compute the initial filter
+        # as the average filter.
+        if num_images > 1:
+            filter_weights = torch.mean(
+                filter_weights.reshape(num_images, -1,
+                                       *filter_weights.shape[-3:]),
+                dim=0)
+
+        return filter_weights
diff --git a/mmtrack/models/filter/filter_optimizer.py b/mmtrack/models/filter/filter_optimizer.py
new file mode 100644
index 000000000..7dd1d884f
--- /dev/null
+++ b/mmtrack/models/filter/filter_optimizer.py
@@ -0,0 +1,285 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmtrack.registry import MODELS
+from ..task_modules.filter import filter as filter_layer
+
+
+@MODELS.register_module()
+class PrDiMPFilterOptimizer(BaseModule):
+    """Optimizer module of filter in PrDiMP.
+
+    It unrolls the steepest descent with Newton iterations to optimize the
+        target filter.
+
+    Args:
+        num_iters (int, optional):  Number of default optimization iterations.
+            Defaults to 1.
+        feat_stride (int, optional):  The stride of the input feature.
+            Defaults to 16.
+        init_step_length (float, optional):  Initial scaling of the step length
+            (which is then learned). Defaults to 1.0.
+        init_filter_regular (float, optional):  Initial filter regularization
+            weight (which is then learned). Defaults to 1e-2.
+        gauss_sigma (float, optional):  The standard deviation to use for the
+            label density function. Defaults to 1.0.
+        min_filter_regular (float, optional):  Enforce a minimum value on the
+            regularization (helps stability sometimes). Defaults to 1e-3.
+        alpha_eps (float, optional):  Term in the denominator of the steepest
+            descent that stabalizes learning. Defaults to 0.
+        label_thres (float, optional):  Threshold probabilities. Defaults to 0.
+    """
+
+    def __init__(self,
+                 num_iters: int = 1,
+                 feat_stride: int = 16,
+                 init_step_length: float = 1.0,
+                 init_filter_regular: float = 1e-2,
+                 gauss_sigma: float = 1.0,
+                 min_filter_regular: float = 1e-3,
+                 alpha_eps: float = 0.0,
+                 label_thres: float = 0.0):
+        super().__init__()
+
+        self.num_iters = num_iters
+        self.feat_stride = feat_stride
+        self.log_step_length = nn.Parameter(
+            math.log(init_step_length) * torch.ones(1))
+        self.filter_regular = nn.Parameter(init_filter_regular * torch.ones(1))
+        self.gauss_sigma = gauss_sigma
+        self.min_filter_regular = min_filter_regular
+        self.alpha_eps = alpha_eps
+        self.label_thres = label_thres
+
+    def gen_label_density(self, center_yx: Tensor,
+                          output_size_hw: Tensor) -> Tensor:
+        """Generate label density.
+
+        Args:
+            center_yx (Tensor): The center of score map.
+            output_size_hw (Tensor): The size of score map in [h, w] format.
+
+        Returns:
+            Tensor: Label density with two possible shape:
+                - train mode: (num_img_per_seq, bs, h, w).
+                - test mode: (num_img_per_seq, 1, h, w).
+        """
+        # convert to (num_img_per_seq, bs, 4) shape
+        center_yx = center_yx.reshape(center_yx.shape[0], -1,
+                                      center_yx.shape[-1])
+        k0 = torch.arange(
+            output_size_hw[0],
+            dtype=torch.float32).reshape(1, 1, -1, 1).to(center_yx.device)
+        k1 = torch.arange(
+            output_size_hw[1],
+            dtype=torch.float32).reshape(1, 1, 1, -1).to(center_yx.device)
+        dist0 = (k0 -
+                 center_yx[:, :, 0].reshape(*center_yx.shape[:2], 1, 1))**2
+        dist1 = (k1 -
+                 center_yx[:, :, 1].reshape(*center_yx.shape[:2], 1, 1))**2
+        if self.gauss_sigma == 0:
+            dist0_view = dist0.reshape(-1, dist0.shape[-2])
+            dist1_view = dist1.reshape(-1, dist1.shape[-1])
+            one_hot0 = torch.zeros_like(dist0_view)
+            one_hot1 = torch.zeros_like(dist1_view)
+            one_hot0[torch.arange(one_hot0.shape[0]),
+                     dist0_view.argmin(dim=-1)] = 1.0
+            one_hot1[torch.arange(one_hot1.shape[0]),
+                     dist1_view.argmin(dim=-1)] = 1.0
+            gauss = one_hot0.reshape(dist0.shape) * one_hot1.reshape(
+                dist1.shape)
+        else:
+            g0 = torch.exp(-1.0 / (2 * self.gauss_sigma**2) * dist0)
+            g1 = torch.exp(-1.0 / (2 * self.gauss_sigma**2) * dist1)
+            gauss = (g0 / (2 * math.pi * self.gauss_sigma**2)) * g1
+        gauss = gauss * (gauss > self.label_thres).float()
+        gauss_density = gauss / (gauss.sum(dim=(-2, -1), keepdim=True) + 1e-8)
+        return gauss_density
+
+    def forward(self,
+                filter_weights: Tensor,
+                feat: Tensor,
+                bboxes: Tensor,
+                num_iters: Optional[int] = None,
+                sample_weights: Optional[Tensor] = None) -> Tuple[Tensor, ...]:
+        """Runs the optimizer module.
+
+        Note that [] denotes an optional dimension. Generally speaking, inputs
+        in test mode don't have the dim of [].
+
+        Args:
+            filter_weights (Tensor):  Initial filter with shape
+                training mode: (bs, c, fitler_h, filter_w)
+                test mode: (1, c, fitler_h, filter_w)
+            feat (Tensor):  Input feature maps with shape
+                (num_img_per_seq, [bs], c, H, W).
+            bboxes (Tensor):  Target bounding boxes with shape
+                (num_img_per_seq, [bs], 4). in (cx, cy, x, y) format.
+            num_iters (int, optional):  Number of iterations to run.
+                Defaults to None.
+            sample_weights (Tensor, optional):  Optional weight for each
+                sample with shape (num_img_per_seq, [bs]). Defaults to None.
+
+        Returns:
+            filter_weights (Tensor):  The final oprimized filter.
+            filter_iters (Tensor, optional):  The filter computed in each
+                iteration (including initial input and final output), returned
+                only in training
+            losses (Tensor, optional): losses in all optimizer iterations,
+                returned only in training
+        """
+
+        # Sizes
+        num_iters = self.num_iters if num_iters is None else num_iters
+        num_img_per_seq = feat.shape[0]
+        batch_size = feat.shape[1] if feat.dim() == 5 else 1
+        filter_size_hw = (filter_weights.shape[-2], filter_weights.shape[-1])
+        output_size_hw = (feat.shape[-2] + (filter_weights.shape[-2] + 1) % 2,
+                          feat.shape[-1] + (filter_weights.shape[-1] + 1) % 2)
+
+        # Get learnable scalars
+        step_length_factor = torch.exp(self.log_step_length)
+        filter_regular = (self.filter_regular**2).clamp(
+            min=self.min_filter_regular**2)
+
+        # Compute label density
+        if self.training:
+            assert bboxes.dim() == 3
+        else:
+            assert bboxes.dim() == 2
+            bboxes = bboxes.reshape([bboxes.shape[0], -1, bboxes.shape[-1]])
+        offset = (torch.Tensor(filter_size_hw).to(bboxes.device) % 2) / 2.0
+        center = bboxes[..., :2] / self.feat_stride
+        center_yx = center.flip((-1, )) - offset
+        label_density = self.gen_label_density(center_yx, output_size_hw)
+
+        # Get total sample weights
+        if sample_weights is None:
+            sample_weights = torch.Tensor([1.0 / num_img_per_seq
+                                           ]).to(feat.device)
+        elif isinstance(sample_weights, torch.Tensor):
+            sample_weights = sample_weights.reshape(num_img_per_seq,
+                                                    batch_size, 1, 1)
+        else:
+            raise NotImplementedError(
+                "Only support two types of 'sample_weights': "
+                'torch.Tensor or None')
+
+        filter_iters = []
+        losses = []
+
+        for _ in range(num_iters):
+            # Get scores by applying the filter on the features
+            scores = filter_layer.apply_filter(feat, filter_weights)
+            scores = torch.softmax(
+                scores.reshape(num_img_per_seq, batch_size, -1),
+                dim=2).reshape(scores.shape)
+
+            # Compute loss and record the filter of each iteration in training
+            # mode.
+            if self.training:
+                filter_iters.append(filter_weights)
+                losses.append(
+                    self._compute_loss(scores, sample_weights, label_density,
+                                       filter_weights, filter_regular))
+
+            # Compute gradient and step_length
+            res = sample_weights * (scores - label_density)
+            filter_grad = filter_layer.apply_feat_transpose(
+                feat, res, filter_size_hw,
+                training=self.training) + filter_regular * filter_weights
+
+            step_length = self.get_step_length(feat, sample_weights, scores,
+                                               filter_grad, filter_regular)
+
+            # Update filter
+            filter_weights = filter_weights - (
+                step_length_factor *
+                step_length.reshape(-1, 1, 1, 1)) * filter_grad
+
+        if self.training:
+            filter_iters.append(filter_weights)
+            # Get scores by applying the final filter on the feature map
+            scores = filter_layer.apply_filter(feat, filter_weights)
+            losses.append(
+                self._compute_loss(scores, sample_weights, label_density,
+                                   filter_weights, filter_regular))
+            return filter_weights, filter_iters, losses
+        else:
+            return filter_weights
+
+    def get_step_length(self, feat: Tensor, sample_weights: Tensor,
+                        scores: Tensor, filter_grad: Tensor,
+                        filter_regular: Tensor) -> Tensor:
+        """Compute the step length of updating the filter.
+
+        Args:
+            feat (Tensor): Input feature map with shape
+                (num_img_per_seq, [bs], feat_dim, H, W).
+            sample_weights (Tensor): The weights of all the samples.
+            scores (Tensor): The score map with two possible shape:
+                - train mode: (num_img_per_seq, bs, h, w).
+                - test mode: (num_img_per_seq, 1, h, w).
+            filter_grad (Tensor): The gradient of the filter with shape
+                (num_img_per_seq, c, fitler_h, filter_w).
+            filter_regular (Tensor): The regulazation item of the filter, with
+                shape (1,).
+
+        Returns:
+            alpha (Tensor): The updating factor with shape (1, ).
+        """
+        num_img_per_seq = feat.shape[0]
+        batch_size = feat.shape[1] if feat.dim() == 5 else 1
+        # Map the gradient with the Hessian
+        scores_grad = filter_layer.apply_filter(feat, filter_grad)
+        sm_scores_grad = scores * scores_grad
+        hes_scores_grad = sm_scores_grad - scores * torch.sum(
+            sm_scores_grad, dim=(-2, -1), keepdim=True)
+        grad_hes_grad = (scores_grad * hes_scores_grad).reshape(
+            num_img_per_seq, batch_size, -1).sum(dim=2).clamp(min=0)
+        grad_hes_grad = (sample_weights.reshape(sample_weights.shape[0], -1) *
+                         grad_hes_grad).sum(dim=0)
+
+        # Compute optimal step length
+        alpha_num = (filter_grad * filter_grad).sum(dim=(1, 2, 3))
+        alpha_den = (grad_hes_grad +
+                     (filter_regular + self.alpha_eps) * alpha_num).clamp(1e-8)
+        alpha = alpha_num / alpha_den
+
+        return alpha
+
+    def _compute_loss(self, scores: Tensor, sample_weights: Tensor,
+                      label_density: Tensor, filter: Tensor,
+                      filter_regular: Tensor) -> Tensor:
+        """Compute loss in the box optimization.
+
+        Args:
+            scores (Tensor): The score map with shape
+                (num_img_per_seq, bs, h, w).
+            sample_weights (Tensor): The weights of all the samples with shape
+                (num_img_per_seq, bs, 1, 1)
+            label_density (Tensor):The label density with shape
+                (num_img_per_seq, bs, h, w).
+            filter (Tensor): The filter with shape
+                (num_img_per_seq, c, fitler_h, filter_w).
+            filter_regular (Tensor):The regulazation item of the filter, with
+                shape (1,).
+
+        Returns:
+            Tensor: with shape (1,)
+        """
+
+        num_samples = sample_weights.shape[0]
+        sample_weights = sample_weights.reshape(sample_weights.shape[0], -1)
+        score_log_sum_exp = torch.log(scores.exp().sum(dim=(-2, -1)))
+        sum_scores = (label_density * scores).sum(dim=(-2, -1))
+
+        return torch.sum(
+            sample_weights * (score_log_sum_exp - sum_scores)
+        ) / num_samples + filter_regular * (filter**2).sum() / num_samples
diff --git a/mmtrack/models/layers/__init__.py b/mmtrack/models/layers/__init__.py
new file mode 100644
index 000000000..445fff36b
--- /dev/null
+++ b/mmtrack/models/layers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .positional_encoding import SinePositionalEncoding3D
+
+__all__ = ['SinePositionalEncoding3D']
diff --git a/mmtrack/models/layers/positional_encoding.py b/mmtrack/models/layers/positional_encoding.py
new file mode 100644
index 000000000..657305085
--- /dev/null
+++ b/mmtrack/models/layers/positional_encoding.py
@@ -0,0 +1,123 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified from
+# https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
+import math
+from typing import Optional
+
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmtrack.registry import MODELS
+
+
+@MODELS.register_module()
+class SinePositionalEncoding3D(BaseModule):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 num_feats: int,
+                 temperature: int = 10000,
+                 normalize: bool = False,
+                 scale: float = 2 * math.pi,
+                 eps: float = 1e-6,
+                 offset: float = 0.,
+                 init_cfg: Optional[dict] = None):
+        super(SinePositionalEncoding3D, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask: Tensor) -> Tensor:
+        """Forward function for `SinePositionalEncoding3D`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, t, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        assert mask.dim() == 4,\
+            f'{mask.shape} should be a 4-dimensional Tensor,' \
+            f' got {mask.dim()}-dimensional Tensor instead '
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        z_embed = not_mask.cumsum(1, dtype=torch.float32)
+        y_embed = not_mask.cumsum(2, dtype=torch.float32)
+        x_embed = not_mask.cumsum(3, dtype=torch.float32)
+        if self.normalize:
+            z_embed = (z_embed + self.offset) / \
+                      (z_embed[:, -1:, :, :] + self.eps) * self.scale
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, :, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+
+        dim_t_z = torch.arange((self.num_feats * 2),
+                               dtype=torch.float32,
+                               device=mask.device)
+        dim_t_z = self.temperature**(2 * (dim_t_z // 2) / (self.num_feats * 2))
+
+        pos_x = x_embed[:, :, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, :, None] / dim_t
+        pos_z = z_embed[:, :, :, :, None] / dim_t_z
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, T, H, W = mask.size()
+        pos_x = torch.stack(
+            (pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos_z = torch.stack(
+            (pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
diff --git a/mmtrack/models/losses/__init__.py b/mmtrack/models/losses/__init__.py
index b7991a1ff..947cb2e05 100644
--- a/mmtrack/models/losses/__init__.py
+++ b/mmtrack/models/losses/__init__.py
@@ -1,6 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .kl_loss import KLGridLoss, KLMCLoss
 from .l2_loss import L2Loss
 from .multipos_cross_entropy_loss import MultiPosCrossEntropyLoss
 from .triplet_loss import TripletLoss
 
-__all__ = ['L2Loss', 'TripletLoss', 'MultiPosCrossEntropyLoss']
+__all__ = [
+    'TripletLoss', 'MultiPosCrossEntropyLoss', 'L2Loss', 'KLGridLoss',
+    'KLMCLoss'
+]
diff --git a/mmtrack/models/losses/kl_loss.py b/mmtrack/models/losses/kl_loss.py
new file mode 100644
index 000000000..45b5d8ec3
--- /dev/null
+++ b/mmtrack/models/losses/kl_loss.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmtrack.registry import MODELS
+
+
+@MODELS.register_module()
+class KLMCLoss(nn.Module):
+    """KL-divergence loss for probabilistic regression.
+
+    It is computed using Monte Carlo (MC) samples from an arbitrary
+    distribution.
+
+    Args:
+        eps (float, optional): Defaults to 0.0.
+    """
+
+    def __init__(self, eps: float = 0.0):
+        super().__init__()
+        self.eps = eps
+
+    def forward(self,
+                scores: Tensor,
+                sample_density: Tensor,
+                gt_density: Tensor,
+                mc_dim: int = -1) -> Tensor:
+        """
+        Args:
+            scores (Tensor): predicted score values. It has shape
+                (num_imgs, num_samples).
+            sample_density (Tensor): probability density of the sample
+                distribution. It has shape (num_imgs, num_samples).
+            gt_density (Tensor): probability density of the ground truth
+                distribution. It has shape (num_imgs, num_samples).
+            mc_dim (int): dimension of the MC samples.
+
+        Returns:
+            torch.Tensor: Calculated loss
+        """
+        exp_val = scores - torch.log(sample_density + self.eps)
+
+        loss = torch.logsumexp(
+            exp_val, dim=mc_dim) - math.log(scores.shape[mc_dim]) - torch.mean(
+                scores * (gt_density / (sample_density + self.eps)),
+                dim=mc_dim)
+
+        return loss.mean()
+
+
+@MODELS.register_module()
+class KLGridLoss(nn.Module):
+    """KL-divergence loss for probabilistic regression.
+
+    It is computed using the grid integration strategy.
+    """
+
+    def forward(self,
+                scores: Tensor,
+                gt_density: Tensor,
+                grid_dim: Union[Tuple, int] = -1,
+                grid_scale: float = 1.0) -> Tensor:
+        """
+        Args:
+            scores (Tensor): predicted score values. It has shape
+                (num_imgs_per_seq, bs, score_map_size, score_map_size).
+            gt_density (Tensor): probability density of the ground truth
+                distribution. It has shape
+                (num_imgs_per_seq, bs, score_map_size, score_map_size).
+            grid_dim (int): dimension(s) of the grid.
+            grid_scale (float): area of one grid cell.
+
+        Returns:
+            torch.Tensor: Calculated loss
+        """
+        score_corr = grid_scale * torch.sum(scores * gt_density, dim=grid_dim)
+
+        loss = torch.logsumexp(
+            scores, dim=grid_dim) + math.log(grid_scale) - score_corr
+
+        return loss.mean()
diff --git a/mmtrack/models/losses/l2_loss.py b/mmtrack/models/losses/l2_loss.py
index ca5b376d4..18221bfae 100644
--- a/mmtrack/models/losses/l2_loss.py
+++ b/mmtrack/models/losses/l2_loss.py
@@ -1,12 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
 import numpy as np
 import torch
-import torch.nn as nn
-from mmdet.models import LOSSES, weighted_loss
+from mmdet.models import weighted_loss
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmtrack.registry import MODELS
 
 
 @weighted_loss
-def l2_loss(pred, target):
+def l2_loss(pred: Tensor, target: Tensor) -> Tensor:
     """L2 loss.
 
     Args:
@@ -21,8 +26,8 @@ def l2_loss(pred, target):
     return loss
 
 
-@LOSSES.register_module()
-class L2Loss(nn.Module):
+@MODELS.register_module()
+class L2Loss(BaseModule):
     """L2 loss.
 
     Args:
@@ -32,12 +37,12 @@ class L2Loss(nn.Module):
     """
 
     def __init__(self,
-                 neg_pos_ub=-1,
-                 pos_margin=-1,
-                 neg_margin=-1,
-                 hard_mining=False,
-                 reduction='mean',
-                 loss_weight=1.0):
+                 neg_pos_ub: int = -1,
+                 pos_margin: float = -1,
+                 neg_margin: float = -1,
+                 hard_mining: bool = False,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0):
         super(L2Loss, self).__init__()
         self.neg_pos_ub = neg_pos_ub
         self.pos_margin = pos_margin
@@ -47,11 +52,11 @@ def __init__(self,
         self.loss_weight = loss_weight
 
     def forward(self,
-                pred,
-                target,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None):
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
         """Forward function.
 
         Args:
@@ -59,8 +64,8 @@ def forward(self,
             target (torch.Tensor): The learning target of the prediction.
             weight (torch.Tensor, optional): The weight of loss for each
                 prediction. Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
             reduction_override (str, optional): The reduction method used to
                 override the original reduction method of the loss.
                 Defaults to None.
@@ -74,7 +79,8 @@ def forward(self,
             pred, target, weight, reduction=reduction, avg_factor=avg_factor)
         return loss_bbox
 
-    def update_weight(self, pred, target, weight, avg_factor):
+    def update_weight(self, pred: Tensor, target: Tensor, weight: Tensor,
+                      avg_factor: float) -> Tuple[Tensor, Tensor, float]:
         """Update the weight according to targets."""
         if weight is None:
             weight = target.new_ones(target.size())
@@ -115,7 +121,8 @@ def update_weight(self, pred, target, weight, avg_factor):
         return pred, weight, avg_factor
 
     @staticmethod
-    def random_choice(gallery, num):
+    def random_choice(gallery: Union[list, np.ndarray, Tensor],
+                      num: int) -> np.ndarray:
         """Random select some elements from the gallery.
 
         It seems that Pytorch's implementation is slower than numpy so we use
diff --git a/mmtrack/models/losses/multipos_cross_entropy_loss.py b/mmtrack/models/losses/multipos_cross_entropy_loss.py
index 91297cf41..6a69c7925 100644
--- a/mmtrack/models/losses/multipos_cross_entropy_loss.py
+++ b/mmtrack/models/losses/multipos_cross_entropy_loss.py
@@ -1,24 +1,29 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
 import torch
-import torch.nn as nn
-from mmdet.models import LOSSES, weight_reduce_loss
+from mmdet.models import weight_reduce_loss
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmtrack.registry import MODELS
 
 
-@LOSSES.register_module()
-class MultiPosCrossEntropyLoss(nn.Module):
+@MODELS.register_module()
+class MultiPosCrossEntropyLoss(BaseModule):
     """multi-positive targets cross entropy loss."""
 
-    def __init__(self, reduction='mean', loss_weight=1.0):
+    def __init__(self, reduction: str = 'mean', loss_weight: float = 1.0):
         super(MultiPosCrossEntropyLoss, self).__init__()
         self.reduction = reduction
         self.loss_weight = loss_weight
 
     def multi_pos_cross_entropy(self,
-                                pred,
-                                label,
-                                weight=None,
-                                reduction='mean',
-                                avg_factor=None):
+                                pred: Tensor,
+                                label: Tensor,
+                                weight: Optional[Tensor] = None,
+                                reduction: str = 'mean',
+                                avg_factor: Optional[float] = None) -> Tensor:
         """
         Args:
             pred (torch.Tensor): The prediction.
@@ -55,12 +60,12 @@ def multi_pos_cross_entropy(self,
         return loss
 
     def forward(self,
-                cls_score,
-                label,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None,
-                **kwargs):
+                cls_score: Tensor,
+                label: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
         """Forward function.
 
         Args:
@@ -69,7 +74,7 @@ def forward(self,
             weight (torch.Tensor): The element-wise weight.
             avg_factor (float): Average factor when computing
                 the mean of losses.
-            reduction (str): Same as built-in losses of PyTorch.
+            reduction_override (str): Same as built-in losses of PyTorch.
         Returns:
             torch.Tensor: Calculated loss
         """
@@ -82,6 +87,5 @@ def forward(self,
             label,
             weight,
             reduction=reduction,
-            avg_factor=avg_factor,
-            **kwargs)
+            avg_factor=avg_factor)
         return loss_cls
diff --git a/mmtrack/models/losses/triplet_loss.py b/mmtrack/models/losses/triplet_loss.py
index 2519b8db0..87fbca31e 100644
--- a/mmtrack/models/losses/triplet_loss.py
+++ b/mmtrack/models/losses/triplet_loss.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
-from mmdet.models import LOSSES
+from mmengine.model import BaseModule
 
+from mmtrack.registry import MODELS
 
-@LOSSES.register_module()
-class TripletLoss(nn.Module):
+
+@MODELS.register_module()
+class TripletLoss(BaseModule):
     """Triplet loss with hard positive/negative mining.
 
     Reference:
@@ -14,24 +16,34 @@ class TripletLoss(nn.Module):
     Imported from `<https://github.com/KaiyangZhou/deep-person-reid/blob/
         master/torchreid/losses/hard_mine_triplet_loss.py>`_.
     Args:
-        margin (float, optional): Margin for triplet loss. Default to 0.3.
-        loss_weight (float, optional): Weight of the loss. Default to 1.0.
+        margin (float, optional): Margin for triplet loss. Defaults to 0.3.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        hard_mining (bool, optional): Whether to perform hard mining.
+            Defaults to True.
     """
 
-    def __init__(self, margin=0.3, loss_weight=1.0, hard_mining=True):
+    def __init__(self,
+                 margin: float = 0.3,
+                 loss_weight: float = 1.0,
+                 hard_mining=True):
         super(TripletLoss, self).__init__()
         self.margin = margin
         self.ranking_loss = nn.MarginRankingLoss(margin=margin)
         self.loss_weight = loss_weight
         self.hard_mining = hard_mining
 
-    def hard_mining_triplet_loss_forward(self, inputs, targets):
+    def hard_mining_triplet_loss_forward(
+            self, inputs: torch.Tensor,
+            targets: torch.LongTensor) -> torch.Tensor:
         """
         Args:
             inputs (torch.Tensor): feature matrix with shape
                 (batch_size, feat_dim).
             targets (torch.LongTensor): ground truth labels with shape
                 (num_classes).
+
+        Returns:
+            torch.Tensor: triplet loss with hard mining.
         """
 
         batch_size = inputs.size(0)
@@ -58,7 +70,18 @@ def hard_mining_triplet_loss_forward(self, inputs, targets):
         y = torch.ones_like(dist_an)
         return self.loss_weight * self.ranking_loss(dist_an, dist_ap, y)
 
-    def forward(self, inputs, targets, **kwargs):
+    def forward(self, inputs: torch.Tensor,
+                targets: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            inputs (torch.Tensor): feature matrix with shape
+                (batch_size, feat_dim).
+            targets (torch.LongTensor): ground truth labels with shape
+                (num_classes).
+
+        Returns:
+            torch.Tensor: triplet loss.
+        """
         if self.hard_mining:
             return self.hard_mining_triplet_loss_forward(inputs, targets)
         else:
diff --git a/mmtrack/models/mot/__init__.py b/mmtrack/models/mot/__init__.py
index 9135ed7dd..3682c6bfe 100644
--- a/mmtrack/models/mot/__init__.py
+++ b/mmtrack/models/mot/__init__.py
@@ -2,9 +2,13 @@
 from .base import BaseMultiObjectTracker
 from .byte_track import ByteTrack
 from .deep_sort import DeepSORT
+from .oc_sort import OCSORT 
 from .qdtrack import QDTrack
+from .qdtrack_sstg import QDTrackSSTG
+from .strong_sort import StrongSORT
 from .tracktor import Tracktor
 
 __all__ = [
-    'BaseMultiObjectTracker', 'Tracktor', 'DeepSORT', 'ByteTrack', 'QDTrack'
+    'BaseMultiObjectTracker', 'ByteTrack', 'DeepSORT', 'OCSORT', 'Tracktor', 'QDTrack',
+    'QDTrackSSTG', 'StrongSORT'
 ]
diff --git a/mmtrack/models/mot/base.py b/mmtrack/models/mot/base.py
index 62d13dd75..c015a40a7 100644
--- a/mmtrack/models/mot/base.py
+++ b/mmtrack/models/mot/base.py
@@ -1,25 +1,31 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
-from collections import OrderedDict
+from typing import Dict, List, Tuple, Union
 
-import mmcv
-import torch
-import torch.distributed as dist
-from mmcv.runner import BaseModule, auto_fp16
+from mmengine.model import BaseModel
+from torch import Tensor
 
-from mmtrack.core import imshow_tracks, results2outs
-from mmtrack.utils import get_root_logger
+from mmtrack.utils import (ForwardResults, OptConfigType, OptMultiConfig,
+                           OptSampleList, SampleList)
 
 
-class BaseMultiObjectTracker(BaseModule, metaclass=ABCMeta):
-    """Base class for multiple object tracking."""
+class BaseMultiObjectTracker(BaseModel, metaclass=ABCMeta):
+    """Base class for multiple object tracking.
 
-    def __init__(self, init_cfg=None):
-        super(BaseMultiObjectTracker, self).__init__(init_cfg)
-        self.logger = get_root_logger()
-        self.fp16_enabled = False
+    Args:
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Initialization config dict.
+    """
 
-    def freeze_module(self, module):
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    def freeze_module(self, module: Union[List[str], Tuple[str], str]) -> None:
         """Freeze module during training."""
         if isinstance(module, str):
             modules = [module]
@@ -35,246 +41,105 @@ def freeze_module(self, module):
                 param.requires_grad = False
 
     @property
-    def with_detector(self):
+    def with_detector(self) -> bool:
         """bool: whether the framework has a detector."""
         return hasattr(self, 'detector') and self.detector is not None
 
     @property
-    def with_reid(self):
+    def with_reid(self) -> bool:
         """bool: whether the framework has a reid model."""
         return hasattr(self, 'reid') and self.reid is not None
 
     @property
-    def with_motion(self):
+    def with_motion(self) -> bool:
         """bool: whether the framework has a motion model."""
         return hasattr(self, 'motion') and self.motion is not None
 
     @property
-    def with_track_head(self):
+    def with_track_head(self) -> bool:
         """bool: whether the framework has a track_head."""
         return hasattr(self, 'track_head') and self.track_head is not None
 
     @property
-    def with_tracker(self):
+    def with_tracker(self) -> bool:
         """bool: whether the framework has a tracker."""
         return hasattr(self, 'tracker') and self.tracker is not None
 
-    @abstractmethod
-    def forward_train(self, imgs, img_metas, **kwargs):
-        """
-        Args:
-            img (list[Tensor]): List of tensors of shape (1, C, H, W).
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys, see
-                :class:`mmdet.datasets.pipelines.Collect`.
-            kwargs (keyword arguments): Specific to concrete implementation.
-        """
-        pass
-
-    @abstractmethod
-    def simple_test(self, img, img_metas, **kwargs):
-        """Test function with a single scale."""
-        pass
-
-    def aug_test(self, imgs, img_metas, **kwargs):
-        """Test function with test time augmentation."""
-        pass
-
-    def forward_test(self, imgs, img_metas, **kwargs):
-        """
-        Args:
-            imgs (List[Tensor]): the outer list indicates test-time
-                augmentations and inner Tensor should have a shape NxCxHxW,
-                which contains all images in the batch.
-            img_metas (List[List[dict]]): the outer list indicates test-time
-                augs (multiscale, flip, etc.) and the inner list indicates
-                images in a batch.
-        """
-        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
-            if not isinstance(var, list):
-                raise TypeError(f'{name} must be a list, but got {type(var)}')
-
-        num_augs = len(imgs)
-        if num_augs != len(img_metas):
-            raise ValueError(f'num of augmentations ({len(imgs)}) '
-                             f'!= num of image meta ({len(img_metas)})')
-
-        if num_augs == 1:
-            # proposals (List[List[Tensor]]): the outer list indicates
-            # test-time augs (multiscale, flip, etc.) and the inner list
-            # indicates images in a batch.
-            # The Tensor should have a shape Px4, where P is the number of
-            # proposals.
-            if 'proposals' in kwargs:
-                kwargs['proposals'] = kwargs['proposals'][0]
-            return self.simple_test(imgs[0], img_metas[0], **kwargs)
-        else:
-            assert imgs[0].size(0) == 1, 'aug test does not support ' \
-                                         'inference with batch size ' \
-                                         f'{imgs[0].size(0)}'
-            # TODO: support test augmentation for predefined proposals
-            assert 'proposals' not in kwargs
-            return self.aug_test(imgs, img_metas, **kwargs)
-
-    @auto_fp16(apply_to=('img', ))
-    def forward(self, img, img_metas, return_loss=True, **kwargs):
-        """Calls either :func:`forward_train` or :func:`forward_test` depending
-        on whether ``return_loss`` is ``True``.
-
-        Note this setting will change the expected inputs. When
-        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
-        and List[dict]), and when ``resturn_loss=False``, img and img_meta
-        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
-        the outer list indicating test time augmentations.
-        """
-        if return_loss:
-            return self.forward_train(img, img_metas, **kwargs)
-        else:
-            return self.forward_test(img, img_metas, **kwargs)
-
-    def _parse_losses(self, losses):
-        """Parse the raw outputs (losses) of the network.
+    def forward(self,
+                inputs: Dict[str, Tensor],
+                data_samples: OptSampleList = None,
+                mode: str = 'predict',
+                **kwargs) -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
 
-        Args:
-            losses (dict): Raw output of the network, which usually contain
-                losses and other necessary information.
+        The method should accept three modes: "tensor", "predict" and "loss":
 
-        Returns:
-            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
-            which may be a weighted sum of all losses, log_vars contains
-            all the variables to be sent to the logger.
-        """
-        log_vars = OrderedDict()
-        for loss_name, loss_value in losses.items():
-            if isinstance(loss_value, torch.Tensor):
-                log_vars[loss_name] = loss_value.mean()
-            elif isinstance(loss_value, list):
-                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
-            else:
-                raise TypeError(
-                    f'{loss_name} is not a tensor or list of tensors')
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`TrackDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
 
-        loss = sum(_value for _key, _value in log_vars.items()
-                   if 'loss' in _key)
-
-        log_vars['loss'] = loss
-        for loss_name, loss_value in log_vars.items():
-            # reduce loss when distributed training
-            if dist.is_available() and dist.is_initialized():
-                loss_value = loss_value.data.clone()
-                dist.all_reduce(loss_value.div_(dist.get_world_size()))
-            log_vars[loss_name] = loss_value.item()
-
-        return loss, log_vars
-
-    def train_step(self, data, optimizer):
-        """The iteration step during training.
-
-        This method defines an iteration step during training, except for the
-        back propagation and optimizer updating, which are done in an optimizer
-        hook. Note that in some complicated cases or models, the whole process
-        including back propagation and optimizer updating is also defined in
-        this method, such as GAN.
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
 
         Args:
-            data (dict): The output of dataloader.
-            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
-                runner is passed to ``train_step()``. This argument is unused
-                and reserved.
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W)
+                encoding input images. Typically these should be mean centered
+                and std scaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'predict'.
 
         Returns:
-            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
-            ``num_samples``.
+            The return type depends on ``mode``.
 
-            - ``loss`` is a tensor for back propagation, which can be a
-            weighted sum of multiple losses.
-            - ``log_vars`` contains all the variables to be sent to the
-            logger.
-            - ``num_samples`` indicates the batch size (when the model is
-            DDP, it means the batch size on each GPU), which is used for
-            averaging the logs.
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`TrackDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
         """
-        losses = self(**data)
-        loss, log_vars = self._parse_losses(losses)
-
-        outputs = dict(
-            loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
-
-        return outputs
-
-    def val_step(self, data, optimizer):
-        """The iteration step during validation.
-
-        This method shares the same signature as :func:`train_step`, but used
-        during val epochs. Note that the evaluation after training epochs is
-        not implemented with this method, but an evaluation hook.
-        """
-        losses = self(**data)
-        loss, log_vars = self._parse_losses(losses)
+        if mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
 
-        outputs = dict(
-            loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
+    @abstractmethod
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
 
-        return outputs
+    @abstractmethod
+    def predict(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
 
-    def show_result(self,
-                    img,
-                    result,
-                    score_thr=0.0,
-                    thickness=1,
-                    font_scale=0.5,
-                    show=False,
-                    out_file=None,
-                    wait_time=0,
-                    backend='cv2',
-                    **kwargs):
-        """Visualize tracking results.
+    def _forward(self,
+                 inputs: Dict[str, Tensor],
+                 data_samples: OptSampleList = None,
+                 **kwargs):
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
 
-        Args:
-            img (str | ndarray): Filename of loaded image.
-            result (dict): Tracking result.
-                - The value of key 'track_bboxes' is list with length
-                num_classes, and each element in list is ndarray with
-                shape(n, 6) in [id, tl_x, tl_y, br_x, br_y, score] format.
-                - The value of key 'det_bboxes' is list with length
-                num_classes, and each element in list is ndarray with
-                shape(n, 5) in [tl_x, tl_y, br_x, br_y, score] format.
-            thickness (int, optional): Thickness of lines. Defaults to 1.
-            font_scale (float, optional): Font scales of texts. Defaults
-                to 0.5.
-            show (bool, optional): Whether show the visualizations on the
-                fly. Defaults to False.
-            out_file (str | None, optional): Output filename. Defaults to None.
-            backend (str, optional): Backend to draw the bounding boxes,
-                options are `cv2` and `plt`. Defaults to 'cv2'.
+         Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W).
+            data_samples (List[:obj:`TrackDataSample`], optional): The
+                Data Samples. It usually includes information such as
+                `gt_instance`.
 
         Returns:
-            ndarray: Visualized image.
+            tuple[list]: A tuple of features from ``head`` forward.
         """
-        assert isinstance(result, dict)
-        track_bboxes = result.get('track_bboxes', None)
-        track_masks = result.get('track_masks', None)
-        if isinstance(img, str):
-            img = mmcv.imread(img)
-        outs_track = results2outs(
-            bbox_results=track_bboxes,
-            mask_results=track_masks,
-            mask_shape=img.shape[:2])
-        img = imshow_tracks(
-            img,
-            outs_track.get('bboxes', None),
-            outs_track.get('labels', None),
-            outs_track.get('ids', None),
-            outs_track.get('masks', None),
-            classes=self.CLASSES,
-            score_thr=score_thr,
-            thickness=thickness,
-            font_scale=font_scale,
-            show=show,
-            out_file=out_file,
-            wait_time=wait_time,
-            backend=backend)
-        return img
+        raise NotImplementedError(
+            "_forward function (namely 'tensor' mode) is not supported now")
diff --git a/mmtrack/models/mot/byte_track.py b/mmtrack/models/mot/byte_track.py
index 23358322b..7040b67c5 100644
--- a/mmtrack/models/mot/byte_track.py
+++ b/mmtrack/models/mot/byte_track.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
 
 import torch
-from mmdet.models import build_detector
+from torch import Tensor
 
-from mmtrack.core import outs2results, results2outs
-from ..builder import MODELS, build_motion, build_tracker
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.utils import OptConfigType, OptMultiConfig, SampleList
 from .base import BaseMultiObjectTracker
 
 
@@ -19,77 +20,98 @@ class ByteTrack(BaseMultiObjectTracker):
         detector (dict): Configuration of detector. Defaults to None.
         tracker (dict): Configuration of tracker. Defaults to None.
         motion (dict): Configuration of motion. Defaults to None.
-        init_cfg (dict): Configuration of initialization. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
     """
 
     def __init__(self,
-                 detector=None,
-                 tracker=None,
-                 motion=None,
-                 init_cfg=None):
-        super().__init__(init_cfg)
+                 detector: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 motion: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
 
         if detector is not None:
-            self.detector = build_detector(detector)
+            self.detector = MODELS.build(detector)
 
         if motion is not None:
-            self.motion = build_motion(motion)
+            self.motion = TASK_UTILS.build(motion)
 
         if tracker is not None:
-            self.tracker = build_tracker(tracker)
+            self.tracker = MODELS.build(tracker)
 
-    def forward_train(self, *args, **kwargs):
-        """Forward function during training."""
-        return self.detector.forward_train(*args, **kwargs)
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
 
-    def simple_test(self, img, img_metas, rescale=False, **kwargs):
-        """Test without augmentations.
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        # modify the inputs shape to fit mmdet
+        img = inputs['img']
+        assert img.size(1) == 1
+        # convert 'inputs' shape to (N, C, H, W)
+        img = torch.squeeze(img, dim=1)
+        return self.detector.loss(img, data_samples, **kwargs)
+
+    def predict(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
 
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-            rescale (bool, optional): If False, then returned bboxes and masks
-                will fit the scale of img, otherwise, returned bboxes and masks
-                will fit the scale of original image shape. Defaults to False.
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
 
         Returns:
-            dict[str : list(ndarray)]: The tracking results.
+            SampleList: Tracking results of the input images.
+            Each TrackDataSample usually contains ``pred_det_instances``
+            or ``pred_track_instances``.
         """
-        frame_id = img_metas[0].get('frame_id', -1)
-        if frame_id == 0:
-            self.tracker.reset()
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'Bytetrack inference only support 1 batch size per gpu for now.'
+        img = img[0]
 
-        det_results = self.detector.simple_test(
-            img, img_metas, rescale=rescale)
-        assert len(det_results) == 1, 'Batch inference is not supported.'
-        bbox_results = det_results[0]
-        num_classes = len(bbox_results)
+        assert len(data_samples) == 1, \
+            'Bytetrack inference only support 1 batch size per gpu for now.'
 
-        outs_det = results2outs(bbox_results=bbox_results)
-        det_bboxes = torch.from_numpy(outs_det['bboxes']).to(img)
-        det_labels = torch.from_numpy(outs_det['labels']).to(img).long()
+        track_data_sample = data_samples[0]
 
-        track_bboxes, track_labels, track_ids = self.tracker.track(
-            img=img,
-            img_metas=img_metas,
+        det_results = self.detector.predict(img, data_samples)
+        assert len(det_results) == 1, 'Batch inference is not supported.'
+        track_data_sample.pred_det_instances = \
+            det_results[0].pred_instances.clone()
+
+        pred_track_instances = self.tracker.track(
             model=self,
-            bboxes=det_bboxes,
-            labels=det_labels,
-            frame_id=frame_id,
-            rescale=rescale,
+            img=img,
+            feats=None,
+            data_sample=track_data_sample,
             **kwargs)
+        track_data_sample.pred_track_instances = pred_track_instances
 
-        track_results = outs2results(
-            bboxes=track_bboxes,
-            labels=track_labels,
-            ids=track_ids,
-            num_classes=num_classes)
-        det_results = outs2results(
-            bboxes=det_bboxes, labels=det_labels, num_classes=num_classes)
-
-        return dict(
-            det_bboxes=det_results['bbox_results'],
-            track_bboxes=track_results['bbox_results'])
+        return [track_data_sample]
diff --git a/mmtrack/models/mot/deep_sort.py b/mmtrack/models/mot/deep_sort.py
index eded87ab1..7027ea8c1 100644
--- a/mmtrack/models/mot/deep_sort.py
+++ b/mmtrack/models/mot/deep_sort.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
+from typing import Dict, Optional
 
-from mmdet.models import build_detector
+from torch import Tensor
 
-from mmtrack.core import outs2results
-from ..builder import MODELS, build_motion, build_reid, build_tracker
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.utils import OptConfigType, SampleList
 from .base import BaseMultiObjectTracker
 
 
@@ -13,128 +13,103 @@ class DeepSORT(BaseMultiObjectTracker):
     """Simple online and realtime tracking with a deep association metric.
 
     Details can be found at `DeepSORT<https://arxiv.org/abs/1703.07402>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        reid (dict): Configuration of reid. Defaults to None
+        tracker (dict): Configuration of tracker. Defaults to None.
+        motion (dict): Configuration of motion. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
     """
 
     def __init__(self,
-                 detector=None,
-                 reid=None,
-                 tracker=None,
-                 motion=None,
-                 pretrains=None,
-                 init_cfg=None):
-        super().__init__(init_cfg)
-        if isinstance(pretrains, dict):
-            warnings.warn('DeprecationWarning: pretrains is deprecated, '
-                          'please use "init_cfg" instead')
-            if detector:
-                detector_pretrain = pretrains.get('detector', None)
-                if detector_pretrain:
-                    detector.init_cfg = dict(
-                        type='Pretrained', checkpoint=detector_pretrain)
-                else:
-                    detector.init_cfg = None
-            if reid:
-                reid_pretrain = pretrains.get('reid', None)
-                if reid_pretrain:
-                    reid.init_cfg = dict(
-                        type='Pretrained', checkpoint=reid_pretrain)
-                else:
-                    reid.init_cfg = None
+                 detector: Optional[dict] = None,
+                 reid: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 motion: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(data_preprocessor, init_cfg)
 
         if detector is not None:
-            self.detector = build_detector(detector)
+            self.detector = MODELS.build(detector)
 
         if reid is not None:
-            self.reid = build_reid(reid)
+            self.reid = MODELS.build(reid)
 
         if motion is not None:
-            self.motion = build_motion(motion)
+            self.motion = TASK_UTILS.build(motion)
 
         if tracker is not None:
-            self.tracker = build_tracker(tracker)
+            self.tracker = MODELS.build(tracker)
+
+        self.preprocess_cfg = data_preprocessor
 
-    def forward_train(self, *args, **kwargs):
-        """Forward function during training."""
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
         raise NotImplementedError(
             'Please train `detector` and `reid` models firstly, then \
                 inference with SORT/DeepSORT.')
 
-    def simple_test(self,
-                    img,
-                    img_metas,
-                    rescale=False,
-                    public_bboxes=None,
-                    **kwargs):
-        """Test without augmentations.
+    def predict(self,
+                inputs: Dict[str, Tensor],
+                data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
 
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-            rescale (bool, optional): If False, then returned bboxes and masks
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
                 will fit the scale of img, otherwise, returned bboxes and masks
-                will fit the scale of original image shape. Defaults to False.
-            public_bboxes (list[Tensor], optional): Public bounding boxes from
-                the benchmark. Defaults to None.
+                will fit the scale of original image shape. Defaults to True.
 
         Returns:
-            dict[str : list(ndarray)]: The tracking results.
+            SampleList: Tracking results of the
+                input images. Each TrackDataSample usually contains
+                ``pred_det_instances`` or ``pred_track_instances``.
         """
-        frame_id = img_metas[0].get('frame_id', -1)
-        if frame_id == 0:
-            self.tracker.reset()
-
-        x = self.detector.extract_feat(img)
-        if hasattr(self.detector, 'roi_head'):
-            # TODO: check whether this is the case
-            if public_bboxes is not None:
-                public_bboxes = [_[0] for _ in public_bboxes]
-                proposals = public_bboxes
-            else:
-                proposals = self.detector.rpn_head.simple_test_rpn(
-                    x, img_metas)
-            det_bboxes, det_labels = self.detector.roi_head.simple_test_bboxes(
-                x,
-                img_metas,
-                proposals,
-                self.detector.roi_head.test_cfg,
-                rescale=rescale)
-            # TODO: support batch inference
-            det_bboxes = det_bboxes[0]
-            det_labels = det_labels[0]
-            num_classes = self.detector.roi_head.bbox_head.num_classes
-        elif hasattr(self.detector, 'bbox_head'):
-            outs = self.detector.bbox_head(x)
-            result_list = self.detector.bbox_head.get_bboxes(
-                *outs, img_metas=img_metas, rescale=rescale)
-            # TODO: support batch inference
-            det_bboxes = result_list[0][0]
-            det_labels = result_list[0][1]
-            num_classes = self.detector.bbox_head.num_classes
-        else:
-            raise TypeError('detector must has roi_head or bbox_head.')
-
-        track_bboxes, track_labels, track_ids = self.tracker.track(
-            img=img,
-            img_metas=img_metas,
+        img = inputs['img']
+
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+        img = img[0]
+
+        assert len(data_samples) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+
+        det_results = self.detector.predict(img, data_samples)
+        assert len(det_results) == 1, 'Batch inference is not supported.'
+        track_data_sample.pred_det_instances = det_results[
+            0].pred_instances.clone()
+
+        pred_track_instances = self.tracker.track(
             model=self,
-            feats=x,
-            bboxes=det_bboxes,
-            labels=det_labels,
-            frame_id=frame_id,
+            img=img,
+            feats=None,
+            data_sample=track_data_sample,
+            data_preprocessor=self.preprocess_cfg,
             rescale=rescale,
             **kwargs)
+        track_data_sample.pred_track_instances = pred_track_instances
 
-        track_results = outs2results(
-            bboxes=track_bboxes,
-            labels=track_labels,
-            ids=track_ids,
-            num_classes=num_classes)
-        det_results = outs2results(
-            bboxes=det_bboxes, labels=det_labels, num_classes=num_classes)
-
-        return dict(
-            det_bboxes=det_results['bbox_results'],
-            track_bboxes=track_results['bbox_results'])
+        return [track_data_sample]
diff --git a/mmtrack/models/mot/my_deep_sort.py b/mmtrack/models/mot/my_deep_sort.py
new file mode 100644
index 000000000..68ea827a2
--- /dev/null
+++ b/mmtrack/models/mot/my_deep_sort.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+from torch import Tensor
+
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.utils import OptConfigType, SampleList
+from .base import BaseMultiObjectTracker
+
+
+@MODELS.register_module()
+class MyDeepSORT(BaseMultiObjectTracker):
+    """Simple online and realtime tracking with a deep association metric.
+
+    Details can be found at `DeepSORT<https://arxiv.org/abs/1703.07402>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        reid (dict): Configuration of reid. Defaults to None
+        tracker (dict): Configuration of tracker. Defaults to None.
+        motion (dict): Configuration of motion. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 reid: Optional[dict] = None,
+                 pose: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 motion: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if reid is not None:
+            self.reid = MODELS.build(reid)
+
+        if pose is not None:
+            self.pose = MODELS.build(pose)
+
+        if motion is not None:
+            self.motion = TASK_UTILS.build(motion)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+        self.preprocess_cfg = data_preprocessor
+
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        raise NotImplementedError(
+            'Please train `detector` and `reid` models firstly, then \
+                inference with SORT/DeepSORT.')
+
+    def predict(self,
+                inputs: Dict[str, Tensor],
+                data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            SampleList: Tracking results of the
+                input images. Each TrackDataSample usually contains
+                ``pred_det_instances`` or ``pred_track_instances``.
+        """
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+        img = img[0]
+
+        assert len(data_samples) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+
+        det_results = self.detector.predict(img, data_samples)
+        assert len(det_results) == 1, 'Batch inference is not supported.'
+        track_data_sample.pred_det_instances = det_results[
+            0].pred_instances.clone()
+
+        pred_track_instances = self.tracker.track(
+            model=self,
+            img=img,
+            feats=None,
+            data_sample=track_data_sample,
+            data_preprocessor=self.preprocess_cfg,
+            rescale=rescale,
+            **kwargs)
+        track_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/mmtrack/models/mot/oc_sort.py b/mmtrack/models/mot/oc_sort.py
new file mode 100644
index 000000000..f96066fc7
--- /dev/null
+++ b/mmtrack/models/mot/oc_sort.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+import torch
+from torch import Tensor
+
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.utils import OptConfigType, SampleList
+
+from .base import BaseMultiObjectTracker
+
+
+@MODELS.register_module()
+class OCSORT(BaseMultiObjectTracker):
+    """OCOSRT: Observation-Centric SORT: Rethinking SORT for Robust
+    Multi-Object Tracking
+    This multi object tracker is the implementation of `OC-SORT
+    <https://arxiv.org/abs/2203.14360>`_.
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        motion (dict): Configuration of motion. Defaults to None.
+        init_cfg (dict): Configuration of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 detector=None,
+                 tracker=None,
+                 motion=None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg=None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if motion is not None:
+            self.motion = TASK_UTILS.build(motion)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        return self.detector.loss(inputs, data_samples, **kwargs)
+
+
+    def predict(self, 
+                inputs: Dict[str, Tensor],
+                data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            SampleList: Tracking results of the input images.
+            Each TrackDataSample usually contains ``pred_det_instances``
+            or ``pred_track_instances``.
+        """
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'OCSORT inference only support ' \
+            '1 batch size per gpu for now.'
+        img = img[0]
+
+        assert len(data_samples) == 1, \
+            'OCSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+
+        det_results = self.detector.predict(img, data_samples)
+        assert len(det_results) == 1, 'Batch inference is not supported.'
+        track_data_sample.pred_det_instances = det_results[
+            0].pred_instances.clone()
+        
+        pred_track_instances = self.tracker.track(
+            model=self,
+            img=img,
+            feats=None,
+            data_sample=track_data_sample,
+            rescale=rescale,
+            **kwargs)
+
+        track_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
\ No newline at end of file
diff --git a/mmtrack/models/mot/qdtrack.py b/mmtrack/models/mot/qdtrack.py
index 8a1dba7cc..93db693d3 100644
--- a/mmtrack/models/mot/qdtrack.py
+++ b/mmtrack/models/mot/qdtrack.py
@@ -1,10 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, Optional
+
 import torch
-from mmdet.models import build_detector, build_head
+from torch import Tensor
 
-from mmtrack.core import outs2results, results2outs
-from mmtrack.models.mot import BaseMultiObjectTracker
-from ..builder import MODELS, build_tracker
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.utils import OptConfigType, OptMultiConfig, SampleList
+from .base import BaseMultiObjectTracker
 
 
 @MODELS.register_module()
@@ -20,163 +24,172 @@ class QDTrack(BaseMultiObjectTracker):
         tracker (dict): Configuration of tracker. Defaults to None.
         freeze_detector (bool): If True, freeze the detector weights.
             Defaults to False.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
     """
 
     def __init__(self,
-                 detector=None,
-                 track_head=None,
-                 tracker=None,
-                 freeze_detector=False,
-                 *args,
-                 **kwargs):
-        super().__init__(*args, **kwargs)
+                 detector: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 freeze_detector: bool = False,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
         if detector is not None:
-            self.detector = build_detector(detector)
+            self.detector = MODELS.build(detector)
 
         if track_head is not None:
-            self.track_head = build_head(track_head)
+            self.track_head = MODELS.build(track_head)
 
         if tracker is not None:
-            self.tracker = build_tracker(tracker)
+            self.tracker = MODELS.build(tracker)
 
         self.freeze_detector = freeze_detector
         if self.freeze_detector:
             self.freeze_module('detector')
 
-    def forward_train(self,
-                      img,
-                      img_metas,
-                      gt_bboxes,
-                      gt_labels,
-                      gt_match_indices,
-                      ref_img,
-                      ref_img_metas,
-                      ref_gt_bboxes,
-                      ref_gt_labels,
-                      gt_bboxes_ignore=None,
-                      gt_masks=None,
-                      ref_gt_bboxes_ignore=None,
-                      ref_gt_masks=None,
-                      **kwargs):
-        """Forward function during training.
-
-         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
-                each item has a shape (num_gts, 4).
-            gt_labels (list[Tensor]): Ground truth labels of all images.
-                each has a shape (num_gts,).
-            gt_match_indices (list(Tensor)): Mapping from gt_instance_ids to
-                ref_gt_instance_ids of the same tracklet in a pair of images.
-            ref_img (Tensor): of shape (N, C, H, W) encoding input reference
-                images. Typically these should be mean centered and std scaled.
-            ref_img_metas (list[dict]): list of reference image info dict where
-                each dict has: 'img_shape', 'scale_factor', 'flip', and may
-                also contain 'filename', 'ori_shape', 'pad_shape',
-                and 'img_norm_cfg'.
-            ref_gt_bboxes (list[Tensor]): Ground truth bboxes of the
-                reference image, each item has a shape (num_gts, 4).
-            ref_gt_labels (list[Tensor]): Ground truth labels of all
-                reference images, each has a shape (num_gts,).
-            gt_masks (list[Tensor]) : Masks for each bbox, has a shape
-                (num_gts, h , w).
-            gt_bboxes_ignore (list[Tensor], None): Ground truth bboxes to be
-                ignored, each item has a shape (num_ignored_gts, 4).
-            ref_gt_bboxes_ignore (list[Tensor], None): Ground truth bboxes
-                of reference images to be ignored,
-                each item has a shape (num_ignored_gts, 4).
-            ref_gt_masks (list[Tensor]) : Masks for each reference bbox,
-                has a shape (num_gts, h , w).
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
 
         Returns:
-            dict[str : Tensor]: All losses.
+            dict: A dictionary of loss components.
         """
+        # modify the inputs shape to fit mmdet
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(1) == 1, \
+            'QDTrack can only have 1 key frame and 1 reference frame.'
+        img = img[:, 0]
+
+        ref_img = inputs['ref_img']
+        assert ref_img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert ref_img.size(1) == 1, \
+            'QDTrack can only have 1 key frame and 1 reference frame.'
+        ref_img = ref_img[:, 0]
+
         x = self.detector.extract_feat(img)
+        ref_x = self.detector.extract_feat(ref_img)
 
         losses = dict()
 
         # RPN forward and loss
-        if self.detector.with_rpn:
-            proposal_cfg = self.detector.train_cfg.get(
-                'rpn_proposal', self.detector.test_cfg.rpn)
-            rpn_losses, proposal_list = self.detector.rpn_head.forward_train(
-                x,
-                img_metas,
-                gt_bboxes,
-                gt_labels=None,
-                gt_bboxes_ignore=gt_bboxes_ignore,
-                proposal_cfg=proposal_cfg)
-            losses.update(rpn_losses)
-
-        roi_losses = self.detector.roi_head.forward_train(
-            x, img_metas, proposal_list, gt_bboxes, gt_labels,
-            gt_bboxes_ignore, gt_masks, **kwargs)
-
-        losses.update(roi_losses)
-
-        ref_x = self.detector.extract_feat(ref_img)
-        ref_proposals = self.detector.rpn_head.simple_test_rpn(
-            ref_x, ref_img_metas)
-
-        track_losses = self.track_head.forward_train(
-            x, img_metas, proposal_list, gt_bboxes, gt_labels,
-            gt_match_indices, ref_x, ref_img_metas, ref_proposals,
-            ref_gt_bboxes, ref_gt_labels, gt_bboxes_ignore, gt_masks,
-            ref_gt_bboxes_ignore)
-
-        losses.update(track_losses)
+        assert self.detector.with_rpn, \
+            'QDTrack only support detector with RPN.'
+
+        proposal_cfg = self.detector.train_cfg.get('rpn_proposal',
+                                                   self.detector.test_cfg.rpn)
+        rpn_data_samples = copy.deepcopy(data_samples)
+        # set cat_id of gt_labels to 0 in RPN
+        for data_sample in rpn_data_samples:
+            data_sample.gt_instances.labels = \
+                torch.zeros_like(data_sample.gt_instances.labels)
+
+        rpn_losses, rpn_results_list = self.detector.rpn_head. \
+            loss_and_predict(x,
+                             rpn_data_samples,
+                             proposal_cfg=proposal_cfg,
+                             **kwargs)
+        # avoid get same name with roi_head loss
+        keys = rpn_losses.keys()
+        for key in keys:
+            if 'loss' in key and 'rpn' not in key:
+                rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+        losses.update(rpn_losses)
+
+        losses_detect = self.detector.roi_head.loss(x, rpn_results_list,
+                                                    data_samples, **kwargs)
+        losses.update(losses_detect)
+
+        # adjust the key of ref_img in data_samples
+        ref_rpn_data_samples = []
+        for data_sample in data_samples:
+            ref_rpn_data_sample = TrackDataSample()
+            ref_rpn_data_sample.set_metainfo(
+                metainfo=dict(
+                    img_shape=data_sample.metainfo['ref_img_shape'],
+                    scale_factor=data_sample.metainfo['ref_scale_factor']))
+            ref_rpn_data_samples.append(ref_rpn_data_sample)
+        ref_rpn_results_list = self.detector.rpn_head.predict(
+            ref_x, ref_rpn_data_samples, **kwargs)
+        losses_track = self.track_head.loss(x, ref_x, rpn_results_list,
+                                            ref_rpn_results_list, data_samples,
+                                            **kwargs)
+        losses.update(losses_track)
 
         return losses
 
-    def simple_test(self, img, img_metas, rescale=False):
-        """Test forward.
-
-         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-            rescale (bool): whether to rescale the bboxes.
+    def predict(self,
+                inputs: Dict[str, Tensor],
+                data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
 
         Returns:
-            dict[str : Tensor]: Track results.
+            SampleList: Tracking results of the input images.
+            Each TrackDataSample usually contains ``pred_det_instances``
+            or ``pred_track_instances``.
         """
-        # TODO inherit from a base tracker
-        assert self.with_track_head, 'track head must be implemented.'  # noqa
-        frame_id = img_metas[0].get('frame_id', -1)
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(1) == 1, \
+            'QDTrack can only have 1 key frame.'
+        img = img[:, 0]
+
+        assert len(data_samples) == 1, \
+            'QDTrack only support 1 batch size per gpu for now.'
+        metainfo = data_samples[0].metainfo
+        frame_id = metainfo.get('frame_id', -1)
         if frame_id == 0:
             self.tracker.reset()
 
         x = self.detector.extract_feat(img)
-        proposal_list = self.detector.rpn_head.simple_test_rpn(x, img_metas)
-
-        det_results = self.detector.roi_head.simple_test(
-            x, proposal_list, img_metas, rescale=rescale)
-
-        bbox_results = det_results[0]
-        num_classes = len(bbox_results)
-        outs_det = results2outs(bbox_results=bbox_results)
+        rpn_results_list = self.detector.rpn_head.predict(x, data_samples)
+        det_results = self.detector.roi_head.predict(
+            x, rpn_results_list, data_samples, rescale=rescale)
 
-        det_bboxes = torch.tensor(outs_det['bboxes']).to(img)
-        det_labels = torch.tensor(outs_det['labels']).to(img).long()
+        track_data_sample = data_samples[0]
+        track_data_sample.pred_det_instances = \
+            det_results[0].clone()
 
-        track_bboxes, track_labels, track_ids = self.tracker.track(
-            img_metas=img_metas,
-            feats=x,
+        pred_track_instances = self.tracker.track(
             model=self,
-            bboxes=det_bboxes,
-            labels=det_labels,
-            frame_id=frame_id)
-
-        track_bboxes = outs2results(
-            bboxes=track_bboxes,
-            labels=track_labels,
-            ids=track_ids,
-            num_classes=num_classes)['bbox_results']
+            img=img,
+            feats=x,
+            data_sample=track_data_sample,
+            rescale=rescale,
+            **kwargs)
+        track_data_sample.pred_track_instances = pred_track_instances
 
-        return dict(det_bboxes=bbox_results, track_bboxes=track_bboxes)
+        return [track_data_sample]
diff --git a/mmtrack/models/mot/qdtrack_sstg.py b/mmtrack/models/mot/qdtrack_sstg.py
new file mode 100644
index 000000000..c2f69e7bb
--- /dev/null
+++ b/mmtrack/models/mot/qdtrack_sstg.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, Optional
+
+import torch
+from torch import Tensor
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.utils import OptConfigType, OptMultiConfig, SampleList
+from .base import BaseMultiObjectTracker
+from mmdet.models.detectors import SingleStageDetector
+
+@MODELS.register_module()
+class QDTrackSSTG(BaseMultiObjectTracker):
+    """Quasi-Dense Similarity Learning Single Stage for Multiple Object Tracking.
+
+    This multi object tracker is the implementation of `QDTrack
+    <https://arxiv.org/abs/2006.06664>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        freeze_detector (bool): If True, freeze the detector weights.
+            Defaults to False.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 freeze_detector: bool = False,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 train_cfg: OptMultiConfig = None,
+                 test_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+        self.freeze_detector = freeze_detector
+        if self.freeze_detector:
+            self.freeze_module('detector')
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        # modify the inputs shape to fit mmdet
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(1) == 1, \
+            'QDTrack can only have 1 key frame and 1 reference frame.'
+        img = img[:, 0]
+
+        ref_img = inputs['ref_img']
+        assert ref_img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert ref_img.size(1) == 1, \
+            'QDTrack can only have 1 key frame and 1 reference frame.'
+        ref_img = ref_img[:, 0]
+
+        x = self.detector.extract_feat(img)
+        ref_x = self.detector.extract_feat(ref_img)
+
+        losses = dict()
+
+        detect_losses, proposal_results = self.detector.bbox_head.loss_and_predict(x, data_samples, **kwargs)
+
+        losses.update(detect_losses)
+
+        # adjust the key of ref_img in data_samples
+        ref_data_samples = []
+        for data_sample in data_samples:
+            ref_rpn_data_sample = TrackDataSample()
+            ref_rpn_data_sample.set_metainfo(
+                metainfo=dict(
+                    img_shape=data_sample.metainfo['ref_img_shape'],
+                    scale_factor=data_sample.metainfo['ref_scale_factor']))
+            ref_data_samples.append(ref_rpn_data_sample)
+        ref_proposal_results = self.detector.bbox_head.predict(
+            ref_x, ref_data_samples, **kwargs)
+        losses_track = self.track_head.loss(x, ref_x, proposal_results,
+                                            ref_proposal_results, data_samples,
+                                            **kwargs)
+        losses.update(losses_track)
+
+        return losses
+
+    def predict(self,
+                inputs: Dict[str, Tensor],
+                data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            SampleList: Tracking results of the input images.
+            Each TrackDataSample usually contains ``pred_det_instances``
+            or ``pred_track_instances``.
+        """
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(1) == 1, \
+            'QDTrack can only have 1 key frame.'
+        img = img[:, 0]
+
+        assert len(data_samples) == 1, \
+            'QDTrack only support 1 batch size per gpu for now.'
+        metainfo = data_samples[0].metainfo
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.tracker.reset()
+
+        x = self.detector.extract_feat(img)
+
+        det_results = self.detector.bbox_head.predict(x, data_samples, rescale=rescale)
+
+        track_data_sample = data_samples[0]
+        track_data_sample.pred_det_instances = \
+            det_results[0].clone()
+
+        pred_track_instances = self.tracker.track(
+            model=self,
+            img=img,
+            feats=x,
+            data_sample=track_data_sample,
+            rescale=rescale,
+            **kwargs)
+        track_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/mmtrack/models/mot/strong_sort.py b/mmtrack/models/mot/strong_sort.py
new file mode 100644
index 000000000..05ffd7040
--- /dev/null
+++ b/mmtrack/models/mot/strong_sort.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.utils import OptConfigType
+from .deep_sort import DeepSORT
+
+
+@MODELS.register_module()
+class StrongSORT(DeepSORT):
+    """StrongSORT: Make DeepSORT Great Again.
+
+    Details can be found at `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        reid (dict): Configuration of reid. Defaults to None
+        tracker (dict): Configuration of tracker. Defaults to None.
+        kalman (dict): Configuration of Kalman filter. Defaults to None.
+        cmc (dict): Configuration of camera model compensation.
+            Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 reid: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 kalman: Optional[dict] = None,
+                 cmc: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(detector, reid, tracker, kalman, data_preprocessor,
+                         init_cfg)
+
+        if kalman is not None:
+            self.kalman = TASK_UTILS.build(kalman)
+
+        if cmc is not None:
+            self.cmc = TASK_UTILS.build(cmc)
+
+    @property
+    def with_cmc(self):
+        """bool: whether the framework has a camera model compensation
+                model.
+        """
+        return hasattr(self, 'cmc') and self.cmc is not None
+
+    @property
+    def with_kalman_filter(self):
+        """bool: whether the framework has a Kalman filter."""
+        return hasattr(self, 'kalman') and self.kalman is not None
diff --git a/mmtrack/models/mot/tracktor.py b/mmtrack/models/mot/tracktor.py
index 841c025fb..8502e8dbe 100644
--- a/mmtrack/models/mot/tracktor.py
+++ b/mmtrack/models/mot/tracktor.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
+from typing import Dict
 
-from mmdet.models import build_detector
+from torch import Tensor
 
-from mmtrack.core import outs2results
-from ..builder import MODELS, build_motion, build_reid, build_tracker
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.utils import OptConfigType, SampleList
 from ..motion import CameraMotionCompensation, LinearMotion
 from .base import BaseMultiObjectTracker
 
@@ -14,41 +14,35 @@ class Tracktor(BaseMultiObjectTracker):
     """Tracking without bells and whistles.
 
     Details can be found at `Tracktor<https://arxiv.org/abs/1903.05625>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        reid (dict): Configuration of reid. Defaults to None
+        tracker (dict): Configuration of tracker. Defaults to None.
+        motion (dict): Configuration of motion. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
     """
 
     def __init__(self,
-                 detector=None,
-                 reid=None,
-                 tracker=None,
-                 motion=None,
-                 pretrains=None,
-                 init_cfg=None):
-        super().__init__(init_cfg)
-        if isinstance(pretrains, dict):
-            warnings.warn('DeprecationWarning: pretrains is deprecated, '
-                          'please use "init_cfg" instead')
-            if detector:
-                detector_pretrain = pretrains.get('detector', None)
-                if detector_pretrain:
-                    detector.init_cfg = dict(
-                        type='Pretrained', checkpoint=detector_pretrain)
-                else:
-                    detector.init_cfg = None
-            if reid:
-                reid_pretrain = pretrains.get('reid', None)
-                if reid_pretrain:
-                    reid.init_cfg = dict(
-                        type='Pretrained', checkpoint=reid_pretrain)
-                else:
-                    reid.init_cfg = None
+                 detector: OptConfigType = None,
+                 reid: OptConfigType = None,
+                 tracker: OptConfigType = None,
+                 motion: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(data_preprocessor, init_cfg)
         if detector is not None:
-            self.detector = build_detector(detector)
+            self.detector = MODELS.build(detector)
 
         if reid is not None:
-            self.reid = build_reid(reid)
+            self.reid = MODELS.build(reid)
 
         if motion is not None:
-            self.motion = build_motion(motion)
+            self.motion = TASK_UTILS.build(motion)
             if not isinstance(self.motion, list):
                 self.motion = [self.motion]
             for m in self.motion:
@@ -58,7 +52,9 @@ def __init__(self,
                     self.linear_motion = m
 
         if tracker is not None:
-            self.tracker = build_tracker(tracker)
+            self.tracker = MODELS.build(tracker)
+
+        self.preprocess_cfg = data_preprocessor
 
     @property
     def with_cmc(self):
@@ -73,84 +69,72 @@ def with_linear_motion(self):
         return hasattr(self,
                        'linear_motion') and self.linear_motion is not None
 
-    def forward_train(self, *args, **kwargs):
-        """Forward function during training."""
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
         raise NotImplementedError(
             'Please train `detector` and `reid` models firstly, then \
                 inference with Tracktor.')
 
-    def simple_test(self,
-                    img,
-                    img_metas,
-                    rescale=False,
-                    public_bboxes=None,
-                    **kwargs):
-        """Test without augmentations.
+    def predict(self,
+                inputs: Dict[str, Tensor],
+                data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
 
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-            rescale (bool, optional): If False, then returned bboxes and masks
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
                 will fit the scale of img, otherwise, returned bboxes and masks
-                will fit the scale of original image shape. Defaults to False.
-            public_bboxes (list[Tensor], optional): Public bounding boxes from
-                the benchmark. Defaults to None.
+                will fit the scale of original image shape. Defaults to True.
 
         Returns:
-            dict[str : list(ndarray)]: The tracking results.
+            SampleList: Tracking results of the
+                input images. Each TrackDataSample usually contains
+                ``pred_det_instances`` or ``pred_track_instances``.
         """
-        frame_id = img_metas[0].get('frame_id', -1)
-        if frame_id == 0:
-            self.tracker.reset()
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'Tracktor inference only support ' \
+            '1 batch size per gpu for now.'
+        img = img[0]
+
+        assert len(data_samples) == 1, \
+            'Tracktor inference only support ' \
+            '1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+
+        assert hasattr(self.detector, 'roi_head'), \
+            'Tracktor must need "roi_head" to refine proposals.'
 
         x = self.detector.extract_feat(img)
-        if hasattr(self.detector, 'roi_head'):
-            # TODO: check whether this is the case
-            if public_bboxes is not None:
-                public_bboxes = [_[0] for _ in public_bboxes]
-                proposals = public_bboxes
-            else:
-                proposals = self.detector.rpn_head.simple_test_rpn(
-                    x, img_metas)
-            det_bboxes, det_labels = self.detector.roi_head.simple_test_bboxes(
-                x,
-                img_metas,
-                proposals,
-                self.detector.roi_head.test_cfg,
-                rescale=rescale)
-            # TODO: support batch inference
-            det_bboxes = det_bboxes[0]
-            det_labels = det_labels[0]
-            num_classes = self.detector.roi_head.bbox_head.num_classes
-        elif hasattr(self.detector, 'bbox_head'):
-            num_classes = self.detector.bbox_head.num_classes
-            raise NotImplementedError(
-                'Tracktor must need "roi_head" to refine proposals.')
-        else:
-            raise TypeError('detector must has roi_head or bbox_head.')
-
-        track_bboxes, track_labels, track_ids = self.tracker.track(
-            img=img,
-            img_metas=img_metas,
+        rpn_results = self.detector.rpn_head.predict(
+            x, data_samples, rescale=False)
+        det_results = self.detector.roi_head.predict(
+            x, rpn_results, data_samples, rescale=rescale)
+        assert len(det_results) == 1, 'Batch inference is not supported.'
+        track_data_sample.pred_det_instances = det_results[0]
+
+        pred_track_instances = self.tracker.track(
             model=self,
+            img=img,
             feats=x,
-            bboxes=det_bboxes,
-            labels=det_labels,
-            frame_id=frame_id,
+            data_sample=track_data_sample,
+            data_preprocessor=self.preprocess_cfg,
             rescale=rescale,
             **kwargs)
+        track_data_sample.pred_track_instances = pred_track_instances
 
-        track_results = outs2results(
-            bboxes=track_bboxes,
-            labels=track_labels,
-            ids=track_ids,
-            num_classes=num_classes)
-        det_results = outs2results(
-            bboxes=det_bboxes, labels=det_labels, num_classes=num_classes)
-
-        return dict(
-            det_bboxes=det_results['bbox_results'],
-            track_bboxes=track_results['bbox_results'])
+        return [track_data_sample]
diff --git a/mmtrack/models/motion/__init__.py b/mmtrack/models/motion/__init__.py
index cb4d1c576..8a8b383de 100644
--- a/mmtrack/models/motion/__init__.py
+++ b/mmtrack/models/motion/__init__.py
@@ -5,5 +5,5 @@
 from .linear_motion import LinearMotion
 
 __all__ = [
-    'FlowNetSimple', 'CameraMotionCompensation', 'LinearMotion', 'KalmanFilter'
+    'FlowNetSimple', 'KalmanFilter', 'CameraMotionCompensation', 'LinearMotion'
 ]
diff --git a/mmtrack/models/motion/camera_motion_compensation.py b/mmtrack/models/motion/camera_motion_compensation.py
index e34311b5c..1904dc1bf 100644
--- a/mmtrack/models/motion/camera_motion_compensation.py
+++ b/mmtrack/models/motion/camera_motion_compensation.py
@@ -2,29 +2,31 @@
 import cv2
 import numpy as np
 import torch
+from torch import Tensor
 
-from ..builder import MOTION
+from mmtrack.registry import TASK_UTILS
 
 
-@MOTION.register_module()
-class CameraMotionCompensation(object):
+@TASK_UTILS.register_module()
+class CameraMotionCompensation:
     """Camera motion compensation.
 
     Args:
         warp_mode (str): Warp mode in opencv.
-        num_iters (int): Number of the iterations.
-        stop_eps (float): Terminate threshold.
+            Defaults to 'cv2.MOTION_EUCLIDEAN'.
+        num_iters (int): Number of the iterations. Defaults to 50.
+        stop_eps (float): Terminate threshold. Defaults to 0.001.
     """
 
     def __init__(self,
-                 warp_mode='cv2.MOTION_EUCLIDEAN',
-                 num_iters=50,
-                 stop_eps=0.001):
+                 warp_mode: str = 'cv2.MOTION_EUCLIDEAN',
+                 num_iters: int = 50,
+                 stop_eps: float = 0.001):
         self.warp_mode = eval(warp_mode)
         self.num_iters = num_iters
         self.stop_eps = stop_eps
 
-    def get_warp_matrix(self, img, ref_img):
+    def get_warp_matrix(self, img: np.ndarray, ref_img: np.ndarray) -> Tensor:
         """Calculate warping matrix between two images."""
         img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
         ref_img = cv2.cvtColor(ref_img, cv2.COLOR_RGB2GRAY)
@@ -38,7 +40,7 @@ def get_warp_matrix(self, img, ref_img):
         warp_matrix = torch.from_numpy(warp_matrix)
         return warp_matrix
 
-    def warp_bboxes(self, bboxes, warp_matrix):
+    def warp_bboxes(self, bboxes: Tensor, warp_matrix: Tensor) -> Tensor:
         """Warp bounding boxes according to the warping matrix."""
         tl, br = bboxes[:, :2], bboxes[:, 2:]
         tl = torch.cat((tl, torch.ones(tl.shape[0], 1).to(bboxes.device)),
@@ -50,7 +52,8 @@ def warp_bboxes(self, bboxes, warp_matrix):
         trans_bboxes = torch.cat((trans_tl, trans_br), dim=1)
         return trans_bboxes.to(bboxes.device)
 
-    def track(self, img, ref_img, tracks, num_samples, frame_id):
+    def track(self, img: Tensor, ref_img: Tensor, tracks: dict,
+              num_samples: int, frame_id: int) -> dict:
         """Tracking forward."""
         img = img.squeeze(0).cpu().numpy().transpose((1, 2, 0))
         ref_img = ref_img.squeeze(0).cpu().numpy().transpose((1, 2, 0))
diff --git a/mmtrack/models/motion/flownet_simple.py b/mmtrack/models/motion/flownet_simple.py
index a84203fa8..f0168e137 100644
--- a/mmtrack/models/motion/flownet_simple.py
+++ b/mmtrack/models/motion/flownet_simple.py
@@ -1,13 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
 import torch
 import torch.nn as nn
 from mmcv.cnn.bricks import ConvModule
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
-from ..builder import MOTION
+from mmtrack.registry import MODELS
 
 
-@MOTION.register_module()
+@MODELS.register_module()
 class FlowNetSimple(BaseModule):
     """The simple version of FlowNet.
 
@@ -40,12 +42,12 @@ class FlowNetSimple(BaseModule):
     }
 
     def __init__(self,
-                 img_scale_factor,
-                 out_indices=[2, 3, 4, 5, 6],
-                 flow_scale_factor=5.0,
-                 flow_img_norm_std=[255.0, 255.0, 255.0],
-                 flow_img_norm_mean=[0.411, 0.432, 0.450],
-                 init_cfg=None):
+                 img_scale_factor: float,
+                 out_indices: List[int] = [2, 3, 4, 5, 6],
+                 flow_scale_factor: float = 5.0,
+                 flow_img_norm_std: List[float] = [255.0, 255.0, 255.0],
+                 flow_img_norm_mean: List[float] = [0.411, 0.432, 0.450],
+                 init_cfg: Union[List[dict], dict] = None) -> None:
         super(FlowNetSimple, self).__init__(init_cfg)
         self.img_scale_factor = img_scale_factor
         self.out_indices = out_indices
@@ -146,44 +148,49 @@ def __init__(self,
             conv_cfg=dict(type='Conv'),
             act_cfg=None)
 
-    def prepare_imgs(self, imgs, img_metas):
+    def prepare_imgs(self, imgs: torch.Tensor, metainfo: dict,
+                     preprocess_cfg: dict) -> torch.Tensor:
         """Preprocess images pairs for computing flow.
 
         Args:
             imgs (Tensor): of shape (N, 6, H, W) encoding input images pairs.
                 Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image information dict where each
+            metainfo (dict): image information dict where each
                 dict has: 'img_shape', 'scale_factor', 'flip', and may also
                 contain 'filename', 'ori_shape', 'pad_shape', and
                 'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
+                `mmtrack/datasets/transforms/formatting.py:PackTrackInputs`.
+            preprocess_cfg (dict): Model preprocessing config
+                for processing the input data. it usually includes
+                ``bgr_to_rgb``, ``rgb_to_bgr``, ``pad_size_divisor``,
+                 ``pad_value``, ``mean``, ``std`` and ``batch_augments``.
 
         Returns:
             Tensor: of shape (N, 6, H, W) encoding the input images pairs for
             FlowNetSimple.
         """
         if not hasattr(self, 'img_norm_mean'):
-            mean = img_metas[0]['img_norm_cfg']['mean']
-            mean = torch.tensor(mean, dtype=imgs.dtype, device=imgs.device)
+            mean = preprocess_cfg['mean']
+            mean = torch.tensor(mean).to(imgs)
             self.img_norm_mean = mean.repeat(2)[None, :, None, None]
 
             mean = self.flow_img_norm_mean
-            mean = torch.tensor(mean, dtype=imgs.dtype, device=imgs.device)
+            mean = torch.tensor(mean).to(imgs)
             self.flow_img_norm_mean = mean.repeat(2)[None, :, None, None]
 
         if not hasattr(self, 'img_norm_std'):
-            std = img_metas[0]['img_norm_cfg']['std']
-            std = torch.tensor(std, dtype=imgs.dtype, device=imgs.device)
+            std = preprocess_cfg['std']
+            std = torch.tensor(std).to(imgs)
             self.img_norm_std = std.repeat(2)[None, :, None, None]
 
             std = self.flow_img_norm_std
-            std = torch.tensor(std, dtype=imgs.dtype, device=imgs.device)
+            std = torch.tensor(std).to(imgs)
             self.flow_img_norm_std = std.repeat(2)[None, :, None, None]
 
         flow_img = imgs * self.img_norm_std + self.img_norm_mean
         flow_img = flow_img / self.flow_img_norm_std - self.flow_img_norm_mean
-        flow_img[:, :, img_metas[0]['img_shape'][0]:, :] = 0.0
-        flow_img[:, :, :, img_metas[0]['img_shape'][1]:] = 0.0
+        flow_img[:, :, metainfo['img_shape'][0]:, :] = 0.0
+        flow_img[:, :, :, metainfo['img_shape'][1]:] = 0.0
         flow_img = torch.nn.functional.interpolate(
             flow_img,
             scale_factor=self.img_scale_factor,
@@ -191,22 +198,27 @@ def prepare_imgs(self, imgs, img_metas):
             align_corners=False)
         return flow_img
 
-    def forward(self, imgs, img_metas):
+    def forward(self, imgs: torch.Tensor, metainfo: dict,
+                preprocess_cfg: dict) -> torch.Tensor:
         """Compute the flow of images pairs.
 
         Args:
             imgs (Tensor): of shape (N, 6, H, W) encoding input images pairs.
                 Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image information dict where each
+            metainfo (dict): image information dict where each
                 dict has: 'img_shape', 'scale_factor', 'flip', and may also
                 contain 'filename', 'ori_shape', 'pad_shape', and
                 'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
+                `mmtrack/datasets/transforms/formatting.py:PackTrackInputs`.
+            preprocess_cfg (dict): Model preprocessing config
+                for processing the input data. it usually includes
+                ``to_rgb``, ``pad_size_divisor``, ``pad_value``,
+                ``mean`` and ``std``.
 
         Returns:
             Tensor: of shape (N, 2, H, W) encoding flow of images pairs.
         """
-        x = self.prepare_imgs(imgs, img_metas)
+        x = self.prepare_imgs(imgs, metainfo, preprocess_cfg)
         conv_outs = []
         for i, conv_name in enumerate(self.conv_layers, 1):
             conv_layer = getattr(self, conv_name)
diff --git a/mmtrack/models/motion/kalman_filter.py b/mmtrack/models/motion/kalman_filter.py
index 61b2a0ca7..2fc58268a 100644
--- a/mmtrack/models/motion/kalman_filter.py
+++ b/mmtrack/models/motion/kalman_filter.py
@@ -1,15 +1,27 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import numpy as np
 import scipy.linalg
+import torch
 
-from ..builder import MOTION
+from mmtrack.registry import TASK_UTILS
 
 
-@MOTION.register_module()
-class KalmanFilter(object):
+@TASK_UTILS.register_module()
+class KalmanFilter:
     """A simple Kalman filter for tracking bounding boxes in image space.
 
     The implementation is referred to https://github.com/nwojke/deep_sort.
+
+    Args:
+        center_only (bool): If True, distance computation is done with
+            respect to the bounding box center position only.
+            Defaults to False.
+        use_nsa (bool): Whether to use the NSA (Noise Scale Adaptive) Kalman
+            Filter, which adaptively modulates the noise scale according to
+            the quality of detections. More details in
+            https://arxiv.org/abs/2202.11983. Defaults to False.
     """
     chi2inv95 = {
         1: 3.8415,
@@ -23,13 +35,14 @@ class KalmanFilter(object):
         9: 16.919
     }
 
-    def __init__(self, center_only=False):
+    def __init__(self, center_only: bool = False, use_nsa: bool = False):
         self.center_only = center_only
         if self.center_only:
             self.gating_threshold = self.chi2inv95[2]
         else:
             self.gating_threshold = self.chi2inv95[4]
 
+        self.use_nsa = use_nsa
         ndim, dt = 4, 1.
 
         # Create Kalman filter model matrices.
@@ -44,7 +57,7 @@ def __init__(self, center_only=False):
         self._std_weight_position = 1. / 20
         self._std_weight_velocity = 1. / 160
 
-    def initiate(self, measurement):
+    def initiate(self, measurement: np.array) -> Tuple[np.array, np.array]:
         """Create track from unassociated measurement.
 
         Args:
@@ -71,7 +84,8 @@ def initiate(self, measurement):
         covariance = np.diag(np.square(std))
         return mean, covariance
 
-    def predict(self, mean, covariance):
+    def predict(self, mean: np.array,
+                covariance: np.array) -> Tuple[np.array, np.array]:
         """Run Kalman filter prediction step.
 
         Args:
@@ -104,13 +118,18 @@ def predict(self, mean, covariance):
 
         return mean, covariance
 
-    def project(self, mean, covariance):
+    def project(self,
+                mean: np.array,
+                covariance: np.array,
+                bbox_score: float = 0.) -> Tuple[np.array, np.array]:
         """Project state distribution to measurement space.
 
         Args:
             mean (ndarray): The state's mean vector (8 dimensional array).
             covariance (ndarray): The state's covariance matrix (8x8
                 dimensional).
+            bbox_score (float): The confidence score of the bbox.
+                Defaults to 0.
 
         Returns:
             (ndarray, ndarray):  Returns the projected mean and covariance
@@ -121,6 +140,10 @@ def project(self, mean, covariance):
             self._std_weight_position * mean[3], 1e-1,
             self._std_weight_position * mean[3]
         ]
+
+        if self.use_nsa:
+            std = [(1 - bbox_score) * x for x in std]
+
         innovation_cov = np.diag(np.square(std))
 
         mean = np.dot(self._update_mat, mean)
@@ -128,7 +151,11 @@ def project(self, mean, covariance):
             (self._update_mat, covariance, self._update_mat.T))
         return mean, covariance + innovation_cov
 
-    def update(self, mean, covariance, measurement):
+    def update(self,
+               mean: np.array,
+               covariance: np.array,
+               measurement: np.array,
+               bbox_score: float = 0.) -> Tuple[np.array, np.array]:
         """Run Kalman filter correction step.
 
         Args:
@@ -138,13 +165,15 @@ def update(self, mean, covariance, measurement):
             measurement (ndarray): The 4 dimensional measurement vector
                 (x, y, a, h), where (x, y) is the center position, a the
                 aspect ratio, and h the height of the bounding box.
-
+            bbox_score (float): The confidence score of the bbox.
+                Defaults to 0.
 
         Returns:
              (ndarray, ndarray): Returns the measurement-corrected state
              distribution.
         """
-        projected_mean, projected_cov = self.project(mean, covariance)
+        projected_mean, projected_cov = \
+            self.project(mean, covariance, bbox_score)
 
         chol_factor, lower = scipy.linalg.cho_factor(
             projected_cov, lower=True, check_finite=False)
@@ -160,10 +189,10 @@ def update(self, mean, covariance, measurement):
         return new_mean, new_covariance
 
     def gating_distance(self,
-                        mean,
-                        covariance,
-                        measurements,
-                        only_position=False):
+                        mean: np.array,
+                        covariance: np.array,
+                        measurements: np.array,
+                        only_position: bool = False) -> np.array:
         """Compute gating distance between state distribution and measurements.
 
         A suitable distance threshold can be obtained from `chi2inv95`. If
@@ -204,7 +233,8 @@ def gating_distance(self,
         squared_maha = np.sum(z * z, axis=0)
         return squared_maha
 
-    def track(self, tracks, bboxes):
+    def track(self, tracks: dict,
+              bboxes: torch.Tensor) -> Tuple[dict, np.array]:
         """Track forward.
 
         Args:
@@ -212,7 +242,7 @@ def track(self, tracks, bboxes):
             bboxes (Tensor): Detected bounding boxes.
 
         Returns:
-            (dict[int:dict], Tensor): Updated tracks and bboxes.
+            (dict[int:dict], ndarray): Updated tracks and bboxes.
         """
         costs = []
         for id, track in tracks.items():
diff --git a/mmtrack/models/motion/linear_motion.py b/mmtrack/models/motion/linear_motion.py
index 9a2105b63..426eee390 100644
--- a/mmtrack/models/motion/linear_motion.py
+++ b/mmtrack/models/motion/linear_motion.py
@@ -1,25 +1,26 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
+from torch import Tensor
 
-from ..builder import MOTION
+from mmtrack.registry import TASK_UTILS
 
 
-@MOTION.register_module()
-class LinearMotion(object):
+@TASK_UTILS.register_module()
+class LinearMotion:
     """Linear motion while tracking.
 
     Args:
         num_samples (int, optional): Number of samples to calculate the
-            velocity. Default to 2.
+            velocity. Defaults to 2.
         center_motion (bool, optional): Whether use center location or
-            bounding box location to estimate the velocity. Default to False.
+            bounding box location to estimate the velocity. Defaults to False.
     """
 
-    def __init__(self, num_samples=2, center_motion=False):
+    def __init__(self, num_samples: int = 2, center_motion: bool = False):
         self.num_samples = num_samples
         self.center_motion = center_motion
 
-    def center(self, bbox):
+    def center(self, bbox: Tensor) -> Tensor:
         """Get the center of the box."""
         if bbox.ndim == 2:
             assert bbox.shape[0] == 1
@@ -27,7 +28,7 @@ def center(self, bbox):
         x1, y1, x2, y2 = bbox
         return torch.Tensor([(x2 + x1) / 2, (y2 + y1) / 2]).to(bbox.device)
 
-    def get_velocity(self, bboxes, num_samples=None):
+    def get_velocity(self, bboxes: Tensor, num_samples: int = None) -> Tensor:
         """Get velocities of the input objects."""
         if num_samples is None:
             num_samples = min(len(bboxes), self.num_samples)
@@ -41,7 +42,7 @@ def get_velocity(self, bboxes, num_samples=None):
             vs.append(v)
         return torch.stack(vs, dim=0).mean(dim=0)
 
-    def step(self, bboxes, velocity=None):
+    def step(self, bboxes: Tensor, velocity: Tensor = None) -> Tensor:
         """Step forward with the velocity."""
         assert isinstance(bboxes, list)
         if velocity is None:
@@ -62,7 +63,7 @@ def step(self, bboxes, velocity=None):
             bbox += velocity
         return bbox
 
-    def track(self, tracks, frame_id):
+    def track(self, tracks: dict, frame_id: int) -> dict:
         """Tracking forward."""
         for k, v in tracks.items():
             if int(v.frame_ids[-1]) == frame_id - 1:
diff --git a/mmtrack/models/reid/base_reid.py b/mmtrack/models/reid/base_reid.py
index f180631c2..1843a7d3c 100644
--- a/mmtrack/models/reid/base_reid.py
+++ b/mmtrack/models/reid/base_reid.py
@@ -1,37 +1,52 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmcls.models import ImageClassifier
-from mmcv.runner import auto_fp16
+from typing import List, Optional
 
-from ..builder import REID
+import torch
+from mmcls.models.classifiers import ImageClassifier
 
+from mmtrack.registry import MODELS
+from mmtrack.structures import ReIDDataSample
 
-@REID.register_module()
+
+@MODELS.register_module()
 class BaseReID(ImageClassifier):
-    """Base class for re-identification."""
-
-    def forward_train(self, img, gt_label, **kwargs):
-        """"Training forward function."""
-        if img.ndim == 5:
-            # change the shape of image tensor from NxSxCxHxW to NSxCxHxW
-            # where S is the number of samples by triplet sampling
-            img = img.view(-1, *img.shape[2:])
-            # change the shape of label tensor from NxS to NS
-            gt_label = gt_label.view(-1)
-        x = self.extract_feat(img)
-        head_outputs = self.head.forward_train(x[0])
-
-        losses = dict()
-        reid_loss = self.head.loss(gt_label, *head_outputs)
-        losses.update(reid_loss)
-        return losses
-
-    @auto_fp16(apply_to=('img', ), out_fp32=True)
-    def simple_test(self, img, **kwargs):
-        """Test without augmentation."""
-        if img.nelement() > 0:
-            x = self.extract_feat(img)
-            head_outputs = self.head.forward_train(x[0])
-            feats = head_outputs[0]
-            return feats
-        else:
-            return img.new_zeros(0, self.head.out_channels)
+    """Base model for re-identification."""
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[List[ReIDDataSample]] = None,
+                mode: str = 'tensor'):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+          tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+          processed to a list of :obj:`ReIDDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+          inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape
+                (N, C, H, W) or (N, T, C, H, W).
+            data_samples (List[ReIDDataSample], optional): The annotation
+                data of every sample. It's required if ``mode="loss"``.
+                Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of
+              :obj:`ReIDDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if len(inputs.size()) == 5:
+            assert inputs.size(0) == 1
+            inputs = inputs[0]
+        return super().forward(inputs, data_samples, mode)
diff --git a/mmtrack/models/reid/fc_module.py b/mmtrack/models/reid/fc_module.py
index cbc4eecf8..74bd69a02 100644
--- a/mmtrack/models/reid/fc_module.py
+++ b/mmtrack/models/reid/fc_module.py
@@ -1,9 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch.nn as nn
 from mmcv.cnn import build_activation_layer, build_norm_layer
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
+from mmtrack.registry import MODELS
 
+
+@MODELS.register_module()
 class FcModule(BaseModule):
     """Fully-connected layer module.
 
@@ -15,16 +18,17 @@ class FcModule(BaseModule):
         act_cfg (dict, optional): Configuration of activation method after fc.
             Defaults to dict(type='ReLU').
         inplace (bool, optional): Whether inplace the activatation module.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to True.
+        init_cfg (dict, optional): Initialization config dict.
             Defaults to dict(type='Kaiming', layer='Linear').
     """
 
     def __init__(self,
-                 in_channels,
-                 out_channels,
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU'),
-                 inplace=True,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: dict = None,
+                 act_cfg: dict = dict(type='ReLU'),
+                 inplace: bool = True,
                  init_cfg=dict(type='Kaiming', layer='Linear')):
         super(FcModule, self).__init__(init_cfg)
         assert norm_cfg is None or isinstance(norm_cfg, dict)
diff --git a/mmtrack/models/reid/gap.py b/mmtrack/models/reid/gap.py
index 1a514a9ea..23ab54984 100644
--- a/mmtrack/models/reid/gap.py
+++ b/mmtrack/models/reid/gap.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import torch
 import torch.nn as nn
-from mmcls.models.builder import NECKS
-from mmcls.models.necks import GlobalAveragePooling as _GlobalAveragePooling
+from mmengine.model import BaseModule
 
+from mmtrack.registry import MODELS
 
-@NECKS.register_module(force=True)
-class GlobalAveragePooling(_GlobalAveragePooling):
+
+@MODELS.register_module()
+class GlobalAveragePooling(BaseModule):
     """Global Average Pooling neck.
 
     Note that we use `view` to remove extra channel after pooling. We do not
@@ -19,3 +21,20 @@ def __init__(self, kernel_size=None, stride=None):
             self.gap = nn.AdaptiveAvgPool2d((1, 1))
         else:
             self.gap = nn.AvgPool2d(kernel_size, stride)
+
+    def forward(self, inputs):
+        if isinstance(inputs, tuple):
+            outs = tuple([self.gap(x) for x in inputs])
+            outs = tuple([
+                out.view(x.size(0),
+                         torch.tensor(out.size()[1:]).prod())
+                for out, x in zip(outs, inputs)
+            ])
+        elif isinstance(inputs, torch.Tensor):
+            outs = self.gap(inputs)
+            outs = outs.view(
+                inputs.size(0),
+                torch.tensor(outs.size()[1:]).prod())
+        else:
+            raise TypeError('neck inputs should be tuple or torch.tensor')
+        return outs
diff --git a/mmtrack/models/reid/linear_reid_head.py b/mmtrack/models/reid/linear_reid_head.py
index 5acdea9ca..9977a8843 100644
--- a/mmtrack/models/reid/linear_reid_head.py
+++ b/mmtrack/models/reid/linear_reid_head.py
@@ -1,18 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+from typing import List, Optional, Tuple, Union
 
+import torch
 import torch.nn as nn
-from mmcls.models.builder import HEADS
-from mmcls.models.heads.base_head import BaseHead
-from mmcls.models.losses import Accuracy
-from mmcv.runner import auto_fp16, force_fp32
-from mmdet.models.builder import build_loss
+from mmcls.evaluation.metrics import Accuracy
+from mmengine.model import BaseModule
 
+from mmtrack.registry import MODELS
+from mmtrack.structures import ReIDDataSample
 from .fc_module import FcModule
 
 
-@HEADS.register_module()
-class LinearReIDHead(BaseHead):
+@MODELS.register_module()
+class LinearReIDHead(BaseModule):
     """Linear head for re-identification.
 
     Args:
@@ -25,30 +26,31 @@ class LinearReIDHead(BaseHead):
         act_cfg (dict, optional): Configuration of activation method after fc.
             Defaults to None.
         num_classes (int, optional): Number of the identities. Default to None.
-        loss (dict, optional): Cross entropy loss to train the
-            re-identificaiton module.
-        loss_pairwise (dict, optional): Triplet loss to train the
-            re-identificaiton module.
-        topk (int, optional): Calculate topk accuracy. Default to False.
+        loss_cls (dict, optional): Cross entropy loss to train the ReID module.
+            Defaults to None.
+        loss_triplet (dict, optional): Triplet loss to train the ReID module.
+            Defaults to None.
+        topk (int | Tuple[int]): Top-k accuracy. Defaults to ``(1, )``.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Defaults to dict(type='Normal',layer='Linear', mean=0, std=0.01,
             bias=0).
     """
 
     def __init__(self,
-                 num_fcs,
-                 in_channels,
-                 fc_channels,
-                 out_channels,
-                 norm_cfg=None,
-                 act_cfg=None,
-                 num_classes=None,
-                 loss=None,
-                 loss_pairwise=None,
-                 topk=(1, ),
-                 init_cfg=dict(
+                 num_fcs: int,
+                 in_channels: int,
+                 fc_channels: int,
+                 out_channels: int,
+                 norm_cfg: Optional[dict] = None,
+                 act_cfg: Optional[dict] = None,
+                 num_classes: Optional[int] = None,
+                 loss_cls: Optional[dict] = None,
+                 loss_triplet: Optional[dict] = None,
+                 topk: Union[int, Tuple[int]] = (1, ),
+                 init_cfg: Union[dict, List[dict]] = dict(
                      type='Normal', layer='Linear', mean=0, std=0.01, bias=0)):
-        super(LinearReIDHead, self).__init__(init_cfg)
+        super(LinearReIDHead, self).__init__(init_cfg=init_cfg)
+
         assert isinstance(topk, (int, tuple))
         if isinstance(topk, int):
             topk = (topk, )
@@ -56,19 +58,19 @@ def __init__(self,
             assert _topk > 0, 'Top-k should be larger than 0'
         self.topk = topk
 
-        if not loss:
+        if loss_cls is None:
             if isinstance(num_classes, int):
                 warnings.warn('Since cross entropy is not set, '
                               'the num_classes will be ignored.')
-            if not loss_pairwise:
+            if loss_triplet is None:
                 raise ValueError('Please choose at least one loss in '
                                  'triplet loss and cross entropy loss.')
         elif not isinstance(num_classes, int):
             raise TypeError('The num_classes must be a current number, '
                             'if there is cross entropy loss.')
-        self.loss_cls = build_loss(loss) if loss else None
-        self.loss_triplet = build_loss(
-            loss_pairwise) if loss_pairwise else None
+        self.loss_cls = MODELS.build(loss_cls) if loss_cls else None
+        self.loss_triplet = MODELS.build(loss_triplet) \
+            if loss_triplet else None
 
         self.num_fcs = num_fcs
         self.in_channels = in_channels
@@ -77,8 +79,6 @@ def __init__(self,
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
         self.num_classes = num_classes
-        self.accuracy = Accuracy(topk=self.topk)
-        self.fp16_enabled = False
 
         self._init_layers()
 
@@ -97,35 +97,95 @@ def _init_layers(self):
             self.bn = nn.BatchNorm1d(self.out_channels)
             self.classifier = nn.Linear(self.out_channels, self.num_classes)
 
-    @auto_fp16()
-    def forward_train(self, x):
-        """Model forward."""
+    def forward(self, feats: Tuple[torch.Tensor]) -> torch.Tensor:
+        """The forward process."""
+        # Multiple stage inputs are acceptable
+        # but only the last stage will be used.
+        feats = feats[-1]
+
         for m in self.fcs:
-            x = m(x)
-        feats = self.fc_out(x)
-        if self.loss_cls:
-            feats_bn = self.bn(feats)
-            cls_score = self.classifier(feats_bn)
-            return (feats, cls_score)
-        return (feats, )
+            feats = m(feats)
+        feats = self.fc_out(feats)
+        return feats
+
+    def loss(self, feats: Tuple[torch.Tensor],
+             data_samples: List[ReIDDataSample]) -> dict:
+        """Calculate losses.
+
+        Args:
+            feats (tuple[Tensor]): The features extracted from the backbone.
+            data_samples (List[ReIDDataSample]): The annotation data of
+                every samples.
 
-    @force_fp32(apply_to=('feats', 'cls_score'))
-    def loss(self, gt_label, feats, cls_score=None):
-        """Compute losses."""
+        Returns:
+            dict: a dictionary of loss components
+        """
+        # The part can be traced by torch.fx
+        feats = self(feats)
+
+        # The part can not be traced by torch.fx
+        losses = self.loss_by_feat(feats, data_samples)
+        return losses
+
+    def loss_by_feat(self, feats: torch.Tensor,
+                     data_samples: List[ReIDDataSample]) -> dict:
+        """Unpack data samples and compute loss."""
         losses = dict()
+        gt_label = torch.cat([i.gt_label.label for i in data_samples])
 
         if self.loss_triplet:
             losses['triplet_loss'] = self.loss_triplet(feats, gt_label)
 
         if self.loss_cls:
-            assert cls_score is not None
+            feats_bn = self.bn(feats)
+            cls_score = self.classifier(feats_bn)
             losses['ce_loss'] = self.loss_cls(cls_score, gt_label)
-            # compute accuracy
-            acc = self.accuracy(cls_score, gt_label)
-            assert len(acc) == len(self.topk)
-            losses['accuracy'] = {
-                f'top-{k}': a
-                for k, a in zip(self.topk, acc)
-            }
+            acc = Accuracy.calculate(cls_score, gt_label, topk=self.topk)
+            losses.update(
+                {f'accuracy_top-{k}': a
+                 for k, a in zip(self.topk, acc)})
 
         return losses
+
+    def predict(
+            self,
+            feats: Tuple[torch.Tensor],
+            data_samples: List[ReIDDataSample] = None) -> List[ReIDDataSample]:
+        """Inference without augmentation.
+
+        Args:
+            feats (Tuple[Tensor]): The features extracted from the backbone.
+                Multiple stage inputs are acceptable but only the last stage
+                will be used.
+            data_samples (List[ReIDDataSample], optional): The annotation
+                data of every samples. If not None, set ``pred_label`` of
+                the input data samples. Defaults to None.
+
+        Returns:
+            List[ReIDDataSample]: A list of data samples which contains the
+            predicted results.
+        """
+        # The part can be traced by torch.fx
+        feats = self(feats)
+
+        # The part can not be traced by torch.fx
+        data_samples = self.predict_by_feat(feats, data_samples)
+
+        return data_samples
+
+    def predict_by_feat(
+            self,
+            feats: torch.Tensor,
+            data_samples: List[ReIDDataSample] = None) -> List[ReIDDataSample]:
+        """Add prediction features to data samples."""
+        if data_samples is not None:
+            for data_sample, feat in zip(data_samples, feats):
+                data_sample.pred_feature = feat
+        else:
+            data_samples = []
+            for feat in feats:
+                data_sample = ReIDDataSample()
+                data_sample.pred_feature = feat
+                data_samples.append(data_sample)
+
+        return data_samples
diff --git a/mmtrack/models/reid/my_reid.py b/mmtrack/models/reid/my_reid.py
new file mode 100644
index 000000000..717320cec
--- /dev/null
+++ b/mmtrack/models/reid/my_reid.py
@@ -0,0 +1,41 @@
+from mmengine.model import BaseModel
+
+from mmtrack.registry import MODELS
+
+from torchreid.reid.utils import FeatureExtractor
+
+
+@MODELS.register_module()
+class MyReID(BaseModel):
+
+    def __init__(self, model_name: str, model_path: str, device: str):
+        super().__init__()
+        self.reid: FeatureExtractor = FeatureExtractor(
+            model_name=model_name, model_path=model_path, device=device)
+
+    @property
+    def head(self):
+
+        class Head:
+            out_channels = self.reid.model.feature_dim
+
+        return Head()
+
+    def forward(self, inputs, mode: str = 'tensor'):
+        # norm reid:
+        # norm_mean=[0.485, 0.456, 0.406]
+        # norm_std=[0.229, 0.224, 0.225]
+
+        # norm deepsort: RGB
+        # mean=[123.675, 116.28, 103.53]
+        # std = [58.395, 57.12, 57.375]
+
+        import torch
+        mean = torch.tensor([123.675, 116.28, 103.53],
+                            device=inputs.device).reshape(1, 3, 1, 1)
+        std = torch.tensor([58.395, 57.12, 57.375],
+                           device=inputs.device).reshape(1, 3, 1, 1)
+
+        if (inputs.max() > 1): inputs = (inputs - mean) / std
+
+        return self.reid(inputs)
\ No newline at end of file
diff --git a/mmtrack/models/reid/pose_reid.py b/mmtrack/models/reid/pose_reid.py
new file mode 100644
index 000000000..239e4a444
--- /dev/null
+++ b/mmtrack/models/reid/pose_reid.py
@@ -0,0 +1,287 @@
+from typing import Optional, List
+from mmtrack.registry import MODELS
+from mmtrack.structures import ReIDDataSample
+from mmengine.model import BaseModel
+import torch
+from mmengine.dataset import Compose
+import numpy as np
+from mmpose.datasets.transforms import LoadImage, GetBBoxCenterScale, PackPoseInputs
+
+
+@MODELS.register_module()
+class PoseReID(BaseModel):
+
+    def __init__(self,
+                 base_reid: Optional[dict] = None,
+                 pose_model: Optional[dict] = None,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.base_reid = MODELS.build(base_reid)
+        self.pose_model = MODELS.build(pose_model)
+
+        self.pose_pipeline = Compose(
+            [LoadImage(),
+             GetBBoxCenterScale(padding=1.0),
+             PackPoseInputs()])
+
+        self.pose_embbedder = FullBodyPoseEmbedder()
+
+    @property
+    def head(self):
+        return self.base_reid.head
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[List[ReIDDataSample]] = None,
+                mode: str = 'tensor'):
+        if mode == 'tensor':
+            reid_results = self.base_reid(inputs, data_samples, mode)
+
+            pose_data = []
+            bboxes_ = []
+            for input in inputs:
+                img = input.detach().moveaxis(0, -1).cpu().numpy()
+                height, width, _ = img.shape
+
+                bboxes = np.array([[0, 0, width, height]], dtype=np.float32)
+                bboxes_.append(bboxes)
+
+                data = self.pose_pipeline(dict(img=img, bbox=bboxes))
+                pds = data['data_samples']
+
+                pds.gt_instances.bbox_scores = np.ones(1)
+                pds.set_field((width, height),
+                              'input_size',
+                              field_type='metainfo')
+                pds.set_field(
+                    (0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15),
+                    'flip_indices',
+                    field_type='metainfo')
+
+                pose_data.append(pds)
+
+            pose_results = self.pose_model.predict(inputs, pose_data)
+        else:
+            raise NotImplementedError(f'PoseReID does not support mode {mode}')
+
+        bboxes_ = np.concatenate(bboxes_, axis=0)
+        bboxes_ = torch.from_numpy(bboxes_).to(reid_results.device)
+
+        pose_embedded = self.pose_embbedder(pose_results,
+                                            bboxes_).to(reid_results.device)
+        embedded = torch.cat((reid_results, pose_embedded), dim=1)
+        return embedded
+
+
+class FullBodyPoseEmbedder(object):
+    """Converts 3D pose landmarks into 3D embedding."""
+
+    def __init__(self, torso_size_multiplier=2.5):
+        # Multiplier to apply to the torso to get minimal body size.
+        self._torso_size_multiplier = torso_size_multiplier
+
+        # Names of the landmarks as they appear in the prediction.
+        self._landmark_names = [
+            'nose',
+            'left_eye',
+            'right_eye',
+            'left_ear',
+            'right_ear',
+            'left_shoulder',
+            'right_shoulder',
+            'left_elbow',
+            'right_elbow',
+            'left_wrist',
+            'right_wrist',
+            'left_hip',
+            'right_hip',
+            'left_knee',
+            'right_knee',
+            'left_ankle',
+            'right_ankle',
+        ]
+
+    def embbed(self, landmarks):
+        """
+        Normalizes pose landmarks and converts to embedding
+
+        Args:
+          landmarks - NumPy array with 3D landmarks of shape (N, 3).
+
+        Result:
+          Numpy array with pose embedding of shape (M, 3) where `M` is the number of
+          pairwise distances defined in `_get_pose_distance_embedding`.
+        """
+        assert landmarks.shape[0] == len(
+            self._landmark_names), 'Unexpected number of landmarks: {}'.format(
+                landmarks.shape[0])
+
+        # Get pose landmarks.
+        landmarks = np.copy(landmarks)
+
+        # Normalize landmarks.
+        landmarks = self._normalize_pose_landmarks(landmarks)
+
+        # Get embedding.
+        embedding = self._get_pose_distance_embedding(landmarks)
+
+        embedding = (embedding + 1) / 2
+
+        return embedding.reshape(-1)
+
+    def __call__(self, pose_results, bboxes):
+        pose_embeddings = []
+        for k in range(len(pose_results)):
+            w1, h1, w2, h2 = bboxes[k]
+
+            landmarks = pose_results[k].pred_instances.keypoints.reshape(-1, 2)
+            for i in range(landmarks.shape[0]):
+                w, h = landmarks[i]
+                landmarks[i][0] = (w - w1) / (w2 - w1)
+                landmarks[i][1] = (h - h1) / (h2 - h1)
+            pose_embeddings.append(self.embbed(landmarks))
+
+        pose_embeddings = torch.from_numpy(np.stack(pose_embeddings, axis=0))
+
+        return pose_embeddings
+
+    def _normalize_pose_landmarks(self, landmarks):
+        """Normalizes landmarks translation and scale."""
+        landmarks = np.copy(landmarks)
+
+        # Normalize translation.
+        pose_center = self._get_pose_center(landmarks)
+        landmarks -= pose_center
+
+        # Normalize scale.
+        pose_size = self._get_pose_size(landmarks, self._torso_size_multiplier)
+        landmarks /= pose_size
+        # Multiplication by 100 is not required, but makes it eaasier to debug.
+        # landmarks *= 100
+
+        return landmarks
+
+    def _get_pose_center(self, landmarks):
+        """Calculates pose center as point between hips."""
+        left_hip = landmarks[self._landmark_names.index('left_hip')]
+        right_hip = landmarks[self._landmark_names.index('right_hip')]
+        center = (left_hip + right_hip) * 0.5
+        return center
+
+    def _get_pose_size(self, landmarks, torso_size_multiplier):
+        """Calculates pose size.
+
+        It is the maximum of two values:
+          * Torso size multiplied by `torso_size_multiplier`
+          * Maximum distance from pose center to any pose landmark
+        """
+        # This approach uses only 2D landmarks to compute pose size.
+        landmarks = landmarks[:, :2]
+
+        # Hips center.
+        left_hip = landmarks[self._landmark_names.index('left_hip')]
+        right_hip = landmarks[self._landmark_names.index('right_hip')]
+        hips = (left_hip + right_hip) * 0.5
+
+        # Shoulders center.
+        left_shoulder = landmarks[self._landmark_names.index('left_shoulder')]
+        right_shoulder = landmarks[self._landmark_names.index(
+            'right_shoulder')]
+        shoulders = (left_shoulder + right_shoulder) * 0.5
+
+        # Torso size as the minimum body size.
+        torso_size = np.linalg.norm(shoulders - hips)
+
+        # Max dist to pose center.
+        pose_center = self._get_pose_center(landmarks)
+        max_dist = np.max(np.linalg.norm(landmarks - pose_center, axis=1))
+
+        return max(torso_size * torso_size_multiplier, max_dist)
+
+    def _get_pose_distance_embedding(self, landmarks):
+        """Converts pose landmarks into 3D embedding.
+
+        We use several pairwise 3D distances to form pose embedding. All distances
+        include X and Y components with sign. We differnt types of pairs to cover
+        different pose classes. Feel free to remove some or add new.
+
+        Args:
+          landmarks - NumPy array with 3D landmarks of shape (N, 3).
+
+        Result:
+          Numpy array with pose embedding of shape (M, 3) where `M` is the number of
+          pairwise distances.
+        """
+        embedding = np.array([
+            # One joint.
+            self._get_distance(
+                self._get_average_by_names(landmarks, 'left_hip', 'right_hip'),
+                self._get_average_by_names(landmarks, 'left_shoulder',
+                                           'right_shoulder')),
+            self._get_distance_by_names(landmarks, 'left_shoulder',
+                                        'left_elbow'),
+            self._get_distance_by_names(landmarks, 'right_shoulder',
+                                        'right_elbow'),
+            self._get_distance_by_names(landmarks, 'left_elbow', 'left_wrist'),
+            self._get_distance_by_names(landmarks, 'right_elbow',
+                                        'right_wrist'),
+            self._get_distance_by_names(landmarks, 'left_hip', 'left_knee'),
+            self._get_distance_by_names(landmarks, 'right_hip', 'right_knee'),
+            self._get_distance_by_names(landmarks, 'left_knee', 'left_ankle'),
+            self._get_distance_by_names(landmarks, 'right_knee',
+                                        'right_ankle'),
+
+            # Two joints.
+            self._get_distance_by_names(landmarks, 'left_shoulder',
+                                        'left_wrist'),
+            self._get_distance_by_names(landmarks, 'right_shoulder',
+                                        'right_wrist'),
+            self._get_distance_by_names(landmarks, 'left_hip', 'left_ankle'),
+            self._get_distance_by_names(landmarks, 'right_hip', 'right_ankle'),
+
+            # Four joints.
+            self._get_distance_by_names(landmarks, 'left_hip', 'left_wrist'),
+            self._get_distance_by_names(landmarks, 'right_hip', 'right_wrist'),
+
+            # Five joints.
+            self._get_distance_by_names(landmarks, 'left_shoulder',
+                                        'left_ankle'),
+            self._get_distance_by_names(landmarks, 'right_shoulder',
+                                        'right_ankle'),
+            self._get_distance_by_names(landmarks, 'left_hip', 'left_wrist'),
+            self._get_distance_by_names(landmarks, 'right_hip', 'right_wrist'),
+
+            # Cross body.
+            self._get_distance_by_names(landmarks, 'left_elbow',
+                                        'right_elbow'),
+            self._get_distance_by_names(landmarks, 'left_knee', 'right_knee'),
+            self._get_distance_by_names(landmarks, 'left_wrist',
+                                        'right_wrist'),
+            self._get_distance_by_names(landmarks, 'left_ankle',
+                                        'right_ankle'),
+
+            # Body bent direction.
+
+            # self._get_distance(
+            #     self._get_average_by_names(landmarks, 'left_wrist', 'left_ankle'),
+            #     landmarks[self._landmark_names.index('left_hip')]),
+            # self._get_distance(
+            #     self._get_average_by_names(landmarks, 'right_wrist', 'right_ankle'),
+            #     landmarks[self._landmark_names.index('right_hip')]),
+        ])
+
+        return embedding
+
+    def _get_average_by_names(self, landmarks, name_from, name_to):
+        lmk_from = landmarks[self._landmark_names.index(name_from)]
+        lmk_to = landmarks[self._landmark_names.index(name_to)]
+        return (lmk_from + lmk_to) * 0.5
+
+    def _get_distance_by_names(self, landmarks, name_from, name_to):
+        lmk_from = landmarks[self._landmark_names.index(name_from)]
+        lmk_to = landmarks[self._landmark_names.index(name_to)]
+        return self._get_distance(lmk_from, lmk_to)
+
+    def _get_distance(self, lmk_from, lmk_to):
+        return lmk_to - lmk_from
diff --git a/mmtrack/models/roi_heads/__init__.py b/mmtrack/models/roi_heads/__init__.py
index ed60661d7..01510271c 100644
--- a/mmtrack/models/roi_heads/__init__.py
+++ b/mmtrack/models/roi_heads/__init__.py
@@ -1,8 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .bbox_heads import SelsaBBoxHead
-from .roi_extractors import SingleRoIExtractor, TemporalRoIAlign
+from .roi_extractors import SingleRoIExtractor
 from .selsa_roi_head import SelsaRoIHead
 
-__all__ = [
-    'SelsaRoIHead', 'SelsaBBoxHead', 'TemporalRoIAlign', 'SingleRoIExtractor'
-]
+__all__ = ['SelsaRoIHead', 'SelsaBBoxHead', 'SingleRoIExtractor']
diff --git a/mmtrack/models/roi_heads/bbox_heads/selsa_bbox_head.py b/mmtrack/models/roi_heads/bbox_heads/selsa_bbox_head.py
index 2eccbe1fe..1ba4249fa 100644
--- a/mmtrack/models/roi_heads/bbox_heads/selsa_bbox_head.py
+++ b/mmtrack/models/roi_heads/bbox_heads/selsa_bbox_head.py
@@ -1,11 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import torch.nn as nn
-from mmdet.models import HEADS, ConvFCBBoxHead
+from mmdet.models import ConvFCBBoxHead
+from torch import Tensor
 
-from mmtrack.models import build_aggregator
+from mmtrack.registry import MODELS
+from mmtrack.utils import ConfigType
 
 
-@HEADS.register_module()
+@MODELS.register_module()
 class SelsaBBoxHead(ConvFCBBoxHead):
     """Selsa bbox head.
 
@@ -13,17 +17,17 @@ class SelsaBBoxHead(ConvFCBBoxHead):
     Object Detection". `SELSA <https://arxiv.org/abs/1907.06390>`_.
 
     Args:
-        aggregator (dict): Configuration of aggregator.
+        aggregator (ConfigType): Configuration of aggregator.
     """
 
-    def __init__(self, aggregator, *args, **kwargs):
+    def __init__(self, aggregator: ConfigType, *args, **kwargs):
         super(SelsaBBoxHead, self).__init__(*args, **kwargs)
         self.aggregator = nn.ModuleList()
         for i in range(self.num_shared_fcs):
-            self.aggregator.append(build_aggregator(aggregator))
+            self.aggregator.append(MODELS.build(aggregator))
         self.inplace_false_relu = nn.ReLU(inplace=False)
 
-    def forward(self, x, ref_x):
+    def forward(self, x: Tensor, ref_x: Tensor) -> Tuple:
         """Computing the `cls_score` and `bbox_pred` of the features `x` of key
         frame proposals.
 
@@ -34,8 +38,14 @@ def forward(self, x, ref_x):
                 frame proposals.
 
         Returns:
-            tuple(cls_score, bbox_pred): The predicted score of classes and
-            the predicted regression offsets.
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification scores for all
+                  scale levels, each is a 4D-tensor, the channels number
+                  is num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for all
+                  scale levels, each is a 4D-tensor, the channels number
+                  is num_base_priors * 4.
         """
         # shared part
         if self.num_shared_convs > 0:
diff --git a/mmtrack/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/mmtrack/models/roi_heads/roi_extractors/single_level_roi_extractor.py
index c239f940b..f1954c3dc 100644
--- a/mmtrack/models/roi_heads/roi_extractors/single_level_roi_extractor.py
+++ b/mmtrack/models/roi_heads/roi_extractors/single_level_roi_extractor.py
@@ -1,11 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.runner import force_fp32
-from mmdet.models.builder import ROI_EXTRACTORS
+from typing import Tuple
+
 from mmdet.models.roi_heads.roi_extractors import \
     SingleRoIExtractor as _SingleRoIExtractor
+from torch import Tensor
+
+from mmtrack.registry import MODELS
 
 
-@ROI_EXTRACTORS.register_module(force=True)
+@MODELS.register_module()
 class SingleRoIExtractor(_SingleRoIExtractor):
     """Extract RoI features from a single level feature map.
 
@@ -14,7 +17,19 @@ class SingleRoIExtractor(_SingleRoIExtractor):
     accept external arguments.
     """
 
-    @force_fp32(apply_to=('feats', ), out_fp16=True)
-    def forward(self, feats, rois, roi_scale_factor=None, **kwargs):
-        """Forward function."""
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: float = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+        Args:
+            feats (Tuple[Tensor]): The feature maps.
+            rois (Tensor): The RoIs.
+            roi_scale_factor (float): Scale factor that RoI will be multiplied
+                by.
+
+        Returns:
+            Tensor: RoI features.
+        """
         return super().forward(feats, rois, roi_scale_factor)
diff --git a/mmtrack/models/roi_heads/roi_extractors/temporal_roi_align.py b/mmtrack/models/roi_heads/roi_extractors/temporal_roi_align.py
index 444c775bd..8bc0674d6 100644
--- a/mmtrack/models/roi_heads/roi_extractors/temporal_roi_align.py
+++ b/mmtrack/models/roi_heads/roi_extractors/temporal_roi_align.py
@@ -1,12 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
 import torch
 from mmcv.cnn import ConvModule
-from mmcv.runner import force_fp32
-from mmdet.models.builder import ROI_EXTRACTORS
 from mmdet.models.roi_heads.roi_extractors import SingleRoIExtractor
+from torch import Tensor
+
+from mmtrack.registry import MODELS
 
 
-@ROI_EXTRACTORS.register_module()
+@MODELS.register_module()
 class TemporalRoIAlign(SingleRoIExtractor):
     """Temporal RoI Align module.
 
@@ -25,8 +28,8 @@ class TemporalRoIAlign(SingleRoIExtractor):
     """
 
     def __init__(self,
-                 num_most_similar_points=2,
-                 num_temporal_attention_blocks=4,
+                 num_most_similar_points: int = 2,
+                 num_temporal_attention_blocks: int = 4,
                  *args,
                  **kwargs):
         super(TemporalRoIAlign, self).__init__(*args, **kwargs)
@@ -42,7 +45,8 @@ def __init__(self,
                 norm_cfg=None,
                 act_cfg=None)
 
-    def temporal_attentional_feature_aggregation(self, x, ref_x):
+    def temporal_attentional_feature_aggregation(self, x: Tensor,
+                                                 ref_x: Tensor) -> Tensor:
         """Aggregate the RoI features `x` with the Most Similar RoI features
         `ref_x`.
 
@@ -97,7 +101,8 @@ def temporal_attentional_feature_aggregation(self, x, ref_x):
         x = (x * ada_weights).sum(dim=0)
         return x
 
-    def most_similar_roi_align(self, roi_feats, ref_feats):
+    def most_similar_roi_align(self, roi_feats: Tensor,
+                               ref_feats: Tensor) -> Tensor:
         """Extract the Most Similar RoI features from reference feature maps
         `ref_feats` based on RoI features `roi_feats`.
 
@@ -181,9 +186,24 @@ def most_similar_roi_align(self, roi_feats, ref_feats):
         ref_roi_feats = ref_roi_feats.permute(0, 1, 4, 2, 3)
         return ref_roi_feats
 
-    @force_fp32(apply_to=('feats', 'ref_feats'), out_fp16=True)
-    def forward(self, feats, rois, roi_scale_factor=None, ref_feats=None):
-        """Forward function."""
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None,
+                ref_feats: Optional[Tuple[Tensor]] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+        Args:
+            feats (Tuple[Tensor]): The feature maps.
+            rois (Tensor): The RoIs.
+            roi_scale_factor (Optional[float], optional): Scale factor that RoI
+                will be multiplied by. Defaults to None.
+            ref_feats (Tuple[Tensor], optional): The feature maps of ref_img.
+                Defaults to None.
+
+        Returns:
+            Tensor: RoI features.
+        """
         roi_feats = super().forward(feats, rois, roi_scale_factor)
 
         if ref_feats is None:
diff --git a/mmtrack/models/roi_heads/selsa_roi_head.py b/mmtrack/models/roi_heads/selsa_roi_head.py
index ddf55cb0c..1a183f3a7 100644
--- a/mmtrack/models/roi_heads/selsa_roi_head.py
+++ b/mmtrack/models/roi_heads/selsa_roi_head.py
@@ -1,84 +1,100 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.core import bbox2result, bbox2roi
-from mmdet.models import HEADS, StandardRoIHead
+from typing import List, Tuple
 
+from mmdet.models import StandardRoIHead
+from mmdet.structures.bbox import bbox2roi
+from torch import Tensor
 
-@HEADS.register_module()
+from mmtrack.registry import MODELS
+from mmtrack.utils import ConfigType, InstanceList, SampleList
+
+
+@MODELS.register_module()
 class SelsaRoIHead(StandardRoIHead):
     """selsa roi head."""
 
-    def forward_train(self,
-                      x,
-                      ref_x,
-                      img_metas,
-                      proposal_list,
-                      ref_proposal_list,
-                      gt_bboxes,
-                      gt_labels,
-                      gt_bboxes_ignore=None,
-                      gt_masks=None):
+    def loss(self, x: Tuple[Tensor], ref_x: Tuple[Tensor],
+             rpn_results_list: InstanceList,
+             ref_rpn_results_list: InstanceList,
+             data_samples: SampleList) -> dict:
         """
         Args:
-            x (list[Tensor]): list of multi-level img features.
-            ref_x (list[Tensor]): list of multi-level ref_img features.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmdet/datasets/pipelines/formatting.py:Collect`.
-            proposal_list (list[Tensors]): list of region proposals.
-            ref_proposal_list (list[Tensors]): list of region proposals
-                from ref_imgs.
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[Tensor]): class indices corresponding to each box
-            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
-                boxes can be ignored when computing the loss.
-            gt_masks (None | Tensor) : true segmentation masks for each box
-                used if the architecture supports a segmentation task.
+            x (Tuple[Tensor]): list of multi-level img features.
+            ref_x (Tuple[Tensor]): list of multi-level ref_img features.
+            rpn_results_list (InstanceList): list of region proposals.
+            ref_rpn_results_list (InstanceList): list of region proposals
+                from reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` and 'metainfo'.
 
         Returns:
-            dict[str, Tensor]: a dictionary of loss components
+            dict[str, Tensor]: a dictionary of loss components.
         """
+        assert len(rpn_results_list) == len(data_samples)
+        batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        for data_sample in data_samples:
+            batch_gt_instances.append(data_sample.gt_instances)
+            if 'ignored_instances' in data_sample:
+                batch_gt_instances_ignore.append(data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+
         # assign gts and sample proposals
         if self.with_bbox or self.with_mask:
-            num_imgs = len(img_metas)
-            if gt_bboxes_ignore is None:
-                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            num_imgs = len(data_samples)
             sampling_results = []
             for i in range(num_imgs):
+                # rename rpn_results.bboxes to rpn_results.priors
+                rpn_results = rpn_results_list[i]
+                rpn_results.priors = rpn_results.pop('bboxes')
                 assign_result = self.bbox_assigner.assign(
-                    proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
-                    gt_labels[i])
+                    rpn_results, batch_gt_instances[i],
+                    batch_gt_instances_ignore[i])
                 sampling_result = self.bbox_sampler.sample(
                     assign_result,
-                    proposal_list[i],
-                    gt_bboxes[i],
-                    gt_labels[i],
+                    rpn_results,
+                    batch_gt_instances[i],
                     feats=[lvl_feat[i][None] for lvl_feat in x])
                 sampling_results.append(sampling_result)
 
         losses = dict()
         # bbox head forward and loss
         if self.with_bbox:
-            bbox_results = self._bbox_forward_train(x, ref_x, sampling_results,
-                                                    ref_proposal_list,
-                                                    gt_bboxes, gt_labels)
+            bbox_results = self.bbox_loss(x, ref_x, sampling_results,
+                                          ref_rpn_results_list)
             losses.update(bbox_results['loss_bbox'])
 
         # mask head forward and loss
         if self.with_mask:
-            mask_results = self._mask_forward_train(x, sampling_results,
-                                                    bbox_results['bbox_feats'],
-                                                    gt_masks, img_metas)
-            # TODO: Support empty tensor input. #2280
-            if mask_results['loss_mask'] is not None:
-                losses.update(mask_results['loss_mask'])
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
 
         return losses
 
-    def _bbox_forward(self, x, ref_x, rois, ref_rois):
-        """Box head forward function used in both training and testing."""
+    def _bbox_forward(self, x: Tuple[Tensor], ref_x: Tuple[Tensor],
+                      rois: Tensor, ref_rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (Tuple[Tensor]): List of multi-level img features.
+            ref_x (Tuple[Tensor]): List of multi-level reference img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            ref_rois (Tensor): Reference RoIs with the shape (n, 5) where the
+                first column indicates batch id of each RoI.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+
         # TODO: a more flexible way to decide which feature maps to use
         bbox_feats = self.bbox_roi_extractor(
             x[:self.bbox_roi_extractor.num_inputs],
@@ -96,73 +112,135 @@ def _bbox_forward(self, x, ref_x, rois, ref_rois):
             cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
         return bbox_results
 
-    def _bbox_forward_train(self, x, ref_x, sampling_results,
-                            ref_proposal_list, gt_bboxes, gt_labels):
-        """Run forward function and calculate loss for box head in training."""
+    def bbox_loss(self, x: Tuple[Tensor], ref_x: Tuple[Tensor],
+                  sampling_results: InstanceList,
+                  ref_rpn_results_list: InstanceList):
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            x (Tuple[Tensor]): list of multi-level img features.
+            ref_x (Tuple[Tensor]): list of multi-level ref_img features.
+            sampling_results (InstanceList): Sampleing results.
+            ref_rpn_results_list (InstanceList): list of region proposals
+                from reference images.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
         rois = bbox2roi([res.bboxes for res in sampling_results])
-        ref_rois = bbox2roi(ref_proposal_list)
+        ref_rois = bbox2roi([res.bboxes for res in ref_rpn_results_list])
         bbox_results = self._bbox_forward(x, ref_x, rois, ref_rois)
 
-        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,
-                                                  gt_labels, self.train_cfg)
-        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
-                                        bbox_results['bbox_pred'], rois,
-                                        *bbox_targets)
+        bbox_loss_and_target = self.bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg)
 
-        bbox_results.update(loss_bbox=loss_bbox)
+        bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
         return bbox_results
 
-    def simple_test(self,
-                    x,
-                    ref_x,
-                    proposals_list,
-                    ref_proposals_list,
-                    img_metas,
-                    proposals=None,
-                    rescale=False):
-        """Test without augmentation."""
+    def predict(self,
+                x: Tuple[Tensor],
+                ref_x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                ref_rpn_results_list: InstanceList,
+                data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): All scale level feature maps of images.
+            ref_x (Tuple[Tensor]): All scale level feature maps of reference
+                mages.
+            rpn_results_list (InstanceList): list of region proposals.
+            ref_rpn_results_list (InstanceList): list of region
+                proposals from reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` and 'metainfo'.
+            rescale (bool, optional): If True, return boxes in original image
+                space. Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
         assert self.with_bbox, 'Bbox head must be implemented.'
 
-        det_bboxes, det_labels = self.simple_test_bboxes(
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in data_samples
+        ]
+
+        results_list = self.predict_bbox(
             x,
             ref_x,
-            proposals_list,
-            ref_proposals_list,
-            img_metas,
+            rpn_results_list,
+            ref_rpn_results_list,
+            batch_img_metas,
             self.test_cfg,
             rescale=rescale)
-        bbox_results = [
-            bbox2result(det_bboxes[i], det_labels[i],
-                        self.bbox_head.num_classes)
-            for i in range(len(det_bboxes))
-        ]
 
-        if not self.with_mask:
-            return bbox_results
-        else:
-            mask_results = self.simple_test_mask(
-                x, img_metas, det_bboxes, det_labels, rescale=rescale)
-            return list(zip(bbox_results, mask_results))
-
-    def simple_test_bboxes(self,
-                           x,
-                           ref_x,
-                           proposals,
-                           ref_proposals,
-                           img_metas,
-                           rcnn_test_cfg,
-                           rescale=False):
-        """Test only det bboxes without augmentation."""
-        rois = bbox2roi(proposals)
-        ref_rois = bbox2roi(ref_proposals)
+        if self.with_mask:
+            results_list = self.predict_mask(
+                x, batch_img_metas, results_list, rescale=rescale)
+
+        return results_list
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     ref_x: Tuple[Tensor],
+                     rpn_results_list: InstanceList,
+                     ref_rpn_results_list: InstanceList,
+                     batch_img_metas: List[dict],
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): All scale level feature maps of images.
+            ref_x (Tuple[Tensor]): All scale level feature maps of reference
+                mages.
+            rpn_results_list (InstanceList): List of region proposals.
+            ref_rpn_results_list (InstanceList): List of region
+                proposals from reference images.
+            batch_img_metas (List[dict]): _List of image information.
+            rcnn_test_cfg (ConfigType): `test_cfg` of R-CNN.
+            rescale (bool, optional): If True, return boxes in original image
+                space. Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+
+        rois = bbox2roi([res.bboxes for res in rpn_results_list])
+        ref_rois = bbox2roi([res.bboxes for res in ref_rpn_results_list])
         bbox_results = self._bbox_forward(x, ref_x, rois, ref_rois)
-        img_shapes = tuple(meta['img_shape'] for meta in img_metas)
-        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
 
         # split batch bbox prediction back to each image
         cls_score = bbox_results['cls_score']
         bbox_pred = bbox_results['bbox_pred']
-        num_proposals_per_img = tuple(len(p) for p in proposals)
+        num_proposals_per_img = tuple(len(p) for p in rpn_results_list)
         rois = rois.split(num_proposals_per_img, 0)
         cls_score = cls_score.split(num_proposals_per_img, 0)
         # some detector with_reg is False, bbox_pred will be None
@@ -171,17 +249,12 @@ def simple_test_bboxes(self,
             0) if bbox_pred is not None else [None, None]
 
         # apply bbox post-processing to each image individually
-        det_bboxes = []
-        det_labels = []
-        for i in range(len(proposals)):
-            det_bbox, det_label = self.bbox_head.get_bboxes(
-                rois[i],
-                cls_score[i],
-                bbox_pred[i],
-                img_shapes[i],
-                scale_factors[i],
-                rescale=rescale,
-                cfg=rcnn_test_cfg)
-            det_bboxes.append(det_bbox)
-            det_labels.append(det_label)
-        return det_bboxes, det_labels
+        result_list = self.bbox_head.predict_by_feat(
+            rois,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=rcnn_test_cfg,
+            rescale=rescale)
+
+        return result_list
diff --git a/mmtrack/models/sot/__init__.py b/mmtrack/models/sot/__init__.py
index 6856faf74..92573a52f 100644
--- a/mmtrack/models/sot/__init__.py
+++ b/mmtrack/models/sot/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .prdimp import PrDiMP
 from .siamrpn import SiamRPN
 from .stark import Stark
 
-__all__ = ['SiamRPN', 'Stark']
+__all__ = ['SiamRPN', 'Stark', 'PrDiMP']
diff --git a/mmtrack/models/sot/base.py b/mmtrack/models/sot/base.py
index 10982f13c..07d77bb9a 100644
--- a/mmtrack/models/sot/base.py
+++ b/mmtrack/models/sot/base.py
@@ -1,29 +1,37 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
-from collections import OrderedDict
+from copy import deepcopy
+from typing import Dict, List, Tuple, Union
 
-import mmcv
-import numpy as np
-import torch
-import torch.distributed as dist
-from mmcv.runner import BaseModule, auto_fp16
+import addict
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from mmengine.model import BaseModel
+from mmengine.structures import InstanceData
+from torch import Tensor
 
-from mmtrack.utils import get_root_logger
+from mmtrack.evaluation import bbox2region
+from mmtrack.structures.bbox import calculate_region_overlap, quad2bbox_cxcywh
+from mmtrack.utils import (ForwardResults, InstanceList, OptConfigType,
+                           OptMultiConfig, OptSampleList, SampleList)
 
 
-class BaseSingleObjectTracker(BaseModule, metaclass=ABCMeta):
+class BaseSingleObjectTracker(BaseModel, metaclass=ABCMeta):
     """Base class for single object tracker.
 
     Args:
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
         init_cfg (dict or list[dict]): Initialization config dict.
     """
 
-    def __init__(self, init_cfg):
-        super(BaseSingleObjectTracker, self).__init__(init_cfg)
-        self.logger = get_root_logger()
-        self.fp16_enabled = False
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
 
-    def freeze_module(self, module):
+    def freeze_module(self, module: Union[List[str], Tuple[str], str]) -> None:
         """Freeze module during training."""
         if isinstance(module, str):
             modules = [module]
@@ -53,244 +61,250 @@ def with_head(self):
         """bool: whether the framework has a head"""
         return hasattr(self, 'head') and self.head is not None
 
-    @abstractmethod
-    def forward_train(self, imgs, img_metas, search_img, search_img_metas,
-                      **kwargs):
-        """
+    def forward(self,
+                inputs: Dict[str, Tensor],
+                data_samples: OptSampleList = None,
+                mode: str = 'predict',
+                **kwargs) -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`TrackDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-
-            img_metas (list[dict]): list of image information dict where each
-                dict has: 'img_shape', 'scale_factor', 'flip', and may also
-                contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            search_img (Tensor): of shape (N, 1, C, H, W) encoding input search
-                images. 1 denotes there is only one search image for each
-                exemplar image. Typically H and W equal to 255.
-
-            search_img_metas (list[list[dict]]): The second list only has one
-                element. The first list contains search image information dict
-                where each dict has: 'img_shape', 'scale_factor', 'flip', and
-                may also contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W)
+                encoding input images. Typically these should be mean centered
+                and std scaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'predict'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`TrackDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
         """
-        pass
+        if mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
 
     @abstractmethod
-    def simple_test(self, img, img_metas, **kwargs):
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples."""
         pass
 
-    def aug_test(self, imgs, img_metas, **kwargs):
-        """Test function with test time augmentation."""
-        pass
+    def predict(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
 
-    def forward_test(self, imgs, img_metas, **kwargs):
-        """
         Args:
-            imgs (List[Tensor]): the outer list indicates test-time
-                augmentations and inner Tensor should have a shape NxCxHxW,
-                which contains all images in the batch.
-            img_metas (List[List[dict]]): the outer list indicates test-time
-                augs (multiscale, flip, etc.) and the inner list indicates
-                images in a batch.
+            inputs (dict[Tensor]): of shape (N, T, C, H, W)
+                encodingbinput images. Typically these should be mean centered
+                and stdbscaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+                In test mode, T = 1 and there is only ``img`` and no
+                ``ref_img``.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instances`` and ``metainfo``.
+
+        Returns:
+            list[obj:`TrackDataSample`]: Tracking results of the
+                input images. Each TrackDataSample usually contains
+                ``pred_det_instances`` or ``pred_track_instances``.
         """
-        if isinstance(imgs, torch.Tensor):
-            imgs = [imgs]
-        elif not isinstance(imgs, list):
-            raise TypeError(
-                f'imgs must be a list or tensor, but got {type(imgs)}')
-
-        assert isinstance(img_metas, list)
-        if isinstance(img_metas[0], dict):
-            img_metas = [img_metas]
-        elif not isinstance(img_metas[0], list):
-            raise TypeError(
-                'img_metas must be a List[List[dict]] or List[dict]')
-
-        num_augs = len(imgs)
-        if num_augs != len(img_metas):
-            raise ValueError(f'num of augmentations ({len(imgs)}) '
-                             f'!= num of image meta ({len(img_metas)})')
-
-        if num_augs == 1:
-            # proposals (List[List[Tensor]]): the outer list indicates
-            # test-time augs (multiscale, flip, etc.) and the inner list
-            # indicates images in a batch.
-            # The Tensor should have a shape Px4, where P is the number of
-            # proposals.
-            if 'proposals' in kwargs:
-                kwargs['proposals'] = kwargs['proposals'][0]
-            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        test_mode = self.test_cfg.get('test_mode', 'OPE')
+        assert test_mode in ['OPE', 'VOT']
+        if test_mode == 'VOT':
+            pred_track_instances = self.predict_vot(inputs, data_samples)
         else:
-            assert imgs[0].size(0) == 1, 'aug test does not support ' \
-                                         'inference with batch size ' \
-                                         f'{imgs[0].size(0)}'
-            # TODO: support test augmentation for predefined proposals
-            assert 'proposals' not in kwargs
-            return self.aug_test(imgs, img_metas, **kwargs)
-
-    @auto_fp16(apply_to=('img', 'search_img'))
-    def forward(self,
-                img,
-                img_metas,
-                search_img=None,
-                search_img_metas=None,
-                return_loss=True,
-                **kwargs):
-        """Calls either :func:`forward_train` or :func:`forward_test` depending
-        on whether ``return_loss`` is ``True``.
-
-        Note this setting will change the expected inputs. When
-        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
-        and List[dict]), and when ``resturn_loss=False``, img and img_meta
-        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
-        the outer list indicating test time augmentations.
-        """
-        if return_loss:
-            return self.forward_train(
-                img,
-                img_metas,
-                search_img=search_img,
-                search_img_metas=search_img_metas,
-                **kwargs)
+            pred_track_instances = self.predict_ope(inputs, data_samples)
+        track_data_samples = deepcopy(data_samples)
+        for _data_sample, _pred_instances in zip(track_data_samples,
+                                                 pred_track_instances):
+            _data_sample.pred_track_instances = _pred_instances
+        return track_data_samples
+
+    def predict_ope(self, inputs: dict, data_samples: SampleList):
+
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'Only support 1 batch size per gpu in test mode'
+        img = img[0]
+
+        metainfo = data_samples[0].metainfo
+        frame_id = metainfo.get('frame_id', -1)
+        assert frame_id >= 0
+
+        if frame_id == 0:
+            # We store the current frame id, previous bbox and template
+            # information in ``self.memo``.
+            self.memo = addict.Dict()
+            self.memo.frame_id = frame_id
+            gt_bboxes = data_samples[0].gt_instances['bboxes']
+            self.memo.bbox = quad2bbox_cxcywh(gt_bboxes)
+            self.init(img)
+            results = [InstanceData()]
+            results[0].bboxes = bbox_cxcywh_to_xyxy(self.memo.bbox)[None]
+            results[0].scores = gt_bboxes.new_tensor([-1.])
         else:
-            return self.forward_test(img, img_metas, **kwargs)
+            self.memo.frame_id = frame_id
+            results = self.track(img, data_samples)
+            self.memo.bbox = bbox_xyxy_to_cxcywh(results[0].bboxes.squeeze())
+
+        return results
 
-    def _parse_losses(self, losses):
-        """Parse the raw outputs (losses) of the network.
+    def predict_vot(self, inputs: dict,
+                    data_samples: SampleList) -> List[InstanceData]:
+        """Test using VOT test mode.
 
         Args:
-            losses (dict): Raw output of the network, which usually contain
-                losses and other necessary information.
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W)
+                encoding input images. Typically these should be mean centered
+                and std scaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+                In test mode, T = 1 and there is only ``img`` and no
+                ``ref_img``.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instances`` and 'metainfo'.
 
         Returns:
-            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
-            which may be a weighted sum of all losses, log_vars contains \
-            all the variables to be sent to the logger.
+            List[:obj:`InstanceData`]: Object tracking results of each image
+            after the post process. Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape (1, )
+                - bboxes (Tensor): Has a shape (1, 4),
+                  the last dimension 4 arrange as [x1, y1, x2, y2].
         """
-        log_vars = OrderedDict()
-        for loss_name, loss_value in losses.items():
-            if isinstance(loss_value, torch.Tensor):
-                log_vars[loss_name] = loss_value.mean()
-            elif isinstance(loss_value, list):
-                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'Only support 1 batch size per gpu in test mode'
+        img = img[0]
+
+        gt_bboxes = data_samples[0].gt_instances['bboxes']
+        metainfo = data_samples[0].metainfo
+        frame_id = metainfo.get('frame_id', -1)
+        assert frame_id >= 0
+
+        if frame_id == 0:
+            self.init_frame_id = 0
+        if self.init_frame_id == frame_id:
+            # initialization
+            # We store the previous bbox、 current frame id and some template
+            # information in ``self.memo``.
+            self.memo = addict.Dict()
+            self.memo.frame_id = frame_id
+            self.memo.bbox = quad2bbox_cxcywh(gt_bboxes)
+            self.init(img)
+            # 1 denotes the initialization state
+            results = [InstanceData()]
+            results[0].bboxes = gt_bboxes.new_tensor([[1.]])
+            results[0].scores = gt_bboxes.new_tensor([-1.])
+        elif self.init_frame_id > frame_id:
+            # 0 denotes unknown state, namely the skipping frame after failure
+            self.memo.frame_id = frame_id
+            results = [InstanceData()]
+            results[0].bboxes = gt_bboxes.new_tensor([[0.]])
+            results[0].scores = gt_bboxes.new_tensor([-1.])
+        else:
+            # normal tracking state
+            self.memo.frame_id = frame_id
+            results = self.track(img, data_samples)
+            self.memo.bbox = bbox_xyxy_to_cxcywh(results[0].bboxes.squeeze())
+
+            # convert bbox to region for overlap calculation
+            track_bbox = results[0].bboxes[0].cpu().numpy()
+            track_region = bbox2region(track_bbox)
+            gt_region = bbox2region(gt_bboxes[0].cpu().numpy())
+
+            if 'img_shape' in metainfo:
+                image_shape = metainfo['img_shape']
+                image_wh = (image_shape[1], image_shape[0])
             else:
-                raise TypeError(
-                    f'{loss_name} is not a tensor or list of tensors')
-
-        loss = sum(_value for _key, _value in log_vars.items()
-                   if 'loss' in _key)
+                image_wh = None
+                Warning('image shape are need when calculating bbox overlap')
+            overlap = calculate_region_overlap(
+                track_region, gt_region, bounds=image_wh)
+            if overlap <= 0:
+                # tracking failure
+                self.init_frame_id = frame_id + 5
+                # 2 denotes the failure state
+                results[0].bboxes = img.new_tensor([[2.]])
+
+        return results
 
-        log_vars['loss'] = loss
-        for loss_name, loss_value in log_vars.items():
-            # reduce loss when distributed training
-            if dist.is_available() and dist.is_initialized():
-                loss_value = loss_value.data.clone()
-                dist.all_reduce(loss_value.div_(dist.get_world_size()))
-            log_vars[loss_name] = loss_value.item()
-
-        return loss, log_vars
+    @abstractmethod
+    def init(img: Tensor):
+        """Initialize the single object tracker in the first frame.
 
-    def train_step(self, data, optimizer):
-        """The iteration step during training.
+        Args:
+            img (Tensor): of shape (1, C, H, W) encoding original input
+                image.
+        """
+        pass
 
-        This method defines an iteration step during training, except for the
-        back propagation and optimizer updating, which are done in an optimizer
-        hook. Note that in some complicated cases or models, the whole process
-        including back propagation and optimizer updating is also defined in
-        this method, such as GAN.
+    @abstractmethod
+    def track(img: Tensor, data_samples: SampleList) -> InstanceList:
+        """Track the box of previous frame to current frame `img`.
 
         Args:
-            data (dict): The output of dataloader.
-            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
-                runner is passed to ``train_step()``. This argument is unused
-                and reserved.
+            img (Tensor): of shape (1, C, H, W) encoding original input
+                image.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instances`` and 'metainfo'.
 
         Returns:
-            dict: It should contain at least 3 keys: ``loss``, ``log_vars``, \
-                ``num_samples``.
-
-                - ``loss`` is a tensor for back propagation, which can be a \
-                weighted sum of multiple losses.
-                - ``log_vars`` contains all the variables to be sent to the
-                logger.
-                - ``num_samples`` indicates the batch size (when the model is \
-                DDP, it means the batch size on each GPU), which is used for \
-                averaging the logs.
+            InstanceList: Tracking results of each image after the postprocess.
+                - scores: a Tensor denoting the score of best_bbox.
+                - bboxes: a Tensor of shape (4, ) in [x1, x2, y1, y2]
+                format, and denotes the best tracked bbox in current frame.
         """
-        losses = self(**data)
-        loss, log_vars = self._parse_losses(losses)
-
-        outputs = dict(
-            loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
-
-        return outputs
-
-    def val_step(self, data, optimizer):
-        """The iteration step during validation.
+        pass
 
-        This method shares the same signature as :func:`train_step`, but used
-        during val epochs. Note that the evaluation after training epochs is
-        not implemented with this method, but an evaluation hook.
-        """
-        losses = self(**data)
-        loss, log_vars = self._parse_losses(losses)
-
-        outputs = dict(
-            loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
-
-        return outputs
-
-    def show_result(self,
-                    img,
-                    result,
-                    color='green',
-                    thickness=1,
-                    show=False,
-                    win_name='',
-                    wait_time=0,
-                    out_file=None,
-                    **kwargs):
-        """Visualize tracking results.
+    def _forward(self,
+                 inputs: Dict[str, Tensor],
+                 data_samples: OptSampleList = None,
+                 **kwargs):
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
 
-        Args:
-            img (str or ndarray): The image to be displayed.
-            result (dict): Tracking result.
-                The value of key 'track_bboxes' is ndarray with shape (5, )
-                in [tl_x, tl_y, br_x, br_y, score] format.
-            color (str or tuple or Color, optional): color of bbox.
-                Defaults to green.
-            thickness (int, optional): Thickness of lines.
-                Defaults to 1.
-            show (bool, optional): Whether to show the image.
-                Defaults to False.
-            win_name (str, optional): The window name.
-                Defaults to ''.
-            wait_time (int, optional): Value of waitKey param.
-                Defaults to 0.
-            out_file (str, optional): The filename to write the image.
-                Defaults to None.
+         Args:
+            inputs (Dict[str, Tensor]): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`TrackDataSample`], optional): The
+                Data Samples. It usually includes information such as
+                `gt_instance`.
 
         Returns:
-            ndarray: Visualized image.
+            tuple[list]: A tuple of features from ``head`` forward.
         """
-        assert isinstance(result, dict)
-        track_bboxes = result.get('track_bboxes', None)
-        assert track_bboxes.ndim == 1
-        assert track_bboxes.shape[0] == 5
-
-        track_bboxes = track_bboxes[:4]
-        mmcv.imshow_bboxes(
-            img,
-            track_bboxes[np.newaxis, :],
-            colors=color,
-            thickness=thickness,
-            show=show,
-            win_name=win_name,
-            wait_time=wait_time,
-            out_file=out_file)
-        return img
+        raise NotImplementedError(
+            "_forward function (namely 'tensor' mode) is not supported now")
diff --git a/mmtrack/models/sot/prdimp.py b/mmtrack/models/sot/prdimp.py
new file mode 100644
index 000000000..62e40839d
--- /dev/null
+++ b/mmtrack/models/sot/prdimp.py
@@ -0,0 +1,437 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.image import imrotate
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmtrack.registry import MODELS
+from mmtrack.utils import (InstanceList, OptConfigType, OptMultiConfig,
+                           SampleList, gauss_blur)
+from .base import BaseSingleObjectTracker
+
+
+@MODELS.register_module()
+class PrDiMP(BaseSingleObjectTracker):
+    """PrDiMP: Probabilistic Regression for Visual Tracking.
+
+    This single object tracker is the implementation of `PrDiMP
+    <https://arxiv.org/abs/2003.12565>`_.
+
+    args:
+        backbone (dict, optional): the configuration of backbone network.
+            Defaults to None.
+        cls_head (dict, optional):  target classification module.
+            Defaults to None.
+        bbox_head (dict, optional):  bounding box regression module.
+            Defaults to None.
+        init_cfg (dict, optional): the configuration of initialization.
+            Defaults to None.
+        test_cfg (dict, optional): the configuration of test.
+            Defaults to None.
+
+    """
+
+    def __init__(self,
+                 backbone: dict,
+                 neck: Optional[dict] = None,
+                 cls_head: Optional[dict] = None,
+                 bbox_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(PrDiMP, self).__init__(data_preprocessor, init_cfg)
+        self.backbone = MODELS.build(backbone)
+        cls_head.update(test_cfg=test_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.classifier = MODELS.build(cls_head)
+        self.bbox_regressor = MODELS.build(bbox_head)
+        self.test_cfg = test_cfg
+
+    def init(self, img: Tensor) -> Tuple[Tuple[Tensor, ...], Tensor]:
+        """Initialize tracker.
+
+        Args:
+            img (Tensor): Input image of shape (1, C, H, W).
+            init_bbox (Tensor): of (4, ) shape in [cx, cy, w, h] format.
+        """
+        self.frame_num = 1
+        init_bbox = self.memo.bbox
+
+        # Set size for image and cropped sample. img_size is in [w, h] format.
+        self.img_size = torch.Tensor([img.shape[-1],
+                                      img.shape[-2]]).to(init_bbox.device)
+        sample_size = self.test_cfg['img_sample_size']
+        sample_size = torch.Tensor([sample_size, sample_size] if isinstance(
+            sample_size, int) else sample_size)
+        self.sample_size = sample_size.to(init_bbox.device)
+
+        # Compute expanded size and output size about augmentation
+        aug_expansion_factor = self.test_cfg['init_aug_cfg'][
+            'aug_expansion_factor']
+        aug_expansion_size = (self.sample_size * aug_expansion_factor).float()
+
+        # Crop image patches and bboxes
+        img_patch, patch_coord = self.get_cropped_img(
+            img, init_bbox.round(),
+            self.test_cfg['search_scale_factor'] * aug_expansion_factor,
+            aug_expansion_size)
+        resized_factor = (patch_coord[:, 2:4] /
+                          aug_expansion_size).prod().sqrt()
+        init_bbox = self.generate_bbox(init_bbox, init_bbox[:2].round(),
+                                       resized_factor)
+
+        # Crop patches from the image and perform augmentation on the image
+        # patches
+        aug_img_patches, aug_cls_bboxes = self.gen_aug_imgs_bboxes(
+            img_patch, init_bbox, self.sample_size)
+
+        # `init_backbone_feats` is a tuple containing the features of `layer2`
+        # and `layer3`
+        init_backbone_feats = self.backbone(aug_img_patches)
+
+        # Initialize the classifier with bboxes and features of `layer3`
+        # get the augmented bboxes on the augmented image patches
+        self.classifier.init_classifier(
+            init_backbone_feats[-1], aug_cls_bboxes,
+            self.test_cfg['init_aug_cfg']['augmentation']['dropout'])
+
+        # Initialize IoUNet
+        # only use the features of the non-augmented image
+        init_iou_features = [x[:1] for x in init_backbone_feats]
+        self.bbox_regressor.init_iou_net(init_iou_features, init_bbox)
+
+    def img_shift_crop(self,
+                       img: Tensor,
+                       output_size: Optional[List] = None,
+                       shift: Optional[List] = None) -> Tensor:
+        """Shift and crop the image.
+
+        Args:
+            img (Tensor): The image of shape (C, H, W).
+            output_size (list): in [w, h] format.
+            shift (list): in [x, y] fotmat.
+
+        Returns:
+            Tensor: Augmented image.
+        """
+        img_size = [img.shape[-1], img.shape[-2]]
+        # img_size = img.shape[-2:]
+        if output_size is None:
+            pad_h = 0
+            pad_w = 0
+        else:
+            pad_w = (output_size[0] - img_size[0]) / 2
+            pad_h = (output_size[1] - img_size[1]) / 2
+
+        if shift is None:
+            shift = [0, 0]
+
+        pad_left = math.floor(pad_w) + shift[0]
+        pad_right = math.ceil(pad_w) - shift[0]
+        pad_top = math.floor(pad_h) + shift[1]
+        pad_bottom = math.ceil(pad_h) - shift[1]
+
+        return F.pad(img, (pad_left, pad_right, pad_top, pad_bottom),
+                     'replicate')
+
+    def gen_aug_imgs_bboxes(self, img: Tensor, init_bbox: Tensor,
+                            output_size: Tensor) -> Tensor:
+        """Perform data augmentation.
+
+        Args:
+            img (Tensor): Cropped image of shape (1, C, H, W).
+            init_bbox (Tensor): of (4, ) shape in [cx, cy, w, h] format.
+            output_size (Tensor): of (2, ) shape in [w, h] format.
+
+        Returns:
+            Tensor: The cropped augmented image patches.
+        """
+        output_size = output_size.long().cpu().tolist()
+        random_shift_factor = self.test_cfg['init_aug_cfg'][
+            'random_shift_factor']
+
+        def get_rand_shift():
+            return ((torch.rand(2) - 0.5) * self.sample_size.cpu() *
+                    random_shift_factor).long().tolist()
+
+        augs = self.test_cfg['init_aug_cfg']['augmentation']
+        aug_imgs = [self.img_shift_crop(img, output_size)]
+        aug_bboxes = [init_bbox]
+
+        # All augmentations
+        if 'relativeshift' in augs:
+            for shift in augs['relativeshift']:
+                absulute_shift = (torch.Tensor(shift) *
+                                  self.sample_size.cpu() / 2).long().tolist()
+                aug_imgs.append(
+                    self.img_shift_crop(img, output_size, absulute_shift))
+                bbox_shift = torch.tensor(
+                    absulute_shift + [0, 0], device=init_bbox.device)
+                aug_bboxes.append(init_bbox + bbox_shift)
+
+        if 'fliplr' in augs and augs['fliplr']:
+            shift = get_rand_shift()
+            aug_imgs.append(
+                self.img_shift_crop(img.flip(3), output_size, shift))
+            bbox_shift = torch.tensor(shift + [0, 0], device=init_bbox.device)
+            aug_bboxes.append(init_bbox + bbox_shift)
+
+        if 'blur' in augs:
+            for sigma in augs['blur']:
+                kernel_size = [math.ceil(2 * s) for s in sigma]
+                img_blur = gauss_blur(
+                    img, kernel_size=kernel_size, sigma=sigma)
+                shift = get_rand_shift()
+                aug_imgs.append(
+                    self.img_shift_crop(img_blur, output_size, shift))
+                bbox_shift = torch.tensor(
+                    shift + [0, 0], device=init_bbox.device)
+                aug_bboxes.append(init_bbox + bbox_shift)
+
+        if 'rotate' in augs:
+            for angle in augs['rotate']:
+                img_numpy = img.squeeze(0).permute(1, 2, 0).cpu().numpy()
+                assert img_numpy.ndim == 3
+                rotated_img = imrotate(
+                    img_numpy, angle, border_mode='replicate')
+                img_tensor = torch.from_numpy(rotated_img.transpose(
+                    2, 0, 1)).float().unsqueeze(0).to(img.device)
+                shift = get_rand_shift()
+                aug_imgs.append(
+                    self.img_shift_crop(img_tensor, output_size, shift))
+                bbox_shift = torch.tensor(
+                    shift + [0, 0], device=init_bbox.device)
+                aug_bboxes.append(init_bbox + bbox_shift)
+
+        if 'dropout' in augs:
+            for _ in range(len(augs['dropout'])):
+                aug_bboxes.append(init_bbox)
+
+        aug_imgs = torch.cat(aug_imgs, dim=0)
+        aug_bboxes = torch.stack(aug_bboxes)
+        return aug_imgs, aug_bboxes
+
+    def generate_bbox(self, bbox: Tensor, sample_center: Tensor,
+                      resize_factor: float) -> Tensor:
+        """All inputs are based in original image coordinates and the outputs
+        are based on the resized cropped image sample.
+
+        Args:
+            bbox (Tensor): of shape (4,) in [cx, cy, w, h] format
+            sample_center (Tensor): of shape (2,)
+            resize_factor (float):
+
+        Return:
+            Tensor: in [cx, cy, w, h] format
+        """
+        bbox_center = (bbox[:2] - sample_center) / resize_factor + (
+            self.sample_size / 2)
+        bbox_size = bbox[2:4] / resize_factor
+        return torch.cat([bbox_center, bbox_size])
+
+    def track(self, img: Tensor, data_samples: SampleList) -> InstanceList:
+        """Track the box `bbox` of previous frame to current frame `img`.
+
+        Args:
+            img (Tensor): of shape (1, C, H, W).
+            bbox (list | Tensor): The bbox in previous frame. The shape of the
+                bbox is (4, ) in [cx, cy, w, h] format.
+
+        Returns:
+            conf_score (int): The confidence of predicted bbox.
+            bbox (Tensor): The predicted bbox in [cx, cy, w, h] format
+        """
+        self.frame_num += 1
+        bbox = self.memo.bbox.clone()
+
+        # 1. Extract backbone features
+        img_patch, patch_coord = self.get_cropped_img(
+            img,
+            bbox.round(),
+            self.test_cfg['search_scale_factor'],
+            self.sample_size,
+            border_mode=self.test_cfg['border_mode'],
+            max_scale_change=self.test_cfg['patch_max_scale_change'])
+
+        backbone_feats = self.backbone(img_patch)
+
+        # location of sample
+        sample_center = patch_coord[:, :2].squeeze()
+        sample_scale_factor = (patch_coord[:, 2:] /
+                               self.sample_size).prod(dim=1).sqrt()
+
+        # 2. Locate the target roughly using score map.
+        new_bbox_center, score_map, state = self.classifier.predict(
+            backbone_feats, data_samples, bbox, sample_center,
+            sample_scale_factor)
+
+        # 3. Refine position and scale of the target.
+        if state != 'not_found':
+            inside_offset = (self.test_cfg['bbox_inside_ratio'] -
+                             0.5) * bbox[2:4]
+            # clip the coordinates of the center of the target on the original
+            # image
+            bbox[:2] = torch.max(
+                torch.min(new_bbox_center, self.img_size - inside_offset),
+                inside_offset)
+
+            cls_bboxes = self.generate_bbox(bbox, sample_center,
+                                            sample_scale_factor)
+            new_bbox = self.bbox_regressor.predict(backbone_feats,
+                                                   data_samples, cls_bboxes,
+                                                   sample_center,
+                                                   sample_scale_factor)
+            if new_bbox is not None:
+                bbox = new_bbox
+
+        # 4. Update the classifier
+        update_flag = state not in ['not_found', 'uncertain']
+        # Update the classifier filter using the latest position and size of
+        # target
+        if update_flag:
+            # Create the target_bbox using the refined predicted boxes
+            target_bbox = self.generate_bbox(bbox, sample_center,
+                                             sample_scale_factor)
+            hard_neg_flag = (state == 'hard_negative')
+            # Update the filter of classifier using it's optimizer module
+            self.classifier.update_classifier(target_bbox, self.frame_num,
+                                              hard_neg_flag)
+
+        result = [InstanceData()]
+        result[0].scores = torch.max(score_map[0]).unsqueeze(0)
+        result[0].bboxes = bbox_cxcywh_to_xyxy(bbox.unsqueeze(0))
+
+        return result
+
+    def get_cropped_img(
+            self,
+            img: Tensor,
+            target_bbox: Tensor,
+            search_scale_factor: float,
+            output_size: Optional[Tensor] = None,
+            border_mode: str = 'replicate',
+            max_scale_change: Optional[float] = None) -> Tuple[Tensor, Tensor]:
+        """Get the cropped patch based on the original image.
+
+        Args:
+            img (Tensor): The original image.
+            target_bbox (Tensor): The bbox of target in [cx, cy, w,h] format.
+            search_scale_factor (float): The ratio of cropped size to the size
+                of target bbox.
+            output_size (Optional[Tensor], optional): The output size.
+                Defaults to None.
+            border_mode (str, optional): The border mode. Defaults to
+                'replicate'.
+            max_scale_change (Optional[float], optional): The max scale change.
+                Defaults to None.
+            is_mask (Optional[bool], optional): Whether is mask.
+                Defaults to False.
+
+        Returns:
+            img_patch (Tensor): of (1, c, h, w) shape.
+            patch_coord (Tensor): of (1, 4) shape in [cx, cy, w, h] format.
+        """
+        crop_size = target_bbox[2:4].prod().sqrt() * search_scale_factor
+
+        # Get new sample size if forced inside the image
+        if border_mode == 'inside' or border_mode == 'inside_major':
+            img_sz = torch.Tensor([img.shape[3],
+                                   img.shape[2]]).to(target_bbox.device)
+            shrink_factor = (crop_size.float() / img_sz)
+            if border_mode == 'inside':
+                shrink_factor = shrink_factor.max()
+            elif border_mode == 'inside_major':
+                shrink_factor = shrink_factor.min()
+            shrink_factor.clamp_(min=1, max=max_scale_change)
+            crop_size = (crop_size.float() / shrink_factor).long()
+
+        tl = (target_bbox[:2] - crop_size // 2).long()
+        br = (target_bbox[:2] + crop_size // 2).long()
+
+        # Shift the crop to inside
+        if border_mode == 'inside' or border_mode == 'inside_major':
+            img2_sz = torch.LongTensor([img.shape[3],
+                                        img.shape[2]]).to(target_bbox.device)
+            shift = (-tl).clamp(0) - (br - img2_sz).clamp(0)
+            tl += shift
+            br += shift
+
+            outside = torch.floor_divide(
+                ((-tl).clamp(0) + (br - img2_sz).clamp(0)), 2)
+            shift = (-tl - outside) * (outside > 0).long()
+            tl += shift
+            br += shift
+
+        patch_coord = torch.cat((tl, br)).view(1, 4)
+        patch_coord = bbox_xyxy_to_cxcywh(patch_coord)
+
+        # Crop image patch
+        img_patch = F.pad(
+            img, (-tl[0].item(), br[0].item() - img.shape[3], -tl[1].item(),
+                  br[1].item() - img.shape[2]),
+            mode='replicate')
+
+        if output_size is None:
+            return img_patch.clone(), patch_coord
+
+        # Resize
+        img_patch = F.interpolate(
+            img_patch,
+            output_size.long().flip(0).tolist(),
+            mode='bilinear',
+            align_corners=True)
+
+        return img_patch, patch_coord
+
+    def loss(self, batch_inputs: dict, batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """
+        Args:
+            batch_inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size. The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+
+            batch_data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instance``.
+
+        Return:
+            dict: A dictionary of loss components.
+        """
+        search_img = batch_inputs['search_img']
+        assert search_img.dim(
+        ) == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        # has [T * N, C, H, W] shape and the first N images cover the entire
+        # mini-batch.
+        search_img = search_img.transpose(1, 0).contiguous().view(
+            -1, *search_img.shape[2:])
+
+        template_img = batch_inputs['img']
+        assert template_img.dim(
+        ) == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        # has [T * N, C, H, W] shape and the first N images cover the entire
+        # mini-batch.
+        template_img = template_img.transpose(1, 0).contiguous().view(
+            -1, *template_img.shape[2:])
+
+        z_feat = self.backbone(template_img)
+        x_feat = self.backbone(search_img)
+
+        losses = dict()
+        loss_cls = self.classifier.loss(z_feat, x_feat, batch_data_samples,
+                                        **kwargs)
+
+        loss_bbox = self.bbox_regressor.loss(z_feat, x_feat,
+                                             batch_data_samples, **kwargs)
+        losses.update(loss_cls)
+        losses.update(loss_bbox)
+        return losses
diff --git a/mmtrack/models/sot/siamrpn.py b/mmtrack/models/sot/siamrpn.py
index 86291c079..1a2d0a8c2 100644
--- a/mmtrack/models/sot/siamrpn.py
+++ b/mmtrack/models/sot/siamrpn.py
@@ -1,18 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
+from typing import List, Optional, Tuple, Union
 
-import numpy as np
 import torch
-from addict import Dict
-from mmdet.core.bbox import bbox_cxcywh_to_xyxy
-from mmdet.models.builder import build_backbone, build_head, build_neck
+from torch import Tensor
 from torch.nn.modules.batchnorm import _BatchNorm
 from torch.nn.modules.conv import _ConvNd
 
-from mmtrack.core.bbox import (bbox_cxcywh_to_x1y1wh, bbox_xyxy_to_x1y1wh,
-                               calculate_region_overlap, quad2bbox)
-from mmtrack.core.evaluation import bbox2region
-from ..builder import MODELS
+from mmtrack.registry import MODELS
+from mmtrack.utils import (InstanceList, OptConfigType, OptMultiConfig,
+                           SampleList)
 from .base import BaseSingleObjectTracker
 
 
@@ -25,30 +21,22 @@ class SiamRPN(BaseSingleObjectTracker):
     """
 
     def __init__(self,
-                 backbone,
-                 neck=None,
-                 head=None,
-                 pretrains=None,
-                 init_cfg=None,
-                 frozen_modules=None,
-                 train_cfg=None,
-                 test_cfg=None):
-        super(SiamRPN, self).__init__(init_cfg)
-        if isinstance(pretrains, dict):
-            warnings.warn('DeprecationWarning: pretrains is deprecated, '
-                          'please use "init_cfg" instead')
-            backbone_pretrain = pretrains.get('backbone', None)
-            if backbone_pretrain:
-                backbone.init_cfg = dict(
-                    type='Pretrained', checkpoint=backbone_pretrain)
-            else:
-                backbone.init_cfg = None
-        self.backbone = build_backbone(backbone)
+                 backbone: dict,
+                 neck: Optional[dict] = None,
+                 head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 frozen_modules: Optional[Union[List[str], Tuple[str],
+                                                str]] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(SiamRPN, self).__init__(data_preprocessor, init_cfg)
+        self.backbone = MODELS.build(backbone)
         if neck is not None:
-            self.neck = build_neck(neck)
+            self.neck = MODELS.build(neck)
         head = head.copy()
         head.update(train_cfg=train_cfg.rpn, test_cfg=test_cfg.rpn)
-        self.head = build_head(head)
+        self.head = MODELS.build(head)
 
         self.test_cfg = test_cfg
         self.train_cfg = train_cfg
@@ -74,7 +62,7 @@ def init_weights(self):
                 if isinstance(m, _ConvNd) or isinstance(m, _BatchNorm):
                     m.reset_parameters()
 
-    def forward_template(self, z_img):
+    def forward_template(self, z_img: Tensor) -> Tuple[Tensor]:
         """Extract the features of exemplar images.
 
         Args:
@@ -82,7 +70,8 @@ def forward_template(self, z_img):
                 images. Typically H and W equal to 127.
 
         Returns:
-            tuple(Tensor): Multi level feature map of exemplar images.
+            Tuple[Tensor, ...]: Multi level feature map of exemplar
+                images.
         """
         z_feat = self.backbone(z_img)
         if self.with_neck:
@@ -95,7 +84,7 @@ def forward_template(self, z_img):
             z_feat_center.append(z_feat[i][:, :, left:right, left:right])
         return tuple(z_feat_center)
 
-    def forward_search(self, x_img):
+    def forward_search(self, x_img: Tensor) -> Tuple[Tensor, ...]:
         """Extract the features of search images.
 
         Args:
@@ -103,15 +92,16 @@ def forward_search(self, x_img):
                 images. Typically H and W equal to 255.
 
         Returns:
-            tuple(Tensor): Multi level feature map of search images.
+            Tuple[Tensor, ...]: Multi level feature map of search images.
         """
         x_feat = self.backbone(x_img)
         if self.with_neck:
             x_feat = self.neck(x_feat)
         return x_feat
 
-    def get_cropped_img(self, img, center_xy, target_size, crop_size,
-                        avg_channel):
+    def get_cropped_img(self, img: Tensor, center_xy: Tensor,
+                        target_size: Tensor, crop_size: Tensor,
+                        avg_channel: Tensor) -> Tensor:
         """Crop image.
 
         Only used during testing.
@@ -124,15 +114,16 @@ def get_cropped_img(self, img, center_xy, target_size, crop_size,
         Args:
             img (Tensor): of shape (1, C, H, W) encoding original input
                 image.
-            center_xy (Tensor): of shape (2, ) denoting the center point for
-                cropping image.
+            center_xy (Tensor): of shape (2, ) denoting the center point
+                for cropping image.
             target_size (int): The output size of cropped image.
             crop_size (Tensor): The size for cropping image.
-            avg_channel (Tensor): of shape (3, ) denoting the padding values.
+            avg_channel (Tensor): of shape (3, ) denoting the padding
+                values.
 
         Returns:
-            Tensor: of shape (1, C, target_size, target_size) encoding the
-            resized cropped image.
+            Tensor: of shape (1, C, target_size, target_size) encoding
+                the resized cropped image.
         """
         N, C, H, W = img.shape
         context_xmin = int(center_xy[0] - crop_size / 2)
@@ -176,263 +167,90 @@ def get_cropped_img(self, img, center_xy, target_size, crop_size,
             align_corners=False)
         return crop_img
 
-    def _bbox_clip(self, bbox, img_h, img_w):
-        """Clip the bbox with [cx, cy, w, h] format."""
-        bbox[0] = bbox[0].clamp(0., img_w)
-        bbox[1] = bbox[1].clamp(0., img_h)
-        bbox[2] = bbox[2].clamp(10., img_w)
-        bbox[3] = bbox[3].clamp(10., img_h)
-        return bbox
-
-    def init(self, img, bbox):
+    def init(self, img: Tensor) -> Tuple[Tuple[Tensor, ...], Tensor]:
         """Initialize the single object tracker in the first frame.
 
         Args:
             img (Tensor): of shape (1, C, H, W) encoding original input
                 image.
-            bbox (Tensor): The given instance bbox of first frame that need be
-                tracked in the following frames. The shape of the box is (4, )
-                with [cx, cy, w, h] format.
-
-        Returns:
-            tuple(z_feat, avg_channel): z_feat is a tuple[Tensor] that
-            contains the multi level feature maps of exemplar image,
-            avg_channel is Tensor with shape (3, ), and denotes the padding
-            values.
         """
+        bbox = self.memo.bbox
         z_width = bbox[2] + self.test_cfg.context_amount * (bbox[2] + bbox[3])
         z_height = bbox[3] + self.test_cfg.context_amount * (bbox[2] + bbox[3])
         z_size = torch.round(torch.sqrt(z_width * z_height))
+        # used for padding when cropping the image.
         avg_channel = torch.mean(img, dim=(0, 2, 3))
         z_crop = self.get_cropped_img(img, bbox[0:2],
                                       self.test_cfg.exemplar_size, z_size,
                                       avg_channel)
-        z_feat = self.forward_template(z_crop)
-        return z_feat, avg_channel
+        self.memo.z_feat = self.forward_template(z_crop)
+        self.memo.avg_channel = avg_channel
 
-    def track(self, img, bbox, z_feat, avg_channel):
-        """Track the box `bbox` of previous frame to current frame `img`.
+    def track(self, img: Tensor, data_samples: SampleList) -> InstanceList:
+        """Track the box of previous frame to current frame `img`.
 
         Args:
             img (Tensor): of shape (1, C, H, W) encoding original input
                 image.
-            bbox (Tensor): The bbox in previous frame. The shape of the box is
-                (4, ) in [cx, cy, w, h] format.
-            z_feat (tuple[Tensor]): The multi level feature maps of exemplar
-                image in the first frame.
-            avg_channel (Tensor): of shape (3, ) denoting the padding values.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instances`` and 'metainfo'.
 
         Returns:
-            tuple(best_score, best_bbox): best_score is a Tensor denoting the
-            score of best_bbox, best_bbox is a Tensor of shape (4, ) in
-            [cx, cy, w, h] format, and denotes the best tracked bbox in
-            current frame.
+            InstanceList: Tracking results of each image after the postprocess.
+                - scores: a Tensor denoting the score of best_bbox.
+                - bboxes: a Tensor of shape (4, ) in [x1, x2, y1, y2]
+                format, and denotes the best tracked bbox in current frame.
         """
-        z_width = bbox[2] + self.test_cfg.context_amount * (bbox[2] + bbox[3])
-        z_height = bbox[3] + self.test_cfg.context_amount * (bbox[2] + bbox[3])
+        prev_bbox = self.memo.bbox
+        z_width = prev_bbox[2] + self.test_cfg.context_amount * (
+            prev_bbox[2] + prev_bbox[3])
+        z_height = prev_bbox[3] + self.test_cfg.context_amount * (
+            prev_bbox[2] + prev_bbox[3])
         z_size = torch.sqrt(z_width * z_height)
 
         x_size = torch.round(
             z_size * (self.test_cfg.search_size / self.test_cfg.exemplar_size))
-        x_crop = self.get_cropped_img(img, bbox[0:2],
+        x_crop = self.get_cropped_img(img, prev_bbox[0:2],
                                       self.test_cfg.search_size, x_size,
-                                      avg_channel)
+                                      self.memo.avg_channel)
 
         x_feat = self.forward_search(x_crop)
-        cls_score, bbox_pred = self.head(z_feat, x_feat)
         scale_factor = self.test_cfg.exemplar_size / z_size
-        best_score, best_bbox = self.head.get_bbox(cls_score, bbox_pred, bbox,
-                                                   scale_factor)
-
-        # clip boundary
-        best_bbox = self._bbox_clip(best_bbox, img.shape[2], img.shape[3])
-        return best_score, best_bbox
-
-    def simple_test_vot(self, img, frame_id, gt_bboxes, img_metas=None):
-        """Test using VOT test mode.
-
-        Args:
-            img (Tensor): of shape (1, C, H, W) encoding input image.
-            frame_id (int): the id of current frame in the video.
-            gt_bboxes (list[Tensor]): list of ground truth bboxes for each
-                image with shape (1, 4) in [tl_x, tl_y, br_x, br_y] format or
-                shape (1, 8) in [x1, y1, x2, y2, x3, y3, x4, y4].
-            img_metas (list[dict]): list of image information dict where each
-                dict has: 'img_shape', 'scale_factor', 'flip', and may also
-                contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-        Returns:
-            bbox_pred (Tensor): in [tl_x, tl_y, br_x, br_y] format.
-            best_score (Tensor): the tracking bbox confidence in range [0,1],
-                and the score of initial frame is -1.
-        """
-        if frame_id == 0:
-            self.init_frame_id = 0
-        if self.init_frame_id == frame_id:
-            # initialization
-            gt_bboxes = gt_bboxes[0][0]
-            self.memo = Dict()
-            self.memo.bbox = quad2bbox(gt_bboxes)
-            self.memo.z_feat, self.memo.avg_channel = self.init(
-                img, self.memo.bbox)
-            # 1 denotes the initialization state
-            bbox_pred = img.new_tensor([1.])
-            best_score = -1.
-        elif self.init_frame_id > frame_id:
-            # 0 denotes unknown state, namely the skipping frame after failure
-            bbox_pred = img.new_tensor([0.])
-            best_score = -1.
-        else:
-            # normal tracking state
-            best_score, self.memo.bbox = self.track(img, self.memo.bbox,
-                                                    self.memo.z_feat,
-                                                    self.memo.avg_channel)
-            # convert bbox to region
-            track_bbox = bbox_cxcywh_to_x1y1wh(self.memo.bbox).cpu().numpy()
-            track_region = bbox2region(track_bbox)
-            gt_bbox = gt_bboxes[0][0]
-            if len(gt_bbox) == 4:
-                gt_bbox = bbox_xyxy_to_x1y1wh(gt_bbox)
-            gt_region = bbox2region(gt_bbox.cpu().numpy())
-
-            if img_metas is not None and 'img_shape' in img_metas[0]:
-                image_shape = img_metas[0]['img_shape']
-                image_wh = (image_shape[1], image_shape[0])
-            else:
-                image_wh = None
-                Warning('image shape are need when calculating bbox overlap')
-            overlap = calculate_region_overlap(
-                track_region, gt_region, bounds=image_wh)
-            if overlap <= 0:
-                # tracking failure
-                self.init_frame_id = frame_id + 5
-                # 2 denotes the failure state
-                bbox_pred = img.new_tensor([2.])
-            else:
-                bbox_pred = bbox_cxcywh_to_xyxy(self.memo.bbox)
-
-        return bbox_pred, best_score
-
-    def simple_test_ope(self, img, frame_id, gt_bboxes):
-        """Test using OPE test mode.
-
-        Args:
-            img (Tensor): of shape (1, C, H, W) encoding input image.
-            frame_id (int): the id of current frame in the video.
-            gt_bboxes (list[Tensor]): list of ground truth bboxes for each
-                image with shape (1, 4) in [tl_x, tl_y, br_x, br_y] format or
-                shape (1, 8) in [x1, y1, x2, y2, x3, y3, x4, y4].
-
-        Returns:
-            bbox_pred (Tensor): in [tl_x, tl_y, br_x, br_y] format.
-            best_score (Tensor): the tracking bbox confidence in range [0,1],
-                and the score of initial frame is -1.
-        """
-        if frame_id == 0:
-            gt_bboxes = gt_bboxes[0][0]
-            self.memo = Dict()
-            self.memo.bbox = quad2bbox(gt_bboxes)
-            self.memo.z_feat, self.memo.avg_channel = self.init(
-                img, self.memo.bbox)
-            best_score = -1.
-        else:
-            best_score, self.memo.bbox = self.track(img, self.memo.bbox,
-                                                    self.memo.z_feat,
-                                                    self.memo.avg_channel)
-        bbox_pred = bbox_cxcywh_to_xyxy(self.memo.bbox)
 
-        return bbox_pred, best_score
+        results = self.head.predict(self.memo.z_feat, x_feat, data_samples,
+                                    prev_bbox, scale_factor)
 
-    def simple_test(self, img, img_metas, gt_bboxes, **kwargs):
-        """Test without augmentation.
-
-        Args:
-            img (Tensor): of shape (1, C, H, W) encoding input image.
-            img_metas (list[dict]): list of image information dict where each
-                dict has: 'img_shape', 'scale_factor', 'flip', and may also
-                contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-            gt_bboxes (list[Tensor]): list of ground truth bboxes for each
-                image with shape (1, 4) in [tl_x, tl_y, br_x, br_y] format or
-                shape (1, 8) in [x1, y1, x2, y2, x3, y3, x4, y4].
-
-        Returns:
-            dict[str : ndarray]: The tracking results.
-        """
-        frame_id = img_metas[0].get('frame_id', -1)
-        assert frame_id >= 0
-        assert len(img) == 1, 'only support batch_size=1 when testing'
-
-        test_mode = self.test_cfg.get('test_mode', 'OPE')
-        assert test_mode in ['OPE', 'VOT']
-        if test_mode == 'VOT':
-            bbox_pred, best_score = self.simple_test_vot(
-                img, frame_id, gt_bboxes, img_metas)
-        else:
-            bbox_pred, best_score = self.simple_test_ope(
-                img, frame_id, gt_bboxes)
-
-        results = dict()
-        if best_score == -1.:
-            results['track_bboxes'] = np.concatenate(
-                (bbox_pred.cpu().numpy(), np.array([best_score])))
-        else:
-            results['track_bboxes'] = np.concatenate(
-                (bbox_pred.cpu().numpy(), best_score.cpu().numpy()[None]))
         return results
 
-    def forward_train(self, img, img_metas, gt_bboxes, search_img,
-                      search_img_metas, search_gt_bboxes, is_positive_pairs,
-                      **kwargs):
+    def loss(self, inputs: dict, data_samples: SampleList, **kwargs) -> dict:
         """
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input exemplar images.
-                Typically H and W equal to 127.
-
-            img_metas (list[dict]): list of image information dict where each
-                dict has: 'img_shape', 'scale_factor', 'flip', and may also
-                contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each exemplar
-                image with shape (1, 4) in [tl_x, tl_y, br_x, br_y] format.
-
-            search_img (Tensor): of shape (N, 1, C, H, W) encoding input search
-                images. 1 denotes there is only one search image for each
-                exemplar image. Typically H and W equal to 255.
-
-            search_img_metas (list[list[dict]]): The second list only has one
-                element. The first list contains search image information dict
-                where each dict has: 'img_shape', 'scale_factor', 'flip', and
-                may also contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            search_gt_bboxes (list[Tensor]): Ground truth bboxes for each
-                search image with shape (1, 5) in [0.0, tl_x, tl_y, br_x, br_y]
-                format.
-
-            is_positive_pairs (list[bool]): list of bool denoting whether each
-                exemplar image and corresponding search image is positive pair.
-
-        Returns:
-            dict[str, Tensor]: a dictionary of loss components.
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size. The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instance``.
+
+        Return:
+            dict: A dictionary of loss components.
         """
+        search_img = inputs['search_img']
+        assert search_img.dim(
+        ) == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
         search_img = search_img[:, 0]
 
-        z_feat = self.forward_template(img)
-        x_feat = self.forward_search(search_img)
-        cls_score, bbox_pred = self.head(z_feat, x_feat)
-
-        losses = dict()
-        bbox_targets = self.head.get_targets(search_gt_bboxes,
-                                             cls_score.shape[2:],
-                                             is_positive_pairs)
-        head_losses = self.head.loss(cls_score, bbox_pred, *bbox_targets)
-        losses.update(head_losses)
+        template_img = inputs['img']
+        assert template_img.dim(
+        ) == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        template_img = template_img[:, 0]
 
+        z_feat = self.forward_template(template_img)
+        x_feat = self.forward_search(search_img)
+        losses = self.head.loss(z_feat, x_feat, data_samples, **kwargs)
         return losses
diff --git a/mmtrack/models/sot/stark.py b/mmtrack/models/sot/stark.py
index b9a3c56d2..d85753022 100644
--- a/mmtrack/models/sot/stark.py
+++ b/mmtrack/models/sot/stark.py
@@ -1,18 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
 from copy import deepcopy
+from typing import List, Optional, Tuple, Union
 
-import numpy as np
 import torch
 import torch.nn.functional as F
-from addict import Dict
-from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh
-from mmdet.models.builder import build_backbone, build_head, build_neck
+from mmdet.structures.bbox.transforms import bbox_xyxy_to_cxcywh
+from torch import Tensor
 from torch.nn.modules.batchnorm import _BatchNorm
 from torch.nn.modules.conv import _ConvNd
-from torchvision.transforms.functional import normalize
 
-from ..builder import MODELS
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.utils import (InstanceList, OptConfigType, OptMultiConfig,
+                           SampleList)
 from .base import BaseSingleObjectTracker
 
 
@@ -40,24 +41,32 @@ class Stark(BaseSingleObjectTracker):
     """
 
     def __init__(self,
-                 backbone,
-                 neck=None,
-                 head=None,
-                 init_cfg=None,
-                 frozen_modules=None,
-                 train_cfg=None,
-                 test_cfg=None):
-        super(Stark, self).__init__(init_cfg)
-        self.backbone = build_backbone(backbone)
-        self.neck = build_neck(neck)
-        self.head = build_head(head)
+                 backbone: dict,
+                 neck: Optional[dict] = None,
+                 head: Optional[dict] = None,
+                 pretrains: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 frozen_modules: Optional[Union[List[str], Tuple[str],
+                                                str]] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(Stark, self).__init__(data_preprocessor, init_cfg)
+        head.update(test_cfg=test_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck)
+        self.head = MODELS.build(head)
 
         self.test_cfg = test_cfg
         self.train_cfg = train_cfg
 
+        self.num_templates = self.test_cfg['num_templates']
+
         # Set the update interval
-        self.update_intervals = self.test_cfg['update_intervals']
-        self.num_extra_template = len(self.update_intervals)
+        self.update_intervals = self.test_cfg.get('update_intervals', None)
+        if isinstance(self.update_intervals, (int, float)):
+            self.update_intervals = [int(self.update_intervals)
+                                     ] * self.num_templates
 
         if frozen_modules is not None:
             self.freeze_module(frozen_modules)
@@ -78,7 +87,7 @@ def init_weights(self):
         if self.with_head:
             self.head.init_weights()
 
-    def extract_feat(self, img):
+    def extract_feat(self, img: Tensor) -> Tensor:
         """Extract the features of the input image.
 
         Args:
@@ -92,8 +101,9 @@ def extract_feat(self, img):
         feat = self.neck(feat)
         return feat
 
-    def get_cropped_img(self, img, target_bbox, search_area_factor,
-                        output_size):
+    def get_cropped_img(self, img: Tensor, target_bbox: Tensor,
+                        search_area_factor: float,
+                        output_size: float) -> Union[Tensor, float, Tensor]:
         """ Crop Image
         Only used during testing
         This function mainly contains two steps:
@@ -103,8 +113,8 @@ def get_cropped_img(self, img, target_bbox, search_area_factor,
 
         args:
             img (Tensor): of shape (1, C, H, W)
-            target_bbox (list | ndarray): in [cx, cy, w, h] format
-            search_area_factor (float): Ratio of crop size to target size
+            target_bbox (Tensor): in [cx, cy, w, h] format
+            search_area_factor (float): Ratio of crop size to target size.
             output_size (float): the size of output cropped image
                 (always square).
         returns:
@@ -165,262 +175,131 @@ def get_cropped_img(self, img, target_bbox, search_area_factor,
 
         return img_crop_padded, resize_factor, padding_mask
 
-    def init(self, img, bbox):
+    def init(self, img: Tensor):
         """Initialize the single object tracker in the first frame.
 
         Args:
             img (Tensor): input image of shape (1, C, H, W).
-            bbox (list | Tensor): in [cx, cy, w, h] format.
         """
-        self.z_dict_list = []  # store templates
+        self.memo.z_dict_list = []  # store templates
         # get the 1st template
         z_patch, _, z_mask = self.get_cropped_img(
-            img, bbox, self.test_cfg['template_factor'],
+            img, self.memo.bbox, self.test_cfg['template_factor'],
             self.test_cfg['template_size']
         )  # z_patch of shape [1,C,H,W];  z_mask of shape [1,H,W]
-        z_patch = normalize(
-            z_patch.squeeze() / 255.,
-            mean=[0.485, 0.456, 0.406],
-            std=[0.229, 0.224, 0.225]).unsqueeze(0)
 
-        with torch.no_grad():
-            z_feat = self.extract_feat(z_patch)
+        z_feat = self.extract_feat(z_patch)
 
         self.z_dict = dict(feat=z_feat, mask=z_mask)
-        self.z_dict_list.append(self.z_dict)
+        self.memo.z_dict_list.append(self.z_dict)
 
         # get other templates
-        for _ in range(self.num_extra_template):
-            self.z_dict_list.append(deepcopy(self.z_dict))
+        for _ in range(self.num_templates - 1):
+            self.memo.z_dict_list.append(deepcopy(self.z_dict))
 
-    def update_template(self, img, bbox, conf_score):
+    def update_template(self, img: Tensor, bbox: Union[List, Tensor],
+                        conf_score: float):
         """Update the dymanic templates.
 
         Args:
             img (Tensor): of shape (1, C, H, W).
-            bbox (list | ndarray): in [cx, cy, w, h] format.
+            bbox (list | Tensor): in [cx, cy, w, h] format.
             conf_score (float): the confidence score of the predicted bbox.
         """
         for i, update_interval in enumerate(self.update_intervals):
-            if self.frame_id % update_interval == 0 and conf_score > 0.5:
+            if self.memo.frame_id % update_interval == 0 and conf_score > 0.5:
                 z_patch, _, z_mask = self.get_cropped_img(
                     img,
                     bbox,
                     self.test_cfg['template_factor'],
                     output_size=self.test_cfg['template_size'])
-                z_patch = normalize(
-                    z_patch.squeeze() / 255.,
-                    mean=[0.485, 0.456, 0.406],
-                    std=[0.229, 0.224, 0.225]).unsqueeze(0)
-                with torch.no_grad():
-                    z_feat = self.extract_feat(z_patch)
+                z_feat = self.extract_feat(z_patch)
                 # the 1st element of z_dict_list is the template from the 1st
                 # frame
-                self.z_dict_list[i + 1] = dict(feat=z_feat, mask=z_mask)
+                self.memo.z_dict_list[i + 1] = dict(feat=z_feat, mask=z_mask)
 
-    def mapping_bbox_back(self, pred_bboxes, prev_bbox, resize_factor):
-        """Mapping the `prediction bboxes` from resized cropped image to
-        original image. The coordinate origins of them are both the top left
-        corner.
-
-        Args:
-            pred_bboxes (Tensor): the predicted bbox of shape (B, Nq, 4), in
-                [tl_x, tl_y, br_x, br_y] format. The coordinates are based in
-                the resized cropped image.
-            prev_bbox (Tensor): the previous bbox of shape (B, 4),
-                in [cx, cy, w, h] format. The coordinates are based in the
-                original image.
-            resize_factor (float): the ratio of original image scale to cropped
-                image scale.
-        Returns:
-            (Tensor): in [tl_x, tl_y, br_x, br_y] format.
-        """
-        # based in the resized croped image
-        pred_bboxes = pred_bboxes.view(-1, 4)
-        # based in the original croped image
-        pred_bbox = pred_bboxes.mean(dim=0) / resize_factor
-
-        # the half size of the original croped image
-        cropped_img_half_size = 0.5 * self.test_cfg[
-            'search_size'] / resize_factor
-        # (x_shift, y_shift) is the coordinate of top left corner of the
-        # cropped image based in the original image.
-        x_shift, y_shift = prev_bbox[0] - cropped_img_half_size, prev_bbox[
-            1] - cropped_img_half_size
-        pred_bbox[0:4:2] += x_shift
-        pred_bbox[1:4:2] += y_shift
-
-        return pred_bbox
-
-    def _bbox_clip(self, bbox, img_h, img_w, margin=0):
-        """Clip the bbox in [tl_x, tl_y, br_x, br_y] format."""
-        bbox_w, bbox_h = bbox[2] - bbox[0], bbox[3] - bbox[1]
-        bbox[0] = bbox[0].clamp(0, img_w - margin)
-        bbox[1] = bbox[1].clamp(0, img_h - margin)
-        bbox_w = bbox_w.clamp(margin, img_w)
-        bbox_h = bbox_h.clamp(margin, img_h)
-        bbox[2] = bbox[0] + bbox_w
-        bbox[3] = bbox[1] + bbox_h
-        return bbox
-
-    def track(self, img, bbox):
-        """Track the box `bbox` of previous frame to current frame `img`.
+    def track(self, img: Tensor, data_samples: SampleList) -> InstanceList:
+        """Track the box of previous frame to current frame `img`.
 
         Args:
             img (Tensor): of shape (1, C, H, W).
-            bbox (list | Tensor): The bbox in previous frame. The shape of the
-                bbox is (4, ) in [x, y, w, h] format.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instances`` and 'metainfo'.
 
         Returns:
+            InstanceList: Tracking results of each image after the postprocess.
+                - scores: a Tensor denoting the score of best_bbox.
+                - bboxes: a Tensor of shape (4, ) in [x1, x2, y1, y2]
+                format, and denotes the best tracked bbox in current frame.
         """
-        H, W = img.shape[2:]
-        # get the t-th search region
+        # get the search patches
         x_patch, resize_factor, x_mask = self.get_cropped_img(
-            img, bbox, self.test_cfg['search_factor'],
+            img, self.memo.bbox, self.test_cfg['search_factor'],
             self.test_cfg['search_size']
         )  # bbox: of shape (x1, y1, w, h), x_mask: of shape (1, h, w)
-        x_patch = normalize(
-            x_patch.squeeze() / 255.,
-            mean=[0.485, 0.456, 0.406],
-            std=[0.229, 0.224, 0.225]).unsqueeze(0)
-
-        with torch.no_grad():
-            x_feat = self.extract_feat(x_patch)
-            x_dict = dict(feat=x_feat, mask=x_mask)
-            head_inputs = self.z_dict_list + [x_dict]
-            # run the transformer
-            track_results = self.head(head_inputs)
-
-        final_bbox = self.mapping_bbox_back(track_results['pred_bboxes'],
-                                            self.memo.bbox, resize_factor)
-        final_bbox = self._bbox_clip(final_bbox, H, W, margin=10)
-
-        conf_score = -1.
-        if self.head.cls_head is not None:
-            # get confidence score (whether the search region is reliable)
-            conf_score = track_results['pred_logits'].view(-1).sigmoid().item()
-            crop_bbox = bbox_xyxy_to_cxcywh(final_bbox)
-            self.update_template(img, crop_bbox, conf_score)
-
-        return conf_score, final_bbox
 
-    def simple_test(self, img, img_metas, gt_bboxes, **kwargs):
-        """Test without augmentation.
+        x_feat = self.extract_feat(x_patch)
+        x_dict = dict(feat=x_feat, mask=x_mask)
+        head_inputs = self.memo.z_dict_list + [x_dict]
+        results = self.head.predict(head_inputs, data_samples, self.memo.bbox,
+                                    resize_factor)
 
-        Args:
-            img (Tensor): input image of shape (1, C, H, W).
-            img_metas (list[dict]): list of image information dict where each
-                dict has: 'img_shape', 'scale_factor', 'flip', and may also
-                contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-            gt_bboxes (list[Tensor]): list of ground truth bboxes for each
-                image with shape (1, 4) in [tl_x, tl_y, br_x, br_y] format.
+        if results[0].scores.item() != -1:
+            # get confidence score (whether the search region is reliable)
+            crop_bbox = bbox_xyxy_to_cxcywh(results[0].bboxes.squeeze())
+            self.update_template(img, crop_bbox, results[0].scores.item())
 
-        Returns:
-            dict(str : ndarray): the tracking results.
-        """
-        frame_id = img_metas[0].get('frame_id', -1)
-        assert frame_id >= 0
-        assert len(img) == 1, 'only support batch_size=1 when testing'
-        self.frame_id = frame_id
-
-        if frame_id == 0:
-            bbox_pred = gt_bboxes[0][0]
-            self.memo = Dict()
-            self.memo.bbox = bbox_xyxy_to_cxcywh(bbox_pred)
-            self.init(img, self.memo.bbox)
-            best_score = -1.
-        else:
-            best_score, bbox_pred = self.track(img, self.memo.bbox)
-            self.memo.bbox = bbox_xyxy_to_cxcywh(bbox_pred)
-
-        results = dict()
-        results['track_bboxes'] = np.concatenate(
-            (bbox_pred.cpu().numpy(), np.array([best_score])))
         return results
 
-    def forward_train(self,
-                      img,
-                      img_metas,
-                      search_img,
-                      search_img_metas,
-                      gt_bboxes,
-                      padding_mask,
-                      search_gt_bboxes,
-                      search_padding_mask,
-                      search_gt_labels=None,
-                      **kwargs):
-        """forward of training.
+    def predict_vot(self, inputs: dict, data_samples: List[TrackDataSample]):
+        raise NotImplementedError(
+            'STARK does not support testing on VOT datasets yet.')
 
-        Args:
-            img (Tensor): template images of shape (N, num_templates, C, H, W).
-                Typically, there are 2 template images, and
-                H and W are both equal to 128.
-
-            img_metas (list[dict]): list of image information dict where each
-                dict has: 'img_shape', 'scale_factor', 'flip', and may also
-                contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            search_img (Tensor): of shape (N, 1, C, H, W) encoding input search
-                images. 1 denotes there is only one search image for each
-                template image. Typically H and W are both equal to 320.
-
-            search_img_metas (list[list[dict]]): The second list only has one
-                element. The first list contains search image information dict
-                where each dict has: 'img_shape', 'scale_factor', 'flip', and
-                may also contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            gt_bboxes (list[Tensor]): Ground truth bboxes for template
-                images with shape (N, 4) in [tl_x, tl_y, br_x, br_y] format.
-
-            padding_mask (Tensor): padding mask of template images.
-                It's of shape (N, num_templates, H, W).
-                Typically, there are 2 padding masks of template images, and
-                H and W are both equal to that of template images.
-
-            search_gt_bboxes (list[Tensor]): Ground truth bboxes for search
-                images with shape (N, 5) in [0., tl_x, tl_y, br_x, br_y]
-                format.
-
-            search_padding_mask (Tensor): padding mask of search images.
-                Its of shape (N, 1, H, W).
-                There are 1 padding masks of search image, and
-                H and W are both equal to that of search image.
-
-            search_gt_labels (list[Tensor], optional): Ground truth labels for
-                search images with shape (N, 2).
+    def loss(self, inputs: dict, data_samples: List[TrackDataSample],
+             **kwargs) -> dict:
+        """Forward of training.
 
-        Returns:
-            dict[str, Tensor]: a dictionary of loss components.
+        Args:
+            inputs (dict[Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size. The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Return:
+            dict: A dictionary of loss components.
         """
+        template_padding_mask = [
+            data_sample.padding_mask for data_sample in data_samples
+        ]
+        template_padding_mask = torch.stack(template_padding_mask, dim=0)
+        search_padding_mask = [
+            data_sample.search_padding_mask for data_sample in data_samples
+        ]
+        search_padding_mask = torch.stack(search_padding_mask, dim=0)
+
+        search_img = inputs['search_img']
+        assert search_img.dim(
+        ) == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        template_img = inputs['img']
+        assert template_img.dim(
+        ) == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+
         head_inputs = []
-        for i in range(self.num_extra_template + 1):
-            z_feat = self.extract_feat(img[:, i])
-            z_dict = dict(feat=z_feat, mask=padding_mask[:, i])
+        for i in range(self.num_templates):
+            z_feat = self.extract_feat(template_img[:, i])
+            z_dict = dict(feat=z_feat, mask=template_padding_mask[:, i])
             head_inputs.append(z_dict)
         x_feat = self.extract_feat(search_img[:, 0])
         x_dict = dict(feat=x_feat, mask=search_padding_mask[:, 0])
         head_inputs.append(x_dict)
-        # run the transformer
-        '''
-        `track_results` is a dict containing the following keys:
-            - 'pred_bboxes': bboxes of (N, num_query, 4) shape in
-                    [tl_x, tl_y, br_x, br_y] format.
-            - 'pred_logits': bboxes of (N, num_query, 1) shape.
-        Typically `num_query` is equal to 1.
-        '''
-        track_results = self.head(head_inputs)
-
-        losses = dict()
-        head_losses = self.head.loss(track_results, search_gt_bboxes,
-                                     search_gt_labels,
-                                     search_img[:, 0].shape[-2:])
-
-        losses.update(head_losses)
+
+        losses = self.head.loss(head_inputs, data_samples)
 
         return losses
diff --git a/mmtrack/models/task_modules/__init__.py b/mmtrack/models/task_modules/__init__.py
new file mode 100644
index 000000000..26ac870f2
--- /dev/null
+++ b/mmtrack/models/task_modules/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor import *  # noqa: F401,F403
+from .filter import *  # noqa: F401,F403
+from .motion import *  # noqa: F401,F403
+from .track import *  # noqa: F401,F403
diff --git a/mmtrack/core/anchor/__init__.py b/mmtrack/models/task_modules/anchor/__init__.py
similarity index 100%
rename from mmtrack/core/anchor/__init__.py
rename to mmtrack/models/task_modules/anchor/__init__.py
diff --git a/mmtrack/core/anchor/sot_anchor_generator.py b/mmtrack/models/task_modules/anchor/sot_anchor_generator.py
similarity index 79%
rename from mmtrack/core/anchor/sot_anchor_generator.py
rename to mmtrack/models/task_modules/anchor/sot_anchor_generator.py
index a43db9bf1..c78b4474a 100644
--- a/mmtrack/core/anchor/sot_anchor_generator.py
+++ b/mmtrack/models/task_modules/anchor/sot_anchor_generator.py
@@ -1,10 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
 import numpy as np
 import torch
-from mmdet.core.anchor import ANCHOR_GENERATORS, AnchorGenerator
+from mmdet.models.task_modules import AnchorGenerator
+from torch import Tensor
+
+from mmtrack.registry import TASK_UTILS
 
 
-@ANCHOR_GENERATORS.register_module()
+@TASK_UTILS.register_module()
 class SiameseRPNAnchorGenerator(AnchorGenerator):
     """Anchor generator for siamese rpn.
 
@@ -12,16 +17,19 @@ class SiameseRPNAnchorGenerator(AnchorGenerator):
     for detailed docstring.
     """
 
-    def __init__(self, strides, *args, **kwargs):
+    def __init__(self, strides: List[Union[int, Tuple[int, int]]], *args,
+                 **kwargs):
         assert len(strides) == 1, 'only support one feature map level'
         super(SiameseRPNAnchorGenerator,
               self).__init__(strides, *args, **kwargs)
 
-    def gen_2d_hanning_windows(self, featmap_sizes, device='cuda'):
+    def gen_2d_hanning_windows(self,
+                               featmap_sizes: List[torch.Size],
+                               device='cuda') -> List[Tensor]:
         """Generate 2D hanning window.
 
         Args:
-            featmap_sizes (list[torch.size]): List of torch.size recording the
+            featmap_sizes (list[torch.Size]): List of torch.Size recording the
                 resolution (height, width) of the multi-level feature maps.
             device (str): Device the tensor will be put on. Defaults to 'cuda'.
 
@@ -39,11 +47,12 @@ def gen_2d_hanning_windows(self, featmap_sizes, device='cuda'):
             multi_level_windows.append(torch.from_numpy(window).to(device))
         return multi_level_windows
 
-    def gen_single_level_base_anchors(self,
-                                      base_size,
-                                      scales,
-                                      ratios,
-                                      center=None):
+    def gen_single_level_base_anchors(
+            self,
+            base_size: Union[int, float],
+            scales: Tensor,
+            ratios: Tensor,
+            center: Optional[Tuple[float]] = None) -> Tensor:
         """Generate base anchors of a single level feature map.
 
         Args:
diff --git a/mmtrack/models/task_modules/filter/__init__.py b/mmtrack/models/task_modules/filter/__init__.py
new file mode 100644
index 000000000..b68faa0d6
--- /dev/null
+++ b/mmtrack/models/task_modules/filter/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .filter import apply_feat_transpose, apply_filter
+
+__all__ = ['apply_filter', 'apply_feat_transpose']
diff --git a/mmtrack/models/task_modules/filter/filter.py b/mmtrack/models/task_modules/filter/filter.py
new file mode 100644
index 000000000..695e5eacb
--- /dev/null
+++ b/mmtrack/models/task_modules/filter/filter.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# The codes are modified from https://github.com/visionml/pytracking/blob/master/ltr/models/layers/filter.py # noqa: E501
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def apply_filter(feat: Tensor, filter_weights: Tensor) -> Tensor:
+    """Applies the filter on the input features.
+
+    The number of groups is automatically calculated.
+    args:
+        feat (Tensor): The input features with two possible shapes in the
+            different modes:
+            - training mode: of shape (num_img_per_seq, bs, c, h, w)
+            - test mode: of shape (num_img_per_seq, c, h, w).
+        filter_weights (Tensor): The filter to be applied on the `feat`. There
+            are two possible shapes in the different modes:
+            - training mode: of shape (bs, c, filter_h, filter_w)
+            - test mode: of shape (1, c, filter_h, filter_w)
+    output:
+        scores (Tenosr): Output of filtering.
+            - train mode: (num_img_per_seq, bs, h, w)
+            - test mode: (num_img_per_seq, 1, h, w)
+    """
+    padding = (filter_weights.shape[-2] // 2, filter_weights.shape[-1] // 2)
+    num_groups = feat.shape[1] if feat.dim() == 5 else 1
+    scores = F.conv2d(
+        feat.reshape(feat.shape[0], -1, *feat.shape[-2:]),
+        filter_weights,
+        padding=padding,
+        groups=num_groups)
+    return scores
+
+
+def apply_feat_transpose(feat: Tensor,
+                         activation: Tensor,
+                         filter_size_hw: Tensor,
+                         training: bool = True) -> Tensor:
+    """The transposed operation of `apply_filter` w.r.t the filter. It can be
+    used to compute the filter gradient. There are two implements: the one
+    forwards slowly and backwards fast, which used in training, and the other
+    is the opposite, which used in test.
+
+    Args:
+        feat (Tensor): The input features with two possible shapes in the
+            different modes:
+            - training mode: of shape (num_img_per_seq, bs, c, h, w)
+            - test mode: of shape (num_img_per_seq, c, h, w).
+        activation (Tensor): The activation (e.g. residuals between output and
+            label). There are two possible shapes in the different modes:
+            - training mode: of shape (num_img_per_seq, bs, size_h, size_w)
+            - test mode: of shape (num_img_per_seq, 1, size_h, size_w)
+        filter_size_hw (Tensor): of shape (2,) shape in [h, w] format.
+        training (bool, optional): Whether training mode or not. The faster
+            implementation is chose according to this.
+
+    Returns:
+        Tensor. There are two possible shape in the
+            different mode:
+            - training mode: of shape (bs, c, filter_h, fiter_w).
+            - test mode: of shape (1, c, filter_h, fiter_w).
+    """
+
+    if isinstance(filter_size_hw, int):
+        filter_size_hw = (filter_size_hw, filter_size_hw)
+
+    if training:
+        # slow forward and fast backward
+        # TODO: check the pad difference in training and test mode
+        transpose_pad = [sz // 2 for sz in filter_size_hw]
+        num_img_per_seq = feat.shape[0]
+        batch_size = feat.shape[1] if feat.dim() == 5 else 1
+
+        filter_grad = F.conv2d(
+            feat.reshape(-1, *feat.shape[-3:]).permute(1, 0, 2, 3),
+            activation.reshape(-1, 1, *activation.shape[-2:]),
+            padding=transpose_pad,
+            groups=num_img_per_seq * batch_size)
+
+        # The shape of returns is (bs, c, filter_h, fiter_w)
+        if num_img_per_seq == 1:
+            return filter_grad.permute(1, 0, 2, 3)
+
+        filter_grad = filter_grad.view(-1, num_img_per_seq, batch_size,
+                                       *filter_grad.shape[-2:])
+        return filter_grad.sum(dim=1).permute(1, 0, 2, 3)
+    else:
+        # fast forwward and slow backward
+        transpose_pad = [(sz - 1) // 2 for sz in filter_size_hw]
+        batch_size = feat.shape[0]
+        filter_grad = F.conv2d(
+            activation.reshape(1, -1, *activation.shape[-2:]),
+            feat.reshape(-1, 1, *feat.shape[-2:]),
+            padding=transpose_pad,
+            groups=batch_size)
+
+        filter_grad = filter_grad.view(batch_size, 1, -1,
+                                       *filter_grad.shape[-2:])
+        # The shape of returns is (1, c, filter_h, fiter_w)
+        return filter_grad.sum(dim=0).flip((2, 3))
diff --git a/mmtrack/core/motion/__init__.py b/mmtrack/models/task_modules/motion/__init__.py
similarity index 100%
rename from mmtrack/core/motion/__init__.py
rename to mmtrack/models/task_modules/motion/__init__.py
diff --git a/mmtrack/core/motion/flow.py b/mmtrack/models/task_modules/motion/flow.py
similarity index 72%
rename from mmtrack/core/motion/flow.py
rename to mmtrack/models/task_modules/motion/flow.py
index 1fd25d735..8022617eb 100644
--- a/mmtrack/core/motion/flow.py
+++ b/mmtrack/models/task_modules/motion/flow.py
@@ -2,7 +2,7 @@
 import torch
 
 
-def flow_warp_feats(x, flow):
+def flow_warp_feats(x: torch.Tensor, flow: torch.Tensor) -> torch.Tensor:
     """Use flow to warp feature map.
 
     Args:
@@ -12,13 +12,15 @@ def flow_warp_feats(x, flow):
     Returns:
         Tensor: The warpped feature map with shape (N, C, H_x, W_x).
     """
-    assert len(x.shape) == 4
-    assert len(flow.shape) == 4 and flow.shape[1] == 2
+    assert x.dim() == 4
+    assert flow.dim() == 4 and flow.size(1) == 2
     # 1. resize the resolution of flow to be the same as x.
-    scale_factor = float(x.shape[-1]) / flow.shape[-1]
+    scale_factor_w = float(x.shape[-1]) / flow.shape[-1]
+    scale_factor_h = float(x.shape[-2]) / flow.shape[-2]
     flow = torch.nn.functional.interpolate(
-        flow, scale_factor=scale_factor, mode='bilinear', align_corners=False)
-    flow = flow * scale_factor
+        flow, size=x.shape[-2:], mode='bilinear', align_corners=False)
+    flow[:, 0] = flow[:, 0] * scale_factor_w
+    flow[:, 1] = flow[:, 1] * scale_factor_h
 
     # 2. compute the flow_field (grid in the code) used to warp features.
     H, W = x.shape[-2:]
diff --git a/mmtrack/models/task_modules/track/__init__.py b/mmtrack/models/task_modules/track/__init__.py
new file mode 100644
index 000000000..151242c95
--- /dev/null
+++ b/mmtrack/models/task_modules/track/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .aflink import AppearanceFreeLink
+from .correlation import depthwise_correlation
+from .interpolation import InterpolateTracklets
+from .similarity import embed_similarity
+
+__all__ = [
+    'depthwise_correlation', 'embed_similarity', 'InterpolateTracklets',
+    'AppearanceFreeLink'
+]
diff --git a/mmtrack/models/task_modules/track/aflink.py b/mmtrack/models/task_modules/track/aflink.py
new file mode 100644
index 000000000..8a45848d8
--- /dev/null
+++ b/mmtrack/models/task_modules/track/aflink.py
@@ -0,0 +1,281 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import Tuple
+
+import numpy as np
+import torch
+from mmengine.model import BaseModule
+from mmengine.runner.checkpoint import load_checkpoint
+from scipy.optimize import linear_sum_assignment
+from torch import Tensor, nn
+
+from mmtrack.registry import TASK_UTILS
+
+INFINITY = 1e5
+
+
+class TemporalBlock(BaseModule):
+    """The temporal block of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+        out_channel (int): the dimension of the output channels.
+    """
+
+    def __init__(self,
+                 in_channel: int,
+                 out_channel: int,
+                 kernel_size: tuple = (7, 1)):
+        super(TemporalBlock, self).__init__()
+        self.conv = nn.Conv2d(in_channel, out_channel, kernel_size, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+        self.bnf = nn.BatchNorm1d(out_channel)
+        self.bnx = nn.BatchNorm1d(out_channel)
+        self.bny = nn.BatchNorm1d(out_channel)
+
+    def bn(self, x: Tensor) -> Tensor:
+        x[:, :, :, 0] = self.bnf(x[:, :, :, 0])
+        x[:, :, :, 1] = self.bnx(x[:, :, :, 1])
+        x[:, :, :, 2] = self.bny(x[:, :, :, 2])
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class FusionBlock(BaseModule):
+    """The fusion block of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+        out_channel (int): the dimension of the output channels.
+    """
+
+    def __init__(self, in_channel: int, out_channel: int):
+        super(FusionBlock, self).__init__()
+        self.conv = nn.Conv2d(in_channel, out_channel, (1, 3), bias=False)
+        self.bn = nn.BatchNorm2d(out_channel)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class Classifier(BaseModule):
+    """The classifier of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+    """
+
+    def __init__(self, in_channel: int, out_channel: int):
+        super(Classifier, self).__init__()
+        self.fc1 = nn.Linear(in_channel * 2, in_channel // 2)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc2 = nn.Linear(in_channel // 2, out_channel)
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        x = torch.cat((x1, x2), dim=1)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        return x
+
+
+class AFLinkModel(BaseModule):
+    """Appearance-Free Link Model."""
+
+    def __init__(self,
+                 temporal_module_channels: list = [1, 32, 64, 128, 256],
+                 fusion_module_channels: list = [256, 256],
+                 classifier_channels: list = [256, 2]):
+        super(AFLinkModel, self).__init__()
+        self.TemporalModule_1 = nn.Sequential(*[
+            TemporalBlock(temporal_module_channels[i],
+                          temporal_module_channels[i + 1])
+            for i in range(len(temporal_module_channels) - 1)
+        ])
+
+        self.TemporalModule_2 = nn.Sequential(*[
+            TemporalBlock(temporal_module_channels[i],
+                          temporal_module_channels[i + 1])
+            for i in range(len(temporal_module_channels) - 1)
+        ])
+
+        self.FusionBlock_1 = FusionBlock(*fusion_module_channels)
+        self.FusionBlock_2 = FusionBlock(*fusion_module_channels)
+
+        self.pooling = nn.AdaptiveAvgPool2d((1, 1))
+        self.classifier = Classifier(*classifier_channels)
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        assert not self.training, 'Only testing is supported for AFLink.'
+        x1 = x1[:, :, :, :3]
+        x2 = x2[:, :, :, :3]
+        x1 = self.TemporalModule_1(x1)  # [B,1,30,3] -> [B,256,6,3]
+        x2 = self.TemporalModule_2(x2)
+        x1 = self.FusionBlock_1(x1)
+        x2 = self.FusionBlock_2(x2)
+        x1 = self.pooling(x1).squeeze(-1).squeeze(-1)
+        x2 = self.pooling(x2).squeeze(-1).squeeze(-1)
+        y = self.classifier(x1, x2)
+        y = torch.softmax(y, dim=1)[0, 1]
+        return y
+
+
+@TASK_UTILS.register_module()
+class AppearanceFreeLink(BaseModule):
+    """Appearance-Free Link method.
+
+    This method is proposed in
+    "StrongSORT: Make DeepSORT Great Again"
+    `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+    Args:
+        checkpoint (str): Checkpoint path.
+        temporal_threshold (tuple, optional): The temporal constraint
+            for tracklets association. Defaults to (0, 30).
+        spatial_threshold (int, optional): The spatial constraint for
+            tracklets association. Defaults to 75.
+        confidence_threshold (float, optional): The minimum confidence
+            threshold for tracklets association. Defaults to 0.95.
+    """
+
+    def __init__(self,
+                 checkpoint: str,
+                 temporal_threshold: tuple = (0, 30),
+                 spatial_threshold: int = 75,
+                 confidence_threshold: float = 0.95):
+        super(AppearanceFreeLink, self).__init__()
+        self.temporal_threshold = temporal_threshold
+        self.spatial_threshold = spatial_threshold
+        self.confidence_threshold = confidence_threshold
+
+        self.model = AFLinkModel()
+        if checkpoint:
+            load_checkpoint(self.model, checkpoint)
+        if torch.cuda.is_available():
+            self.model.cuda()
+        self.model.eval()
+
+        self.device = next(self.model.parameters()).device
+        self.fn_l2 = lambda x, y: np.sqrt(x**2 + y**2)
+
+    def data_transform(self,
+                       track1: np.ndarray,
+                       track2: np.ndarray,
+                       length: int = 30) -> Tuple[np.ndarray]:
+        """Data Transformation. This is used to standardize the length of
+        tracks to a unified length. Then perform min-max normalization to the
+        motion embeddings.
+
+        Args:
+            track1 (ndarray): the first track with shape (N,C).
+            track2 (ndarray): the second track with shape (M,C).
+            length (int): the unified length of tracks. Defaults to 30.
+
+        Returns:
+            Tuple[ndarray]: the transformed track1 and track2.
+        """
+        # fill or cut track1
+        length_1 = track1.shape[0]
+        track1 = track1[-length:] if length_1 >= length else \
+            np.pad(track1, ((length - length_1, 0), (0, 0)))
+
+        # fill or cut track1
+        length_2 = track2.shape[0]
+        track2 = track2[:length] if length_2 >= length else \
+            np.pad(track2, ((0, length - length_2), (0, 0)))
+
+        # min-max normalization
+        min_ = np.concatenate((track1, track2), axis=0).min(axis=0)
+        max_ = np.concatenate((track1, track2), axis=0).max(axis=0)
+        subtractor = (max_ + min_) / 2
+        divisor = (max_ - min_) / 2 + 1e-5
+        track1 = (track1 - subtractor) / divisor
+        track2 = (track2 - subtractor) / divisor
+
+        return track1, track2
+
+    def forward(self, pred_tracks: np.ndarray) -> np.ndarray:
+        """Forward function.
+
+        pred_tracks (ndarray): With shape (N, 7). Each row denotes
+            (frame_id, track_id, x1, y1, x2, y2, score).
+
+        Returns:
+            ndarray: The linked tracks with shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        # sort tracks by the frame id
+        pred_tracks = pred_tracks[np.argsort(pred_tracks[:, 0])]
+
+        # gather tracks information
+        id2info = defaultdict(list)
+        for row in pred_tracks:
+            frame_id, track_id, x1, y1, x2, y2 = row[:6]
+            id2info[track_id].append([frame_id, x1, y1, x2 - x1, y2 - y1])
+        id2info = {k: np.array(v) for k, v in id2info.items()}
+        num_track = len(id2info)
+        track_ids = np.array(list(id2info))
+        cost_matrix = np.full((num_track, num_track), INFINITY)
+
+        # compute the cost matrix
+        for i, id_i in enumerate(track_ids):
+            for j, id_j in enumerate(track_ids):
+                if id_i == id_j:
+                    continue
+                info_i, info_j = id2info[id_i], id2info[id_j]
+                frame_i, box_i = info_i[-1][0], info_i[-1][1:3]
+                frame_j, box_j = info_j[0][0], info_j[0][1:3]
+                # temporal constraint
+                if not self.temporal_threshold[0] <= \
+                        frame_j - frame_i <= self.temporal_threshold[1]:
+                    continue
+                # spatial constraint
+                if self.fn_l2(box_i[0] - box_j[0], box_i[1] - box_j[1]) \
+                        > self.spatial_threshold:
+                    continue
+                # confidence constraint
+                track_i, track_j = self.data_transform(info_i, info_j)
+
+                # numpy to torch
+                track_i = torch.tensor(
+                    track_i, dtype=torch.float).to(self.device)
+                track_j = torch.tensor(
+                    track_j, dtype=torch.float).to(self.device)
+                track_i = track_i.unsqueeze(0).unsqueeze(0)
+                track_j = track_j.unsqueeze(0).unsqueeze(0)
+
+                confidence = self.model(track_i,
+                                        track_j).detach().cpu().numpy()
+                if confidence >= self.confidence_threshold:
+                    cost_matrix[i, j] = 1 - confidence
+
+        # linear assignment
+        indices = linear_sum_assignment(cost_matrix)
+        _id2id = dict()  # the temporary assignment results
+        id2id = dict()  # the final assignment results
+        for i, j in zip(indices[0], indices[1]):
+            if cost_matrix[i, j] < INFINITY:
+                _id2id[i] = j
+        for k, v in _id2id.items():
+            if k in id2id:
+                id2id[v] = id2id[k]
+            else:
+                id2id[v] = k
+
+        # link
+        for k, v in id2id.items():
+            pred_tracks[pred_tracks[:, 1] == k, 1] = v
+
+        # deduplicate
+        _, index = np.unique(pred_tracks[:, :2], return_index=True, axis=0)
+
+        return pred_tracks[index]
diff --git a/mmtrack/core/track/correlation.py b/mmtrack/models/task_modules/track/correlation.py
similarity index 73%
rename from mmtrack/core/track/correlation.py
rename to mmtrack/models/task_modules/track/correlation.py
index 944139259..716a055c0 100644
--- a/mmtrack/core/track/correlation.py
+++ b/mmtrack/models/task_modules/track/correlation.py
@@ -17,8 +17,8 @@ def depthwise_correlation(x, kernel):
     """
     batch = kernel.size(0)
     channel = kernel.size(1)
-    x = x.view(1, batch * channel, x.size(2), x.size(3))
-    kernel = kernel.view(batch * channel, 1, kernel.size(2), kernel.size(3))
+    x = x.reshape(1, batch * channel, x.size(2), x.size(3))
+    kernel = kernel.reshape(batch * channel, 1, kernel.size(2), kernel.size(3))
     out = F.conv2d(x, kernel, groups=batch * channel)
-    out = out.view(batch, channel, out.size(2), out.size(3))
+    out = out.reshape(batch, channel, out.size(2), out.size(3))
     return out
diff --git a/mmtrack/models/task_modules/track/interpolation.py b/mmtrack/models/task_modules/track/interpolation.py
new file mode 100644
index 000000000..af51aeaa2
--- /dev/null
+++ b/mmtrack/models/task_modules/track/interpolation.py
@@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from sklearn.gaussian_process import GaussianProcessRegressor as GPR
+from sklearn.gaussian_process.kernels import RBF
+
+from mmtrack.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class InterpolateTracklets:
+    """Interpolate tracks to make tracks more complete.
+
+    Args:
+        min_num_frames (int, optional): The minimum length of a track that will
+            be interpolated. Defaults to 5.
+        max_num_frames (int, optional): The maximum disconnected length in
+            a track. Defaults to 20.
+        use_gsi (bool, optional): Whether to use the GSI (Gaussian-smoothed
+            interpolation) method. Defaults to False.
+        smooth_tau (int, optional): smoothing parameter in GSI. Defaults to 10.
+    """
+
+    def __init__(self,
+                 min_num_frames: int = 5,
+                 max_num_frames: int = 20,
+                 use_gsi: bool = False,
+                 smooth_tau: int = 10):
+        self.min_num_frames = min_num_frames
+        self.max_num_frames = max_num_frames
+        self.use_gsi = use_gsi
+        self.smooth_tau = smooth_tau
+
+    def _interpolate_track(self,
+                           track: np.ndarray,
+                           track_id: int,
+                           max_num_frames: int = 20) -> np.ndarray:
+        """Interpolate a track linearly to make the track more complete.
+
+        This function is proposed in
+        "ByteTrack: Multi-Object Tracking by Associating Every Detection Box."
+        `ByteTrack<https://arxiv.org/abs/2110.06864>`_.
+
+        Args:
+            track (ndarray): With shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score).
+            max_num_frames (int, optional): The maximum disconnected length in
+                the track. Defaults to 20.
+
+        Returns:
+            ndarray: The interpolated track with shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        assert (track[:, 1] == track_id).all(), \
+            'The track id should not changed when interpolate a track.'
+
+        frame_ids = track[:, 0]
+        interpolated_track = np.zeros((0, 7))
+        # perform interpolation for the disconnected frames in the track.
+        for i in np.where(np.diff(frame_ids) > 1)[0]:
+            left_frame_id = frame_ids[i]
+            right_frame_id = frame_ids[i + 1]
+            num_disconnected_frames = int(right_frame_id - left_frame_id)
+
+            if 1 < num_disconnected_frames < max_num_frames:
+                left_bbox = track[i, 2:6]
+                right_bbox = track[i + 1, 2:6]
+
+                # perform interpolation for two adjacent tracklets.
+                for j in range(1, num_disconnected_frames):
+                    cur_bbox = j / (num_disconnected_frames) * (
+                        right_bbox - left_bbox) + left_bbox
+                    cur_result = np.ones((7, ))
+                    cur_result[0] = j + left_frame_id
+                    cur_result[1] = track_id
+                    cur_result[2:6] = cur_bbox
+
+                    interpolated_track = np.concatenate(
+                        (interpolated_track, cur_result[None]), axis=0)
+
+        interpolated_track = np.concatenate((track, interpolated_track),
+                                            axis=0)
+        return interpolated_track
+
+    def gaussian_smoothed_interpolation(self,
+                                        track: np.ndarray,
+                                        smooth_tau: int = 10) -> np.ndarray:
+        """Gaussian-Smoothed Interpolation.
+
+        This function is proposed in
+        "StrongSORT: Make DeepSORT Great Again"
+        `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+        Args:
+            track (ndarray): With shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score).
+            smooth_tau (int, optional): smoothing parameter in GSI.
+                Defaults to 10.
+
+        Returns:
+            ndarray: The interpolated tracks with shape (N, 7). Each row
+                denotes (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        len_scale = np.clip(smooth_tau * np.log(smooth_tau**3 / len(track)),
+                            smooth_tau**-1, smooth_tau**2)
+        gpr = GPR(RBF(len_scale, 'fixed'))
+        t = track[:, 0].reshape(-1, 1)
+        x1 = track[:, 2].reshape(-1, 1)
+        y1 = track[:, 3].reshape(-1, 1)
+        x2 = track[:, 4].reshape(-1, 1)
+        y2 = track[:, 5].reshape(-1, 1)
+        gpr.fit(t, x1)
+        x1_gpr = gpr.predict(t)
+        gpr.fit(t, y1)
+        y1_gpr = gpr.predict(t)
+        gpr.fit(t, x2)
+        x2_gpr = gpr.predict(t)
+        gpr.fit(t, y2)
+        y2_gpr = gpr.predict(t)
+        gsi_track = [[
+            t[i, 0], track[i, 1], x1_gpr[i], y1_gpr[i], x2_gpr[i], y2_gpr[i],
+            track[i, 6]
+        ] for i in range(len(t))]
+        return np.array(gsi_track)
+
+    def forward(self, pred_tracks: np.ndarray) -> np.ndarray:
+        """Forward function.
+
+        pred_tracks (ndarray): With shape (N, 7). Each row denotes
+            (frame_id, track_id, x1, y1, x2, y2, score).
+
+        Returns:
+            ndarray: The interpolated tracks with shape (N, 7). Each row
+            denotes (frame_id, track_id, x1, y1, x2, y2, score).
+        """
+        max_track_id = int(np.max(pred_tracks[:, 1]))
+        min_track_id = int(np.min(pred_tracks[:, 1]))
+
+        # perform interpolation for each track
+        interpolated_tracks = []
+        for track_id in range(min_track_id, max_track_id + 1):
+            inds = pred_tracks[:, 1] == track_id
+            track = pred_tracks[inds]
+            num_frames = len(track)
+            if num_frames <= 2:
+                continue
+
+            if num_frames > self.min_num_frames:
+                interpolated_track = self._interpolate_track(
+                    track, track_id, self.max_num_frames)
+            else:
+                interpolated_track = track
+
+            if self.use_gsi:
+                interpolated_track = self.gaussian_smoothed_interpolation(
+                    interpolated_track, self.smooth_tau)
+
+            interpolated_tracks.append(interpolated_track)
+
+        interpolated_tracks = np.concatenate(interpolated_tracks)
+        return interpolated_tracks[interpolated_tracks[:, 0].argsort()]
diff --git a/mmtrack/core/track/similarity.py b/mmtrack/models/task_modules/track/similarity.py
similarity index 80%
rename from mmtrack/core/track/similarity.py
rename to mmtrack/models/task_modules/track/similarity.py
index 67305e056..730e43b86 100644
--- a/mmtrack/core/track/similarity.py
+++ b/mmtrack/models/task_modules/track/similarity.py
@@ -1,12 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn.functional as F
+from torch import Tensor
 
 
-def embed_similarity(key_embeds,
-                     ref_embeds,
-                     method='dot_product',
-                     temperature=-1):
+def embed_similarity(key_embeds: Tensor,
+                     ref_embeds: Tensor,
+                     method: str = 'dot_product',
+                     temperature: int = -1) -> Tensor:
     """Calculate feature similarity from embeddings.
 
     Args:
diff --git a/mmtrack/models/track_heads/__init__.py b/mmtrack/models/track_heads/__init__.py
index 3e4d3819a..558f38781 100644
--- a/mmtrack/models/track_heads/__init__.py
+++ b/mmtrack/models/track_heads/__init__.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .iounet_head import IouNetHead
+from .mask2former_head import Mask2FormerHead
+from .prdimp_cls_head import PrDiMPClsHead
 from .quasi_dense_embed_head import QuasiDenseEmbedHead
 from .quasi_dense_track_head import QuasiDenseTrackHead
 from .roi_embed_head import RoIEmbedHead
@@ -9,5 +12,5 @@
 __all__ = [
     'CorrelationHead', 'SiameseRPNHead', 'RoIEmbedHead', 'RoITrackHead',
     'StarkHead', 'CornerPredictorHead', 'QuasiDenseEmbedHead',
-    'QuasiDenseTrackHead'
+    'QuasiDenseTrackHead', 'PrDiMPClsHead', 'IouNetHead', 'Mask2FormerHead'
 ]
diff --git a/mmtrack/models/track_heads/iounet_head.py b/mmtrack/models/track_heads/iounet_head.py
new file mode 100644
index 000000000..aef453ee4
--- /dev/null
+++ b/mmtrack/models/track_heads/iounet_head.py
@@ -0,0 +1,653 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks import ConvModule
+from mmcv.ops import PrRoIPool
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmtrack.registry import MODELS
+from mmtrack.structures.bbox import (bbox_cxcywh_to_x1y1wh,
+                                     bbox_rel_cxcywh_to_xywh,
+                                     bbox_xywh_to_rel_cxcywh,
+                                     bbox_xyxy_to_x1y1wh)
+from mmtrack.utils import OptConfigType, SampleList
+
+
+class LinearBlock(nn.Module):
+    """The linear block. The full pipeline: FC > BN > ReLU.
+
+    Args:
+        in_planes (int): The dim of input.
+        out_planes (int): The dim of output.
+        input_size (int): The size of input.
+        bias (bool, optional): Whether to have bias in linear layer. Defaults
+            to True.
+        batch_norm (bool, optional): Whether to have BN after linear layer.
+            Defaults to True.
+        relu (bool, optional):  Whether to have ReLU at last. Defaults to True.
+    """
+
+    def __init__(self,
+                 in_planes: int,
+                 out_planes: int,
+                 input_size: int,
+                 bias: bool = True,
+                 batch_norm: bool = True,
+                 relu: bool = True):
+        super().__init__()
+        self.linear = nn.Linear(
+            in_planes * input_size * input_size, out_planes, bias=bias)
+        self.bn = nn.BatchNorm2d(out_planes) if batch_norm else None
+        self.relu = nn.ReLU(inplace=True) if relu else None
+
+    def forward(self, x):
+        x = self.linear(x.reshape(x.shape[0], -1))
+        if self.bn is not None:
+            x = self.bn(x.reshape(x.shape[0], x.shape[1], 1, 1))
+        if self.relu is not None:
+            x = self.relu(x)
+        return x.reshape(x.shape[0], -1)
+
+
+@MODELS.register_module()
+class IouNetHead(BaseModule):
+    """Module for IoU prediction.
+
+    Refer to the ATOM paper for a detailed illustration of the architecture.
+    `ATOM: <https://arxiv.org/abs/1811.07628>`_.
+
+    Args:
+        in_dim (tuple(int), optional): Feature dimensionality from the two
+            input backbone layers. Defaults to (128, 256).
+        pred_in_dim (tuple(int), optional): Input dimensionality of the
+            prediction network. Defaults to (256, 256).
+        pred_inter_dim (tuple(int), optional): Intermediate dimensionality in
+            the prediction network. Defaults to (256, 256).
+        bbox_cfg (dict, optional): The configuration of bbox refinement.
+            Defaults to None. Defaults to None.
+        train_cfg (dict, optional): The configuration of training.
+            Defaults to None.
+        loss_bbox (dict, optional): The configuration of loss.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_dim: Tuple = (128, 256),
+                 pred_in_dim: Tuple = (256, 256),
+                 pred_inter_dim: Tuple = (256, 256),
+                 bbox_cfg: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 loss_bbox: OptConfigType = None,
+                 **kwargs):
+        super().__init__()
+        self.bbox_cfg = bbox_cfg
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+
+        self.loss_bbox = MODELS.build(loss_bbox)
+
+        def conv_module(in_planes, out_planes, kernel_size=3, padding=1):
+            # The module's pipeline: Conv -> BN -> ReLU.
+            return ConvModule(
+                in_channels=in_planes,
+                out_channels=out_planes,
+                kernel_size=kernel_size,
+                padding=padding,
+                bias=False,
+                norm_cfg=dict(type='BN', requires_grad=True),
+                act_cfg=dict(type='ReLU'),
+                inplace=True)
+
+        # `*_temp` denotes template branch, `*_search` denotes search branch
+        # The `number` in the names of variables are block indexes in the
+        # backbone or indexes of head layer.
+        self.conv3_temp = conv_module(in_dim[0], 128)
+        self.roi3_temp = PrRoIPool(3, 1 / 8)
+        self.fc3_temp = conv_module(128, 256, padding=0)
+        self.fc34_3_temp = conv_module(
+            256 + 256, pred_in_dim[0], kernel_size=1, padding=0)
+
+        self.conv4_temp = conv_module(in_dim[1], 256)
+        self.roi4_temp = PrRoIPool(1, 1 / 16)
+        self.fc34_4_temp = conv_module(
+            256 + 256, pred_in_dim[1], kernel_size=1, padding=0)
+
+        self.conv3_search = nn.Sequential(
+            conv_module(in_dim[0], 256), conv_module(256, pred_in_dim[0]))
+        self.roi3_search = PrRoIPool(5, 1 / 8)
+        self.fc3_search = LinearBlock(pred_in_dim[0], pred_inter_dim[0], 5)
+
+        self.conv4_search = nn.Sequential(
+            conv_module(in_dim[1], 256), conv_module(256, pred_in_dim[1]))
+        self.roi4_search = PrRoIPool(3, 1 / 16)
+        self.fc4_search = LinearBlock(pred_in_dim[1], pred_inter_dim[1], 3)
+
+        self.iou_predictor = nn.Linear(
+            pred_inter_dim[0] + pred_inter_dim[1], 1, bias=True)
+
+    def init_weights(self):
+        """Initialize the parameters of this module."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d) or isinstance(
+                    m, nn.ConvTranspose2d) or isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight.data, mode='fan_in')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.uniform_()
+                m.bias.data.zero_()
+
+    def predict_iou(self, modulations: Tuple[Tensor], feats: Tensor,
+                    proposals: Tensor) -> Tensor:
+        """Predicts IoU for the give proposals.
+
+        Args:
+            modulations (Tuple(Tensor)): contains the features from two layers.
+                The inner tensors are of shape (bs, c, 1, 1)
+            feats (Tuple(Tensor)):  IoU features for test images.
+                The inner tensors are of shape (bs, c, h, w).
+            proposals (Tuple[Tensor]):  Proposal boxes for which the IoU will
+                be predicted (bs, num_proposals, 4).
+
+        Returns:
+            Tensor: IoU between the proposals with the groundtruth boxes. It's
+                of shape (bs, num_proposals).
+        """
+        # `*_temp` denotes template branch, `*_search` denotes search branch
+        # The `number` in the names of variables are block indexes in the
+        # backbone or indexes of head layer.
+        fc34_3_temp, fc34_4_temp = modulations
+        conv3_search, conv4_search = feats
+        batch_size = conv3_search.shape[0]
+
+        # Modulation
+        conv3_search_att = conv3_search * fc34_3_temp
+        conv4_search_att = conv4_search * fc34_4_temp
+
+        # Push the different rois for the same image along the batch dimension
+        num_proposals_per_batch = proposals.shape[1]
+
+        # input proposals2 is in format xywh, convert it to x0y0x1y1 format
+        proposals_xyxy = torch.cat(
+            (proposals[..., 0:2], proposals[..., 0:2] + proposals[..., 2:4]),
+            dim=2)
+
+        # Add batch_index to rois
+        batch_index = torch.arange(
+            batch_size, dtype=torch.float32).reshape(-1, 1,
+                                                     1).to(proposals.device)
+        roi = torch.cat((batch_index.expand(-1, num_proposals_per_batch,
+                                            -1), proposals_xyxy),
+                        dim=2)
+        roi = roi.reshape(-1, 5)
+
+        roi3_search = self.roi3_search(conv3_search_att, roi)
+        roi4_search = self.roi4_search(conv4_search_att, roi)
+
+        fc3_search = self.fc3_search(roi3_search)
+        fc4_search = self.fc4_search(roi4_search)
+
+        fc34_search_cat = torch.cat((fc3_search, fc4_search), dim=1)
+
+        iou_pred = self.iou_predictor(fc34_search_cat).reshape(
+            batch_size, num_proposals_per_batch)
+
+        return iou_pred
+
+    def get_modulation(self, feats: Tuple[Tensor],
+                       bboxes: Tensor) -> Tuple[Tensor, Tensor]:
+        """Get modulation vectors for the targets in the search branch.
+
+        Args:
+            feats (tuple(Tensor)): Backbone features from template branch.
+                It's of shape (bs, c, h, w).
+            bboxes (Tensor): Target boxes (x1, y1, x2, y2) in image coords in
+                the template branch. It's of shape (bs, 4).
+
+        Returns:
+            fc34_3_temp (Tensor): of shape (bs, c, 1, 1).
+            fc34_4_temp (Tensor): of shape (bs, c, 1, 1).
+        """
+
+        # Add batch_index to rois
+        batch_size = bboxes.shape[0]
+        batch_index = torch.arange(
+            batch_size, dtype=torch.float32).reshape(-1, 1).to(bboxes.device)
+        roi = torch.cat((batch_index, bboxes), dim=1)
+
+        # Perform conv and prpool on the feature maps from the backbone
+        # `*_temp` denotes template branch, `*_search` denotes search branch
+        # The `number` in the names of variables are block indexes in the
+        # backbone or indexes of head layer.
+        feat3_temp, feat4_temp = feats
+        conv3_temp = self.conv3_temp(feat3_temp)
+        roi3_temp = self.roi3_temp(conv3_temp, roi)
+        fc3_temp = self.fc3_temp(roi3_temp)
+
+        c4_temp = self.conv4_temp(feat4_temp)
+        roi4_temp = self.roi4_temp(c4_temp, roi)
+
+        # Concatenate from block 3 and 4
+        fc34_temp = torch.cat((fc3_temp, roi4_temp), dim=1)
+
+        fc34_3_temp = self.fc34_3_temp(fc34_temp)
+        fc34_4_temp = self.fc34_4_temp(fc34_temp)
+
+        return fc34_3_temp, fc34_4_temp
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor, Tensor]:
+        """Get IoU prediction features from a 4 or 5 dimensional backbone
+        input.
+
+        Args:
+            feats (tuple(Tensor)): Containing the features from backbone with
+                shape (bs, c, h, w)
+
+        Returns:
+            conv3_search (Tensor): Features from the `conv3_search` branch.
+            conv4_search (Tensor): Features from the `conv4_search` branch.
+        """
+        # `*_temp` denotes template branch, `*_search` denotes search branch
+        # The `number` in the names of variables are block indexes in the
+        # backbone or indexes of head layer.
+        feats = [
+            f.reshape(-1, *f.shape[-3:]) if f.dim() == 5 else f for f in feats
+        ]
+        feat3_search, feat4_search = feats
+        conv3_search = self.conv3_search(feat3_search)
+        conv4_search = self.conv4_search(feat4_search)
+
+        return conv3_search, conv4_search
+
+    def init_iou_net(self, iou_backbone_feats: Tensor, bboxes: Tensor):
+        """Initialize the IoUNet with feature are from the 'layer2' and
+        'layer3' of backbone.
+
+        Args:
+            iou_backbone_feats (tuple(Tensor)): The features from the backbone.
+            bboxes (Tensor): of shape (4, ) or (1, 4) in [cx, cy, w, h] format.
+        """
+        bboxes = bbox_cxcywh_to_xyxy(bboxes.view(-1, 4))
+        # Get modulation vector
+        self.iou_modulation = self.get_modulation(iou_backbone_feats, bboxes)
+
+    def optimize_bboxes(self, iou_features: Tuple[Tensor],
+                        init_bboxes: Tensor) -> Tuple[Tensor, Tensor]:
+        """Optimize the bboxes.
+
+        Args:
+            iou_features (tuple(Tensor)): The features used to predict IoU.
+            init_bboxes (Tensor): The initialized bboxes with shape (N,4) in
+                [cx, cy, w, h] format.
+
+        Returns:
+            Tensor: The optimized bboxes with shape (N,4)  in [x, y, w, h]
+                format.
+            Tensor: The predict IoU of the optimized bboxes with shape (N, ).
+        """
+        step_length = self.bbox_cfg['box_refine_step_length']
+        if isinstance(step_length, (tuple, list)):
+            step_length = torch.Tensor([
+                step_length[0], step_length[0], step_length[1], step_length[1]
+            ]).to(iou_features[0].device).view(1, 1, 4)
+
+        # TODO: simplify this series of transform
+        output_bboxes = bbox_cxcywh_to_x1y1wh(init_bboxes)
+        output_bboxes = output_bboxes.view(1, -1, 4)
+        bboxes_sz_norm = output_bboxes[:, :1, 2:].clone()
+        output_bboxes_rel = bbox_xywh_to_rel_cxcywh(output_bboxes,
+                                                    bboxes_sz_norm)
+
+        with torch.set_grad_enabled(True):
+            for _ in range(self.bbox_cfg['box_refine_iter']):
+                # Forward
+                bboxes_init_rel = output_bboxes_rel.clone().detach()
+                bboxes_init_rel.requires_grad = True
+
+                bboxes_init = bbox_rel_cxcywh_to_xywh(bboxes_init_rel,
+                                                      bboxes_sz_norm)
+                iou_outputs = self.predict_iou(self.iou_modulation,
+                                               iou_features, bboxes_init)
+                # Backward
+                iou_outputs.backward(gradient=torch.ones_like(iou_outputs))
+
+                # Update bboxes
+                output_bboxes_rel = bboxes_init_rel + (
+                    step_length * bboxes_init_rel.grad)
+                output_bboxes_rel.detach_()
+
+                step_length *= self.bbox_cfg['box_refine_step_decay']
+
+        output_bboxes = bbox_rel_cxcywh_to_xywh(output_bboxes_rel,
+                                                bboxes_sz_norm)
+
+        return output_bboxes.view(-1, 4), iou_outputs.detach().view(-1)
+
+    def predict(self, backbone_feats: Tensor, data_samples: SampleList,
+                init_bbox: Tensor, sample_center: Tensor,
+                scale_factor: float) -> Tensor:
+        """Refine the target bounding box.
+
+        Args:
+            init_bbox (Tensor): of shape (4, ) or (1, 4) in [cx, cy, w, h]
+                formmat.
+            backbone_feats (tuple(Tensor)): of shape (1, c, h, w)
+            sample_center (Tensor): The center of the cropped
+                sample on the original image. It's in [x, y] format.
+            scale_factor (float): The size ratio of the cropped patch to the
+                resized image.
+
+        Returns:
+            Tensor: The refined target bbox in [cx, cy, w, h] format.
+        """
+        init_bbox = init_bbox.squeeze()
+        sample_center = sample_center.squeeze()
+        assert sample_center.dim() == 1
+
+        iou_features = self(backbone_feats)
+        return self.predict_by_feat(iou_features, init_bbox, sample_center,
+                                    scale_factor)
+
+    def predict_by_feat(self, iou_features: Tensor, init_bbox: Tensor,
+                        sample_center: Tensor, scale_factor: Tensor) -> Tensor:
+        """Refine the target bounding box.
+
+        Args:
+            init_bbox (Tensor): The init target bbox.
+            iou_features (Tensor): The features for IoU prefiction.
+            sample_center (Tensor): The coordinate of the sample center based
+                on the original image.
+            scale_factor (float): The size ratio of the cropped patch to the
+                resized image.
+
+        Returns:
+            Tensor: The refined target bbox in [cx, cy, w, h] format.
+        """
+
+        # Generate some random initial boxes based on the `init_bbox`
+        init_bbox = init_bbox.view(1, 4)
+        init_bboxes = init_bbox.clone()
+        if self.bbox_cfg['num_init_random_boxes'] > 0:
+            square_box_sz = init_bbox[0, 2:].prod().sqrt().item()
+            rand_factor = square_box_sz * torch.cat([
+                self.bbox_cfg['box_jitter_pos'] * torch.ones(2),
+                self.bbox_cfg['box_jitter_sz'] * torch.ones(2)
+            ])
+            min_edge_size = init_bbox[0, 2:].min() / 3
+            rand_bboxes_jitter = (torch.rand(
+                self.bbox_cfg['num_init_random_boxes'], 4) - 0.5) * rand_factor
+            rand_bboxes_jitter = rand_bboxes_jitter.to(init_bbox.device)
+            new_size = (init_bbox[:, 2:] +
+                        rand_bboxes_jitter[:, 2:]).clamp(min_edge_size)
+            new_center = init_bbox[:, :2] + rand_bboxes_jitter[:, :2]
+            init_bboxes = torch.cat([new_center, new_size], 1)
+            init_bboxes = torch.cat([init_bbox, init_bboxes])
+
+        # Optimize the boxes
+        out_bboxes, out_iou = self.optimize_bboxes(iou_features, init_bboxes)
+
+        return self._bbox_post_process(out_bboxes, out_iou, sample_center,
+                                       scale_factor)
+
+    def _bbox_post_process(self, out_bboxes: Tensor, out_ious: Tensor,
+                           sample_center: Tensor,
+                           scale_factor: float) -> Tensor:
+        """The post process about bbox.
+
+        Args:
+            out_bboxes (Tensor): The several optimized bboxes.
+            out_ious (Tensor): The IoUs about the optimized bboxes.
+            sample_center (Tensor): The coordinate of the sample center based
+                on the original image.
+            scale_factor (float): The size ratio of the cropped patch to the
+                resized image.
+
+        Returns:
+            Tensor: The refined target bbox in [cx, cy, w, h] format.
+        """
+        # Remove weird boxes according to the ratio of aspect
+        out_bboxes[:, 2:].clamp_(1)
+        aspect_ratio = out_bboxes[:, 2] / out_bboxes[:, 3]
+        keep_ind = (aspect_ratio < self.bbox_cfg['max_aspect_ratio']) * (
+            aspect_ratio > 1 / self.bbox_cfg['max_aspect_ratio'])
+        out_bboxes = out_bboxes[keep_ind, :]
+        out_ious = out_ious[keep_ind]
+
+        # If no box found
+        if out_bboxes.shape[0] == 0:
+            return None
+
+        # Predict box
+        k = self.bbox_cfg['iounet_topk']
+        topk = min(k, out_bboxes.shape[0])
+        _, inds = torch.topk(out_ious, topk)
+        # in [x,y,w,h] format
+        predicted_box = out_bboxes[inds, :].mean(0)
+
+        # Convert the bbox of the cropped sample to that of original image.
+        # TODO: this postprocess about mapping back can be moved to other place
+        new_bbox_center = predicted_box[:2] + predicted_box[2:] / 2
+        new_bbox_center = (new_bbox_center - self.test_cfg['img_sample_size'] /
+                           2) * scale_factor + sample_center
+        new_target_size = predicted_box[2:] * scale_factor
+
+        return torch.cat([new_bbox_center, new_target_size], dim=-1)
+
+    def _gauss_density_centered(self, x: Tensor, sigma: Tensor) -> Tensor:
+        """Evaluate the probability density of a Gaussian centered at zero.
+
+        args:
+            x (Tensor): of (num_smples, 4) shape.
+            sigma (Tensor): Standard deviations with (1, 4, 2) shape.
+        """
+
+        return torch.exp(-0.5 * (x / sigma)**2) / (
+            math.sqrt(2 * math.pi) * sigma)
+
+    def _gmm_density_centered(self, x: Tensor, sigma: Tensor) -> Tensor:
+        """Evaluate the probability density of a GMM centered at zero.
+
+        args:
+            x(Tensor): of (num_smples, 4) shape.
+            sigma (Tensor): Tensor of standard deviations with (1, 4, 2) shape.
+        """
+        if x.dim() == sigma.dim() - 1:
+            x = x[..., None]
+        elif not (x.dim() == sigma.dim() and x.shape[-1] == 1):
+            raise ValueError('Last dimension must be the gmm sigmas.')
+
+        # ``product`` along feature dim of ``bbox```, ``mean``` along component
+        # dim of ``sigma``.
+        return self._gauss_density_centered(x, sigma).prod(-2).mean(-1)
+
+    def _sample_gmm_centered(self,
+                             sigma: Tensor,
+                             num_samples: int = 1) -> Tuple[Tensor, Tensor]:
+        """Sample from a GMM distribution centered at zero.
+
+        Args:
+            sigma (Tensor): Standard deviations of bbox coordinates with
+                [4, 2] shape.
+            num_samples (int, optional): The number of samples.
+
+        Returns:
+            x_centered (Tensor): of shape (num_samples, num_dims)
+            prob_density (Tensor): of shape (num_samples, )
+        """
+        num_components = sigma.shape[-1]
+        num_dims = sigma.shape[-2]
+
+        sigma = sigma.reshape(1, num_dims, num_components)
+
+        # Sampling component index
+        k = torch.randint(
+            num_components, size=(num_samples, ), dtype=torch.int64)
+        sigma_samples = sigma[0, :, k].t()
+
+        x_centered = sigma_samples * torch.randn(num_samples, num_dims).to(
+            sigma_samples.device)
+        prob_density = self._gmm_density_centered(x_centered, sigma)
+
+        return x_centered, prob_density
+
+    def get_targets(self, bbox: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """Generate the training targets for search images.
+
+        Args:
+            bbox (Tensor): The bbox of (N, 4) shape in [x, y, w, h] format.
+
+        Returns:
+            Tuple[Tensor, Tensor, Tensor]:
+                ``proposals``: proposals with [num_samples, 4] shape.
+                ``proposal_density``: proposal density with [num_samples, ]
+                    shape.
+                ``gt_density``: groundtruth density with [num_samples, ] shape.
+        """
+        bbox = bbox.clone().reshape(-1, 4)
+        bbox_wh = bbox[:, 2:]
+
+        if not hasattr(self, 'proposals_sigma'):
+            center_sigma = torch.tensor(
+                [s[0] for s in self.train_cfg['proposals_sigma']])
+            size_sigma = torch.tensor(
+                [s[1] for s in self.train_cfg['proposals_sigma']])
+            # of shape (4, len(train_cfg['proposal_sigma']))
+            self.proposals_sigma = torch.stack(
+                (center_sigma, center_sigma, size_sigma, size_sigma),
+                dim=0).to(bbox.device)
+
+        if not hasattr(self, 'gt_bboxes_sigma'):
+            # of shape (1, 4)
+            self.gt_bboxes_sigma = torch.tensor(
+                (self.train_cfg['gt_bboxes_sigma'][0],
+                 self.train_cfg['gt_bboxes_sigma'][0],
+                 self.train_cfg['gt_bboxes_sigma'][1],
+                 self.train_cfg['gt_bboxes_sigma'][1]),
+                dtype=torch.float32,
+                device=bbox.device).reshape(-1, 4)
+
+        # Sample boxes
+        proposals_rel_centered, proposal_density = self._sample_gmm_centered(
+            self.proposals_sigma,
+            self.train_cfg['num_samples'] * bbox.shape[0])
+
+        # Add mean and map back
+        # of shape (num_seq*bs, 4)
+        mean_box_rel = bbox_xywh_to_rel_cxcywh(bbox, bbox_wh)
+        # the first num_samples elements along zero dim are
+        # from the same image
+        # of shape (num_samples*bbox.shape[0] , bbox.shape[-1])
+        mean_box_rel = mean_box_rel.unsqueeze(1).expand(
+            -1, self.train_cfg['num_samples'],
+            -1).reshape(-1, mean_box_rel.shape[-1])
+        proposals_rel = proposals_rel_centered + mean_box_rel
+        # of shape (num_samples * bbox.shape[0], bbox.shape[-1])
+        bbox_wh = bbox_wh.unsqueeze(1).expand(-1,
+                                              self.train_cfg['num_samples'],
+                                              -1).reshape(
+                                                  -1, bbox_wh.shape[-1])
+        proposals = bbox_rel_cxcywh_to_xywh(proposals_rel, bbox_wh)
+
+        # of shape (num_samples, )
+        gt_density = self._gauss_density_centered(
+            proposals_rel_centered, self.gt_bboxes_sigma).prod(-1)
+
+        if self.train_cfg['add_first_bbox']:
+            proposals = torch.cat((bbox, proposals))
+            proposal_density = torch.cat(
+                (torch.tensor([-1.]), proposal_density))
+            gt_density = torch.cat((torch.tensor([1.]), gt_density))
+
+        return proposals, proposal_density, gt_density
+
+    def loss(self, template_feats: Tuple[Tensor], search_feats: Tuple[Tensor],
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the tracking
+        head on the features of the upstream network.
+
+        Args:
+            template_feats (tuple[Tensor, ...]): Tuple of Tensor with
+                shape (N, C, H, W) denoting the multi level feature maps of
+                exemplar images.
+            search_feats (tuple[Tensor, ...]): Tuple of Tensor with shape
+                (N, C, H, W) denoting the multi level feature maps of
+                search images.
+            batch_data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_size = len(batch_data_samples)
+        batch_gt_bboxes = []
+        batch_img_metas = []
+        batch_search_gt_bboxes = []
+
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_bboxes.append(data_sample.gt_instances['bboxes'])
+            batch_search_gt_bboxes.append(
+                data_sample.search_gt_instances['bboxes'])
+
+        # Extract the first train sample in each sequence
+        template_feats = [feat[:batch_size, ...] for feat in template_feats]
+        batch_gt_bboxes = torch.stack(batch_gt_bboxes, dim=1)
+
+        # Get modulation vector
+        modulations = self.get_modulation(template_feats, batch_gt_bboxes[0])
+
+        iou_feats = self(search_feats)
+        num_search_imgs_per_seq = batch_data_samples[0].search_gt_instances[
+            'bboxes'].shape[0]
+        # (num_seq*bs, c). The first `bs` tensors along
+        # zero-dim are from different images of a batch.
+        modulations = [
+            feat.repeat(num_search_imgs_per_seq, 1, 1, 1)
+            for feat in modulations
+        ]
+
+        return self.loss_by_feat(modulations, iou_feats, batch_gt_bboxes,
+                                 batch_search_gt_bboxes)
+
+    def loss_by_feat(self, modulations: Tuple[Tensor],
+                     iou_feats: Tuple[Tensor], batch_gt_bboxes: Tensor,
+                     batch_search_gt_bboxes: Tensor) -> dict:
+        """Compute loss.
+
+        Args:
+            modulations (Tuple[Tensor]): The modulation features.
+            iou_feats (Tuple[Tensor]): The features for iou prediction.
+            batch_gt_bboxes (Tensor): The gt_bboxes in a batch.
+            batch_search_gt_bboxes (Tensor): The search gt_bboxes in a batch.
+
+        Returns:
+            dict: a dictionary of loss components.
+        """
+        batch_search_gt_bboxes = torch.stack(
+            batch_search_gt_bboxes, dim=1).view(-1, 4)
+        batch_search_gt_bboxes_xywh = bbox_xyxy_to_x1y1wh(
+            batch_search_gt_bboxes)
+        (proposals, proposals_density, search_gt_bboxes_density
+         ) = self.get_targets(batch_search_gt_bboxes_xywh)
+
+        proposals = proposals.view(-1, self.train_cfg['num_samples'],
+                                   proposals.shape[-1])
+        proposals_density = proposals_density.view(
+            -1, self.train_cfg['num_samples'])
+        search_gt_bboxes_density = search_gt_bboxes_density.view(
+            -1, self.train_cfg['num_samples'])
+        pred_iou = self.predict_iou(modulations, iou_feats, proposals)
+
+        loss_bbox = self.loss_bbox(
+            pred_iou,
+            sample_density=proposals_density,
+            gt_density=search_gt_bboxes_density,
+            mc_dim=1) * self.train_cfg['loss_weights']['bbox']
+
+        return dict(loss_bbox=loss_bbox)
diff --git a/mmtrack/models/track_heads/mask2former_head.py b/mmtrack/models/track_heads/mask2former_head.py
new file mode 100644
index 000000000..c668a07c4
--- /dev/null
+++ b/mmtrack/models/track_heads/mask2former_head.py
@@ -0,0 +1,710 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d
+from mmcv.ops import point_sample
+from mmdet.models.dense_heads import AnchorFreeHead
+from mmdet.models.dense_heads import MaskFormerHead as MMDET_MaskFormerHead
+from mmdet.models.utils import get_uncertain_point_coords_with_randomness
+from mmdet.structures.mask import mask2bbox
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig, reduce_mean
+from mmengine.model import ModuleList
+from mmengine.model.weight_init import caffe2_xavier_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.utils import InstanceList, SampleList
+
+
+@MODELS.register_module()
+class Mask2FormerHead(MMDET_MaskFormerHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_classes (int): Number of VIS classes.
+        num_queries (int): Number of query in Transformer decoder.
+            Defaults to 100.
+        num_transformer_feat_level (int): Number of feats levels.
+            Defaults to 3.
+        pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel
+            decoder.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of transformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`ConfigDict` or dict): Config for
+            transformer decoder.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer decoder position encoding.
+            Defaults to `SinePositionalEncoding3D`.
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss.
+            Defaults to 'CrossEntropyLoss'.
+        loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss.
+            Defaults to 'DiceLoss'.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            Mask2Former head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            Mask2Former head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 feat_channels: int,
+                 out_channels: int,
+                 num_classes: int,
+                 num_frames: int,
+                 num_queries: int = 100,
+                 num_transformer_feat_level: int = 3,
+                 pixel_decoder: ConfigType = ...,
+                 enforce_decoder_input_project: bool = False,
+                 transformer_decoder: ConfigType = ...,
+                 positional_encoding: ConfigType = dict(
+                     type='SinePositionalEncoding3D',
+                     num_feats=128,
+                     normalize=True),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=2.0,
+                     reduction='mean',
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=5.0),
+                 loss_dice: ConfigType = dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     reduction='mean',
+                     naive_dice=True,
+                     eps=1.0,
+                     loss_weight=5.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(AnchorFreeHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.num_frames = num_frames
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.transformerlayers. \
+            attn_cfgs.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.transformerlayers.attn_cfgs.num_levels \
+               == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = MODELS.build(pixel_decoder_)
+        self.transformer_decoder = MODELS.build(transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = MODELS.build(positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.loss_dice = MODELS.build(loss_dice)
+
+    def init_weights(self) -> None:
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def preprocess_gt(self, batch_gt_instances: InstanceList) -> InstanceList:
+        """Preprocess the ground truth for all images.
+
+        It aims to reorganize the `gt`. For example, in the
+        `batch_data_sample.gt_instances.mask`, its shape is
+        `(all_num_gts, h, w)`, but we don't know each gt belongs to which `img`
+        (assume `num_frames` is 2). So, this func used to reshape the `gt_mask`
+        to `(num_gts_per_img, num_frames, h, w)`. In addition, we can't
+        guarantee that the number of instances in these two images is equal,
+        so `-1` refers to nonexistent instances.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``labels``, each is
+                ground truth labels of each bbox, with shape (num_gts, )
+                and ``masks``, each is ground truth masks of each instances
+                of an image, shape (num_gts, h, w).
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+
+                - labels (Tensor): Ground truth class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Ground truth mask for a\
+                    image, with shape (n, t, h, w).
+        """
+        final_batch_gt_instances = []
+        for gt_instances in batch_gt_instances:
+            _device = gt_instances.labels.device
+            gt_instances.masks = gt_instances.masks.to_tensor(
+                dtype=torch.bool, device=_device)
+
+            all_ins_id = gt_instances.instances_id.unique().tolist()
+            map_ins_id = dict()
+            for i, ins_id in enumerate(all_ins_id):
+                map_ins_id[ins_id] = i
+            per_frame_gts = []
+            # a list used to record which image each instance belongs to
+            map_info = gt_instances.map_instances_to_img_idx
+            for frame_id in range(self.num_frames):
+                ins_index = (map_info == frame_id)
+                per_frame_gts.append(gt_instances[ins_index])
+
+            num_instances = len(all_ins_id)
+            mask_shape = [
+                num_instances, self.num_frames, gt_instances.masks.shape[1],
+                gt_instances.masks.shape[2]
+            ]
+            gt_masks_per_video = torch.zeros(
+                mask_shape, dtype=torch.bool, device=_device)
+            gt_ids_per_video = torch.full((num_instances, self.num_frames),
+                                          -1,
+                                          dtype=torch.long,
+                                          device=_device)
+            gt_labels_per_video = torch.full((num_instances, ),
+                                             -1,
+                                             dtype=torch.long,
+                                             device=_device)
+            for frame_id in range(self.num_frames):
+                cur_frame_gts = per_frame_gts[frame_id]
+                ins_ids = cur_frame_gts.instances_id.tolist()
+                for i, id in enumerate(ins_ids):
+                    gt_masks_per_video[map_ins_id[id],
+                                       frame_id, :, :] = cur_frame_gts.masks[i]
+                    gt_ids_per_video[map_ins_id[id],
+                                     frame_id] = cur_frame_gts.instances_id[i]
+                    gt_labels_per_video[
+                        map_ins_id[id]] = cur_frame_gts.labels[i]
+
+            tmp_instances = InstanceData(
+                labels=gt_labels_per_video,
+                masks=gt_masks_per_video.long(),
+                instances_id=gt_ids_per_video)
+            final_batch_gt_instances.append(tmp_instances)
+
+        return final_batch_gt_instances
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> Tuple[Tensor]:
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, num_frames, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+            img_meta (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, num_frames, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        # (num_gts, )
+        gt_labels = gt_instances.labels
+        # (num_gts, num_frames, h, w)
+        gt_masks = gt_instances.masks
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(mask_pred,
+                                        point_coords.repeat(num_queries, 1,
+                                                            1)).flatten(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(gt_masks.float(),
+                                       point_coords.repeat(num_gts, 1,
+                                                           1)).flatten(1)
+
+        sampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_points_masks)
+        sampled_pred_instances = InstanceData(
+            scores=cls_score, masks=mask_points_pred)
+        # assign and sample
+        assign_result = self.assigner.assign(
+            pred_instances=sampled_pred_instances,
+            gt_instances=sampled_gt_instances,
+            img_meta=img_meta)
+        pred_instances = InstanceData(scores=cls_score, masks=mask_pred)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor,
+                             batch_gt_instances: List[InstanceData],
+                             batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should include
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, num_frames,h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         avg_factor) = self.get_targets(cls_scores_list, mask_preds_list,
+                                        batch_gt_instances, batch_img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, num_frames, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, num_frames, h, w)
+        # -> (num_total_gts, num_frames, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.flatten(0, 1).unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts * num_frames, h, w) ->
+            # (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.flatten(0, 1).unsqueeze(1).float(),
+                points_coords).squeeze(1)
+        # shape (num_total_gts * num_frames, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.flatten(0, 1).unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_total_gts * num_frames, num_points) ->
+        # (num_total_gts * num_frames * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points / self.num_frames)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def _forward_head(
+        self, decoder_out: Tensor, mask_feature: Tensor,
+        attn_mask_target_size: Tuple[int,
+                                     int]) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (num_queries, batch_size, c).
+            mask_feature (Tensor): in shape (batch_size, t, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+                - cls_pred (Tensor): Classification scores in shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should include background.
+                - mask_pred (Tensor): Mask scores in shape \
+                    (batch_size, num_queries,h, w).
+                - attn_mask (Tensor): Attention mask in shape \
+                    (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        decoder_out = decoder_out.transpose(0, 1)
+        # shape (batch_size, num_queries, c)
+        cls_pred = self.cls_embed(decoder_out)
+        # shape (batch_size, num_queries, c)
+        mask_embed = self.mask_embed(decoder_out)
+        # shape (batch_size, num_queries, t, h, w)
+        mask_pred = torch.einsum('bqc,btchw->bqthw', mask_embed, mask_feature)
+        b, q, t, _, _ = mask_pred.shape
+
+        attn_mask = F.interpolate(
+            mask_pred.flatten(0, 1),
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False).view(b, q, t, attn_mask_target_size[0],
+                                      attn_mask_target_size[1])
+
+        # shape (batch_size, num_queries, t, h, w) ->
+        # (batch_size, num_queries, t*h*w) ->
+        # (batch_size, num_head, num_queries, t*h*w) ->
+        # (batch_size*num_head, num_queries, t*h*w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(self, x: List[Tensor],
+                data_samples: SampleList) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            tuple[list[Tensor]]: A tuple contains two elements.
+
+                - cls_pred_list (list[Tensor)]: Classification logits \
+                    for each decoder layer. Each is a 3D-tensor with shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should include background.
+                - mask_pred_list (list[Tensor]): Mask logits for each \
+                    decoder layer. Each with shape (batch_size, num_queries, \
+                    h, w).
+        """
+        mask_features, multi_scale_memorys = self.pixel_decoder(x)
+        bt, c_m, h_m, w_m = mask_features.shape
+        batch_size = bt // self.num_frames if self.training else 1
+        t = bt // batch_size
+        mask_features = mask_features.view(batch_size, t, c_m, h_m, w_m)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            decoder_input = decoder_input.flatten(2)
+            level_embed = self.level_embed.weight[i][None, :, None]
+            decoder_input = decoder_input + level_embed
+            _, c, hw = decoder_input.shape
+            # shape (batch_size*t, c, h, w) ->
+            # (batch_size, t, c, hw) ->
+            # (t*h*w, batch_size, c)
+            decoder_input = decoder_input.view(batch_size, t, c,
+                                               hw).permute(1, 3, 0,
+                                                           2).flatten(0, 1)
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, t) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                3).permute(1, 3, 0, 2).flatten(0, 1)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (num_queries, batch_size, c)
+        query_feat = self.query_feat.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self._forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            attn_mask[torch.where(
+                attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            attn_masks = [attn_mask, None]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                attn_masks=attn_masks,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self._forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
+
+    def loss(
+        self,
+        x: Tuple[Tensor],
+        data_samples: SampleList,
+    ) -> Dict[str, Tensor]:
+        """Perform forward propagation and loss calculation of the track head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+
+        for data_sample in data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, data_samples)
+
+        # preprocess ground truth
+        batch_gt_instances = self.preprocess_gt(batch_gt_instances)
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+                - labels (Tensor): Prediction class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Prediction mask for a\
+                    image, with shape (n, t, h, w).
+        """
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in data_samples
+        ]
+        all_cls_scores, all_mask_preds = self(x, data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        mask_cls_results = mask_cls_results[0]
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results[0],
+            size=(img_shape[0], img_shape[1]),
+            mode='bilinear',
+            align_corners=False)
+
+        results = self.predict_by_feat(mask_cls_results, mask_pred_results,
+                                       batch_img_metas)
+        return results
+
+    def predict_by_feat(self,
+                        mask_cls_results: List[Tensor],
+                        mask_pred_results: List[Tensor],
+                        batch_img_metas: List[dict],
+                        rescale: bool = True) -> InstanceList:
+        """Get top-10 predictions.
+
+        Args:
+            mask_cls_results (Tensor): Mask classification logits,\
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should include background.
+            mask_pred_results (Tensor): Mask logits, shape \
+                (batch_size, num_queries, h, w).
+            batch_img_metas (list[dict]): List of image meta information.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+                - labels (Tensor): Prediction class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Prediction mask for a\
+                    image, with shape (n, t, h, w).
+        """
+        results = []
+        if len(mask_cls_results) > 0:
+            scores = F.softmax(mask_cls_results, dim=-1)[:, :-1]
+            labels = torch.arange(self.num_classes).unsqueeze(0).repeat(
+                self.num_queries, 1).flatten(0, 1)
+            # keep top-10 predictions
+            scores_per_image, topk_indices = scores.flatten(0, 1).topk(
+                10, sorted=False)
+            labels_per_image = labels[topk_indices]
+            topk_indices = topk_indices // self.num_classes
+            mask_pred_results = mask_pred_results[topk_indices]
+
+            img_shape = batch_img_metas[0]['img_shape']
+            mask_pred_results = \
+                mask_pred_results[:, :, :img_shape[0], :img_shape[1]]
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = batch_img_metas[0]['ori_shape'][:2]
+                mask_pred_results = F.interpolate(
+                    mask_pred_results,
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)
+
+            masks = mask_pred_results > 0.
+
+            # format top-10 predictions
+            for img_idx in range(len(batch_img_metas)):
+                pred_track_instances = InstanceData()
+
+                pred_track_instances.masks = masks[:, img_idx]
+                pred_track_instances.bboxes = mask2bbox(masks[:, img_idx])
+                pred_track_instances.labels = labels_per_image
+                pred_track_instances.scores = scores_per_image
+                pred_track_instances.instances_id = torch.arange(10)
+
+                results.append(pred_track_instances)
+
+            return results
diff --git a/mmtrack/models/track_heads/prdimp_cls_head.py b/mmtrack/models/track_heads/prdimp_cls_head.py
new file mode 100644
index 000000000..48b8a15e2
--- /dev/null
+++ b/mmtrack/models/track_heads/prdimp_cls_head.py
@@ -0,0 +1,710 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from addict import Dict
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from mmengine.model import BaseModule
+from torch import Tensor, nn
+
+from mmtrack.registry import MODELS
+from mmtrack.utils import OptConfigType, SampleList, max_last2d
+from ..task_modules.filter import filter as filter_layer
+
+
+@MODELS.register_module()
+class PrDiMPClsHead(BaseModule):
+    """PrDiMP classification head.
+
+    Args:
+        in_dim (int, optional): The dim of input feature. Defaults to 1024.
+        out_dim (int, optional): The dim of output. Defaults to 512.
+        filter_initializer (dict, optional): The configuration of filter
+            initializer. Defaults to None.
+        filter_optimizer (dict, optional): The configuration of filter
+            optimizer. Defaults to None.
+        locate_cfg (dict, optional): The configuration of bbox location.
+            Defaults to None.
+        update_cfg (dict, optional): The configuration of updating tracking
+            state in memory. Defaults to None.
+        optimizer_cfg (dict, optional): The configuration of optimizer.
+            Defaults to None.
+        loss_cls (dict, optional): The configuration of classification
+            loss. Defaults to None.
+        train_cfg (dict, optional): The configuration of training.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_dim: int = 1024,
+                 out_dim: int = 512,
+                 filter_initializer: OptConfigType = None,
+                 filter_optimizer: OptConfigType = None,
+                 locate_cfg: OptConfigType = None,
+                 update_cfg: OptConfigType = None,
+                 optimizer_cfg: OptConfigType = None,
+                 loss_cls: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 **kwargs):
+        super().__init__()
+        filter_size = filter_initializer['filter_size']
+        self.filter_initializer = MODELS.build(filter_initializer)
+        self.filter_optimizer = MODELS.build(filter_optimizer)
+        self.feat_norm_scale = math.sqrt(1.0 /
+                                         (out_dim * filter_size * filter_size))
+        self.channel_mapping = nn.Sequential(
+            nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1, bias=False))
+
+        self.locate_cfg = locate_cfg
+        self.update_cfg = update_cfg
+        self.optimizer_cfg = optimizer_cfg
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+
+        self.loss_cls = MODELS.build(loss_cls)
+
+        if isinstance(filter_size, (int, float)):
+            filter_size = [filter_size, filter_size]
+        self.filter_size = torch.tensor(filter_size, dtype=torch.float32)
+        self.feat_size = torch.tensor(
+            train_cfg['feat_size'], dtype=torch.float32)
+        self.img_size = torch.tensor(
+            train_cfg['img_size'], dtype=torch.float32)
+
+    def init_weights(self):
+        """Initialize the parameters of this module."""
+        for m in self.channel_mapping:
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+        self.filter_initializer.init_weights()
+
+    def get_cls_feats(self, backbone_feats: Tensor) -> Tensor:
+        """Get features for classification.
+
+        Args:
+            backbone_feats (Tensor): The features from backbone.
+
+        Returns:
+            Tensor: The features for classification.
+        """
+        cls_feats = self.channel_mapping(backbone_feats)
+        scale_factor = (torch.tensor(cls_feats.shape[1:]).prod() / (torch.sum(
+            (cls_feats**2).reshape(cls_feats.shape[0], 1, 1, -1),
+            dim=3,
+            keepdim=True) + 1e-5)).sqrt()
+        cls_feats = cls_feats * self.feat_norm_scale * scale_factor
+
+        return cls_feats
+
+    def init_classifier(self,
+                        backbone_feats: Tensor,
+                        target_bboxes: Tensor,
+                        dropout_probs: Optional[List] = None):
+        """Initialize the filter and memory in the classifier.
+
+        Args:
+            backbone_feats (Tensor): The features from backbone.
+            target_bboxes (Tensor): in [cx, cy, w, h] format.
+            dropout_probs (list, optional): Defaults to None.
+        """
+        cls_feats = self.get_cls_feats(backbone_feats)
+
+        # add features through the augmentation of `dropout`
+        if dropout_probs is not None:
+            aug_feats = []
+            for i, prob in enumerate(dropout_probs):
+                aug_feat = F.dropout2d(
+                    cls_feats[:1, ...], p=prob, training=True)
+                aug_feats.append(aug_feat)
+            cls_feats = torch.cat([cls_feats] + aug_feats)
+
+        # Get target filter by running the discriminative model prediction
+        # module
+        target_bboxes_xyxy = bbox_cxcywh_to_xyxy(target_bboxes)
+        init_filter = self.filter_initializer(cls_feats, target_bboxes_xyxy)
+        self.target_filter = self.filter_optimizer(
+            init_filter,
+            feat=cls_feats,
+            bboxes=target_bboxes,
+            num_iters=self.optimizer_cfg['init_update_iters'])
+
+        # Initialize memory
+        self.init_memory(cls_feats, target_bboxes)
+
+    def init_memory(self, aug_feats: Tensor, target_bboxes: Tensor):
+        """Initialize the some properties about training samples in memory:
+
+            - `bboxes` (N, 4): the gt_bboxes of all samples in [cx, cy, w, h]
+                format.
+            - `training_samples` (N, C, H, W): the features of training samples
+            - `sample_weights` (N,): the weights of all samples
+            - `num_samples` (int): the number of all the samples fed into
+                memory, including the outdated samples.
+            - `replace_ind` (int): the index of samples in memory which would
+                be replaced by the next new samples.
+
+        Args:
+            aug_feats (Tensor): The augmented features.
+            target_bboxes (Tensor): of shape (N, 4) in [cx, cy, w, h] format.
+        """
+        self.memo = Dict()
+
+        self.memo.bboxes = target_bboxes.new_zeros(
+            self.update_cfg['sample_memory_size'], 4)
+        self.memo.bboxes[:target_bboxes.shape[0], :] = target_bboxes
+
+        # Initialize first-frame spatial training samples
+        self.memo.num_samples = self.num_init_samples = aug_feats.size(0)
+        # the index of the replaced memory samples in the next frame
+        self.memo.replace_ind = None
+
+        self.memo.sample_weights = aug_feats.new_zeros(
+            self.update_cfg['sample_memory_size'])
+        self.memo.sample_weights[:self.num_init_samples] = aug_feats.new_ones(
+            1) / aug_feats.shape[0]
+
+        self.memo.training_samples = aug_feats.new_zeros(
+            self.update_cfg['sample_memory_size'], *aug_feats.shape[1:])
+        self.memo.training_samples[:self.num_init_samples] = aug_feats
+
+    def forward(self, backbone_feats: Tensor) -> Tuple[Tensor, Tensor]:
+        """Run classifier on the backbone features.
+
+        Args:
+            backbone_feats (Tensor): the features from the last layer of
+                backbone
+
+        Returns:
+            scores (Tensor): of shape (bs, 1, h, w)
+            feats (Tensor): features for classification.
+        """
+        feats = self.get_cls_feats(backbone_feats)
+        scores = filter_layer.apply_filter(feats, self.target_filter)
+        return scores, feats
+
+    def update_memory(self,
+                      target_bbox: Tensor,
+                      learning_rate: Optional[float] = None):
+        """Update the tracking state in memory.
+
+        Args:
+            target_bbox (Tensor): of shape (1,4) in [x, y, w, h] format.
+            learning_rate (float, optional): The learning rate about updating.
+                Defaults to None.
+        """
+        # Update weights and get replace ind
+        replace_ind = self.update_sample_weights(learning_rate)
+        self.memo.replace_ind = replace_ind
+
+        # Update training samples and bboxes in memory
+        self.memo.training_samples[replace_ind:replace_ind + 1,
+                                   ...] = self.memo.sample_feat
+        self.memo.bboxes[replace_ind, :] = target_bbox
+        self.memo.num_samples += 1
+
+    def update_sample_weights(self, learning_rate=None) -> int:
+        """Update the weights of samples in memory.
+
+        Args:
+            learning_rate (int, optional): The learning rate of updating
+                samples in memory. Defaults to None.
+
+        Returns:
+            (int): the index of updated samples in memory.
+        """
+
+        init_sample_weight = self.update_cfg['init_samples_min_weight']
+        if init_sample_weight == 0:
+            init_sample_weight = None
+
+        replace_start_ind = 0 if init_sample_weight is None else \
+            self.num_init_samples
+
+        if self.memo.num_samples == 0 or learning_rate == 1:
+            self.memo.sample_weights[:] = 0
+            self.memo.sample_weights[0] = 1
+            replace_ind = 0
+        else:
+            # Get index to replace
+            if self.memo.num_samples < self.memo.sample_weights.shape[0]:
+                replace_ind = self.memo.num_samples
+            else:
+                _, replace_ind = torch.min(
+                    self.memo.sample_weights[replace_start_ind:], 0)
+                replace_ind = replace_ind.item() + replace_start_ind
+
+            # Update weights
+            if self.memo.replace_ind is None:
+                self.memo.sample_weights /= 1 - learning_rate
+                self.memo.sample_weights[replace_ind] = learning_rate
+            else:
+                self.memo.sample_weights[
+                    replace_ind] = self.memo.sample_weights[
+                        self.memo.replace_ind] / (1 - learning_rate)
+
+        self.memo.sample_weights /= self.memo.sample_weights.sum()
+        if (init_sample_weight is not None
+                and self.memo.sample_weights[:self.num_init_samples].sum() <
+                init_sample_weight):
+            # TODO werid! the sum of samples_weights is not equal to 1.
+            self.memo.sample_weights /= (
+                init_sample_weight +
+                self.memo.sample_weights[self.num_init_samples:].sum())
+            self.memo.sample_weights[:self.num_init_samples] = (
+                init_sample_weight / self.num_init_samples)
+
+        return replace_ind
+
+    def update_classifier(self,
+                          target_bbox: Tensor,
+                          frame_num: int,
+                          hard_neg_flag: Optional[bool] = False):
+        """Update the classifier with the refined bbox.
+
+        Args:
+            target_bbox (Tensor): of shape (1, 4) in [x, y, w, h] format.
+            frame_num (int): The ID of frame.
+            hard_neg_flag (bool, optional): Whether is the hard negative
+                sample. Defaults to False.
+        """
+        # Set flags and learning rate
+        learning_rate = self.update_cfg[
+            'normal_lr'] if not hard_neg_flag else self.update_cfg[
+                'hard_neg_lr']
+
+        # Update the tracker memory
+        if hard_neg_flag:
+            self.update_memory(target_bbox, learning_rate)
+
+        # Decide the number of iterations to run
+        num_iters = 0
+        if hard_neg_flag:
+            num_iters = self.optimizer_cfg['hard_neg_iters']
+        elif (frame_num - 1) % self.update_cfg['train_skipping'] == 0:
+            num_iters = self.optimizer_cfg['update_iters']
+
+        if num_iters > 0:
+            # Get inputs for the DiMP filter optimizer module
+            samples = self.memo.training_samples[:self.memo.num_samples, ...]
+            target_bboxes = self.memo.bboxes[:self.memo.num_samples, :].clone()
+            sample_weights = self.memo.sample_weights[:self.memo.num_samples]
+
+            # Run the filter optimizer module
+            self.target_filter = self.filter_optimizer(
+                self.target_filter,
+                num_iters=num_iters,
+                feat=samples,
+                bboxes=target_bboxes,
+                sample_weights=sample_weights)
+
+    def predict(self, backbone_feats: Tuple[Tensor], data_samples: SampleList,
+                prev_bbox: Tensor, sample_center: Tensor,
+                scale_factor: float) -> Tuple[Tensor, Tensor, bool]:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            backbone_feats (Tuple[Tensor]): The features from backbone.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+            sample_center (Tensor): The coordinate of the sample center based
+                on the original image.
+            prev_bbox (Tensor): of shape (4, ) in [cx, cy, w, h] format.
+            scale_factor (float): scale factor.
+
+        Returns:
+            new_bbox_center (Tensor): The center of the new bbox.
+            scores_map (Tensor): The score map from the classifier.
+            state (Tensor): The tracking state.
+        """
+        # ``self.memo.sample_feat`` is used to update the training samples
+        # in the memory on some conditions.
+        scores_raw, self.memo.sample_feat = self(backbone_feats[-1])
+        scores_map = torch.softmax(
+            scores_raw.view(-1), dim=0).view(scores_raw.shape)
+
+        new_bbox_center, state = self.predict_by_feat(scores_map, prev_bbox,
+                                                      sample_center,
+                                                      scale_factor)
+
+        return new_bbox_center, scores_map, state
+
+    def _gen_2d_hanning_windows(self,
+                                size: Sequence,
+                                device: str = 'cuda') -> Tensor:
+        """Generate 2D hanning window.
+
+        Args:
+            size (Sequence): The size of 2d hanning window.
+            device (str): Device the tensor will be put on. Defaults to 'cuda'.
+
+        Returns:
+            Tensor: 2D hanning window with shape
+            (num_base_anchors[i] * featmap_sizes[i][0] * featmap_sizes[i][1]).
+        """
+
+        def hanning_1d(s):
+            return 0.5 * (1 - torch.cos(
+                (2 * math.pi / (s + 1)) * torch.arange(1, s + 1).float()))
+
+        hanning_win = hanning_1d(size[0]).reshape(-1, 1) * hanning_1d(
+            size[1]).reshape(1, -1)
+
+        return hanning_win.to(device)
+
+    def predict_by_feat(self, scores: Tensor, prev_bbox: Tensor,
+                        sample_center: Tensor,
+                        scale_factor: float) -> Tuple[Tensor, bool]:
+        """Track `prev_bbox` to current frame based on the output of network.
+
+        Args:
+            scores (Tensor): It's of shape (1, h, w) or (h, w).
+            prev_bbox (Tensor): It's of shape (4,) in [cx, cy, w, h] format.
+            sample_center (Tensor): The center of the cropped
+                sample on the original image. It's of shape (1,2) or (2,) in
+                [x, y] format.
+            scale_factor (float): The scale of the cropped sample.
+                It's of shape (1,) when it's a tensor.
+
+        Return:
+            Tensor: The displacement of the target to the center of original
+                image
+            bool: The tracking state.
+        """
+        sample_center = sample_center.squeeze()
+        scores = scores.squeeze()
+        prev_bbox = prev_bbox.squeeze()
+        assert scores.dim() == 2
+        assert sample_center.dim() == prev_bbox.dim() == 1
+
+        score_size = torch.tensor([scores.shape[-1], scores.shape[-2]])
+        output_size = (score_size - (self.filter_size + 1) % 2).to(
+            scores.device)
+        score_center = (score_size / 2).to(scores.device)
+
+        scores_hn = scores
+        if self.locate_cfg.get('hanning_window', False):
+            scores_hn = scores * self._gen_2d_hanning_windows(
+                score_size, device=scores.device)
+
+        max_score, max_pos = max_last2d(scores_hn)
+        max_pos = max_pos.flip(0).float()
+        # the displacement of target to the center of score map
+        target_disp_score_map = max_pos - score_center
+        # the ratio of the size of original image to to that of score map
+        ratio_size = (self.test_cfg['img_sample_size'] /
+                      output_size) * scale_factor
+        # the displcement of the target to the center of original image
+        target_disp = target_disp_score_map * ratio_size
+
+        # Handle different cases
+        # 1. Target is not found
+        if max_score.item() < self.locate_cfg['no_target_min_score']:
+            return target_disp + sample_center, 'not_found'
+
+        # Analysis whether there is a distractor
+        # Calculate the size of neighborhood near the current target
+        target_neigh_sz = self.locate_cfg['target_neighborhood_scale'] * (
+            prev_bbox[2:4] / ratio_size)
+
+        top_left = (max_pos - target_neigh_sz / 2).round().long()
+        top_left = torch.clamp_min(top_left, 0).tolist()
+        bottom_right = (max_pos + target_neigh_sz / 2).round().long()
+        bottom_right = torch.clamp_max(bottom_right,
+                                       score_size.min().item()).tolist()
+        scores_masked = scores.clone()
+        scores_masked[top_left[1]:bottom_right[1],
+                      top_left[0]:bottom_right[0]] = 0
+
+        # Find new maximum except the neighborhood of the target
+        second_max_score, second_max_pos = max_last2d(scores_masked)
+        second_max_pos = second_max_pos.flip(0).float().view(-1)
+        distractor_disp_score_map = second_max_pos - score_center
+        distractor_disp = distractor_disp_score_map * ratio_size
+        # The displacement of previout target bbox to the center of the score
+        # map.
+        # Note that `sample_center`` may not be equal to the center of previous
+        # tracking bbox due to different cropping mode
+        prev_target_disp_score_map = (prev_bbox[:2] -
+                                      sample_center) / ratio_size
+
+        # 2. There is a distractor
+        if second_max_score > self.locate_cfg['distractor_thres'] * max_score:
+            target_disp_diff = torch.sqrt(
+                torch.sum(
+                    (target_disp_score_map - prev_target_disp_score_map)**2))
+            # `distractor_disp_diff` is the displacement between current
+            # tracking bbox and previous tracking bbox.
+            distractor_disp_diff = torch.sqrt(
+                torch.sum((distractor_disp_score_map -
+                           prev_target_disp_score_map)**2))
+            disp_diff_thres = self.locate_cfg[
+                'dispalcement_scale'] * score_size.prod().float().sqrt() / 2
+
+            if (distractor_disp_diff > disp_diff_thres
+                    and target_disp_diff < disp_diff_thres):
+                return target_disp + sample_center, 'hard_negative'
+            if (distractor_disp_diff < disp_diff_thres
+                    and target_disp_diff > disp_diff_thres):
+                # The true target may be the `distractor` instead of the
+                # `target` on this frame
+                return distractor_disp + sample_center, 'hard_negative'
+            else:
+                # If both the displacement of target and distractor is larger
+                # or smaller than the threshold, return the displacement of the
+                # highest score.
+                return target_disp + sample_center, 'uncertain'
+
+        # 3. There is a hard negative object
+        if (second_max_score > self.locate_cfg['hard_neg_thres'] * max_score
+                and second_max_score > self.locate_cfg['no_target_min_score']):
+            return target_disp + sample_center, 'hard_negative'
+
+        # 4. Normal target
+        return target_disp + sample_center, 'normal'
+
+    def _gauss_1d(self,
+                  size: Tensor,
+                  sigma: float,
+                  center: Tensor,
+                  end_pad: int = 0,
+                  return_density: bool = True):
+        """Generate gauss labels.
+
+        Args:
+            size (Tensor): The size of score map with (2, ) shape.
+            sigma (float): Standard deviations.
+            center (Tensor): of (N, ) shape.
+            end_pad (int, optional): The padding size.. Defaults to 0.
+            return_density (bool, optional): Whether to return density.
+                Defaults to True.
+
+        Returns:
+            Tensor: Gauss labels with (N, score_map_size) shape.
+        """
+        k = torch.arange(-(size - 1) / 2, (size + 1) / 2 + end_pad).reshape(
+            1, -1).to(center.device)
+        gauss = torch.exp(-1.0 / (2 * sigma**2) *
+                          (k - center.reshape(-1, 1))**2)
+        if not return_density:
+            return gauss
+        else:
+            return gauss / (math.sqrt(2 * math.pi) * sigma)
+
+    def _gauss_2d(self,
+                  size: Tensor,
+                  sigma: float,
+                  center: Tensor,
+                  end_pad: Union[Tensor, List, Tuple] = (0, 0),
+                  return_density: bool = True):
+        """Generate gauss labels.
+
+        Args:
+            size (Tensor): The size of score map with (2, ) shape.
+            sigma (Tensor): Standard deviations.
+            center (Tensor): The center of bbox with (N, 2) shape.
+            end_pad (Union[Tensor, List, Tuple], optional): The padding size.
+                Defaults to (0, 0).
+            return_density (bool, optional): Whether to return density.
+                Defaults to True.
+
+        Returns:
+            Tensor: Gauss labels with (N, score_map_size, score_map_size)
+                shape.
+        """
+        if isinstance(sigma, (float, int)):
+            sigma = (sigma, sigma)
+
+        gauss_1d_out1 = self._gauss_1d(
+            size[0].item(),
+            sigma[0],
+            center[:, 0],
+            end_pad[0],
+            return_density=return_density)
+        gauss_1d_out1 = gauss_1d_out1.reshape(center.shape[0], 1, -1)
+
+        gauss_1d_out2 = self._gauss_1d(
+            size[1].item(),
+            sigma[1],
+            center[:, 1],
+            end_pad[1],
+            return_density=return_density)
+        gauss_1d_out2 = gauss_1d_out2.reshape(center.shape[0], -1, 1)
+
+        return gauss_1d_out1 * gauss_1d_out2
+
+    def get_targets(self, target_center: Tensor) -> Tensor:
+        """Generate the training targets for search images.
+
+        Args:
+            target_center (Tensor): The center of target bboxes with (N, 2)
+                shape, in [x, y] format.
+
+        Returns:
+            Tensor: The distribution of gauss labels with
+                (N, score_map_size, score_map_size) shape.
+        """
+        # TODO simplify the different devices
+        img_size = self.img_size.to(target_center.device)
+        filter_size = self.filter_size.to(target_center.device)
+        feat_size = self.feat_size.to(target_center.device)
+
+        # set the center of image as coordinate origin
+        target_center_norm = (target_center - img_size / 2.) / img_size
+
+        center = feat_size * target_center_norm + 0.5 * torch.fmod(
+            filter_size + 1, 2)
+
+        sigma = self.train_cfg['sigma_factor'] * feat_size.prod().sqrt().item()
+
+        if self.train_cfg['end_pad_if_even']:
+            end_pad = (torch.fmod(filter_size, 2) == 0).to(filter_size)
+        else:
+            end_pad = torch.zeros(2).to(filter_size)
+
+        # generate gauss labels and density
+        # Both of them are of shape (N, feat_size, feat_size)
+        gauss_label_density = self._gauss_2d(
+            feat_size, sigma, center, end_pad,
+            self.train_cfg['use_gauss_density'])
+        # continue to process label density
+        feat_area = (feat_size + end_pad).prod()
+        gauss_label_density = (1.0 - self.train_cfg['gauss_label_bias']
+                               ) * gauss_label_density + self.train_cfg[
+                                   'gauss_label_bias'] / feat_area
+
+        mask = (gauss_label_density > self.train_cfg['label_density_threshold']
+                ).to(gauss_label_density)
+        gauss_label_density *= mask
+
+        if self.train_cfg['label_density_norm']:
+            g_sum = gauss_label_density.sum(dim=(-2, -1))
+            valid = g_sum > 0.01
+            gauss_label_density[valid, :, :] /= g_sum[valid].reshape(-1, 1, 1)
+            gauss_label_density[~valid, :, :] = 1.0 / (
+                gauss_label_density.shape[-2] * gauss_label_density.shape[-1])
+        gauss_label_density *= 1.0 - self.train_cfg['label_density_shrink']
+
+        return gauss_label_density
+
+    def loss(self, template_feats: Tuple[Tensor], search_feats: Tuple[Tensor],
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the tracking
+        head on the features of the upstream network.
+
+        Args:
+            template_feats (tuple[Tensor, ...]): Tuple of Tensor with
+                shape (N, C, H, W) denoting the multi level feature maps of
+                exemplar images.
+            search_feats (tuple[Tensor, ...]): Tuple of Tensor with shape
+                (N, C, H, W) denoting the multi level feature maps of
+                search images.
+            batch_data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_bboxes = []
+        batch_img_metas = []
+        batch_search_gt_bboxes = []
+
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_bboxes.append(data_sample.gt_instances['bboxes'])
+            batch_search_gt_bboxes.append(
+                data_sample.search_gt_instances['bboxes'])
+
+        num_imgs_per_seq = batch_data_samples[0].gt_instances['bboxes'].shape[
+            0]
+        num_search_imgs_per_seq = batch_data_samples[0].search_gt_instances[
+            'bboxes'].shape[0]
+
+        # Extract features
+        template_feats = self.get_cls_feats(template_feats[-1])
+        template_feats = template_feats.reshape(num_imgs_per_seq, -1,
+                                                *template_feats.shape[-3:])
+        search_feats = self.get_cls_feats(search_feats[-1])
+        search_feats = search_feats.reshape(num_search_imgs_per_seq, -1,
+                                            *search_feats.shape[-3:])
+
+        return self.loss_by_feat(template_feats, search_feats, batch_gt_bboxes,
+                                 batch_search_gt_bboxes)
+
+    def loss_by_feat(self, template_feats: Tensor, search_feats: Tensor,
+                     batch_gt_bboxes: Tensor,
+                     batch_search_gt_bboxes: Tensor) -> dict:
+        """Compute loss.
+
+        Args:
+            modulations (Tuple[Tensor]): The modulation features.
+            iou_feats (Tuple[Tensor]): The features for iou prediction.
+            batch_gt_bboxes (Tensor): The gt_bboxes in a batch.
+            batch_search_gt_bboxes (Tensor): The search gt_bboxes in a batch.
+
+        Returns:
+            dict: a dictionary of loss components.
+        """
+        # Train filter
+        batch_gt_bboxes = torch.stack(batch_gt_bboxes, dim=1)
+        # filter_iter is a list, and each of them is of shape
+        # (bs, C, self.filter_size, self.filter_size)
+        init_filter = self.filter_initializer(template_feats,
+                                              batch_gt_bboxes.view(-1, 4))
+        bboxes_cxcywh = bbox_xyxy_to_cxcywh(batch_gt_bboxes)
+        _, filters_all_iters, _ = self.filter_optimizer(
+            init_filter, feat=template_feats, bboxes=bboxes_cxcywh)
+
+        # Classify samples using all filters
+        # each item in ``target_scores`` is of shape
+        # (num_search_imgs_per_seq, bs, score_map_size, score_map_size)
+        target_scores = [
+            filter_layer.apply_filter(search_feats, filter_weight)
+            for filter_weight in filters_all_iters
+        ]
+
+        batch_search_gt_bboxes = torch.stack(batch_search_gt_bboxes, dim=1)
+        search_gt_bboxes = batch_search_gt_bboxes.view(-1, 4)
+        target_center = (search_gt_bboxes[:, :2] +
+                         search_gt_bboxes[:, 2:4]) / 2.
+        prob_labels_density = self.get_targets(target_center)
+        # of shape (num_search_imgs_per_seq, bs, score_map_size, score_map_size) # noqa: E501
+        prob_labels_density = prob_labels_density.view(
+            batch_search_gt_bboxes.shape[0], -1,
+            *prob_labels_density.shape[-2:])
+
+        # compute loss
+        loss_cls_list = [
+            self.loss_cls(score, prob_labels_density, grid_dim=(-2, -1))
+            for score in target_scores
+        ]
+        loss_cls_final = self.train_cfg['loss_weights'][
+            'cls_final'] * loss_cls_list[-1]
+        loss_cls_init = self.train_cfg['loss_weights'][
+            'cls_init'] * loss_cls_list[0]
+
+        if isinstance(self.train_cfg['loss_weights']['cls_iter'], list):
+            loss_cls_iters = sum([
+                weight * loss for weight, loss in zip(
+                    self.train_cfg['loss_weights']['cls_iter'],
+                    loss_cls_list[1:-1])
+            ])
+        else:
+            loss_cls_iters = (self.train_cfg['loss_weights']['cls_iter'] /
+                              (len(loss_cls_list) - 2)) * sum(
+                                  loss_cls_list[1:-1])
+        losses = dict(
+            loss_cls_init=loss_cls_init,
+            loss_cls_iter=loss_cls_iters,
+            loss_cls_final=loss_cls_final)
+
+        return losses
diff --git a/mmtrack/models/track_heads/quasi_dense_embed_head.py b/mmtrack/models/track_heads/quasi_dense_embed_head.py
index 9df2706d1..b754701d7 100644
--- a/mmtrack/models/track_heads/quasi_dense_embed_head.py
+++ b/mmtrack/models/track_heads/quasi_dense_embed_head.py
@@ -1,13 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
 import torch
 import torch.nn as nn
-from mmdet.models import HEADS, build_loss
+from mmdet.models.task_modules import SamplingResult
+from torch import Tensor
 
-from mmtrack.core import embed_similarity
+from mmtrack.registry import MODELS
+from ..task_modules.track import embed_similarity
 from .roi_embed_head import RoIEmbedHead
 
 
-@HEADS.register_module()
+@MODELS.register_module()
 class QuasiDenseEmbedHead(RoIEmbedHead):
     """The quasi-dense roi embed head.
 
@@ -22,17 +26,16 @@ class QuasiDenseEmbedHead(RoIEmbedHead):
     """
 
     def __init__(self,
-                 embed_channels=256,
-                 softmax_temp=-1,
-                 loss_track=dict(
-                     type='MultiPosCrossEntropyLoss', loss_weight=0.25),
-                 loss_track_aux=dict(
+                 embed_channels: int = 256,
+                 softmax_temp: int = -1,
+                 loss_track: Optional[dict] = None,
+                 loss_track_aux: dict = dict(
                      type='L2Loss',
                      sample_ratio=3,
                      margin=0.3,
                      loss_weight=1.0,
                      hard_mining=True),
-                 init_cfg=dict(
+                 init_cfg: dict = dict(
                      type='Xavier',
                      layer='Linear',
                      distribution='uniform',
@@ -43,21 +46,22 @@ def __init__(self,
                          mean=0,
                          std=0.01,
                          bias=0)),
-                 *args,
                  **kwargs):
-        super(QuasiDenseEmbedHead, self).__init__(
-            init_cfg=init_cfg, *args, **kwargs)
+        super(QuasiDenseEmbedHead, self).__init__(init_cfg=init_cfg, **kwargs)
 
-        self.fc_embed = nn.Linear(self.last_layer_dim, embed_channels)
+        if loss_track is None:
+            loss_track = dict(
+                type='MultiPosCrossEntropyLoss', loss_weight=0.25)
 
+        self.fc_embed = nn.Linear(self.last_layer_dim, embed_channels)
         self.softmax_temp = softmax_temp
-        self.loss_track = build_loss(loss_track)
+        self.loss_track = MODELS.build(loss_track)
         if loss_track_aux is not None:
-            self.loss_track_aux = build_loss(loss_track_aux)
+            self.loss_track_aux = MODELS.build(loss_track_aux)
         else:
             self.loss_track_aux = None
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         """Forward the input `x`."""
 
         if self.num_convs > 0:
@@ -70,18 +74,20 @@ def forward(self, x):
         x = self.fc_embed(x)
         return x
 
-    def get_targets(self, gt_match_indices, key_sampling_results,
-                    ref_sampling_results):
+    def get_targets(
+            self, gt_match_indices: List[Tensor],
+            key_sampling_results: List[SamplingResult],
+            ref_sampling_results: List[SamplingResult]) -> Tuple[List, List]:
         """Calculate the track targets and track weights for all samples in a
         batch according to the sampling_results.
 
         Args:
-            key_sampling_results (List[obj:SamplingResults]): Assign results of
-                all images in a batch after sampling.
-            ref_sampling_results (List[obj:SamplingResults]): Assign results of
-                all reference images in a batch after sampling.
             gt_match_indices (list(Tensor)): Mapping from gt_instance_ids to
                 ref_gt_instance_ids of the same tracklet in a pair of images.
+            key_sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResult]): Assign results of
+                all reference images in a batch after sampling.
 
         Returns:
             Tuple[list[Tensor]]: Association results.
@@ -113,8 +119,11 @@ def get_targets(self, gt_match_indices, key_sampling_results,
             track_weights.append(weights)
         return track_targets, track_weights
 
-    def match(self, key_embeds, ref_embeds, key_sampling_results,
-              ref_sampling_results):
+    def match(
+        self, key_embeds: Tensor, ref_embeds: Tensor,
+        key_sampling_results: List[SamplingResult],
+        ref_sampling_results: List[SamplingResult]
+    ) -> Tuple[List[Tensor], List[Tensor]]:
         """Calculate the dist matrixes for loss measurement.
 
         Args:
@@ -122,7 +131,7 @@ def match(self, key_embeds, ref_embeds, key_sampling_results,
                 of key image.
             ref_embeds (Tensor): Embeds of all bboxes in sampling results
                 of the reference image.
-            keysampling_results (List[obj:SamplingResults]): Assign results of
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
                 all images in a batch after sampling.
             ref_sampling_results (List[obj:SamplingResults]): Assign results of
                 all reference images in a batch after sampling.
@@ -160,21 +169,24 @@ def match(self, key_embeds, ref_embeds, key_sampling_results,
                 cos_dists.append(None)
         return dists, cos_dists
 
-    def loss(self, dists, cos_dists, targets, weights):
+    def loss(self, key_roi_feats: Tensor, ref_roi_feats: Tensor,
+             key_sampling_results: List[SamplingResult],
+             ref_sampling_results: List[SamplingResult],
+             gt_match_indices_list: List[Tensor]) -> dict:
         """Calculate the track loss and the auxiliary track loss.
 
         Args:
-            dists (list[Tensor]): Dot-product dists between
-                key_embeds and ref_embeds.
-            cos_dists (list[Tensor]): Cosine dists between
-                key_embeds and ref_embeds.
-            targets (list[Tensor]): The mapping instance ids from all
-                positive proposals in the key image to all proposals
-                in the reference image, each tensor in list has
-                shape (len(key_pos_bboxes), len(ref_bboxes)).
-            weights (list[Tensor]): Loss weights for all positive
-                proposals in a batch, each tensor in list has
-                shape (len(key_pos_bboxes),).
+            key_roi_feats (Tensor): Embeds of positive bboxes in sampling
+                results of key image.
+            ref_roi_feats (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+            gt_match_indices_list (list(Tensor)): Mapping from gt_instances_id
+                to ref_gt_instances_id of the same tracklet in a pair of
+                images.
 
         Returns:
             Dict [str: Tensor]: Calculation results.
@@ -183,7 +195,46 @@ def loss(self, dists, cos_dists, targets, weights):
                 - loss_track (Tensor): Results of loss_track function.
                 - loss_track_aux (Tensor): Results of loss_track_aux function.
         """
+        key_track_feats = self(key_roi_feats)
+        ref_track_feats = self(ref_roi_feats)
 
+        losses = self.loss_by_feat(key_track_feats, ref_track_feats,
+                                   key_sampling_results, ref_sampling_results,
+                                   gt_match_indices_list)
+        return losses
+
+    def loss_by_feat(self, key_track_feats: Tensor, ref_track_feats: Tensor,
+                     key_sampling_results: List[SamplingResult],
+                     ref_sampling_results: List[SamplingResult],
+                     gt_match_indices_list: List[Tensor]) -> dict:
+        """Calculate the track loss and the auxiliary track loss.
+
+        Args:
+            key_track_feats (Tensor): Embeds of positive bboxes in sampling
+                results of key image.
+            ref_track_feats (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+            gt_match_indices_list (list(Tensor)): Mapping from gt_instances_id
+                to ref_gt_instances_id of the same tracklet in a pair of
+                images.
+
+        Returns:
+            Dict [str: Tensor]: Calculation results.
+            Containing the following list of Tensors:
+
+                - loss_track (Tensor): Results of loss_track function.
+                - loss_track_aux (Tensor): Results of loss_track_aux function.
+        """
+        dists, cos_dists = self.match(key_track_feats, ref_track_feats,
+                                      key_sampling_results,
+                                      ref_sampling_results)
+        targets, weights = self.get_targets(gt_match_indices_list,
+                                            key_sampling_results,
+                                            ref_sampling_results)
         losses = dict()
 
         loss_track = 0.
@@ -200,3 +251,16 @@ def loss(self, dists, cos_dists, targets, weights):
             losses['loss_track_aux'] = loss_track_aux / len(dists)
 
         return losses
+
+    def predict(self, bbox_feats: Tensor) -> Tensor:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            bbox_feats: The extracted roi features.
+
+        Returns:
+            Tensor: The extracted track features.
+        """
+        track_feats = self(bbox_feats)
+        return track_feats
diff --git a/mmtrack/models/track_heads/quasi_dense_track_head.py b/mmtrack/models/track_heads/quasi_dense_track_head.py
index 98db34b39..eb29ee366 100644
--- a/mmtrack/models/track_heads/quasi_dense_track_head.py
+++ b/mmtrack/models/track_heads/quasi_dense_track_head.py
@@ -1,125 +1,131 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.core import bbox2roi
-from mmdet.models import HEADS
+from typing import List
 
+from mmdet.structures.bbox import bbox2roi
+from torch import Tensor
+
+from mmtrack.registry import MODELS
+from mmtrack.utils import InstanceList, SampleList
 from .roi_track_head import RoITrackHead
 
 
-@HEADS.register_module()
+@MODELS.register_module()
 class QuasiDenseTrackHead(RoITrackHead):
     """The quasi-dense track head."""
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def forward_train(self,
-                      x,
-                      img_metas,
-                      proposal_list,
-                      gt_bboxes,
-                      gt_labels,
-                      gt_match_indices,
-                      ref_x,
-                      ref_img_metas,
-                      ref_proposals,
-                      ref_gt_bboxes,
-                      ref_gt_labels,
-                      gt_bboxes_ignore=None,
-                      gt_masks=None,
-                      ref_gt_bboxes_ignore=None,
-                      ref_gt_mask=None,
-                      *args,
-                      **kwargs):
-        """Forward function during training.
-
-         Args:
-            x (list[Tensor]): list of multi-level image features.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-            proposal_list (list[Tensors]): list of region proposals.
-            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
-                each item has a shape (num_gts, 4).
-            gt_labels (list[Tensor]): Ground truth labels of all images.
-                each has a shape (num_gts,).
-            gt_match_indices (list(Tensor)): Mapping from gt_instance_ids to
-                ref_gt_instance_ids of the same tracklet in a pair of images.
-            ref_x (list[Tensor]): list of multi-level ref_img features.
-            ref_img_metas (list[dict]): list of reference image info dict where
-                each dict has: 'img_shape', 'scale_factor', 'flip', and may
-                also contain 'filename', 'ori_shape', 'pad_shape',
-                and 'img_norm_cfg'.
-            ref_proposal_list (list[Tensors]): list of ref_img
-                region proposals.
-            ref_gt_bboxes (list[Tensor]): Ground truth bboxes of the
-                reference image, each item has a shape (num_gts, 4).
-            ref_gt_labels (list[Tensor]): Ground truth labels of all
-                reference images, each has a shape (num_gts,).
-            gt_bboxes_ignore (list[Tensor], None): Ground truth bboxes to be
-                ignored, each item has a shape (num_ignored_gts, 4).
-            gt_masks (list[Tensor]) : Masks for each bbox, has a shape
-                (num_gts, h , w).
-            ref_gt_bboxes_ignore (list[Tensor], None): Ground truth bboxes
-                of reference images to be ignored,
-                each item has a shape (num_ignored_gts, 4).
-            ref_gt_masks (list[Tensor]) : Masks for each reference bbox,
-                has a shape (num_gts, h , w).
+    def extract_roi_feats(self, feats: List[Tensor],
+                          bboxes: List[Tensor]) -> Tensor:
+        """Extract roi features.
+
+        Args:
+            feats (list[Tensor]): list of multi-level image features.
+            bboxes (list[Tensor]): list of bboxes in sampling result.
 
         Returns:
-            dict[str : Tensor]: Track losses.
+            Tensor: The extracted roi features.
         """
+        rois = bbox2roi(bboxes)
+        bbox_feats = self.roi_extractor(feats[:self.roi_extractor.num_inputs],
+                                        rois)
+        return bbox_feats
+
+    def loss(self, key_feats: List[Tensor], ref_feats: List[Tensor],
+             rpn_results_list: InstanceList,
+             ref_rpn_results_list: InstanceList, data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
 
+        Args:
+            key_feats (list[Tensor]): list of multi-level image features.
+            ref_feats (list[Tensor]): list of multi-level ref_img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals of key img.
+            ref_rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals of ref img.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
         assert self.with_track
-        num_imgs = len(img_metas)
-        if gt_bboxes_ignore is None:
-            gt_bboxes_ignore = [None for _ in range(num_imgs)]
-        if ref_gt_bboxes_ignore is None:
-            ref_gt_bboxes_ignore = [None for _ in range(num_imgs)]
+        num_imgs = len(data_samples)
+        batch_gt_instances = []
+        ref_batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        gt_match_indices_list = []
+        for data_sample in data_samples:
+            batch_gt_instances.append(data_sample.gt_instances)
+            ref_batch_gt_instances.append(data_sample.ref_gt_instances)
+            if 'ignored_instances' in data_sample:
+                batch_gt_instances_ignore.append(data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+            # get gt_match_indices
+            ins_ids = data_sample.gt_instances.instances_id.tolist()
+            ref_ins_ids = data_sample.ref_gt_instances.instances_id.tolist()
+            match_indices = Tensor([
+                ref_ins_ids.index(i) if (i in ref_ins_ids and i > 0) else -1
+                for i in ins_ids
+            ]).to(key_feats[0].device)
+            gt_match_indices_list.append(match_indices)
+
         key_sampling_results, ref_sampling_results = [], []
         for i in range(num_imgs):
-            assign_result = self.bbox_assigner.assign(proposal_list[i],
-                                                      gt_bboxes[i],
-                                                      gt_bboxes_ignore[i],
-                                                      gt_labels[i])
+            rpn_results = rpn_results_list[i]
+            ref_rpn_results = ref_rpn_results_list[i]
+            # rename ref_rpn_results.bboxes to ref_rpn_results.priors
+            ref_rpn_results.priors = ref_rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
             sampling_result = self.bbox_sampler.sample(
                 assign_result,
-                proposal_list[i],
-                gt_bboxes[i],
-                gt_labels[i],
-                feats=[lvl_feat[i][None] for lvl_feat in x])
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in key_feats])
             key_sampling_results.append(sampling_result)
 
             ref_assign_result = self.bbox_assigner.assign(
-                ref_proposals[i], ref_gt_bboxes[i], ref_gt_bboxes_ignore[i],
-                ref_gt_labels[i])
+                ref_rpn_results, ref_batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
             ref_sampling_result = self.bbox_sampler.sample(
                 ref_assign_result,
-                ref_proposals[i],
-                ref_gt_bboxes[i],
-                ref_gt_labels[i],
-                feats=[lvl_feat[i][None] for lvl_feat in ref_x])
+                ref_rpn_results,
+                ref_batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in ref_feats])
             ref_sampling_results.append(ref_sampling_result)
 
         key_bboxes = [res.pos_bboxes for res in key_sampling_results]
-        key_feats = self.extract_bbox_feats(x, key_bboxes)
+        key_roi_feats = self.extract_roi_feats(key_feats, key_bboxes)
         ref_bboxes = [res.bboxes for res in ref_sampling_results]
-        ref_feats = self.extract_bbox_feats(ref_x, ref_bboxes)
+        ref_roi_feats = self.extract_roi_feats(ref_feats, ref_bboxes)
 
-        match_feats = self.embed_head.match(key_feats, ref_feats,
-                                            key_sampling_results,
-                                            ref_sampling_results)
-        asso_targets = self.embed_head.get_targets(gt_match_indices,
-                                                   key_sampling_results,
-                                                   ref_sampling_results)
-        loss_track = self.embed_head.loss(*match_feats, *asso_targets)
+        loss_track = self.embed_head.loss(key_roi_feats, ref_roi_feats,
+                                          key_sampling_results,
+                                          ref_sampling_results,
+                                          gt_match_indices_list)
 
         return loss_track
 
-    def extract_bbox_feats(self, x, bboxes):
-        """Extract roi features."""
+    def predict(self, feats: List[Tensor],
+                rescaled_bboxes: List[Tensor]) -> Tensor:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
 
-        rois = bbox2roi(bboxes)
-        track_feats = self.roi_extractor(x[:self.roi_extractor.num_inputs],
-                                         rois)
-        track_feats = self.embed_head(track_feats)
+        Args:
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            rescaled_bboxes (list[Tensor]): list of rescaled bboxes in sampling
+                result.
+
+        Returns:
+            Tensor: The extracted track features.
+        """
+        bbox_feats = self.extract_roi_feats(feats, rescaled_bboxes)
+        track_feats = self.embed_head.predict(bbox_feats)
         return track_feats
diff --git a/mmtrack/models/track_heads/roi_embed_head.py b/mmtrack/models/track_heads/roi_embed_head.py
index b9992b632..7619b17e8 100644
--- a/mmtrack/models/track_heads/roi_embed_head.py
+++ b/mmtrack/models/track_heads/roi_embed_head.py
@@ -1,18 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from collections import defaultdict
+from typing import List, Optional, Tuple
 
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule, auto_fp16, force_fp32
-from mmdet.models import HEADS, build_loss
 from mmdet.models.losses import accuracy
+from mmdet.models.task_modules import SamplingResult
+from mmengine.model import BaseModule
+from torch import Tensor
 from torch.nn.modules.utils import _pair
 
-from mmtrack.core import embed_similarity
+from mmtrack.registry import MODELS
+from ..task_modules.track import embed_similarity
 
 
-@HEADS.register_module()
+@MODELS.register_module()
 class RoIEmbedHead(BaseModule):
     """The roi embed head.
 
@@ -42,20 +45,20 @@ class RoIEmbedHead(BaseModule):
     """
 
     def __init__(self,
-                 num_convs=0,
-                 num_fcs=0,
-                 roi_feat_size=7,
-                 in_channels=256,
-                 conv_out_channels=256,
-                 with_avg_pool=False,
-                 fc_out_channels=1024,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 loss_match=dict(
-                     type='CrossEntropyLoss',
+                 num_convs: int = 0,
+                 num_fcs: int = 0,
+                 roi_feat_size: int = 7,
+                 in_channels: int = 256,
+                 conv_out_channels: int = 256,
+                 with_avg_pool: bool = False,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: Optional[dict] = None,
+                 norm_cfg: Optional[dict] = None,
+                 loss_match: dict = dict(
+                     type='mmdet.CrossEntropyLoss',
                      use_sigmoid=False,
                      loss_weight=1.0),
-                 init_cfg=None,
+                 init_cfg: Optional[dict] = None,
                  **kwargs):
         super(RoIEmbedHead, self).__init__(init_cfg=init_cfg)
         self.num_convs = num_convs
@@ -68,7 +71,7 @@ def __init__(self,
         self.fc_out_channels = fc_out_channels
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
-        self.loss_match = build_loss(loss_match)
+        self.loss_match = MODELS.build(loss_match)
         self.fp16_enabled = False
 
         if self.with_avg_pool:
@@ -78,8 +81,9 @@ def __init__(self,
             self.num_convs, self.num_fcs, self.in_channels)
         self.relu = nn.ReLU(inplace=True)
 
-    def _add_conv_fc_branch(self, num_branch_convs, num_branch_fcs,
-                            in_channels):
+    def _add_conv_fc_branch(
+            self, num_branch_convs: int, num_branch_fcs: int,
+            in_channels: int) -> Tuple[nn.ModuleList, nn.ModuleList, int]:
         """Add shared or separable branch.
 
         convs -> avg pool (optional) -> fcs
@@ -119,8 +123,9 @@ def _add_conv_fc_branch(self, num_branch_convs, num_branch_fcs,
     def custom_activation(self):
         return getattr(self.loss_match, 'custom_activation', False)
 
-    def _forward(self, x, num_x_per_img):
-        """Forward the input `x`, and split the output to a list.
+    def extract_feat(self, x: Tensor,
+                     num_x_per_img: List[int]) -> Tuple[Tensor]:
+        """Extract feature from the input `x`, and split the output to a list.
 
         Args:
             x (Tensor): of shape [N, C, H, W]. N is the number of proposals.
@@ -148,8 +153,10 @@ def _forward(self, x, num_x_per_img):
         x_split = torch.split(x, num_x_per_img, dim=0)
         return x_split
 
-    @auto_fp16(apply_to=('x', 'ref_x'))
-    def forward(self, x, ref_x, num_x_per_img, num_x_per_ref_img):
+    def forward(
+            self, x: Tensor, ref_x: Tensor, num_x_per_img: List[int],
+            num_x_per_ref_img: List[int]
+    ) -> Tuple[Tuple[Tensor], Tuple[Tensor]]:
         """Computing the similarity scores between `x` and `ref_x`.
 
         Args:
@@ -165,28 +172,22 @@ def forward(self, x, ref_x, num_x_per_img, num_x_per_ref_img):
                 proposals for each reference image.
 
         Returns:
-            list[Tensor]: The predicted similarity_logits of each pair of key
-            image and reference image.
+            tuple[tuple[Tensor], tuple[Tensor]]: Each tuple of tensor denotes
+            the embed features belonging to an image in a batch.
         """
-        x_split = self._forward(x, num_x_per_img)
-        ref_x_split = self._forward(ref_x, num_x_per_ref_img)
+        x_split = self.extract_feat(x, num_x_per_img)
+        ref_x_split = self.extract_feat(ref_x, num_x_per_ref_img)
 
-        similarity_logits = []
-        for one_x, one_ref_x in zip(x_split, ref_x_split):
-            similarity_logit = embed_similarity(
-                one_x, one_ref_x, method='dot_product')
-            dummy = similarity_logit.new_zeros(one_x.shape[0], 1)
-            similarity_logit = torch.cat((dummy, similarity_logit), dim=1)
-            similarity_logits.append(similarity_logit)
-        return similarity_logits
+        return x_split, ref_x_split
 
-    def get_targets(self, sampling_results, gt_instance_ids,
-                    ref_gt_instance_ids):
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    gt_instance_ids: List[Tensor],
+                    ref_gt_instance_ids: List[Tensor]) -> Tuple[List, List]:
         """Calculate the ground truth for all samples in a batch according to
         the sampling_results.
 
         Args:
-            sampling_results (List[obj:SamplingResults]): Assign results of
+            sampling_results (List[obj:SamplingResult]): Assign results of
                 all images in a batch after sampling.
             gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
                 all images in a batch, each tensor has shape (num_gt, ).
@@ -227,35 +228,92 @@ def get_targets(self, sampling_results, gt_instance_ids,
 
         return track_id_targets, track_id_weights
 
-    @force_fp32(apply_to=('similarity_logits', ))
-    def loss(self,
-             similarity_logits,
-             track_id_targets,
-             track_id_weights,
-             reduction_override=None):
+    def loss(
+        self,
+        bbox_feats: Tensor,
+        ref_bbox_feats: Tensor,
+        num_bbox_per_img: int,
+        num_bbox_per_ref_img: int,
+        sampling_results: List[SamplingResult],
+        gt_instance_ids: List[Tensor],
+        ref_gt_instance_ids: List[Tensor],
+        reduction_override: Optional[str] = None,
+    ) -> dict:
         """Calculate the loss in a batch.
 
         Args:
-            similarity_logits (list[Tensor]): The predicted similarity_logits
-                of each pair of key image and reference image.
-            track_id_targets (list[Tensor]): The instance ids of Gt_labels for
-                all proposals in a batch, each tensor in list has shape
-                (num_proposals,).
-            track_id_weights (list[Tensor]): Labels_weights for
-                all proposals in a batch, each tensor in list has shape
-                (num_proposals,).
+            bbox_feats (Tensor): of shape [N, C, H, W]. N is the number of
+                bboxes.
+            ref_bbox_feats (Tensor): of shape [M, C, H, W]. M is the number of
+                reference bboxes.
+            num_bbox_per_img (list[int]): The `bbox_feats` contains proposals
+                of multi-images. `num_bbox_per_img` denotes the number of
+                proposals for each key image.
+            num_bbox_per_ref_img (list[int]): The `ref_bbox_feats` contains
+                proposals of multi-images. `num_bbox_per_ref_img` denotes the
+                number of proposals for each reference image.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
             reduction_override (str, optional): The method used to reduce the
                 loss. Options are "none", "mean" and "sum".
 
         Returns:
             dict[str, Tensor]: a dictionary of loss components.
         """
-        assert isinstance(similarity_logits, list)
+        x_split, ref_x_split = self(bbox_feats, ref_bbox_feats,
+                                    num_bbox_per_img, num_bbox_per_ref_img)
+
+        losses = self.loss_by_feat(x_split, ref_x_split, sampling_results,
+                                   gt_instance_ids, ref_gt_instance_ids,
+                                   reduction_override)
+        return losses
+
+    def loss_by_feat(self,
+                     x_split: Tuple[Tensor],
+                     ref_x_split: Tuple[Tensor],
+                     sampling_results: List[SamplingResult],
+                     gt_instance_ids: List[Tensor],
+                     ref_gt_instance_ids: List[Tensor],
+                     reduction_override: Optional[str] = None) -> dict:
+        """Calculate losses.
+
+        Args:
+            x_split (Tensor): The embed features belonging to key image.
+            ref_x_split (Tensor): The embed features belonging to ref image.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        track_id_targets, track_id_weights = self.get_targets(
+            sampling_results, gt_instance_ids, ref_gt_instance_ids)
         assert isinstance(track_id_targets, list)
         assert isinstance(track_id_weights, list)
-        assert len(similarity_logits) == len(track_id_targets)
         assert len(track_id_weights) == len(track_id_targets)
+
         losses = defaultdict(list)
+        similarity_logits = []
+        for one_x, one_ref_x in zip(x_split, ref_x_split):
+            similarity_logit = embed_similarity(
+                one_x, one_ref_x, method='dot_product')
+            dummy = similarity_logit.new_zeros(one_x.shape[0], 1)
+            similarity_logit = torch.cat((dummy, similarity_logit), dim=1)
+            similarity_logits.append(similarity_logit)
+        assert isinstance(similarity_logits, list)
+        assert len(similarity_logits) == len(track_id_targets)
 
         for similarity_logit, track_id_target, track_id_weight in zip(
                 similarity_logits, track_id_targets, track_id_weights):
@@ -289,3 +347,45 @@ def loss(self,
         for key, value in losses.items():
             losses[key] = sum(losses[key]) / len(similarity_logits)
         return losses
+
+    def predict(self, roi_feats: Tensor,
+                prev_roi_feats: Tensor) -> List[Tensor]:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            roi_feats (Tensor): Feature map of current images rois.
+            prev_roi_feats (Tensor): Feature map of previous images rois.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        x_split, ref_x_split = self(roi_feats, prev_roi_feats,
+                                    [roi_feats.shape[0]],
+                                    [prev_roi_feats.shape[0]])
+
+        similarity_logits = self.predict_by_feat(x_split, ref_x_split)
+
+        return similarity_logits
+
+    def predict_by_feat(self, x_split: Tuple[Tensor],
+                        ref_x_split: Tuple[Tensor]) -> List[Tensor]:
+        """Get similarity_logits.
+
+        Args:
+            x_split (Tensor): The embed features belonging to key image.
+            ref_x_split (Tensor): The embed features belonging to ref image.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        similarity_logits = []
+        for one_x, one_ref_x in zip(x_split, ref_x_split):
+            similarity_logit = embed_similarity(
+                one_x, one_ref_x, method='dot_product')
+            dummy = similarity_logit.new_zeros(one_x.shape[0], 1)
+            similarity_logit = torch.cat((dummy, similarity_logit), dim=1)
+            similarity_logits.append(similarity_logit)
+        return similarity_logits
diff --git a/mmtrack/models/track_heads/roi_track_head.py b/mmtrack/models/track_heads/roi_track_head.py
index 338891261..568f9c0b4 100644
--- a/mmtrack/models/track_heads/roi_track_head.py
+++ b/mmtrack/models/track_heads/roi_track_head.py
@@ -1,12 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta
+from typing import List, Optional, Tuple
 
-from mmcv.runner import BaseModule
-from mmdet.core import bbox2roi, build_assigner, build_sampler
-from mmdet.models import HEADS, build_head, build_roi_extractor
+from mmdet.models.task_modules import SamplingResult
+from mmdet.structures.bbox import bbox2roi
+from mmengine.model import BaseModule
+from torch import Tensor
 
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.utils import InstanceList, SampleList
 
-@HEADS.register_module()
+
+@MODELS.register_module()
 class RoITrackHead(BaseModule, metaclass=ABCMeta):
     """The roi track head.
 
@@ -22,12 +27,12 @@ class RoITrackHead(BaseModule, metaclass=ABCMeta):
     """
 
     def __init__(self,
-                 roi_extractor=None,
-                 embed_head=None,
-                 regress_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 init_cfg=None,
+                 roi_extractor: Optional[dict] = None,
+                 embed_head: Optional[dict] = None,
+                 regress_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
                  *args,
                  **kwargs):
         super().__init__(init_cfg=init_cfg)
@@ -42,131 +47,146 @@ def __init__(self,
 
         self.init_assigner_sampler()
 
-    def init_embed_head(self, roi_extractor, embed_head):
+    def init_embed_head(self, roi_extractor, embed_head) -> None:
         """Initialize ``embed_head``"""
-        self.roi_extractor = build_roi_extractor(roi_extractor)
-        self.embed_head = build_head(embed_head)
+        self.roi_extractor = MODELS.build(roi_extractor)
+        self.embed_head = MODELS.build(embed_head)
 
-    def init_assigner_sampler(self):
+    def init_assigner_sampler(self) -> None:
         """Initialize assigner and sampler."""
         self.bbox_assigner = None
         self.bbox_sampler = None
         if self.train_cfg:
-            self.bbox_assigner = build_assigner(self.train_cfg.assigner)
-            self.bbox_sampler = build_sampler(
-                self.train_cfg.sampler, context=self)
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.bbox_sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
 
     @property
-    def with_track(self):
-        """bool: whether the mulit-object tracker has a embed head"""
+    def with_track(self) -> bool:
+        """bool: whether the multi-object tracker has an embed head"""
         return hasattr(self, 'embed_head') and self.embed_head is not None
 
-    def extract_roi_feats(self, x, bboxes):
-        """Extract roi features."""
+    def extract_roi_feats(
+            self, feats: List[Tensor],
+            bboxes: List[Tensor]) -> Tuple[Tuple[Tensor], List[int]]:
+        """Extract roi features.
+
+        Args:
+            feats (list[Tensor]): list of multi-level image features.
+            bboxes (list[Tensor]): list of bboxes in sampling result.
+
+        Returns:
+            tuple[tuple[Tensor], list[int]]: The extracted roi features and
+            the number of bboxes in each image.
+        """
         rois = bbox2roi(bboxes)
-        bbox_feats = self.roi_extractor(x[:self.roi_extractor.num_inputs],
+        bbox_feats = self.roi_extractor(feats[:self.roi_extractor.num_inputs],
                                         rois)
         num_bbox_per_img = [len(bbox) for bbox in bboxes]
         return bbox_feats, num_bbox_per_img
 
-    def forward_train(self,
-                      x,
-                      ref_x,
-                      img_metas,
-                      proposal_list,
-                      gt_bboxes,
-                      ref_gt_bboxes,
-                      gt_labels,
-                      gt_instance_ids,
-                      ref_gt_instance_ids,
-                      gt_bboxes_ignore=None,
-                      **kwargs):
-        """
-        Args:
-            x (list[Tensor]): list of multi-level image features.
-
-            ref_x (list[Tensor]): list of multi-level ref_img features.
-
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            proposal_list (list[Tensors]): list of region proposals.
-
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-
-            ref_gt_bboxes (list[Tensor]): Ground truth bboxes for each
-                reference image with shape (num_gts, 4) in
-                [tl_x, tl_y, br_x, br_y] format.
-
-            gt_labels (list[Tensor]): class indices corresponding to each box.
+    def loss(self, key_feats: List[Tensor], ref_feats: List[Tensor],
+             rpn_results_list: InstanceList, data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
 
-            gt_instance_ids (None | list[Tensor]): specify the instance id for
-                each ground truth bbox.
-
-            ref_gt_instance_ids (None | list[Tensor]): specify the instance id
-                for each ground truth bbox of reference images.
+        Args:
+            key_feats (list[Tensor]): list of multi-level image features.
+            ref_feats (list[Tensor]): list of multi-level ref_img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
 
-            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
-                boxes can be ignored when computing the loss.
         Returns:
-            dict[str, Tensor]: a dictionary of loss components
+            dict: A dictionary of loss components.
         """
-        # assign gts and sample proposals
+        assert len(rpn_results_list) == len(data_samples)
+        batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        for data_sample in data_samples:
+            batch_gt_instances.append(data_sample.gt_instances)
+            if 'ignored_instances' in data_sample:
+                batch_gt_instances_ignore.append(data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+
         if self.with_track:
-            num_imgs = len(img_metas)
-            if gt_bboxes_ignore is None:
-                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            num_imgs = len(data_samples)
+            if batch_gt_instances_ignore is None:
+                batch_gt_instances_ignore = [None] * num_imgs
             sampling_results = []
             for i in range(num_imgs):
+                rpn_results = rpn_results_list[i]
+
                 assign_result = self.bbox_assigner.assign(
-                    proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
-                    gt_labels[i])
+                    rpn_results, batch_gt_instances[i],
+                    batch_gt_instances_ignore[i])
                 sampling_result = self.bbox_sampler.sample(
                     assign_result,
-                    proposal_list[i],
-                    gt_bboxes[i],
-                    gt_labels[i],
-                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                    rpn_results,
+                    batch_gt_instances[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in key_feats])
                 sampling_results.append(sampling_result)
 
         losses = dict()
 
         if self.with_track:
-            track_results = self._track_forward_train(x, ref_x,
-                                                      sampling_results,
-                                                      ref_gt_bboxes,
-                                                      gt_instance_ids,
-                                                      ref_gt_instance_ids)
+            track_results = self.track_loss(key_feats, ref_feats,
+                                            sampling_results, data_samples)
             losses.update(track_results['loss_track'])
 
         return losses
 
-    def _track_forward_train(self, x, ref_x, sampling_results, ref_gt_bboxes,
-                             gt_instance_ids, ref_gt_instance_ids, **kwargs):
-        """Run forward function and calculate loss for track head in
-        training."""
+    def track_loss(self, key_feats: List[Tensor], ref_feats: List[Tensor],
+                   sampling_results: List[SamplingResult],
+                   data_samples: SampleList, **kwargs) -> dict:
+        """Run forward function and calculate loss for track head in training.
+
+        Args:
+            key_feats (list[Tensor]): list of multi-level image features.
+            ref_feats (list[Tensor]): list of multi-level ref_img features.
+            sampling_results (list[:obj:`SamplingResult`]): List of Bbox
+                sampling result.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
         bboxes = [res.bboxes for res in sampling_results]
-        bbox_feats, num_bbox_per_img = self.extract_roi_feats(x, bboxes)
+        bbox_feats, num_bbox_per_img = self.extract_roi_feats(
+            key_feats, bboxes)
+        # batch_size is 1
+        ref_gt_bboxes = [data_samples[0].ref_gt_instances.bboxes]
         ref_bbox_feats, num_bbox_per_ref_img = self.extract_roi_feats(
-            ref_x, ref_gt_bboxes)
+            ref_feats, ref_gt_bboxes)
 
-        similarity_logits = self.embed_head(bbox_feats, ref_bbox_feats,
-                                            num_bbox_per_img,
-                                            num_bbox_per_ref_img)
+        gt_instance_ids = [data_samples[0].gt_instances.instances_id]
+        ref_gt_instance_ids = [data_samples[0].ref_gt_instances.instances_id]
 
-        track_targets = self.embed_head.get_targets(sampling_results,
-                                                    gt_instance_ids,
-                                                    ref_gt_instance_ids)
-        loss_track = self.embed_head.loss(similarity_logits, *track_targets)
+        loss_track = self.embed_head.loss(bbox_feats, ref_bbox_feats,
+                                          num_bbox_per_img,
+                                          num_bbox_per_ref_img,
+                                          sampling_results, gt_instance_ids,
+                                          ref_gt_instance_ids)
         track_results = dict(loss_track=loss_track)
 
         return track_results
 
-    def simple_test(self, roi_feats, prev_roi_feats):
-        """Test without augmentations."""
-        return self.embed_head(roi_feats, prev_roi_feats, [roi_feats.shape[0]],
-                               [prev_roi_feats.shape[0]])[0]
+    def predict(self, roi_feats: Tensor,
+                prev_roi_feats: Tensor) -> List[Tensor]:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            roi_feats (Tensor): Feature map of current images rois.
+            prev_roi_feats (Tensor): Feature map of previous images rois.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        return self.embed_head.predict(roi_feats, prev_roi_feats)[0]
diff --git a/mmtrack/models/track_heads/siamese_rpn_head.py b/mmtrack/models/track_heads/siamese_rpn_head.py
index aecd7d3f2..36e4562f4 100644
--- a/mmtrack/models/track_heads/siamese_rpn_head.py
+++ b/mmtrack/models/track_heads/siamese_rpn_head.py
@@ -1,17 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
 import torch
 import torch.nn as nn
 from mmcv.cnn.bricks import ConvModule
-from mmcv.runner import BaseModule, auto_fp16, force_fp32
-from mmdet.core import build_assigner, build_bbox_coder, build_sampler
-from mmdet.core.anchor import build_prior_generator
-from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh
-from mmdet.models import HEADS, build_loss
+from mmdet.structures.bbox.transforms import (bbox_cxcywh_to_xyxy,
+                                              bbox_xyxy_to_cxcywh)
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
 
-from mmtrack.core.track import depthwise_correlation
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.utils import InstanceList, SampleList
+from ..task_modules.track import depthwise_correlation
 
 
-@HEADS.register_module()
+@MODELS.register_module()
 class CorrelationHead(BaseModule):
     """Correlation head module.
 
@@ -28,18 +32,18 @@ class CorrelationHead(BaseModule):
             Defaults to dict(type='BN').
         act_cfg (dict): Configuration of activation method after each conv.
             Defaults to dict(type='ReLU').
-        init_cfg (dict or list[dict], optional): Initialization config dict.
+        init_cfg (Optional(dict)): Initialization config dict.
             Defaults to None.
     """
 
     def __init__(self,
-                 in_channels,
-                 mid_channels,
-                 out_channels,
-                 kernel_size=3,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None,
+                 in_channels: int,
+                 mid_channels: int,
+                 out_channels: int,
+                 kernel_size: int = 3,
+                 norm_cfg: dict = dict(type='BN'),
+                 act_cfg: dict = dict(type='ReLU'),
+                 init_cfg: Optional[dict] = None,
                  **kwargs):
         super(CorrelationHead, self).__init__(init_cfg)
         self.kernel_convs = ConvModule(
@@ -69,7 +73,16 @@ def __init__(self,
                 kernel_size=1,
                 act_cfg=None))
 
-    def forward(self, kernel, search):
+    def forward(self, kernel: Tensor, search: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            kernel (Tensor): The feature map of the template.
+            search (Tensor): The feature map of search images.
+
+        Returns:
+            Tensor: The correlation results.
+        """
         kernel = self.kernel_convs(kernel)
         search = self.search_convs(search)
         correlation_maps = depthwise_correlation(search, kernel)
@@ -77,7 +90,7 @@ def forward(self, kernel, search):
         return out
 
 
-@HEADS.register_module()
+@MODELS.register_module()
 class SiameseRPNHead(BaseModule):
     """Siamese RPN head.
 
@@ -119,32 +132,32 @@ class SiameseRPNHead(BaseModule):
     """
 
     def __init__(self,
-                 anchor_generator,
-                 in_channels,
-                 kernel_size=3,
-                 norm_cfg=dict(type='BN'),
-                 weighted_sum=False,
-                 bbox_coder=dict(
+                 anchor_generator: dict,
+                 in_channels: int,
+                 kernel_size: int = 3,
+                 norm_cfg: dict = dict(type='BN'),
+                 weighted_sum: bool = False,
+                 bbox_coder: dict = dict(
                      type='DeltaXYWHBBoxCoder',
                      target_means=[0., 0., 0., 0.],
                      target_stds=[1., 1., 1., 1.]),
-                 loss_cls=dict(
+                 loss_cls: dict = dict(
                      type='CrossEntropyLoss', reduction='sum',
                      loss_weight=1.0),
-                 loss_bbox=dict(
+                 loss_bbox: dict = dict(
                      type='L1Loss', reduction='sum', loss_weight=1.2),
-                 train_cfg=None,
-                 test_cfg=None,
-                 init_cfg=None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
                  *args,
                  **kwargs):
         super(SiameseRPNHead, self).__init__(init_cfg)
-        self.anchor_generator = build_prior_generator(anchor_generator)
-        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.anchor_generator = TASK_UTILS.build(anchor_generator)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
         self.train_cfg = train_cfg
         self.test_cfg = test_cfg
-        self.assigner = build_assigner(self.train_cfg.assigner)
-        self.sampler = build_sampler(self.train_cfg.sampler)
+        self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+        self.sampler = TASK_UTILS.build(self.train_cfg.sampler)
         self.fp16_enabled = False
 
         self.cls_heads = nn.ModuleList()
@@ -164,78 +177,104 @@ def __init__(self,
             self.cls_weight = nn.Parameter(torch.ones(len(in_channels)))
             self.reg_weight = nn.Parameter(torch.ones(len(in_channels)))
 
-        self.loss_cls = build_loss(loss_cls)
-        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
 
-    @auto_fp16()
-    def forward(self, z_feats, x_feats):
-        """Forward with features `z_feats` of exemplar images and features
-        `x_feats` of search images.
+    def forward(self, template_feats: Tuple[Tensor, ...],
+                search_feats: Tuple[Tensor, ...]) -> Tuple[Tensor, Tensor]:
+        """Forward with features `template_feats` of template images and
+        features `search_feats` of search images.
 
         Args:
-            z_feats (tuple[Tensor]): Tuple of Tensor with shape (N, C, H, W)
-                denoting the multi level feature maps of exemplar images.
-                Typically H and W equal to 7.
-            x_feats (tuple[Tensor]): Tuple of Tensor with shape (N, C, H, W)
-                denoting the multi level feature maps of search images.
-                Typically H and W equal to 31.
+            template_feats (tuple[Tensor, ...]): Tuple of Tensor with shape
+                (N, C, H, W) denoting the multi level feature maps of template
+                images. Typically H and W equal to 7.
+            search_feats (tuple[Tensor, ...]): Tuple of Tensor with shape
+                (N, C, H, W) denoting the multi level feature maps of search
+                images. Typically H and W equal to 31.
 
         Returns:
-            tuple(cls_score, bbox_pred): cls_score is a Tensor with shape
-            (N, 2 * num_base_anchors, H, W), bbox_pred is a Tensor with shape
-            (N, 4 * num_base_anchors, H, W), Typically H and W equal to 25.
+            tuple(Tensor, Tensor): It contains
+              - ``cls_score``: a Tensor with shape
+                (N, 2 * num_base_anchors, H, W)
+              - ``bbox_pred``: a Tensor with shape
+                (N, 4 * num_base_anchors, H, W).
+                Typically H and W equal to 25.
         """
-        assert isinstance(z_feats, tuple) and isinstance(x_feats, tuple)
-        assert len(z_feats) == len(x_feats) and len(z_feats) == len(
-            self.cls_heads)
+        assert isinstance(template_feats, tuple) and isinstance(
+            search_feats, tuple)
+        assert len(template_feats) == len(search_feats) and len(
+            template_feats) == len(self.cls_heads)
 
         if self.weighted_sum:
             cls_weight = nn.functional.softmax(self.cls_weight, dim=0)
             reg_weight = nn.functional.softmax(self.reg_weight, dim=0)
         else:
             reg_weight = cls_weight = [
-                1.0 / len(z_feats) for i in range(len(z_feats))
+                1.0 / len(template_feats) for i in range(len(template_feats))
             ]
 
         cls_score = 0
         bbox_pred = 0
-        for i in range(len(z_feats)):
-            cls_score_single = self.cls_heads[i](z_feats[i], x_feats[i])
-            bbox_pred_single = self.reg_heads[i](z_feats[i], x_feats[i])
+        for i in range(len(template_feats)):
+            cls_score_single = self.cls_heads[i](template_feats[i],
+                                                 search_feats[i])
+            bbox_pred_single = self.reg_heads[i](template_feats[i],
+                                                 search_feats[i])
             cls_score += cls_weight[i] * cls_score_single
             bbox_pred += reg_weight[i] * bbox_pred_single
 
         return cls_score, bbox_pred
 
-    def _get_init_targets(self, gt_bbox, score_maps_size):
+    def _get_init_targets(self, bboxes: Tensor,
+                          score_maps_size: torch.Size) -> Tuple[Tensor, ...]:
         """Initialize the training targets based on flattened anchors of the
-        last score map."""
+        last score map.
+
+        Args:
+            bboxes (Tensor): The generated anchors.
+            score_maps_size (torch.Size): denoting the output size
+                (height, width) of the network.
+
+        Returns:
+            tuple(Tensor, ...): It contains
+              - ``labels``: in shape (N, H * W * num_base_anchors)
+              - ``labels_weights``: in shape (N, H * W * num_base_anchors)
+              - ``bbox_targets``: in shape (N, H * W * num_base_anchors, 4)
+              - ``bbox_weights``: in shape (N, H * W * num_base_anchors, 4)
+        """
         num_base_anchors = self.anchor_generator.num_base_anchors[0]
         H, W = score_maps_size
         num_anchors = H * W * num_base_anchors
-        labels = gt_bbox.new_zeros((num_anchors, ), dtype=torch.long)
-        labels_weights = gt_bbox.new_zeros((num_anchors, ))
-        bbox_weights = gt_bbox.new_zeros((num_anchors, 4))
-        bbox_targets = gt_bbox.new_zeros((num_anchors, 4))
+        labels = bboxes.new_zeros((num_anchors, ), dtype=torch.long)
+        labels_weights = bboxes.new_zeros((num_anchors, ))
+        bbox_weights = bboxes.new_zeros((num_anchors, 4))
+        bbox_targets = bboxes.new_zeros((num_anchors, 4))
         return labels, labels_weights, bbox_targets, bbox_weights
 
-    def _get_positive_pair_targets(self, gt_bbox, score_maps_size):
+    def _get_positive_pair_targets(
+            self, gt_instances: InstanceData,
+            score_maps_size: torch.Size) -> Tuple[Tensor, ...]:
         """Generate the training targets for positive exemplar image and search
         image pair.
 
         Args:
-            gt_bbox (Tensor): Ground truth bboxes of an search image with
-                shape (1, 5) in [0.0, tl_x, tl_y, br_x, br_y] format.
-            score_maps_size (torch.size): denoting the output size
+            gt_instances (:obj:`InstanceData`): Groundtruth instances. It
+                usually includes ``bboxes`` and ``labels`` attributes.
+                ``bboxes`` of each search image is of
+                shape (1, 4) in [tl_x, tl_y, br_x, br_y] format.
+            score_maps_size (torch.Size): denoting the output size
                 (height, width) of the network.
 
         Returns:
-            tuple(labels, labels_weights, bbox_targets, bbox_weights): the
-            shape is (H * W * num_base_anchors,), (H * W * num_base_anchors,),
-            (H * W * num_base_anchors, 4), (H * W * num_base_anchors, 4)
-            respectively. All of them are Tensor.
+            tuple(Tensor, ...): It contains
+              - ``labels``: in shape (N, H * W * num_base_anchors)
+              - ``labels_weights``: in shape (N, H * W * num_base_anchors)
+              - ``bbox_targets``: in shape (N, H * W * num_base_anchors, 4)
+              - ``bbox_weights``: in shape (N, H * W * num_base_anchors, 4)
         """
-        (labels, labels_weights, _,
+        gt_bbox = gt_instances.bboxes
+        (labels, labels_weights, bbox_targets,
          bbox_weights) = self._get_init_targets(gt_bbox, score_maps_size)
 
         if not hasattr(self, 'anchors'):
@@ -255,9 +294,10 @@ def _get_positive_pair_targets(self, gt_bbox, score_maps_size):
         # the searched image.
         anchors += self.train_cfg.search_size // 2
 
-        assign_result = self.assigner.assign(anchors, gt_bbox[:, 1:])
-        sampling_result = self.sampler.sample(assign_result, anchors,
-                                              gt_bbox[:, 1:])
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances, gt_instances)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
         pos_inds = sampling_result.pos_inds
         neg_inds = sampling_result.neg_inds
         neg_upper_bound = int(self.sampler.num *
@@ -274,30 +314,37 @@ def _get_positive_pair_targets(self, gt_bbox, score_maps_size):
             labels[neg_inds] = 0
             labels_weights[neg_inds] = 1.0 / len(neg_inds) / 2
 
-        bbox_targets = self.bbox_coder.encode(
-            anchors, gt_bbox[:, 1:].repeat(anchors.shape[0], 1))
+        bbox_targets[pos_inds, :] = self.bbox_coder.encode(
+            sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+
         return labels, labels_weights, bbox_targets, bbox_weights
 
-    def _get_negative_pair_targets(self, gt_bbox, score_maps_size):
+    def _get_negative_pair_targets(
+            self, gt_instances: InstanceData,
+            score_maps_size: torch.Size) -> Tuple[Tensor, ...]:
         """Generate the training targets for negative exemplar image and search
         image pair.
 
         Args:
-            gt_bbox (Tensor): Ground truth bboxes of an search image with
-                shape (1, 5) in [0.0, tl_x, tl_y, br_x, br_y] format.
-            score_maps_size (torch.size): denoting the output size
+            gt_instances (:obj:`InstanceData`): Groundtruth instances. It
+                usually includes ``bboxes`` and ``labels`` attributes.
+                ``bboxes`` of each search image is of
+                shape (1, 4) in [tl_x, tl_y, br_x, br_y] format.
+            score_maps_size (torch.Size): denoting the output size
                 (height, width) of the network.
 
         Returns:
-            tuple(labels, labels_weights, bbox_targets, bbox_weights): the
-            shape is (H * W * num_base_anchors,), (H * W * num_base_anchors,),
-            (H * W * num_base_anchors, 4), (H * W * num_base_anchors, 4)
-            respectively. All of them are Tensor.
+            tuple(Tensor, ...): It contains
+              - ``labels``: in shape (N, H * W * num_base_anchors)
+              - ``labels_weights``: in shape (N, H * W * num_base_anchors)
+              - ``bbox_targets``: in shape (N, H * W * num_base_anchors, 4)
+              - ``bbox_weights``: in shape (N, H * W * num_base_anchors, 4)
         """
+        gt_bbox = gt_instances.bboxes
         (labels, labels_weights, bbox_targets,
          bbox_weights) = self._get_init_targets(gt_bbox, score_maps_size)
         H, W = score_maps_size
-        target_cx, target_cy, _, _ = bbox_xyxy_to_cxcywh(gt_bbox[:, 1:])[0]
+        target_cx, target_cy, _, _ = bbox_xyxy_to_cxcywh(gt_bbox)[0]
         anchor_stride = self.anchor_generator.strides[0]
 
         cx = W // 2
@@ -328,42 +375,45 @@ def _get_negative_pair_targets(self, gt_bbox, score_maps_size):
         if len(neg_inds) > 0:
             labels[neg_inds] = 0
             labels_weights[neg_inds] = 1.0 / len(neg_inds) / 2
+
+        # TODO: check it whether it's right.
         labels[...] = 0
 
         return labels, labels_weights, bbox_targets, bbox_weights
 
-    def get_targets(self, gt_bboxes, score_maps_size, is_positive_pairs):
+    def get_targets(self, batch_gt_instances: InstanceList,
+                    score_maps_size: torch.Size) -> Tuple[Tensor, ...]:
         """Generate the training targets for exemplar image and search image
         pairs.
 
         Args:
-            gt_bboxes (list[Tensor]): Ground truth bboxes of each
-                search image with shape (1, 5) in [0.0, tl_x, tl_y, br_x, br_y]
-                format.
-            score_maps_size (torch.size): denoting the output size
+            batch_gt_instances (list[InstanceData]): Batch of
+                groundtruth instances. It usually includes ``bboxes`` and
+                ``labels`` attributes. ``bboxes`` of each search image is of
+                shape (1, 4) in [tl_x, tl_y, br_x, br_y] format.
+            score_maps_size (torch.Size): denoting the output size
                 (height, width) of the network.
-            is_positive_pairs (bool): list of bool denoting whether each ground
-                truth bbox in `gt_bboxes` is positive.
 
         Returns:
-            tuple(all_labels, all_labels_weights, all_bbox_targets,
-            all_bbox_weights): the shape is (N, H * W * num_base_anchors),
-            (N, H * W * num_base_anchors), (N, H * W * num_base_anchors, 4),
-            (N, H * W * num_base_anchors, 4), respectively. All of them are
-            Tensor.
+            tuple(Tensor, ...): It contains
+              - ``all_labels``: in shape (N, H * W * num_base_anchors)
+              - ``all_labels_weights``: in shape (N, H * W * num_base_anchors)
+              - ``all_bbox_targets``: in shape (N, H * W * num_base_anchors, 4)
+              - ``all_bbox_weights``: in shape (N, H * W * num_base_anchors, 4)
         """
         (all_labels, all_labels_weights, all_bbox_targets,
          all_bbox_weights) = [], [], [], []
 
-        for gt_bbox, is_positive_pair in zip(gt_bboxes, is_positive_pairs):
+        for gt_instances in batch_gt_instances:
+            is_positive_pair = gt_instances['labels'][0]
             if is_positive_pair:
                 (labels, labels_weights, bbox_targets,
                  bbox_weights) = self._get_positive_pair_targets(
-                     gt_bbox, score_maps_size)
+                     gt_instances, score_maps_size)
             else:
                 (labels, labels_weights, bbox_targets,
                  bbox_weights) = self._get_negative_pair_targets(
-                     gt_bbox, score_maps_size)
+                     gt_instances, score_maps_size)
 
             all_labels.append(labels)
             all_labels_weights.append(labels_weights)
@@ -380,24 +430,62 @@ def get_targets(self, gt_bboxes, score_maps_size, is_positive_pairs):
         return (all_labels, all_labels_weights, all_bbox_targets,
                 all_bbox_weights)
 
-    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
-    def loss(self, cls_score, bbox_pred, labels, labels_weights, bbox_targets,
-             bbox_weights):
+    def loss(self, template_feats: Tuple[Tensor], search_feats: Tuple[Tensor],
+             data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the tracking
+        head on the features of the upstream network.
+
+        Args:
+            template_feats (tuple[Tensor, ...]): Tuple of Tensor with
+                shape (N, C, H, W) denoting the multi level feature maps of
+                exemplar images. Typically H and W equal to 7.
+            search_feats (tuple[Tensor, ...]): Tuple of Tensor with shape
+                (N, C, H, W) denoting the multi level feature maps of
+                search images. Typically H and W equal to 31.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outs = self(template_feats, search_feats)
+
+        batch_gt_instances = []
+        batch_img_metas = []
+        batch_search_gt_instances = []
+        for data_sample in data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+            batch_search_gt_instances.append(data_sample.search_gt_instances)
+
+        loss_inputs = outs + (batch_gt_instances, batch_search_gt_instances,
+                              batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(self, cls_score: Tensor, bbox_pred: Tensor,
+                     batch_gt_instances: InstanceList,
+                     batch_search_gt_instances: InstanceList,
+                     batch_img_metas: List[dict]) -> dict:
         """Compute loss.
 
         Args:
             cls_score (Tensor): of shape (N, 2 * num_base_anchors, H, W).
             bbox_pred (Tensor): of shape (N, 4 * num_base_anchors, H, W).
-            labels (Tensor): of shape (N, H * W * num_base_anchors).
-            labels_weights (Tensor): of shape (N, H * W * num_base_anchors).
-            bbox_targets (Tensor): of shape (N, H * W * num_base_anchors, 4).
-            bbox_weights (Tensor): of shape (N, H * W * num_base_anchors, 4).
+            batch_gt_instances (InstanceList): the instances in a batch.
+            batch_search_gt_instances (InstanceList): the search instances in a
+                batch.
+            batch_img_metas (List[dict]): the meta information of all images in
+                a batch.
 
         Returns:
-            dict[str, Tensor]: a dictionary of loss components
+            dict[str, Tensor]: A dictionary of loss components.
         """
-        losses = {}
         N, _, H, W = cls_score.shape
+        (labels, labels_weights, bbox_targets,
+         bbox_weights) = self.get_targets(batch_search_gt_instances, (H, W))
+
+        losses = {}
 
         cls_score = cls_score.view(N, 2, -1, H, W)
         cls_score = cls_score.permute(0, 3, 4, 2, 1).contiguous().view(-1, 2)
@@ -408,6 +496,7 @@ def loss(self, cls_score, bbox_pred, labels, labels_weights, bbox_targets,
 
         bbox_pred = bbox_pred.view(N, 4, -1, H, W)
         bbox_pred = bbox_pred.permute(0, 3, 4, 2, 1).contiguous().view(-1, 4)
+
         bbox_targets = bbox_targets.view(-1, 4)
         bbox_weights = bbox_weights.view(-1, 4)
         losses['loss_rpn_bbox'] = self.loss_bbox(
@@ -415,21 +504,62 @@ def loss(self, cls_score, bbox_pred, labels, labels_weights, bbox_targets,
 
         return losses
 
-    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
-    def get_bbox(self, cls_score, bbox_pred, prev_bbox, scale_factor):
+    def predict(self, template_feats: Tuple[Tensor],
+                search_feats: Tuple[Tensor], data_samples: SampleList,
+                prev_bbox: Tensor, scale_factor: Tensor) -> InstanceList:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            template_feats (tuple[Tensor, ...]): Tuple of Tensor with
+                shape (N, C, H, W) denoting the multi level feature maps of
+                exemplar images. Typically H and W equal to 7.
+            search_feats (tuple[Tensor, ...]): Tuple of Tensor with shape
+                (N, C, H, W) denoting the multi level feature maps of
+                search images. Typically H and W equal to 31.
+
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+            prev_bbox (Tensor): of shape (4, ) in [cx, cy, w, h] format.
+            scale_factor (Tensor): scale factor.
+
+        Returns:
+            List[:obj:`InstanceData`]: Object tracking results of each image
+            after the post process. Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape (1, )
+                - bboxes (Tensor): Has a shape (1, 4),
+                  the last dimension 4 arrange as [x1, y1, x2, y2].
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in data_samples
+        ]
+        outs = self(template_feats, search_feats)
+        predictions = self.predict_by_feat(
+            *outs,
+            prev_bbox=prev_bbox,
+            scale_factor=scale_factor,
+            batch_img_metas=batch_img_metas)
+        return predictions
+
+    def predict_by_feat(self, cls_score: Tensor, bbox_pred: Tensor,
+                        prev_bbox: Tensor, scale_factor: Tensor,
+                        batch_img_metas: List[dict]) -> InstanceList:
         """Track `prev_bbox` to current frame based on the output of network.
 
         Args:
             cls_score (Tensor): of shape (1, 2 * num_base_anchors, H, W).
             bbox_pred (Tensor): of shape (1, 4 * num_base_anchors, H, W).
             prev_bbox (Tensor): of shape (4, ) in [cx, cy, w, h] format.
-            scale_factor (Tensr): scale factor.
+            scale_factor (Tensor): scale factor.
+            batch_img_metas (List[dict]): the meta information of all images in
+                a batch.
 
         Returns:
-            tuple(best_score, best_bbox): best_score is a Tensor denoting the
-            score of `best_bbox`, best_bbox is a Tensor of shape (4, )
-            with [cx, cy, w, h] format, which denotes the best tracked
-            bbox in current frame.
+            List[:obj:`InstanceData`]: Object tracking results of each image
+            after the post process. Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape (1, )
+                - bboxes (Tensor): Has a shape (1, 4),
+                  the last dimension 4 arrange as [x1, y1, x2, y2].
         """
         score_maps_size = [(cls_score.shape[2:])]
         if not hasattr(self, 'anchors'):
@@ -454,7 +584,37 @@ def get_bbox(self, cls_score, bbox_pred, prev_bbox, scale_factor):
         bbox_pred = bbox_pred.view(4, -1, H, W)
         bbox_pred = bbox_pred.permute(2, 3, 1, 0).contiguous().view(-1, 4)
         bbox_pred = self.bbox_coder.decode(self.anchors, bbox_pred)
-        bbox_pred = bbox_xyxy_to_cxcywh(bbox_pred)
+
+        result = InstanceData()
+        result.scores = cls_score
+        result.bboxes = bbox_pred
+
+        return self._bbox_post_process([result],
+                                       prev_bbox,
+                                       scale_factor,
+                                       batch_img_metas=batch_img_metas)
+
+    def _bbox_post_process(self, results: InstanceList, prev_bbox: Tensor,
+                           scale_factor: Tensor, batch_img_metas: List[dict],
+                           **kwargs) -> InstanceList:
+        """The postprocess of tracking bboxes.
+
+        Args:
+            results (InstanceList): tracking results.
+            prev_bbox (Tensor): of shape (4, ) in [cx, cy, w, h] format.
+            scale_factor (Tensor): scale factor.
+            batch_img_metas (List[dict]): the meta information of all images in
+                a batch.
+
+        Returns:
+            List[:obj:`InstanceData`]: Object tracking results of each image
+            after the post process. Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape (1, )
+                - bboxes (Tensor): Has a shape (1, 4),
+                  the last dimension 4 arrange as [x1, y1, x2, y2].
+        """
+        result = results[0]
+        bbox_pred = bbox_xyxy_to_cxcywh(result.bboxes)
 
         def change_ratio(ratio):
             return torch.max(ratio, 1. / ratio)
@@ -476,16 +636,16 @@ def enlarge_size(w, h):
         # penalize cls_score
         penalty = torch.exp(-(aspect_ratio_penalty * scale_penalty - 1) *
                             self.test_cfg.penalty_k)
-        penalty_score = penalty * cls_score
+        penalty_score = penalty * result.scores
 
         # window penalty
         penalty_score = penalty_score * (1 - self.test_cfg.window_influence) \
             + self.windows * self.test_cfg.window_influence
 
         best_idx = torch.argmax(penalty_score)
-        best_score = cls_score[best_idx]
-        best_bbox = bbox_pred[best_idx, :] / scale_factor
+        result = result[best_idx.item()]
 
+        best_bbox = bbox_pred[best_idx, :] / scale_factor
         final_bbox = torch.zeros_like(best_bbox)
 
         # map the bbox center from the searched image to the original image.
@@ -493,8 +653,34 @@ def enlarge_size(w, h):
         final_bbox[1] = best_bbox[1] + prev_bbox[1]
 
         # smooth bbox
-        lr = penalty[best_idx] * cls_score[best_idx] * self.test_cfg.lr
+        lr = penalty[best_idx] * result.scores * self.test_cfg.lr
         final_bbox[2] = prev_bbox[2] * (1 - lr) + best_bbox[2] * lr
         final_bbox[3] = prev_bbox[3] * (1 - lr) + best_bbox[3] * lr
 
-        return best_score, final_bbox
+        # clip boundary
+        img_shape = batch_img_metas[0]['ori_shape']
+        # rather than [x1, x2, y1, y2] format.
+        final_bbox = self._bbox_clip(final_bbox, img_shape[0], img_shape[1])
+
+        result.bboxes = bbox_cxcywh_to_xyxy(final_bbox)[None]
+
+        return [result]
+
+    def _bbox_clip(self, bbox: Tensor, img_h: int, img_w: int) -> Tensor:
+        """Clip the bbox with [cx, cy, w, h] format.
+
+        Args:
+            img (Tensor): of shape (1, C, H, W) encoding original input
+                image.
+            bbox (Tensor): The given instance bbox of first frame that
+                need be tracked in the following frames. The shape of the box
+                is of (4, ) shape in [cx, cy, w, h] format.
+
+        Returns:
+            Tensor: The clipped boxes.
+        """
+        bbox[0] = bbox[0].clamp(0., img_w)
+        bbox[1] = bbox[1].clamp(0., img_h)
+        bbox[2] = bbox[2].clamp(10., img_w)
+        bbox[3] = bbox[3].clamp(10., img_h)
+        return bbox
diff --git a/mmtrack/models/track_heads/stark_head.py b/mmtrack/models/track_heads/stark_head.py
index 1caf5e8c7..3dbf1adce 100644
--- a/mmtrack/models/track_heads/stark_head.py
+++ b/mmtrack/models/track_heads/stark_head.py
@@ -1,19 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from collections import defaultdict
+from typing import Dict, List, Tuple
 
 import torch
 import torch.nn.functional as F
 from mmcv.cnn.bricks import ConvModule
 from mmcv.cnn.bricks.transformer import build_positional_encoding
-from mmcv.runner.base_module import BaseModule
-from mmdet.models import HEADS
-from mmdet.models.builder import build_head, build_loss
-from mmdet.models.utils import Transformer, build_transformer
-from mmdet.models.utils.builder import TRANSFORMER
-from torch import nn
+from mmdet.models.layers import Transformer
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
 
+from mmtrack.registry import MODELS
+from mmtrack.utils import InstanceList, OptConfigType, SampleList
 
-@HEADS.register_module()
+
+@MODELS.register_module()
 class CornerPredictorHead(BaseModule):
     """Corner Predictor head.
 
@@ -24,13 +26,20 @@ class CornerPredictorHead(BaseModule):
         stride (int): the stride of feature map from the backbone
     """
 
-    def __init__(self, inplanes, channel, feat_size=20, stride=16):
+    def __init__(self,
+                 inplanes: int,
+                 channel: int,
+                 feat_size: int = 20,
+                 stride: int = 16):
         super(CornerPredictorHead, self).__init__()
         self.feat_size = feat_size
         self.stride = stride
         self.img_size = self.feat_size * self.stride
 
-        def conv_module(in_planes, out_planes, kernel_size=3, padding=1):
+        def conv_module(in_planes: int,
+                        out_planes: int,
+                        kernel_size: int = 3,
+                        padding: int = 1):
             # The module's pipeline: Conv -> BN -> ReLU.
             return ConvModule(
                 in_channels=in_planes,
@@ -55,7 +64,7 @@ def conv_module(in_planes, out_planes, kernel_size=3, padding=1):
             conv_module(channel // 4, channel // 8),
             nn.Conv2d(channel // 8, 1, kernel_size=1))
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         """Forward pass with input x.
 
         Args:
@@ -68,7 +77,7 @@ def forward(self, x):
         coorx_br, coory_br = self.soft_argmax(score_map_br)
         return torch.stack((coorx_tl, coory_tl, coorx_br, coory_br), dim=1)
 
-    def get_score_map(self, x):
+    def get_score_map(self, x: Tensor) -> Tuple[Tensor, Tensor]:
         """Score map branch.
 
         Args:
@@ -83,11 +92,11 @@ def get_score_map(self, x):
         score_map_br = self.br_corner_pred(x)
         return score_map_tl, score_map_br
 
-    def soft_argmax(self, score_map):
+    def soft_argmax(self, score_map: Tensor) -> Tuple[Tensor, Tensor]:
         """Get soft-argmax coordinate for the given score map.
 
         Args:
-            score_map (self.feat_size, self.feat_size): the last score map
+            score_map (Tensor): the last score map
                 in bbox_head branch
 
         Returns:
@@ -116,7 +125,7 @@ def soft_argmax(self, score_map):
         return soft_argmax_x, soft_argmax_y
 
 
-@HEADS.register_module()
+@MODELS.register_module()
 class ScoreHead(nn.Module):
     """Predict the confidence score of target in current frame.
 
@@ -132,11 +141,11 @@ class ScoreHead(nn.Module):
     """
 
     def __init__(self,
-                 input_dim,
-                 hidden_dim,
-                 output_dim,
-                 num_layers,
-                 use_bn=False):
+                 input_dim: int,
+                 hidden_dim: int,
+                 output_dim: int,
+                 num_layers: int,
+                 use_bn: bool = False):
         super(ScoreHead, self).__init__()
         self.num_layers = num_layers
         hidden_dims = [hidden_dim] * (num_layers - 1)
@@ -151,21 +160,23 @@ def __init__(self,
                 for n, k in zip([input_dim] + hidden_dims, hidden_dims +
                                 [output_dim]))
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         """Forward function for `ScoreHead`.
 
         Args:
             x (Tensor): of shape (1, bs, num_query, c).
 
         Returns:
-            Tensor: of shape (bs, num_query, 1).
+            Tensor: of shape (bs * num_query, 1).
         """
+        # TODO: Perform sigmoid to the last output here rather than in loss
+        # calculation.
         for i, layer in enumerate(self.layers):
             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x.squeeze(0)
+        return x.view(-1, 1)
 
 
-@TRANSFORMER.register_module()
+@MODELS.register_module()
 class StarkTransformer(Transformer):
     """The transformer head used in STARK. `STARK.
 
@@ -176,19 +187,25 @@ class StarkTransformer(Transformer):
     <https://arxiv.org/pdf/2005.12872>`_ for details.
 
     Args:
-        encoder (`mmcv.ConfigDict` | Dict): Config of
+        encoder (`mmengine.ConfigDict` | Dict): Config of
             TransformerEncoder. Defaults to None.
-        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+        decoder ((`mmengine.ConfigDict` | Dict)): Config of
             TransformerDecoder. Defaults to None
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+        init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization.
             Defaults to None.
     """
 
-    def __init__(self, encoder=None, decoder=None, init_cfg=None):
+    def __init__(
+        self,
+        encoder: OptConfigType = None,
+        decoder: OptConfigType = None,
+        init_cfg: OptConfigType = None,
+    ):
         super(StarkTransformer, self).__init__(
             encoder=encoder, decoder=decoder, init_cfg=init_cfg)
 
-    def forward(self, x, mask, query_embed, pos_embed):
+    def forward(self, x: Tensor, mask: Tensor, query_embed: Tensor,
+                pos_embed: Tensor) -> Tuple[Tensor, Tensor]:
         """Forward function for `StarkTransformer`.
 
         The difference with transofrmer module in `MMCV` is the input shape.
@@ -213,7 +230,8 @@ def forward(self, x, mask, query_embed, pos_embed):
             'x_feat_h' and 'x_feat_w' denote the height and width of search
             features respectively.
         Returns:
-            tuple[Tensor]: results of decoder containing the following tensor.
+            tuple[Tensor, Tensor]: results of decoder containing the following
+                tensor.
                 - out_dec: Output from decoder. If return_intermediate_dec \
                       is True, output has shape [num_dec_layers, bs,
                       num_query, embed_dims], else has shape [1, bs, \
@@ -245,7 +263,7 @@ def forward(self, x, mask, query_embed, pos_embed):
         return out_dec, enc_mem
 
 
-@HEADS.register_module()
+@MODELS.register_module()
 class StarkHead(BaseModule):
     """STARK head module for bounding box regression and prediction of
     confidence score of tracking bbox.
@@ -256,23 +274,23 @@ class StarkHead(BaseModule):
 
     Args:
         num_query (int): Number of query in transformer.
-        transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
+        transformer (obj:`mmengine.ConfigDict`|dict): Config for transformer.
             Default: None.
-        positional_encoding (obj:`mmcv.ConfigDict`|dict):
+        positional_encoding (obj:`mmengine.ConfigDict`|dict):
             Config for position encoding.
-        bbox_head (obj:`mmcv.ConfigDict`|dict, optional): Config for bbox head.
-            Defaults to None.
-        cls_head (obj:`mmcv.ConfigDict`|dict, optional): Config for
+        bbox_head (obj:`mmengine.ConfigDict`|dict, optional): Config for bbox
+            head. Defaults to None.
+        cls_head (obj:`mmengine.ConfigDict`|dict, optional): Config for
             classification head. Defaults to None.
-        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
+        loss_cls (obj:`mmengine.ConfigDict`|dict): Config of the
             classification loss. Default `CrossEntropyLoss`.
-        loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the bbox
+        loss_bbox (obj:`mmengine.ConfigDict`|dict): Config of the bbox
             regression loss. Default `L1Loss`.
-        loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the bbox
+        loss_iou (obj:`mmengine.ConfigDict`|dict): Config of the bbox
             regression iou loss. Default `GIoULoss`.
-        tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
+        tran_cfg (obj:`mmengine.ConfigDict`|dict): Training config of
             transformer head.
-        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
+        test_cfg (obj:`mmengine.ConfigDict`|dict): Testing config of
             transformer head.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Default: None
@@ -300,20 +318,20 @@ def __init__(self,
                  frozen_modules=None,
                  **kwargs):
         super(StarkHead, self).__init__(init_cfg=init_cfg)
-        self.transformer = build_transformer(transformer)
+        self.transformer = MODELS.build(transformer)
         self.positional_encoding = build_positional_encoding(
             positional_encoding)
         assert bbox_head is not None
-        self.bbox_head = build_head(bbox_head)
+        self.bbox_head = MODELS.build(bbox_head)
         if cls_head is None:
             # the stage-1 training
-            self.loss_bbox = build_loss(loss_bbox)
-            self.loss_iou = build_loss(loss_iou)
+            self.loss_bbox = MODELS.build(loss_bbox)
+            self.loss_iou = MODELS.build(loss_iou)
             self.cls_head = None
         else:
             # the stage-2 training
-            self.cls_head = build_head(cls_head)
-            self.loss_cls = build_loss(loss_cls)
+            self.cls_head = MODELS.build(cls_head)
+            self.loss_cls = MODELS.build(loss_cls)
         self.embed_dims = self.transformer.embed_dims
         self.num_query = num_query
         self.query_embedding = nn.Embedding(self.num_query, self.embed_dims)
@@ -336,7 +354,7 @@ def init_weights(self):
         """Parameters initialization."""
         self.transformer.init_weights()
 
-    def _merge_template_search(self, inputs):
+    def _merge_template_search(self, inputs: List[Dict[str, Tensor]]) -> dict:
         """Merge the data of template and search images.
         The merge includes 3 steps: flatten, premute and concatenate.
         Note: the data of search image must be in the last place.
@@ -379,12 +397,12 @@ def _merge_template_search(self, inputs):
                 seq_dict[name] = torch.cat(x, dim=0)
         return seq_dict
 
-    def forward_bbox_head(self, feat, enc_mem):
+    def forward_bbox_head(self, feat: Tensor, enc_mem: Tensor) -> Tensor:
         """
         Args:
-            feat: output embeddings of decoder, with shape
+            feat (Tensor): output embeddings of decoder, with shape
                 (1, bs, num_query, c).
-            enc_mem: output embeddings of encoder, with shape
+            enc_mem (Tensor): output embeddings of encoder, with shape
                 (feats_flatten_len, bs, C)
 
                 Here, 'feats_flatten_len' = z_feat_h*z_feat_w*2 + \
@@ -394,7 +412,7 @@ def forward_bbox_head(self, feat, enc_mem):
                 'x_feat_h' and 'x_feat_w' denote the height and width of search
                 features respectively.
         Returns:
-            Tensor: of shape (bs, num_query, 4). The bbox is in
+            Tensor: of shape (bs * num_query, 4). The bbox is in
                 [tl_x, tl_y, br_x, br_y] format.
         """
         z_feat_len = self.bbox_head.feat_size**2
@@ -413,10 +431,9 @@ def forward_bbox_head(self, feat, enc_mem):
                                    self.bbox_head.feat_size)
         # run the corner prediction head
         outputs_coord = self.bbox_head(bbox_feat)
-        outputs_coord = outputs_coord.view(bs, num_query, 4)
         return outputs_coord
 
-    def forward(self, inputs):
+    def forward(self, inputs: List[dict]) -> dict:
         """"
         Args:
             inputs (list[dict(tuple(Tensor))]): The list contains the
@@ -429,10 +446,10 @@ def forward(self, inputs):
                 image respectively. `stride` is the stride of feature map.
 
         Returns:
-             (dict):
-                - 'pred_bboxes': (Tensor) of shape (bs, num_query, 4), in
+            dict:
+                - 'pred_bboxes': (Tensor) of shape (bs * num_query, 4), in
                     [tl_x, tl_y, br_x, br_y] format
-                - 'pred_logit': (Tensor) of shape (bs, num_query, 1)
+                - 'pred_logit': (Tensor) of shape (bs * num_query, 1)
         """
         # 1. preprocess inputs for transformer
         all_inputs = []
@@ -454,53 +471,256 @@ def forward(self, inputs):
                                              all_inputs['pos_embed'])
 
         # 3. forward bbox head and classification head
-        track_results = {}
         if not self.training:
+            pred_logits = None
             if self.cls_head is not None:
                 # forward the classification head
-                track_results['pred_logits'] = self.cls_head(outs_dec)
-            track_results['pred_bboxes'] = self.forward_bbox_head(
-                outs_dec, enc_mem)
+                pred_logits = self.cls_head(outs_dec)
+            pred_bboxes = self.forward_bbox_head(outs_dec, enc_mem)
         else:
             if self.cls_head is not None:
                 # stage-1 training: forward the classification head
-                track_results['pred_logits'] = self.cls_head(outs_dec)
+                pred_logits = self.cls_head(outs_dec)
+                pred_bboxes = None
             else:
                 # stage-2 training: forward the box prediction head
-                track_results['pred_bboxes'] = self.forward_bbox_head(
-                    outs_dec, enc_mem)
-        return track_results
+                pred_logits = None
+                pred_bboxes = self.forward_bbox_head(outs_dec, enc_mem)
+
+        return pred_logits, pred_bboxes
+
+    def predict(self, inputs: List[dict], data_samples: SampleList,
+                prev_bbox: Tensor, scale_factor: Tensor) -> InstanceList:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            inputs (list[dict(tuple(Tensor))]): The list contains the
+                multi-level features and masks of template or search images.
+                    - 'feat': (tuple(Tensor)), the Tensor is of shape
+                        (bs, c, h//stride, w//stride).
+                    - 'mask': (Tensor), of shape (bs, h, w).
+
+                Here, `h` and `w` denote the height and width of input
+                image respectively. `stride` is the stride of feature map.
+
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+            prev_bbox (Tensor): of shape (4, ) in [cx, cy, w, h] format.
+            scale_factor (Tensor): scale factor.
 
-    def loss(self, track_results, gt_bboxes, gt_labels, img_size=None):
+        Returns:
+            List[:obj:`InstanceData`]: Object tracking results of each image
+            after the post process. Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape (1, )
+                - bboxes (Tensor): Has a shape (1, 4),
+                  the last dimension 4 arrange as [x1, y1, x2, y2].
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in data_samples
+        ]
+        outs = self(inputs)
+        predictions = self.predict_by_feat(
+            *outs,
+            prev_bbox=prev_bbox,
+            scale_factor=scale_factor,
+            batch_img_metas=batch_img_metas)
+        return predictions
+
+    def predict_by_feat(self, pred_logits: Tensor, pred_bboxes: Tensor,
+                        prev_bbox: Tensor, scale_factor: Tensor,
+                        batch_img_metas: List[dict]) -> InstanceList:
+        """Track `prev_bbox` to current frame based on the output of network.
+
+        Args:
+            pred_logit: (Tensor) of shape (bs * num_query, 1). This item
+                only exists when the model has classification head.
+            pred_bboxes: (Tensor) of shape (bs * num_query, 4), in
+                [tl_x, tl_y, br_x, br_y] format
+            prev_bbox (Tensor): of shape (4, ) in [cx, cy, w, h] format.
+            scale_factor (Tensor): scale factor.
+            batch_img_metas (List[dict]): the meta information of all images in
+                a batch.
+
+        Returns:
+            List[:obj:`InstanceData`]: Object tracking results of each image
+            after the post process. Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape (1, )
+                - bboxes (Tensor): Has a shape (1, 4),
+                  the last dimension 4 arrange as [x1, y1, x2, y2].
+        """
+        result = InstanceData()
+        if pred_logits is not None:
+            result.scores = pred_logits.view(-1).sigmoid()
+        else:
+            result.scores = prev_bbox.new_tensor([-1.])
+        result.bboxes = pred_bboxes
+
+        return self._bbox_post_process([result],
+                                       prev_bbox,
+                                       scale_factor,
+                                       batch_img_metas=batch_img_metas)
+
+    def _bbox_post_process(self, results: InstanceList, prev_bbox: Tensor,
+                           scale_factor: Tensor, batch_img_metas: List[dict],
+                           **kwargs) -> InstanceList:
+        """The postprocess of tracking bboxes.
+
+        Args:
+            results (InstanceList): tracking results.
+            prev_bbox (Tensor): of shape (4, ) in [cx, cy, w, h] format.
+            scale_factor (Tensor): scale factor.
+            batch_img_metas (List[dict]): the meta information of all images in
+                a batch.
+
+        Returns:
+            List[:obj:`InstanceData`]: Object tracking results of each image
+            after the post process. Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape (1, )
+                - bboxes (Tensor): Has a shape (1, 4),
+                  the last dimension 4 arrange as [x1, y1, x2, y2].
+        """
+        result = results[0]
+        final_bbox = self._mapping_bbox_back(result.bboxes, prev_bbox,
+                                             scale_factor)
+        img_shape = batch_img_metas[0]['ori_shape']
+        final_bbox = self._bbox_clip(
+            final_bbox, img_shape[0], img_shape[1], margin=10)
+        result.bboxes = final_bbox[None]
+
+        return [result]
+
+    def _mapping_bbox_back(self, pred_bboxes: Tensor, prev_bbox: Tensor,
+                           resize_factor: float) -> Tensor:
+        """Mapping the `prediction bboxes` from resized cropped image to
+        original image. The coordinate origins of them are both the top left
+        corner.
+
+        Args:
+            pred_bboxes (Tensor): the predicted bbox of shape
+                (bs * num_query, 4), in [tl_x, tl_y, br_x, br_y] format.
+                The coordinates are based in the resized cropped image.
+            prev_bbox (Tensor): the previous bbox of shape (B, 4),
+                in [cx, cy, w, h] format. The coordinates are based in the
+                original image.
+            resize_factor (float): the ratio of original image scale to cropped
+                image scale.
+        Returns:
+            (Tensor): of (4, ) shape, in [tl_x, tl_y, br_x, br_y] format.
+        """
+        # based in the original croped image
+        pred_bbox = pred_bboxes.mean(dim=0) / resize_factor
+
+        # the half size of the original croped image
+        cropped_img_half_size = 0.5 * self.test_cfg[
+            'search_size'] / resize_factor
+        # (x_shift, y_shift) is the coordinate of top left corner of the
+        # cropped image based in the original image.
+        x_shift, y_shift = prev_bbox[0] - cropped_img_half_size, prev_bbox[
+            1] - cropped_img_half_size
+        pred_bbox[0:4:2] += x_shift
+        pred_bbox[1:4:2] += y_shift
+
+        return pred_bbox
+
+    def _bbox_clip(self,
+                   bbox: Tensor,
+                   img_h: int,
+                   img_w: int,
+                   margin: int = 0) -> Tensor:
+        """Clip the bbox in [tl_x, tl_y, br_x, br_y] format.
+
+        Args:
+            bbox (Tensor): Bounding bbox.
+            img_h (int): The height of the image.
+            img_w (int): The width of the image.
+            margin (int, optional): The distance from image boundary.
+                Defaults to 0.
+
+        Returns:
+            Tensor: The clipped bounding box.
+        """
+
+        bbox_w, bbox_h = bbox[2] - bbox[0], bbox[3] - bbox[1]
+        bbox[0] = bbox[0].clamp(0, img_w - margin)
+        bbox[1] = bbox[1].clamp(0, img_h - margin)
+        bbox_w = bbox_w.clamp(margin, img_w)
+        bbox_h = bbox_h.clamp(margin, img_h)
+        bbox[2] = bbox[0] + bbox_w
+        bbox[3] = bbox[1] + bbox_h
+        return bbox
+
+    # TODO: unify the `sefl.predict`, `self.loss` and so on in all the heads of
+    # SOT.
+    def loss(self, inputs: List[dict], data_samples: SampleList,
+             **kwargs) -> dict:
         """Compute loss.
 
         Args:
-            track_results (dict): it may contains the following keys:
-                - 'pred_bboxes': bboxes of (N, num_query, 4) shape in
-                        [tl_x, tl_y, br_x, br_y] format.
-                - 'pred_logits': bboxes of (N, num_query, 1) shape.
-            gt_bboxes (list[Tensor]): ground truth bboxes for search images
-                with shape (N, 5) in [0., tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[Tensor]): ground truth labels for
-                search images with shape (N, 2).
-            img_size (tuple, optional): the size (h, w) of original
-                search image. Defaults to None.
+            inputs (list[dict(tuple(Tensor))]): The list contains the
+                multi-level features and masks of template or search images.
+                    - 'feat': (tuple(Tensor)), the Tensor is of shape
+                        (bs, c, h//stride, w//stride).
+                    - 'mask': (Tensor), of shape (bs, h, w).
+                Here, `h` and `w` denote the height and width of input
+                image respectively. `stride` is the stride of feature map.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`
+                and 'metainfo'.
 
         Returns:
             dict[str, Tensor]: a dictionary of loss components.
         """
-        losses = dict()
+        outs = self(inputs)
+
+        batch_gt_instances = []
+        batch_img_metas = []
+        batch_search_gt_instances = []
+        for data_sample in data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+            batch_search_gt_instances.append(data_sample.search_gt_instances)
 
+        loss_inputs = outs + (batch_gt_instances, batch_search_gt_instances,
+                              batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        return losses
+
+    def loss_by_feat(self, pred_logits: Tensor, pred_bboxes: Tensor,
+                     batch_gt_instances: InstanceList,
+                     batch_search_gt_instances: InstanceList,
+                     batch_img_metas: List[dict]) -> dict:
+        """Compute loss.
+
+        Args:
+            pred_logits: (Tensor) of shape (bs * num_query, 1). This item
+                only exists when the model has classification head.
+            pred_bboxes: (Tensor) of shape (bs * num_query, 4), in
+                [tl_x, tl_y, br_x, br_y] format
+            batch_gt_instances (InstanceList): the instances in a batch.
+            batch_search_gt_instances (InstanceList): the search instances in a
+                batch.
+            batch_img_metas (List[dict]): the meta information of all images in
+                a batch.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        losses = dict()
         if self.cls_head is None:
             # the stage-1 training
-            assert img_size is not None
-            pred_bboxes = track_results['pred_bboxes'][:, 0]  # shape [N, 4]
-            pred_bboxes[:, 0:4:2] = pred_bboxes[:, 0:4:2] / float(img_size[1])
-            pred_bboxes[:, 1:4:2] = pred_bboxes[:, 1:4:2] / float(img_size[0])
-
-            gt_bboxes = torch.cat(gt_bboxes, dim=0).type(torch.float32)[:, 1:]
-            gt_bboxes[:, 0:4:2] = gt_bboxes[:, 0:4:2] / float(img_size[1])
-            gt_bboxes[:, 1:4:2] = gt_bboxes[:, 1:4:2] / float(img_size[0])
+            assert pred_bboxes is not None
+            img_shape = batch_img_metas[0]['search_img_shape']
+            pred_bboxes[:, 0:4:2] = pred_bboxes[:, 0:4:2] / float(img_shape[1])
+            pred_bboxes[:, 1:4:2] = pred_bboxes[:, 1:4:2] / float(img_shape[0])
+
+            gt_bboxes = [
+                instance['bboxes'] for instance in batch_search_gt_instances
+            ]
+            gt_bboxes = torch.cat(gt_bboxes, dim=0).type(torch.float32)
+            gt_bboxes[:, 0:4:2] = gt_bboxes[:, 0:4:2] / float(img_shape[1])
+            gt_bboxes[:, 1:4:2] = gt_bboxes[:, 1:4:2] / float(img_shape[0])
             gt_bboxes = gt_bboxes.clamp(0., 1.)
 
             # regression IoU loss, default GIoU loss
@@ -515,10 +735,14 @@ def loss(self, track_results, gt_bboxes, gt_labels, img_size=None):
             losses['loss_bbox'] = self.loss_bbox(pred_bboxes, gt_bboxes)
         else:
             # the stage-2 training
-            assert gt_labels is not None
-            pred_logits = track_results['pred_logits'][:, 0].squeeze()
+            assert pred_logits is not None
+            pred_logits = pred_logits.squeeze()
+
+            gt_labels = [
+                instance['labels'] for instance in batch_search_gt_instances
+            ]
             gt_labels = torch.cat(
-                gt_labels, dim=0).type(torch.float32)[:, 1:].squeeze()
+                gt_labels, dim=0).type(torch.float32).squeeze()
             losses['loss_cls'] = self.loss_cls(pred_logits, gt_labels)
 
         return losses
diff --git a/mmtrack/models/trackers/__init__.py b/mmtrack/models/trackers/__init__.py
index bd5094e9c..5fd511386 100644
--- a/mmtrack/models/trackers/__init__.py
+++ b/mmtrack/models/trackers/__init__.py
@@ -2,12 +2,15 @@
 from .base_tracker import BaseTracker
 from .byte_tracker import ByteTracker
 from .masktrack_rcnn_tracker import MaskTrackRCNNTracker
+from .ocsort_tracker import OCSORTTracker
 from .quasi_dense_tao_tracker import QuasiDenseTAOTracker
 from .quasi_dense_tracker import QuasiDenseTracker
-from .sort_tracker import SortTracker
+from .sort_tracker import SORTTracker
+from .strongsort_tracker import StrongSORTTracker
 from .tracktor_tracker import TracktorTracker
 
 __all__ = [
-    'BaseTracker', 'TracktorTracker', 'SortTracker', 'MaskTrackRCNNTracker',
-    'ByteTracker', 'QuasiDenseTracker', 'QuasiDenseTAOTracker'
+    'BaseTracker', 'ByteTracker', 'MaskTrackRCNNTracker', 'OCSORTTracker', 'SORTTracker',
+    'QuasiDenseTracker', 'QuasiDenseTAOTracker', 'TracktorTracker',
+    'StrongSORTTracker'
 ]
diff --git a/mmtrack/models/trackers/base_tracker.py b/mmtrack/models/trackers/base_tracker.py
index 0b10c6d02..b441873ac 100644
--- a/mmtrack/models/trackers/base_tracker.py
+++ b/mmtrack/models/trackers/base_tracker.py
@@ -1,30 +1,28 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
 from addict import Dict
-from mmcv.runner import BaseModule
 
-from mmtrack.models import TRACKERS
 
-
-@TRACKERS.register_module()
-class BaseTracker(BaseModule, metaclass=ABCMeta):
+class BaseTracker(metaclass=ABCMeta):
     """Base tracker model.
 
     Args:
         momentums (dict[str:float], optional): Momentums to update the buffers.
             The `str` indicates the name of the buffer while the `float`
-            indicates the momentum. Default to None.
+            indicates the momentum. Defaults to None.
         num_frames_retain (int, optional). If a track is disappeared more than
             `num_frames_retain` frames, it will be deleted in the memo.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
-            Defaults to None.
+             Defaults to 10.
     """
 
-    def __init__(self, momentums=None, num_frames_retain=10, init_cfg=None):
-        super().__init__(init_cfg)
+    def __init__(self,
+                 momentums: Optional[dict] = None,
+                 num_frames_retain: int = 10) -> None:
+        super().__init__()
         if momentums is not None:
             assert isinstance(momentums, dict), 'momentums must be a dict'
         self.momentums = momentums
@@ -33,27 +31,27 @@ def __init__(self, momentums=None, num_frames_retain=10, init_cfg=None):
 
         self.reset()
 
-    def reset(self):
+    def reset(self) -> None:
         """Reset the buffer of the tracker."""
         self.num_tracks = 0
         self.tracks = dict()
 
     @property
-    def empty(self):
+    def empty(self) -> bool:
         """Whether the buffer is empty or not."""
         return False if self.tracks else True
 
     @property
-    def ids(self):
+    def ids(self) -> List[dict]:
         """All ids in the tracker."""
         return list(self.tracks.keys())
 
     @property
-    def with_reid(self):
+    def with_reid(self) -> bool:
         """bool: whether the framework has a reid model"""
         return hasattr(self, 'reid') and self.reid is not None
 
-    def update(self, **kwargs):
+    def update(self, **kwargs) -> None:
         """Update the tracker.
 
         Args:
@@ -92,7 +90,7 @@ def update(self, **kwargs):
 
         self.pop_invalid_tracks(frame_id)
 
-    def pop_invalid_tracks(self, frame_id):
+    def pop_invalid_tracks(self, frame_id: int) -> None:
         """Pop out invalid tracks."""
         invalid_ids = []
         for k, v in self.tracks.items():
@@ -101,7 +99,7 @@ def pop_invalid_tracks(self, frame_id):
         for invalid_id in invalid_ids:
             self.tracks.pop(invalid_id)
 
-    def update_track(self, id, obj):
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]):
         """Update a track."""
         for k, v in zip(self.memo_items, obj):
             v = v[None]
@@ -111,7 +109,7 @@ def update_track(self, id, obj):
             else:
                 self.tracks[id][k].append(v)
 
-    def init_track(self, id, obj):
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]):
         """Initialize a track."""
         self.tracks[id] = Dict()
         for k, v in zip(self.memo_items, obj):
@@ -122,7 +120,7 @@ def init_track(self, id, obj):
                 self.tracks[id][k] = [v]
 
     @property
-    def memo(self):
+    def memo(self) -> dict:
         """Return all buffers in the tracker."""
         outs = Dict()
         for k in self.memo_items:
@@ -142,12 +140,16 @@ def memo(self):
             outs[k] = torch.cat(v, dim=0)
         return outs
 
-    def get(self, item, ids=None, num_samples=None, behavior=None):
+    def get(self,
+            item: str,
+            ids: Optional[list] = None,
+            num_samples: Optional[int] = None,
+            behavior: Optional[str] = None) -> torch.Tensor:
         """Get the buffer of a specific item.
 
         Args:
             item (str): The demanded item.
-            ids (list[int]): The demanded ids.
+            ids (list[int], optional): The demanded ids. Defaults to None.
             num_samples (int, optional): Number of samples to calculate the
                 results. Defaults to None.
             behavior (str, optional): Behavior to calculate the results.
@@ -182,14 +184,18 @@ def track(self, *args, **kwargs):
         """Tracking forward function."""
         pass
 
-    def crop_imgs(self, img, img_metas, bboxes, rescale=False):
+    def crop_imgs(self,
+                  img: torch.Tensor,
+                  meta_info: dict,
+                  bboxes: torch.Tensor,
+                  rescale: bool = False) -> torch.Tensor:
         """Crop the images according to some bounding boxes. Typically for re-
         identification sub-module.
 
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
                 Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
+            meta_info (dict): image information dict where each dict
                 has: 'img_shape', 'scale_factor', 'flip', and may also contain
                 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
             bboxes (Tensor): of shape (N, 4) or (N, 5).
@@ -197,22 +203,25 @@ def crop_imgs(self, img, img_metas, bboxes, rescale=False):
                 rescaled to fit the scale of the image. Defaults to False.
 
         Returns:
-            Tensor: Image tensor of shape (N, C, H, W).
+            Tensor: Image tensor of shape (T, C, H, W).
         """
-        h, w, _ = img_metas[0]['img_shape']
+        h, w = meta_info['img_shape']
         img = img[:, :, :h, :w]
         if rescale:
-            bboxes[:, :4] *= torch.tensor(img_metas[0]['scale_factor']).to(
-                bboxes.device)
-        bboxes[:, 0::2] = torch.clamp(bboxes[:, 0::2], min=0, max=w)
-        bboxes[:, 1::2] = torch.clamp(bboxes[:, 1::2], min=0, max=h)
+            factor_x, factor_y = meta_info['scale_factor']
+            bboxes[:, :4] *= torch.tensor(
+                [factor_x, factor_y, factor_x, factor_y]).to(bboxes.device)
+        bboxes[:, 0] = torch.clamp(bboxes[:, 0], min=0, max=w - 1)
+        bboxes[:, 1] = torch.clamp(bboxes[:, 1], min=0, max=h - 1)
+        bboxes[:, 2] = torch.clamp(bboxes[:, 2], min=1, max=w)
+        bboxes[:, 3] = torch.clamp(bboxes[:, 3], min=1, max=h)
 
         crop_imgs = []
         for bbox in bboxes:
             x1, y1, x2, y2 = map(int, bbox)
-            if x2 == x1:
+            if x2 <= x1:
                 x2 = x1 + 1
-            if y2 == y1:
+            if y2 <= y1:
                 y2 = y1 + 1
             crop_img = img[:, :, y1:y2, x1:x2]
             if self.reid.get('img_scale', False):
@@ -225,5 +234,8 @@ def crop_imgs(self, img, img_metas, bboxes, rescale=False):
 
         if len(crop_imgs) > 0:
             return torch.cat(crop_imgs, dim=0)
+        elif self.reid.get('img_scale', False):
+            _h, _w = self.reid['img_scale']
+            return img.new_zeros((0, 3, _h, _w))
         else:
-            return img.new_zeros((0, ))
+            return img.new_zeros((0, 3, h, w))
diff --git a/mmtrack/models/trackers/byte_tracker.py b/mmtrack/models/trackers/byte_tracker.py
index 5e994fc8c..a35ba2123 100644
--- a/mmtrack/models/trackers/byte_tracker.py
+++ b/mmtrack/models/trackers/byte_tracker.py
@@ -1,16 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
 import lap
 import numpy as np
 import torch
-from mmcv.runner import force_fp32
-from mmdet.core import bbox_overlaps
+from mmdet.structures.bbox import bbox_overlaps
+from mmengine.structures import InstanceData
 
-from mmtrack.core.bbox import bbox_cxcyah_to_xyxy, bbox_xyxy_to_cxcyah
-from mmtrack.models import TRACKERS
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.structures.bbox import bbox_cxcyah_to_xyxy, bbox_xyxy_to_cxcyah
 from .base_tracker import BaseTracker
 
 
-@TRACKERS.register_module()
+@MODELS.register_module()
 class ByteTracker(BaseTracker):
     """Tracker for ByteTrack.
 
@@ -30,19 +33,16 @@ class ByteTracker(BaseTracker):
                 tracklets. Defaults to 0.3.
         num_tentatives (int, optional): Number of continuous frames to confirm
             a track. Defaults to 3.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
-            Defaults to None.
     """
 
     def __init__(self,
-                 obj_score_thrs=dict(high=0.6, low=0.1),
-                 init_track_thr=0.7,
-                 weight_iou_with_det_scores=True,
-                 match_iou_thrs=dict(high=0.1, low=0.5, tentative=0.3),
-                 num_tentatives=3,
-                 init_cfg=None,
+                 obj_score_thrs: dict = dict(high=0.6, low=0.1),
+                 init_track_thr: float = 0.7,
+                 weight_iou_with_det_scores: bool = True,
+                 match_iou_thrs: dict = dict(high=0.1, low=0.5, tentative=0.3),
+                 num_tentatives: int = 3,
                  **kwargs):
-        super().__init__(init_cfg=init_cfg, **kwargs)
+        super().__init__(**kwargs)
         self.obj_score_thrs = obj_score_thrs
         self.init_track_thr = init_track_thr
 
@@ -52,18 +52,18 @@ def __init__(self,
         self.num_tentatives = num_tentatives
 
     @property
-    def confirmed_ids(self):
+    def confirmed_ids(self) -> List:
         """Confirmed ids in the tracker."""
         ids = [id for id, track in self.tracks.items() if not track.tentative]
         return ids
 
     @property
-    def unconfirmed_ids(self):
+    def unconfirmed_ids(self) -> List:
         """Unconfirmed ids in the tracker."""
         ids = [id for id, track in self.tracks.items() if track.tentative]
         return ids
 
-    def init_track(self, id, obj):
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]) -> None:
         """Initialize a track."""
         super().init_track(id, obj)
         if self.tracks[id].frame_ids[-1] == 0:
@@ -76,7 +76,7 @@ def init_track(self, id, obj):
         self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
             bbox)
 
-    def update_track(self, id, obj):
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]) -> None:
         """Update a track."""
         super().update_track(id, obj)
         if self.tracks[id].tentative:
@@ -92,7 +92,7 @@ def update_track(self, id, obj):
         self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
             self.tracks[id].mean, self.tracks[id].covariance, bbox)
 
-    def pop_invalid_tracks(self, frame_id):
+    def pop_invalid_tracks(self, frame_id: int) -> None:
         """Pop out invalid tracks."""
         invalid_ids = []
         for k, v in self.tracks.items():
@@ -105,17 +105,22 @@ def pop_invalid_tracks(self, frame_id):
         for invalid_id in invalid_ids:
             self.tracks.pop(invalid_id)
 
-    def assign_ids(self,
-                   ids,
-                   det_bboxes,
-                   det_labels,
-                   weight_iou_with_det_scores=False,
-                   match_iou_thr=0.5):
+    def assign_ids(
+            self,
+            ids: List[int],
+            det_bboxes: torch.Tensor,
+            det_labels: torch.Tensor,
+            det_scores: torch.Tensor,
+            weight_iou_with_det_scores: Optional[bool] = False,
+            match_iou_thr: Optional[float] = 0.5
+    ) -> Tuple[np.ndarray, np.ndarray]:
         """Assign ids.
 
         Args:
             ids (list[int]): Tracking ids.
-            det_bboxes (Tensor): of shape (N, 5)
+            det_bboxes (Tensor): of shape (N, 4)
+            det_labels (Tensor): of shape (N,)
+            det_scores (Tensor): of shape (N,)
             weight_iou_with_det_scores (bool, optional): Whether using
                 detection scores to weight IOU which is used for matching.
                 Defaults to False.
@@ -123,7 +128,7 @@ def assign_ids(self,
                 Defaults to 0.5.
 
         Returns:
-            tuple(int): The assigning ids.
+            tuple(np.ndarray, np.ndarray): The assigning ids.
         """
         # get track_bboxes
         track_bboxes = np.zeros((0, 4))
@@ -134,10 +139,9 @@ def assign_ids(self,
         track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes)
 
         # compute distance
-        ious = bbox_overlaps(track_bboxes, det_bboxes[:, :4])
+        ious = bbox_overlaps(track_bboxes, det_bboxes)
         if weight_iou_with_det_scores:
-            ious *= det_bboxes[:, 4][None]
-
+            ious *= det_scores
         # support multi-class association
         track_labels = torch.tensor([
             self.tracks[id]['labels'][-1] for id in ids
@@ -158,39 +162,40 @@ def assign_ids(self,
             col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
         return row, col
 
-    @force_fp32(apply_to=('img', 'bboxes'))
-    def track(self,
-              img,
-              img_metas,
-              model,
-              bboxes,
-              labels,
-              frame_id,
-              rescale=False,
-              **kwargs):
+    def track(self, model: torch.nn.Module, img: torch.Tensor,
+              feats: List[torch.Tensor], data_sample: TrackDataSample,
+              **kwargs) -> InstanceData:
         """Tracking forward function.
 
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
             model (nn.Module): MOT model.
-            bboxes (Tensor): of shape (N, 5).
-            labels (Tensor): of shape (N, ).
-            frame_id (int): The id of current frame, 0-index.
-            rescale (bool, optional): If True, the bounding boxes should be
-                rescaled to fit the original scale of the image. Defaults to
-                False.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                ByteTrack method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+
         Returns:
-            tuple: Tracking results.
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
         """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_det_instances.bboxes
+        labels = data_sample.pred_det_instances.labels
+        scores = data_sample.pred_det_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
         if not hasattr(self, 'kf'):
             self.kf = model.motion
 
         if self.empty or bboxes.size(0) == 0:
-            valid_inds = bboxes[:, -1] > self.init_track_thr
+            valid_inds = scores > self.init_track_thr
+            scores = scores[valid_inds]
             bboxes = bboxes[valid_inds]
             labels = labels[valid_inds]
             num_new_tracks = bboxes.size(0)
@@ -206,16 +211,18 @@ def track(self,
                              device=labels.device)
 
             # get the detection bboxes for the first association
-            first_det_inds = bboxes[:, -1] > self.obj_score_thrs['high']
+            first_det_inds = scores > self.obj_score_thrs['high']
             first_det_bboxes = bboxes[first_det_inds]
             first_det_labels = labels[first_det_inds]
+            first_det_scores = scores[first_det_inds]
             first_det_ids = ids[first_det_inds]
 
             # get the detection bboxes for the second association
             second_det_inds = (~first_det_inds) & (
-                bboxes[:, -1] > self.obj_score_thrs['low'])
+                scores > self.obj_score_thrs['low'])
             second_det_bboxes = bboxes[second_det_inds]
             second_det_labels = labels[second_det_inds]
+            second_det_scores = scores[second_det_inds]
             second_det_ids = ids[second_det_inds]
 
             # 1. use Kalman Filter to predict current location
@@ -230,7 +237,8 @@ def track(self,
             # 2. first match
             first_match_track_inds, first_match_det_inds = self.assign_ids(
                 self.confirmed_ids, first_det_bboxes, first_det_labels,
-                self.weight_iou_with_det_scores, self.match_iou_thrs['high'])
+                first_det_scores, self.weight_iou_with_det_scores,
+                self.match_iou_thrs['high'])
             # '-1' mean a detection box is not matched with tracklets in
             # previous frame
             valid = first_match_det_inds > -1
@@ -239,11 +247,13 @@ def track(self,
 
             first_match_det_bboxes = first_det_bboxes[valid]
             first_match_det_labels = first_det_labels[valid]
+            first_match_det_scores = first_det_scores[valid]
             first_match_det_ids = first_det_ids[valid]
             assert (first_match_det_ids > -1).all()
 
             first_unmatch_det_bboxes = first_det_bboxes[~valid]
             first_unmatch_det_labels = first_det_labels[~valid]
+            first_unmatch_det_scores = first_det_scores[~valid]
             first_unmatch_det_ids = first_det_ids[~valid]
             assert (first_unmatch_det_ids == -1).all()
 
@@ -252,7 +262,8 @@ def track(self,
             (tentative_match_track_inds,
              tentative_match_det_inds) = self.assign_ids(
                  self.unconfirmed_ids, first_unmatch_det_bboxes,
-                 first_unmatch_det_labels, self.weight_iou_with_det_scores,
+                 first_unmatch_det_labels, first_unmatch_det_scores,
+                 self.weight_iou_with_det_scores,
                  self.match_iou_thrs['tentative'])
             valid = tentative_match_det_inds > -1
             first_unmatch_det_ids[valid] = torch.tensor(self.unconfirmed_ids)[
@@ -270,7 +281,7 @@ def track(self,
 
             second_match_track_inds, second_match_det_inds = self.assign_ids(
                 first_unmatch_track_ids, second_det_bboxes, second_det_labels,
-                False, self.match_iou_thrs['low'])
+                second_det_scores, False, self.match_iou_thrs['low'])
             valid = second_match_det_inds > -1
             second_det_ids[valid] = torch.tensor(first_unmatch_track_ids)[
                 second_match_det_inds[valid]].to(ids)
@@ -287,6 +298,10 @@ def track(self,
                 (first_match_det_labels, first_unmatch_det_labels), dim=0)
             labels = torch.cat((labels, second_det_labels[valid]), dim=0)
 
+            scores = torch.cat(
+                (first_match_det_scores, first_unmatch_det_scores), dim=0)
+            scores = torch.cat((scores, second_det_scores[valid]), dim=0)
+
             ids = torch.cat((first_match_det_ids, first_unmatch_det_ids),
                             dim=0)
             ids = torch.cat((ids, second_det_ids[valid]), dim=0)
@@ -298,5 +313,18 @@ def track(self,
                 self.num_tracks + new_track_inds.sum()).to(labels)
             self.num_tracks += new_track_inds.sum()
 
-        self.update(ids=ids, bboxes=bboxes, labels=labels, frame_ids=frame_id)
-        return bboxes, labels, ids
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/mmtrack/models/trackers/masktrack_rcnn_tracker.py b/mmtrack/models/trackers/masktrack_rcnn_tracker.py
index df355e3b8..29eecf7c6 100644
--- a/mmtrack/models/trackers/masktrack_rcnn_tracker.py
+++ b/mmtrack/models/trackers/masktrack_rcnn_tracker.py
@@ -1,13 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
 import torch
-from mmcv.runner import force_fp32
-from mmdet.core import bbox_overlaps
+from mmdet.structures.bbox import bbox_overlaps
+from mmengine.structures import InstanceData
+from torch import Tensor
 
-from mmtrack.models import TRACKERS
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
 from .base_tracker import BaseTracker
 
 
-@TRACKERS.register_module()
+@MODELS.register_module()
 class MaskTrackRCNNTracker(BaseTracker):
     """Tracker for MaskTrack R-CNN.
 
@@ -21,30 +25,29 @@ class MaskTrackRCNNTracker(BaseTracker):
                 score.
             - det_label (float): The coefficient of `label_deltas` when
                 computing match score.
-
-        init_cfg (dict or list[dict], optional): Initialization config dict.
-            Defaults to None.
     """
 
     def __init__(self,
-                 match_weights=dict(det_score=1.0, iou=2.0, det_label=10.0),
-                 init_cfg=None,
+                 match_weights: dict = dict(
+                     det_score=1.0, iou=2.0, det_label=10.0),
                  **kwargs):
-        super().__init__(init_cfg=init_cfg, **kwargs)
+        super().__init__(**kwargs)
         self.match_weights = match_weights
 
-    def get_match_score(self, bboxes, labels, prev_bboxes, prev_labels,
-                        similarity_logits):
+    def get_match_score(self, bboxes: Tensor, labels: Tensor, scores: Tensor,
+                        prev_bboxes: Tensor, prev_labels: Tensor,
+                        similarity_logits: Tensor) -> Tensor:
         """Get the match score.
 
         Args:
-            bboxes (torch.Tensor): of shape (num_current_bboxes, 5) in
-                [tl_x, tl_y, br_x, br_y, score] format. Denoting the detection
+            bboxes (torch.Tensor): of shape (num_current_bboxes, 4) in
+                [tl_x, tl_y, br_x, br_y] format. Denoting the detection
                 bboxes of current frame.
             labels (torch.Tensor): of shape (num_current_bboxes, )
-            prev_bboxes (torch.Tensor): of shape (num_previous_bboxes, 5) in
-                [tl_x, tl_y, br_x, br_y, score] format.  Denoting the
-                detection bboxes of previous frame.
+            scores (torch.Tensor): of shape (num_current_bboxes, )
+            prev_bboxes (torch.Tensor): of shape (num_previous_bboxes, 4) in
+                [tl_x, tl_y, br_x, br_y] format.  Denoting the detection bboxes
+                of previous frame.
             prev_labels (torch.Tensor): of shape (num_previous_bboxes, )
             similarity_logits (torch.Tensor): of shape (num_current_bboxes,
                 num_previous_bboxes + 1). Denoting the similarity logits from
@@ -56,7 +59,7 @@ def get_match_score(self, bboxes, labels, prev_bboxes, prev_labels,
         """
         similarity_scores = similarity_logits.softmax(dim=1)
 
-        ious = bbox_overlaps(bboxes[:, :4], prev_bboxes[:, :4])
+        ious = bbox_overlaps(bboxes, prev_bboxes)
         iou_dummy = ious.new_zeros(ious.shape[0], 1)
         ious = torch.cat((iou_dummy, ious), dim=1)
 
@@ -66,13 +69,13 @@ def get_match_score(self, bboxes, labels, prev_bboxes, prev_labels,
 
         match_score = similarity_scores.log()
         match_score += self.match_weights['det_score'] * \
-            bboxes[:, 4].view(-1, 1).log()
+            scores.view(-1, 1).log()
         match_score += self.match_weights['iou'] * ious
         match_score += self.match_weights['det_label'] * label_deltas
 
         return match_score
 
-    def assign_ids(self, match_scores):
+    def assign_ids(self, match_scores: Tensor):
         num_prev_bboxes = match_scores.shape[1] - 1
         _, match_ids = match_scores.max(dim=1)
 
@@ -91,47 +94,54 @@ def assign_ids(self, match_scores):
                     best_match_scores[match_id - 1] = match_score
         return ids, best_match_scores
 
-    @force_fp32(apply_to=('img', 'feats', 'bboxes'))
     def track(self,
-              img,
-              img_metas,
-              model,
-              feats,
-              bboxes,
-              labels,
-              masks,
-              frame_id,
-              rescale=False,
-              **kwargs):
+              model: torch.nn.Module,
+              img: torch.Tensor,
+              feats: List[torch.Tensor],
+              data_sample: TrackDataSample,
+              rescale=True,
+              **kwargs) -> InstanceData:
         """Tracking forward function.
 
         Args:
-            img (Tensor): of shape (1, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
             model (nn.Module): VIS model.
-            feats (tuple): Backbone features of the input image.
-            bboxes (Tensor): of shape (N, 5).
-            labels (Tensor): of shape (N, ).
-            masks (Tensor): of shape (N, H, W)
-            frame_id (int): The id of current frame, 0-index.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                MaskTrackRCNN method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
             rescale (bool, optional): If True, the bounding boxes should be
                 rescaled to fit the original scale of the image. Defaults to
-                False.
+                True.
 
         Returns:
-            tuple: Tracking results.
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
         """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_det_instances.bboxes
+        masks = data_sample.pred_det_instances.masks
+        labels = data_sample.pred_det_instances.labels
+        scores = data_sample.pred_det_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        # create pred_track_instances
+        pred_track_instances = InstanceData()
+
         if bboxes.shape[0] == 0:
             ids = torch.zeros_like(labels)
-            return bboxes, labels, masks, ids
+            pred_track_instances = data_sample.pred_det_instances.clone()
+            pred_track_instances.instances_id = ids
+            return pred_track_instances
 
         rescaled_bboxes = bboxes.clone()
         if rescale:
-            rescaled_bboxes[:, :4] *= torch.tensor(
-                img_metas[0]['scale_factor']).to(bboxes.device)
+            scale_factor = rescaled_bboxes.new_tensor(
+                metainfo['scale_factor']).repeat((1, 2))
+            rescaled_bboxes = rescaled_bboxes * scale_factor
         roi_feats, _ = model.track_head.extract_roi_feats(
             feats, [rescaled_bboxes])
 
@@ -147,16 +157,18 @@ def track(self,
             prev_labels = self.get('labels')
             prev_roi_feats = self.get('roi_feats')
 
-            similarity_logits = model.track_head.simple_test(
+            similarity_logits = model.track_head.predict(
                 roi_feats, prev_roi_feats)
-            match_scores = self.get_match_score(bboxes, labels, prev_bboxes,
-                                                prev_labels, similarity_logits)
+            match_scores = self.get_match_score(bboxes, labels, scores,
+                                                prev_bboxes, prev_labels,
+                                                similarity_logits)
             ids, best_match_scores = self.assign_ids(match_scores)
 
         valid_inds = ids > -1
         ids = ids[valid_inds]
         bboxes = bboxes[valid_inds]
         labels = labels[valid_inds]
+        scores = scores[valid_inds]
         masks = masks[valid_inds]
         roi_feats = roi_feats[valid_inds]
 
@@ -164,7 +176,15 @@ def track(self,
             ids=ids,
             bboxes=bboxes,
             labels=labels,
+            scores=scores,
             masks=masks,
             roi_feats=roi_feats,
             frame_ids=frame_id)
-        return bboxes, labels, masks, ids
+        # update pred_track_instances
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.masks = masks
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/mmtrack/models/trackers/my_sort_tracker.py b/mmtrack/models/trackers/my_sort_tracker.py
new file mode 100644
index 000000000..033972279
--- /dev/null
+++ b/mmtrack/models/trackers/my_sort_tracker.py
@@ -0,0 +1,638 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+from mmdet.structures.bbox import bbox_overlaps
+from mmengine.structures import InstanceData
+from motmetrics.lap import linear_sum_assignment
+from torch import Tensor
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.structures.bbox import bbox_xyxy_to_cxcyah
+from mmtrack.utils import OptConfigType, imrenormalize
+from .base_tracker import BaseTracker
+
+from mmengine.dataset import Compose
+import numpy as np
+from mmpose.datasets.transforms import LoadImage, GetBBoxCenterScale, PackPoseInputs
+
+
+@MODELS.register_module()
+class MySORTTracker(BaseTracker):
+    """Tracker for SORT/DeepSORT.
+
+    Args:
+        obj_score_thr (float, optional): Threshold to filter the objects.
+            Defaults to 0.3.
+        reid (dict, optional): Configuration for the ReID model.
+            - num_samples (int, optional): Number of samples to calculate the
+                feature embeddings of a track. Default to 10.
+            - image_scale (tuple, optional): Input scale of the ReID model.
+                Default to (256, 128).
+            - img_norm_cfg (dict, optional): Configuration to normalize the
+                input. Default to None.
+            - match_score_thr (float, optional): Similarity threshold for the
+                matching process. Default to 2.0.
+        match_iou_thr (float, optional): Threshold of the IoU matching process.
+            Defaults to 0.7.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+    """
+
+    def __init__(self,
+                 obj_score_thr: float = 0.3,
+                 reid: dict = dict(
+                     num_samples=10,
+                     img_scale=(256, 128),
+                     img_norm_cfg=None,
+                     match_score_thr=2.0),
+                 match_iou_thr: float = 0.7,
+                 num_tentatives: int = 3,
+                 biou: bool = False,
+                 pose: bool = False,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.obj_score_thr = obj_score_thr
+        self.reid = reid
+        self.match_iou_thr = match_iou_thr
+        self.num_tentatives = num_tentatives
+        self.biou = biou
+        self.pose = pose
+
+        self.pose_pipeline = Compose(
+            [LoadImage(),
+             GetBBoxCenterScale(padding=1.0),
+             PackPoseInputs()])
+
+        self.pose_embbedder = FullBodyPoseEmbedder()
+
+    @property
+    def confirmed_ids(self) -> List:
+        """Confirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if not track.tentative]
+        return ids
+
+    def init_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Initialize a track."""
+        super().init_track(id, obj)
+        self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+
+    def update_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+
+    def pop_invalid_tracks(self, frame_id: int) -> None:
+        """Pop out invalid tracks."""
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            # case1: disappeared frames >= self.num_frames_retrain
+            case1 = frame_id - v['frame_ids'][-1] >= self.num_frames_retain
+            # case2: tentative tracks but not matched in this frame
+            case2 = v.tentative and v['frame_ids'][-1] != frame_id
+            if case1 or case2:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+    def track(self,
+              model: torch.nn.Module,
+              img: Tensor,
+              feats: List[Tensor],
+              data_sample: TrackDataSample,
+              data_preprocessor: OptConfigType = None,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                SORT method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                False.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_det_instances.bboxes
+        labels = data_sample.pred_det_instances.labels
+        scores = data_sample.pred_det_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = model.motion
+
+        if self.with_reid:
+            if self.reid.get('img_norm_cfg', False):
+                img_norm_cfg = dict(
+                    mean=data_preprocessor['mean'],
+                    std=data_preprocessor['std'],
+                    to_bgr=data_preprocessor['rgb_to_bgr'])
+                reid_img = imrenormalize(img, img_norm_cfg,
+                                         self.reid['img_norm_cfg'])
+            else:
+                reid_img = img.clone()
+
+        valid_inds = scores > self.obj_score_thr
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+
+        print()
+        print('number of bboxes: ', bboxes.shape[0])
+        print('reid_image:', reid_img.shape)
+
+        if self.empty or bboxes.size(0) == 0:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += num_new_tracks
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                if crops.size(0) > 0:
+                    embeds = model.reid(crops, mode='tensor')
+                    if self.pose:
+                        pose_results, pose_embedded = self.get_pose_embedded(
+                            bboxes.clone(), scores.clone(), metainfo, reid_img,
+                            crops, model.pose)
+                        embeds = torch.cat(
+                            (embeds, pose_embedded.to(embeds.device)), dim=1)
+                else:
+                    pose_embedded = crops.new_zeros((0, 46))
+                    embeds = crops.new_zeros((0, model.reid.head.out_channels))
+
+        else:
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
+
+            # motion_bboxes = self.get('bboxes', self.ids)
+            # print('motion_bboxes: ', len(motion_bboxes))
+            # print(motion_bboxes[0])
+
+            # print('number of tracks: ', len(self.tracks))
+            # print(self.tracks[0].keys())
+            # print(self.tracks[0]['ids'])
+            # print(self.tracks[0]['bboxes'])
+            # print(self.tracks[0]['frame_ids'])
+            # print(self.tracks[0]['mean'].shape)
+            # print(self.tracks[0]['covariance'].shape)
+
+            # motion
+            if model.with_motion:
+                # print('motion')
+                self.tracks, costs = model.motion.track(
+                    self.tracks, bbox_xyxy_to_cxcyah(bboxes))
+
+            # motion_bboxes = self.get('bboxes', self.ids)
+            # print(motion_bboxes[0])
+
+            active_ids = self.confirmed_ids
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+
+                embeds = model.reid(crops, mode='tensor')
+
+                if self.pose:
+                    pose_results, pose_embedded = self.get_pose_embedded(
+                        bboxes.clone(), scores.clone(), metainfo, reid_img,
+                        crops, model.pose)
+
+                    embeds = torch.cat(
+                        (embeds, pose_embedded.to(embeds.device)), dim=1)
+
+                print('reid_mtaching')
+                print('active_ids: ', len(active_ids))
+
+                # reid
+                if len(active_ids) > 0:
+                    track_embeds = self.get(
+                        'embeds',
+                        active_ids,
+                        self.reid.get('num_samples', None),
+                        behavior='mean')
+
+                    reid_dists = torch.cdist(track_embeds, embeds)
+
+                    # support multi-class association
+                    track_labels = torch.tensor([
+                        self.tracks[id]['labels'][-1] for id in active_ids
+                    ]).to(bboxes.device)
+                    cate_match = labels[None, :] == track_labels[:, None]
+                    cate_cost = (1 - cate_match.int()) * 1e6
+                    reid_dists = (reid_dists + cate_cost).cpu().numpy()
+
+                    valid_inds = [list(self.ids).index(_) for _ in active_ids]
+                    reid_dists[~np.isfinite(costs[valid_inds, :])] = np.nan
+
+                    row, col = linear_sum_assignment(reid_dists)
+                    for r, c in zip(row, col):
+                        dist = reid_dists[r, c]
+                        if not np.isfinite(dist):
+                            continue
+                        if dist <= self.reid['match_score_thr']:
+                            ids[c] = active_ids[r]
+
+            active_ids = [
+                id for id in self.ids if id not in ids
+                and self.tracks[id].frame_ids[-1] == frame_id - 1
+            ]
+
+            print('biou_mtaching')
+            print('active_ids: ', len(active_ids))
+            if len(active_ids) > 0:
+                active_dets = torch.nonzero(ids == -1).squeeze(1)
+                track_bboxes = self.get('bboxes', active_ids)
+
+                # print(active_ids)
+                # print(track_bboxes[0])
+
+                if self.biou:
+                    ious = bbox_overlaps(
+                        expanse_bboxes(track_bboxes),
+                        expanse_bboxes(bboxes[active_dets]))
+
+                else:
+                    ious = bbox_overlaps(track_bboxes, bboxes[active_dets])
+
+                # support multi-class association
+                track_labels = torch.tensor([
+                    self.tracks[id]['labels'][-1] for id in active_ids
+                ]).to(bboxes.device)
+                cate_match = labels[None, active_dets] == track_labels[:, None]
+                cate_cost = (1 - cate_match.int()) * 1e6
+
+                dists = (1 - ious + cate_cost).cpu().numpy()
+
+                row, col = linear_sum_assignment(dists)
+                for r, c in zip(row, col):
+                    dist = dists[r, c]
+                    if dist < 1 - self.match_iou_thr:
+                        ids[active_dets[c]] = active_ids[r]
+
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum(),
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            embeds=embeds if self.with_reid else None,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+        if self.with_reid and self.pose:
+            pred_track_instances.pose = pose_results
+
+        return pred_track_instances
+
+    def prepare_pose_data(self, img, bboxes, scores, crops):
+        print('prepare_pose_data')
+        pose_data = []
+
+        for bbox, score, crop in zip(bboxes, scores, crops):
+            data = self.pose_pipeline(dict(img=img,
+                                           bbox=bbox[None]))  # shape (1, 4)
+            pds = data['data_samples']
+            pds.gt_instances.bbox_scores = score.reshape(1)
+            pds.set_field(
+                (crop.shape[2], crop.shape[1]),  # w, h
+                'input_size',
+                field_type='metainfo')
+            pds.set_field(
+                (0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15),
+                'flip_indices',
+                field_type='metainfo')
+
+            pose_data.append(pds)
+        return pose_data
+
+    def draw_img(self, bboxes, img, pose_results):
+        print('draw_img')
+        import cv2
+        mean = np.array([[[123.675, 116.28, 103.53]]])
+        std = np.array([[[58.395, 57.12, 57.375]]])
+        img = img * std + mean
+
+        cv2.imwrite('image.jpg', img[:, :, ::-1])
+        img = cv2.imread('image.jpg')
+
+        color = (255, 255, 0)
+        thickness = 2
+
+        for k in range(bboxes.shape[0]):
+            start_point = (int(bboxes[k][0]), int(bboxes[k][1]))
+            end_point = (int(bboxes[k][2]), int(bboxes[k][3]))
+            img = cv2.rectangle(img, start_point, end_point, color, thickness)
+
+            landmarks = pose_results[k].pred_instances.keypoints.reshape(-1, 2)
+            for i in range(landmarks.shape[0]):
+                center_coordinates = (int(landmarks[i][0]),
+                                      int(landmarks[i][1]))
+                radius = 3
+                color = (100, 255, 100)
+                thickness = 1
+                img = cv2.circle(img, center_coordinates, radius, color,
+                                 thickness)
+
+        cv2.imwrite('image1.jpg', img)
+
+    def get_pose_embedded(self, bboxes, scores, metainfo, img, crops,
+                          pose_estimator):
+
+        bboxes = bboxes.detach().cpu().numpy()
+        scores = scores.detach().cpu().numpy()
+        img = img.squeeze().detach().moveaxis(0, -1).cpu().numpy()
+
+        factor_x, factor_y = metainfo['scale_factor']
+        bboxes_scale = bboxes[:, :4] * np.array(
+            [factor_x, factor_y, factor_x, factor_y])
+
+        pose_data = self.prepare_pose_data(img, bboxes_scale, scores, crops)
+        pose_results = pose_estimator.predict(crops, pose_data)
+        self.draw_img(bboxes_scale, img, pose_results)
+
+        pose_embedded = self.pose_embbedder(pose_results, bboxes_scale)
+
+        for k in range(len(pose_results)):
+            keypoints = pose_results[k].pred_instances.keypoints[0]
+            keypoints /= np.array([factor_x, factor_y])
+        return pose_results, pose_embedded
+
+
+def expanse_bboxes(bboxes, b: float = 0.3):
+    w = bboxes[:, 2] - bboxes[:, 0]
+    h = bboxes[:, 3] - bboxes[:, 1]
+
+    w_center = (bboxes[:, 2] + bboxes[:, 0]) / 2
+    h_center = (bboxes[:, 3] + bboxes[:, 1]) / 2
+
+    w_expanse = (2 * b + 1) * w
+    h_expanse = (2 * b + 1) * h
+
+    return torch.stack((
+        (w_center - w_expanse / 2),
+        (h_center - h_expanse / 2),
+        (w_center + w_expanse / 2),
+        (h_center + h_expanse / 2),
+    ),
+                       dim=1)
+
+
+class FullBodyPoseEmbedder(object):
+    """Converts 3D pose landmarks into 3D embedding."""
+
+    def __init__(self, torso_size_multiplier=2.5):
+        # Multiplier to apply to the torso to get minimal body size.
+        self._torso_size_multiplier = torso_size_multiplier
+
+        # Names of the landmarks as they appear in the prediction.
+        self._landmark_names = [
+            'nose',
+            'left_eye',
+            'right_eye',
+            'left_ear',
+            'right_ear',
+            'left_shoulder',
+            'right_shoulder',
+            'left_elbow',
+            'right_elbow',
+            'left_wrist',
+            'right_wrist',
+            'left_hip',
+            'right_hip',
+            'left_knee',
+            'right_knee',
+            'left_ankle',
+            'right_ankle',
+        ]
+
+    def embbed(self, landmarks):
+        """
+        Normalizes pose landmarks and converts to embedding
+
+        Args:
+          landmarks - NumPy array with 3D landmarks of shape (N, 3).
+
+        Result:
+          Numpy array with pose embedding of shape (M, 3) where `M` is the number of
+          pairwise distances defined in `_get_pose_distance_embedding`.
+        """
+        assert landmarks.shape[0] == len(
+            self._landmark_names), 'Unexpected number of landmarks: {}'.format(
+                landmarks.shape[0])
+
+        # Get pose landmarks.
+        landmarks = np.copy(landmarks)
+
+        # Normalize landmarks.
+        landmarks = self._normalize_pose_landmarks(landmarks)
+
+        # Get embedding.
+        embedding = self._get_pose_distance_embedding(landmarks)
+
+        embedding = (embedding + 1) / 2
+
+        return embedding.reshape(-1)
+
+    def __call__(self, pose_results, bboxes):
+        print('pose embedded')
+        pose_embeddings = []
+        for k in range(len(pose_results)):
+            w1, h1, w2, h2 = bboxes[k]
+
+            landmarks = np.copy(
+                pose_results[k].pred_instances.keypoints.reshape(-1, 2))
+
+            for i in range(landmarks.shape[0]):
+                w, h = landmarks[i]
+                landmarks[i][0] = (w - w1) / (w2 - w1)
+                landmarks[i][1] = (h - h1) / (h2 - h1)
+            # print(landmarks)
+            pose_embeddings.append(self.embbed(landmarks))
+
+        pose_embeddings = torch.from_numpy(np.stack(pose_embeddings, axis=0))
+        # print(pose_embeddings[0])
+        return pose_embeddings
+
+    def _normalize_pose_landmarks(self, landmarks):
+        """Normalizes landmarks translation and scale."""
+        landmarks = np.copy(landmarks)
+
+        # Normalize translation.
+        pose_center = self._get_pose_center(landmarks)
+        landmarks -= pose_center
+
+        # Normalize scale.
+        pose_size = self._get_pose_size(landmarks, self._torso_size_multiplier)
+        landmarks /= pose_size
+        # Multiplication by 100 is not required, but makes it eaasier to debug.
+        # landmarks *= 100
+
+        return landmarks
+
+    def _get_pose_center(self, landmarks):
+        """Calculates pose center as point between hips."""
+        left_hip = landmarks[self._landmark_names.index('left_hip')]
+        right_hip = landmarks[self._landmark_names.index('right_hip')]
+        center = (left_hip + right_hip) * 0.5
+        return center
+
+    def _get_pose_size(self, landmarks, torso_size_multiplier):
+        """Calculates pose size.
+
+        It is the maximum of two values:
+          * Torso size multiplied by `torso_size_multiplier`
+          * Maximum distance from pose center to any pose landmark
+        """
+        # This approach uses only 2D landmarks to compute pose size.
+        landmarks = landmarks[:, :2]
+
+        # Hips center.
+        left_hip = landmarks[self._landmark_names.index('left_hip')]
+        right_hip = landmarks[self._landmark_names.index('right_hip')]
+        hips = (left_hip + right_hip) * 0.5
+
+        # Shoulders center.
+        left_shoulder = landmarks[self._landmark_names.index('left_shoulder')]
+        right_shoulder = landmarks[self._landmark_names.index(
+            'right_shoulder')]
+        shoulders = (left_shoulder + right_shoulder) * 0.5
+
+        # Torso size as the minimum body size.
+        torso_size = np.linalg.norm(shoulders - hips)
+
+        # Max dist to pose center.
+        pose_center = self._get_pose_center(landmarks)
+        max_dist = np.max(np.linalg.norm(landmarks - pose_center, axis=1))
+
+        return max(torso_size * torso_size_multiplier, max_dist)
+
+    def _get_pose_distance_embedding(self, landmarks):
+        """Converts pose landmarks into 3D embedding.
+
+        We use several pairwise 3D distances to form pose embedding. All distances
+        include X and Y components with sign. We differnt types of pairs to cover
+        different pose classes. Feel free to remove some or add new.
+
+        Args:
+          landmarks - NumPy array with 3D landmarks of shape (N, 3).
+
+        Result:
+          Numpy array with pose embedding of shape (M, 3) where `M` is the number of
+          pairwise distances.
+        """
+        embedding = np.array([
+            # One joint.
+            self._get_distance(
+                self._get_average_by_names(landmarks, 'left_hip', 'right_hip'),
+                self._get_average_by_names(landmarks, 'left_shoulder',
+                                           'right_shoulder')),
+            self._get_distance_by_names(landmarks, 'left_shoulder',
+                                        'left_elbow'),
+            self._get_distance_by_names(landmarks, 'right_shoulder',
+                                        'right_elbow'),
+            self._get_distance_by_names(landmarks, 'left_elbow', 'left_wrist'),
+            self._get_distance_by_names(landmarks, 'right_elbow',
+                                        'right_wrist'),
+            self._get_distance_by_names(landmarks, 'left_hip', 'left_knee'),
+            self._get_distance_by_names(landmarks, 'right_hip', 'right_knee'),
+            self._get_distance_by_names(landmarks, 'left_knee', 'left_ankle'),
+            self._get_distance_by_names(landmarks, 'right_knee',
+                                        'right_ankle'),
+
+            # Two joints.
+            self._get_distance_by_names(landmarks, 'left_shoulder',
+                                        'left_wrist'),
+            self._get_distance_by_names(landmarks, 'right_shoulder',
+                                        'right_wrist'),
+            self._get_distance_by_names(landmarks, 'left_hip', 'left_ankle'),
+            self._get_distance_by_names(landmarks, 'right_hip', 'right_ankle'),
+
+            # Four joints.
+            self._get_distance_by_names(landmarks, 'left_hip', 'left_wrist'),
+            self._get_distance_by_names(landmarks, 'right_hip', 'right_wrist'),
+
+            # Five joints.
+            self._get_distance_by_names(landmarks, 'left_shoulder',
+                                        'left_ankle'),
+            self._get_distance_by_names(landmarks, 'right_shoulder',
+                                        'right_ankle'),
+            self._get_distance_by_names(landmarks, 'left_hip', 'left_wrist'),
+            self._get_distance_by_names(landmarks, 'right_hip', 'right_wrist'),
+
+            # Cross body.
+            self._get_distance_by_names(landmarks, 'left_elbow',
+                                        'right_elbow'),
+            self._get_distance_by_names(landmarks, 'left_knee', 'right_knee'),
+            self._get_distance_by_names(landmarks, 'left_wrist',
+                                        'right_wrist'),
+            self._get_distance_by_names(landmarks, 'left_ankle',
+                                        'right_ankle'),
+
+            # Body bent direction.
+
+            # self._get_distance(
+            #     self._get_average_by_names(landmarks, 'left_wrist', 'left_ankle'),
+            #     landmarks[self._landmark_names.index('left_hip')]),
+            # self._get_distance(
+            #     self._get_average_by_names(landmarks, 'right_wrist', 'right_ankle'),
+            #     landmarks[self._landmark_names.index('right_hip')]),
+        ])
+
+        return embedding
+
+    def _get_average_by_names(self, landmarks, name_from, name_to):
+        lmk_from = landmarks[self._landmark_names.index(name_from)]
+        lmk_to = landmarks[self._landmark_names.index(name_to)]
+        return (lmk_from + lmk_to) * 0.5
+
+    def _get_distance_by_names(self, landmarks, name_from, name_to):
+        lmk_from = landmarks[self._landmark_names.index(name_from)]
+        lmk_to = landmarks[self._landmark_names.index(name_to)]
+        return self._get_distance(lmk_from, lmk_to)
+
+    def _get_distance(self, lmk_from, lmk_to):
+        return lmk_to - lmk_from
diff --git a/mmtrack/models/trackers/ocsort_tracker.py b/mmtrack/models/trackers/ocsort_tracker.py
new file mode 100644
index 000000000..08a4da73a
--- /dev/null
+++ b/mmtrack/models/trackers/ocsort_tracker.py
@@ -0,0 +1,523 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import lap
+import numpy as np
+import torch
+from torch import Tensor
+from addict import Dict
+from mmengine.structures import InstanceData
+from mmdet.structures.bbox import bbox_overlaps
+
+from mmtrack.structures.bbox import bbox_cxcyah_to_xyxy, bbox_xyxy_to_cxcyah
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.utils import OptConfigType
+from .sort_tracker import SORTTracker
+
+
+@MODELS.register_module()
+class OCSORTTracker(SORTTracker):
+    """Tracker for OC-SORT.
+
+    Args:
+        obj_score_thrs (float): Detection score threshold for matching objects.
+            Defaults to 0.3.
+        init_track_thr (float): Detection score threshold for initializing a
+            new tracklet. Defaults to 0.7.
+        weight_iou_with_det_scores (bool): Whether using detection scores to
+            weight IOU which is used for matching. Defaults to True.
+        match_iou_thr (float): IOU distance threshold for matching between two
+            frames. Defaults to 0.3.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+        vel_consist_weight (float): Weight of the velocity consistency term in
+            association (OCM term in the paper).
+        vel_delta_t (int): The difference of time step for calculating of the
+            velocity direction of tracklets.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 obj_score_thr: float = 0.3,
+                 init_track_thr: float = 0.7,
+                 weight_iou_with_det_scores=True,
+                 match_iou_thr=0.3,
+                 num_tentatives=3,
+                 vel_consist_weight=0.2,
+                 vel_delta_t=3,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.obj_score_thr = obj_score_thr
+        self.init_track_thr = init_track_thr
+
+        self.weight_iou_with_det_scores = weight_iou_with_det_scores
+        self.match_iou_thr = match_iou_thr
+        self.vel_consist_weight = vel_consist_weight
+        self.vel_delta_t = vel_delta_t
+
+        self.num_tentatives = num_tentatives
+
+    @property
+    def unconfirmed_ids(self):
+        """Unconfirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if track.tentative]
+        return ids
+
+    def init_track(self, id, obj):
+        """Initialize a track."""
+        super().init_track(id, obj)
+        if self.tracks[id].frame_ids[-1] == 0:
+            self.tracks[id].tentative = False
+        else:
+            self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+        # track.obs maintains the history associated detections to this track
+        self.tracks[id].obs = []
+        bbox_id = self.memo_items.index('bboxes')
+        self.tracks[id].obs.append(obj[bbox_id])
+        # a placefolder to save mean/covariance before losing tracking it
+        # parameters to save: mean, covariance, measurement
+        self.tracks[id].tracked = True
+        self.tracks[id].saved_attr = Dict()
+        self.tracks[id].velocity = torch.tensor(
+            (-1, -1)).to(obj[bbox_id].device)  # placeholder
+
+    def update_track(self, id, obj):
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+        self.tracks[id].tracked = True
+        bbox_id = self.memo_items.index('bboxes')
+        self.tracks[id].obs.append(obj[bbox_id])
+
+        bbox1 = self.k_step_observation(self.tracks[id])
+        bbox2 = obj[bbox_id]
+        self.tracks[id].velocity = self.vel_direction(bbox1, bbox2).to(
+            obj[bbox_id].device)
+
+    def vel_direction(self, bbox1, bbox2):
+        """Estimate the direction vector between two boxes."""
+        if bbox1.sum() < 0 or bbox2.sum() < 0:
+            return torch.tensor((-1, -1))
+        cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0
+        cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0
+        speed = torch.tensor([cy2 - cy1, cx2 - cx1])
+        norm = torch.sqrt((speed[0])**2 + (speed[1])**2) + 1e-6
+        return speed / norm
+
+    def vel_direction_batch(self, bboxes1, bboxes2):
+        """Estimate the direction vector given two batches of boxes."""
+        cx1, cy1 = (bboxes1[:, 0] + bboxes1[:, 2]) / 2.0, (bboxes1[:, 1] +
+                                                           bboxes1[:, 3]) / 2.0
+        cx2, cy2 = (bboxes2[:, 0] + bboxes2[:, 2]) / 2.0, (bboxes2[:, 1] +
+                                                           bboxes2[:, 3]) / 2.0
+        speed_diff_y = cy2[None, :] - cy1[:, None]
+        speed_diff_x = cx2[None, :] - cx1[:, None]
+        speed = torch.cat((speed_diff_y[..., None], speed_diff_x[..., None]),
+                          dim=-1)
+        norm = torch.sqrt((speed[:, :, 0])**2 + (speed[:, :, 1])**2) + 1e-6
+        speed[:, :, 0] /= norm
+        speed[:, :, 1] /= norm
+        return speed
+
+    def k_step_observation(self, track):
+        """return the observation k step away before."""
+        obs_seqs = track.obs
+        num_obs = len(obs_seqs)
+        if num_obs == 0:
+            return torch.tensor((-1, -1, -1, -1)).to(track.obs[0].device)
+        elif num_obs > self.vel_delta_t:
+            if obs_seqs[num_obs - 1 - self.vel_delta_t] is not None:
+                return obs_seqs[num_obs - 1 - self.vel_delta_t]
+            else:
+                return self.last_obs(track)
+        else:
+            return self.last_obs(track)
+
+    def ocm_assign_ids(self,
+                       ids,
+                       det_bboxes,
+                       det_scores,
+                       weight_iou_with_det_scores=False,
+                       match_iou_thr=0.5):
+        """Apply Observation-Centric Momentum (OCM) to assign ids.
+
+        OCM adds movement direction consistency into the association cost
+        matrix. This term requires no additional assumption but from the
+        same linear motion assumption as the canonical Kalman Filter in SORT.
+
+        Args:
+            ids (list[int]): Tracking ids.
+            det_bboxes (Tensor): of shape (N, 4)
+            det_scores: (Tensor): of shape (N, )
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(int): The assigning ids.
+
+        OC-SORT uses velocity consistency besides IoU for association
+        """
+        # get track_bboxes
+        track_bboxes = np.zeros((0, 4))
+        for id in ids:
+            track_bboxes = np.concatenate(
+                (track_bboxes, self.tracks[id].mean[:4][None]), axis=0)
+        track_bboxes = torch.from_numpy(track_bboxes).to(det_bboxes.device)
+        track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes)
+
+        # compute distance
+        ious = bbox_overlaps(track_bboxes, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores[None]
+        dists = (1 - ious).cpu().numpy()
+
+        if len(ids) > 0 and len(det_bboxes) > 0:
+            track_velocities = torch.stack(
+                [self.tracks[id].velocity for id in ids]).to(det_bboxes.device)
+            k_step_observations = torch.stack([
+                self.k_step_observation(self.tracks[id]) for id in ids
+            ]).to(det_bboxes.device)
+            # valid1: if the track has previous observations to estimate speed
+            # valid2: if the associated observation k steps ago is a detection
+            valid1 = track_velocities.sum(dim=1) != -2
+            valid2 = k_step_observations.sum(dim=1) != -4
+            valid = valid1 & valid2
+
+            vel_to_match = self.vel_direction_batch(k_step_observations[:, :4],
+                                                    det_bboxes)
+            track_velocities = track_velocities[:, None, :].repeat(
+                1, det_bboxes.shape[0], 1)
+
+            angle_cos = (vel_to_match * track_velocities).sum(dim=-1)
+            angle_cos = torch.clamp(angle_cos, min=-1, max=1)
+            angle = torch.acos(angle_cos)  # [0, pi]
+            norm_angle = (angle - np.pi / 2.) / np.pi  # [-0.5, 0.5]
+            valid_matrix = valid[:, None].int().repeat(1, det_bboxes.shape[0])
+            # set non-valid entries 0
+            valid_norm_angle = norm_angle * valid_matrix
+
+            dists += valid_norm_angle.cpu().numpy() * self.vel_consist_weight
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(ids)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def last_obs(self, track):
+        """extract the last associated observation."""
+        for bbox in track.obs[::-1]:
+            if bbox is not None:
+                return bbox
+
+    def ocr_assign_ids(self,
+                       track_obs,
+                       det_bboxes,
+                       det_scores,
+                       weight_iou_with_det_scores=False,
+                       match_iou_thr=0.5):
+        """Association for Observation-Centric Recovery.
+
+        As try to recover tracks from being lost whose estimated velocity is
+        out- to-date, we use IoU-only matching strategy.
+
+        Args:
+            track_obs (Tensor): the list of historical associated
+                detections of tracks
+            det_bboxes (Tensor): of shape (N, 4), unmatched detections
+            det_scores (Tensor): of shape (N, )
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(int): The assigning ids.
+        """
+        # compute distance
+        ious = bbox_overlaps(track_obs, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores[None]
+
+        dists = (1 - ious).cpu().numpy()
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(track_obs)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def online_smooth(self, track, obj):
+        """Once a track is recovered from being lost, online smooth its
+        parameters to fix the error accumulated during being lost.
+
+        NOTE: you can use different virtual trajectory generation
+        strategies, we adopt the naive linear interpolation as default
+        """
+        last_match_bbox = self.last_obs(track)
+        new_match_bbox = obj
+        unmatch_len = 0
+        for bbox in track.obs[::-1]:
+            if bbox is None:
+                unmatch_len += 1
+            else:
+                break
+        bbox_shift_per_step = (new_match_bbox - last_match_bbox) / (
+            unmatch_len + 1)
+        track.mean = track.saved_attr.mean
+        track.covariance = track.saved_attr.covariance
+        for i in range(unmatch_len):
+            virtual_bbox = last_match_bbox + (i + 1) * bbox_shift_per_step
+            virtual_bbox = bbox_xyxy_to_cxcyah(virtual_bbox[None, :])
+            virtual_bbox = virtual_bbox.squeeze(0).cpu().numpy()
+            track.mean, track.covariance = self.kf.update(
+                track.mean, track.covariance, virtual_bbox)
+
+    def track(self,
+              model: torch.nn.Module,
+              img: Tensor,
+              feats: List[Tensor],
+              data_sample: TrackDataSample,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        NOTE: this implementation is slightly different from the original
+        OC-SORT implementation (https://github.com/noahcao/OC_SORT)that we
+        do association between detections and tentative/non-tentative tracks
+        independently while the original implementation combines them together.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                SORT method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                False.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_det_instances.bboxes
+        labels = data_sample.pred_det_instances.labels
+        scores = data_sample.pred_det_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+
+        if not hasattr(self, 'kf'):
+            self.kf = model.motion
+
+        if self.empty or bboxes.size(0) == 0:
+            valid_inds = scores > self.init_track_thr
+            bboxes = bboxes[valid_inds]
+            labels = labels[valid_inds]
+            scores = scores[valid_inds]
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(self.num_tracks,
+                               self.num_tracks + num_new_tracks,
+                               dtype=torch.long).to(bboxes.device)
+            self.num_tracks += num_new_tracks
+        else:
+            # 0. init
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
+
+            # get the detection bboxes for the first association
+            det_inds = scores > self.obj_score_thr
+            det_bboxes = bboxes[det_inds]
+            det_labels = labels[det_inds]
+            det_scores = scores[det_inds]
+            det_ids = ids[det_inds]
+
+            # 1. predict by Kalman Filter
+            for id in self.confirmed_ids:
+                # track is lost in previous frame
+                if self.tracks[id].frame_ids[-1] != frame_id - 1:
+                    self.tracks[id].mean[7] = 0
+                if self.tracks[id].tracked:
+                    self.tracks[id].saved_attr.mean = self.tracks[id].mean
+                    self.tracks[id].saved_attr.covariance = self.tracks[
+                        id].covariance
+                (self.tracks[id].mean,
+                 self.tracks[id].covariance) = self.kf.predict(
+                     self.tracks[id].mean, self.tracks[id].covariance)
+
+            # 2. match detections and tracks' predicted locations
+            match_track_inds, raw_match_det_inds = self.ocm_assign_ids(
+                self.confirmed_ids,
+                det_bboxes,
+                det_scores,
+                self.weight_iou_with_det_scores,
+                self.match_iou_thr)
+            # '-1' mean a detection box is not matched with tracklets in
+            # previous frame
+            valid = raw_match_det_inds > -1
+            det_ids[valid] = torch.tensor(
+                self.confirmed_ids)[raw_match_det_inds[valid]].to(labels)
+
+            match_det_bboxes = det_bboxes[valid]
+            match_det_labels = det_labels[valid]
+            match_det_scores = det_scores[valid]
+            match_det_ids = det_ids[valid]
+            assert (match_det_ids > -1).all()
+
+            # unmatched tracks and detections
+            unmatch_det_bboxes = det_bboxes[~valid]
+            unmatch_det_labels = det_labels[~valid]
+            unmatch_det_scores = det_scores[~valid]
+            unmatch_det_ids = det_ids[~valid]
+            assert (unmatch_det_ids == -1).all()
+
+            # 3. use unmatched detection bboxes from the first match to match
+            # the unconfirmed tracks
+            (tentative_match_track_inds,
+             tentative_match_det_inds) = self.ocm_assign_ids(
+                 self.unconfirmed_ids,
+                 unmatch_det_bboxes,
+                 unmatch_det_scores,
+                 self.weight_iou_with_det_scores, 
+                 self.match_iou_thr)
+            valid = tentative_match_det_inds > -1
+            unmatch_det_ids[valid] = torch.tensor(self.unconfirmed_ids)[
+                tentative_match_det_inds[valid]].to(labels)
+
+            match_det_bboxes = torch.cat(
+                (match_det_bboxes, unmatch_det_bboxes[valid]), dim=0)
+            match_det_labels = torch.cat(
+                (match_det_labels, unmatch_det_labels[valid]), dim=0)
+            match_det_scores = torch.cat(
+                (match_det_scores, unmatch_det_scores[valid]), dim=0)
+            match_det_ids = torch.cat((match_det_ids, unmatch_det_ids[valid]),
+                                      dim=0)
+            assert (match_det_ids > -1).all()
+
+            unmatch_det_bboxes = unmatch_det_bboxes[~valid]
+            unmatch_det_labels = unmatch_det_labels[~valid]
+            unmatch_det_scores = unmatch_det_scores[~valid]
+            unmatch_det_ids = unmatch_det_ids[~valid]
+            assert (unmatch_det_ids == -1).all()
+
+            all_track_ids = [id for id, _ in self.tracks.items()]
+            unmatched_track_inds = torch.tensor(
+                [ind for ind in all_track_ids if ind not in match_det_ids])
+
+            if len(unmatched_track_inds) > 0:
+                # 4. still some tracks not associated yet, perform OCR
+                last_observations = []
+                for id in unmatched_track_inds:
+                    last_box = self.last_obs(self.tracks[id.item()])
+                    last_observations.append(last_box)
+                last_observations = torch.stack(last_observations)
+
+                remain_det_ids = torch.full((unmatch_det_bboxes.size(0), ),
+                                            -1,
+                                            dtype=labels.dtype,
+                                            device=labels.device)
+
+                _, ocr_match_det_inds = self.ocr_assign_ids(
+                    last_observations, 
+                    unmatch_det_bboxes, 
+                    unmatch_det_scores,
+                    self.weight_iou_with_det_scores,
+                    self.match_iou_thr)
+
+                valid = ocr_match_det_inds > -1
+                remain_det_ids[valid] = unmatched_track_inds.clone()[
+                    ocr_match_det_inds[valid]].to(labels)
+
+                ocr_match_det_bboxes = unmatch_det_bboxes[valid]
+                ocr_match_det_labels = unmatch_det_labels[valid]
+                ocr_match_det_scores = unmatch_det_scores[valid]
+                ocr_match_det_ids = remain_det_ids[valid]
+                assert (ocr_match_det_ids > -1).all()
+
+                ocr_unmatch_det_bboxes = unmatch_det_bboxes[~valid]
+                ocr_unmatch_det_labels = unmatch_det_labels[~valid]
+                ocr_unmatch_det_scores = unmatch_det_scores[~valid]
+                ocr_unmatch_det_ids = remain_det_ids[~valid]
+                assert (ocr_unmatch_det_ids == -1).all()
+
+                unmatch_det_bboxes = ocr_unmatch_det_bboxes
+                unmatch_det_labels = ocr_unmatch_det_labels
+                unmatch_det_scores = ocr_unmatch_det_scores
+                unmatch_det_ids = ocr_unmatch_det_ids
+                match_det_bboxes = torch.cat(
+                    (match_det_bboxes, ocr_match_det_bboxes), dim=0)
+                match_det_labels = torch.cat(
+                    (match_det_labels, ocr_match_det_labels), dim=0)
+                match_det_scores = torch.cat(
+                    (match_det_scores, ocr_match_det_scores), dim=0)
+                match_det_ids = torch.cat((match_det_ids, ocr_match_det_ids),
+                                          dim=0)
+
+            # 5. summarize the track results
+            for i in range(len(match_det_ids)):
+                det_bbox = match_det_bboxes[i]
+                track_id = match_det_ids[i].item()
+                if not self.tracks[track_id].tracked:
+                    # the track is lost before this step
+                    self.online_smooth(self.tracks[track_id], det_bbox)
+
+            for track_id in all_track_ids:
+                if track_id not in match_det_ids:
+                    self.tracks[track_id].tracked = False
+                    self.tracks[track_id].obs.append(None)
+
+            bboxes = torch.cat((match_det_bboxes, unmatch_det_bboxes), dim=0)
+            labels = torch.cat((match_det_labels, unmatch_det_labels), dim=0)
+            scores = torch.cat((match_det_scores, unmatch_det_scores), dim=0)
+            ids = torch.cat((match_det_ids, unmatch_det_ids), dim=0)
+
+            # 6. assign new ids
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum()).to(labels)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(ids=ids, bboxes=bboxes, labels=labels, frame_ids=frame_id)
+        # return bboxes, labels, ids
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/mmtrack/models/trackers/quasi_dense_tao_tracker.py b/mmtrack/models/trackers/quasi_dense_tao_tracker.py
index ff01842d6..95cccaef2 100644
--- a/mmtrack/models/trackers/quasi_dense_tao_tracker.py
+++ b/mmtrack/models/trackers/quasi_dense_tao_tracker.py
@@ -1,13 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
 import torch
-from mmdet.core import bbox_overlaps
+from mmdet.structures.bbox import bbox_overlaps
+from mmengine.structures import InstanceData
+from torch import Tensor
 
-from mmtrack.core import embed_similarity
-from ..builder import TRACKERS
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from ..task_modules.track import embed_similarity
 from .base_tracker import BaseTracker
 
 
-@TRACKERS.register_module()
+@MODELS.register_module()
 class QuasiDenseTAOTracker(BaseTracker):
     """Tracker for Quasi-Dense Tracking Method with TAO Dataset.
 
@@ -37,17 +42,17 @@ class QuasiDenseTAOTracker(BaseTracker):
     """
 
     def __init__(self,
-                 init_score_thr=0.0001,
-                 obj_score_thr=0.0001,
-                 match_score_thr=0.5,
-                 memo_frames=10,
-                 memo_momentum=0.8,
-                 momentum_obj_score=0.5,
-                 obj_score_diff_thr=1.0,
-                 distractor_nms_thr=0.3,
-                 distractor_score_thr=0.5,
-                 match_metric='bisoftmax',
-                 match_with_cosine=True,
+                 init_score_thr: float = 0.0001,
+                 obj_score_thr: float = 0.0001,
+                 match_score_thr: float = 0.5,
+                 memo_frames: int = 10,
+                 memo_momentum: float = 0.8,
+                 momentum_obj_score: float = 0.5,
+                 obj_score_diff_thr: float = 1.0,
+                 distractor_nms_thr: float = 0.3,
+                 distractor_score_thr: float = 0.5,
+                 match_metric: str = 'bisoftmax',
+                 match_with_cosine: bool = True,
                  **kwargs):
         super().__init__(**kwargs)
         self.init_score_thr = init_score_thr
@@ -72,7 +77,8 @@ def reset(self):
         self.num_tracks = 0
         self.tracks = dict()
 
-    def update(self, ids, bboxes, labels, embeds, frame_id):
+    def update(self, ids: Tensor, bboxes: Tensor, labels: Tensor,
+               embeds: Tensor, scores: Tensor, frame_id: int) -> None:
         """Tracking forward function.
 
         Args:
@@ -80,19 +86,22 @@ def update(self, ids, bboxes, labels, embeds, frame_id):
             bboxes (Tensor): of shape (N, 5).
             embeds (Tensor): of shape (N, 256).
             labels (Tensor): of shape (N, ).
+            scores (Tensor): of shape (N, ).
             frame_id (int): The id of current frame, 0-index.
         """
         tracklet_inds = ids > -1
 
         # update memo
-        for id, bbox, embed, label in zip(ids[tracklet_inds],
-                                          bboxes[tracklet_inds],
-                                          embeds[tracklet_inds],
-                                          labels[tracklet_inds]):
+        for id, bbox, embed, label, score in zip(ids[tracklet_inds],
+                                                 bboxes[tracklet_inds],
+                                                 embeds[tracklet_inds],
+                                                 labels[tracklet_inds],
+                                                 scores[tracklet_inds]):
             id = int(id)
             if id in self.tracks:
                 self.tracks[id]['bboxes'].append(bbox)
                 self.tracks[id]['labels'].append(label)
+                self.tracks[id]['scores'].append(score)
                 self.tracks[id]['embeds'] = (
                     1 - self.memo_momentum
                 ) * self.tracks[id]['embeds'] + self.memo_momentum * embed
@@ -101,6 +110,7 @@ def update(self, ids, bboxes, labels, embeds, frame_id):
                 self.tracks[id] = dict(
                     bboxes=[bbox],
                     labels=[label],
+                    scores=[score],
                     embeds=embed,
                     frame_ids=[frame_id])
 
@@ -113,77 +123,100 @@ def update(self, ids, bboxes, labels, embeds, frame_id):
             self.tracks.pop(invalid_id)
 
     @property
-    def memo(self):
+    def memo(self) -> Tuple[Tensor, ...]:
         """Get tracks memory."""
         memo_ids = []
         memo_bboxes = []
         memo_labels = []
+        memo_scores = []
         memo_embeds = []
         for k, v in self.tracks.items():
             memo_ids.append(k)
             memo_bboxes.append(v['bboxes'][-1][None, :])
             memo_labels.append(v['labels'][-1].view(1, 1))
+            memo_scores.append(v['scores'][-1].view(1, 1))
             memo_embeds.append(v['embeds'][None, :])
         memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1)
 
         memo_bboxes = torch.cat(memo_bboxes, dim=0)
         memo_embeds = torch.cat(memo_embeds, dim=0)
         memo_labels = torch.cat(memo_labels, dim=0).squeeze(1)
-        return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze(0)
+        memo_scores = torch.cat(memo_scores, dim=0).squeeze(1)
+        return memo_bboxes, memo_labels, memo_scores, memo_embeds, memo_ids.\
+            squeeze(0)
 
     def track(self,
-              img_metas,
-              feats,
-              model,
-              bboxes,
-              labels,
-              frame_id,
-              temperature=-1,
-              **kwargs):
+              model: torch.nn.Module,
+              img: torch.Tensor,
+              feats: List[torch.Tensor],
+              data_sample: TrackDataSample,
+              temperature: int = -1,
+              rescale=True,
+              **kwargs) -> InstanceData:
         """Tracking forward function.
 
         Args:
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-            feats (tuple): Backbone features of the input image.
-            model (nn.Module): The forward model.
-            bboxes (Tensor): of shape (N, 5).
-            labels (Tensor): of shape (N, ).
-            frame_id (int): The id of current frame, 0-index.
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                QDTrack method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
             temperature (int): similarity temperature.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                True.
 
         Returns:
-            list: Tracking results.
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
         """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_det_instances.bboxes
+        labels = data_sample.pred_det_instances.labels
+        scores = data_sample.pred_det_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        # create pred_track_instances
+        pred_track_instances = InstanceData()
         # return zero bboxes if there is no track targets
         if bboxes.shape[0] == 0:
             ids = torch.zeros_like(labels)
-            return bboxes, labels, ids
+            pred_track_instances = data_sample.pred_det_instances.clone()
+            pred_track_instances.instances_id = ids
+            return pred_track_instances
+
         # get track feats
-        track_bboxes = bboxes[:, :-1] * torch.tensor(
-            img_metas[0]['scale_factor']).to(bboxes.device)
-        track_feats = model.track_head.extract_bbox_feats(
-            feats, [track_bboxes])
+        rescaled_bboxes = bboxes.clone()
+        if rescale:
+            scale_factor = rescaled_bboxes.new_tensor(
+                metainfo['scale_factor']).repeat((1, 2))
+            rescaled_bboxes = rescaled_bboxes * scale_factor
+        track_feats = model.track_head.predict(feats, [rescaled_bboxes])
+
         # all objects is valid here
         valid_inds = labels > -1
         # inter-class nms
         low_inds = torch.nonzero(
-            bboxes[:, -1] < self.distractor_score_thr,
-            as_tuple=False).squeeze(1)
+            scores < self.distractor_score_thr, as_tuple=False).squeeze(1)
         cat_same = labels[low_inds].view(-1, 1) == labels.view(1, -1)
-        ious = bbox_overlaps(bboxes[low_inds, :-1], bboxes[:, :-1])
+        ious = bbox_overlaps(bboxes[low_inds], bboxes)
         ious *= cat_same.to(ious.device)
         for i, ind in enumerate(low_inds):
             if (ious[i, :ind] > self.distractor_nms_thr).any():
                 valid_inds[ind] = False
         bboxes = bboxes[valid_inds]
+        scores = scores[valid_inds]
         labels = labels[valid_inds]
         embeds = track_feats[valid_inds]
 
         # match if buffer is not empty
         if bboxes.size(0) > 0 and not self.empty:
-            memo_bboxes, memo_labels, memo_embeds, memo_ids = self.memo
+            memo_bboxes, memo_labels, memo_scores, memo_embeds, memo_ids \
+                = self.memo
 
             if self.match_metric == 'bisoftmax':
                 sims = embed_similarity(
@@ -198,47 +231,54 @@ def track(self,
                 cos_scores = embed_similarity(
                     embeds, memo_embeds, method='cosine')
                 cos_scores *= cat_same.to(cos_scores.device)
-                scores = (d2t_scores + t2d_scores) / 2
+                match_scores = (d2t_scores + t2d_scores) / 2
                 if self.match_with_cosine:
-                    scores = (scores + cos_scores) / 2
+                    match_scores = (match_scores + cos_scores) / 2
             elif self.match_metric == 'cosine':
                 cos_scores = embed_similarity(
                     embeds, memo_embeds, method='cosine')
                 cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
-                scores = cos_scores * cat_same.float().to(cos_scores.device)
+                match_scores = cos_scores * cat_same.float().to(
+                    cos_scores.device)
             else:
                 raise NotImplementedError()
 
             # keep the object score consistency for detection of the same track
             obj_score_diffs = torch.abs(
-                bboxes[:, -1].view(-1, 1).expand_as(scores) -
-                memo_bboxes[:, -1].view(1, -1).expand_as(scores))
+                scores.view(-1, 1).expand_as(match_scores) -
+                memo_scores.view(1, -1).expand_as(match_scores))
 
             num_objs = bboxes.size(0)
             ids = torch.full((num_objs, ), -1, dtype=torch.long)
             for i in range(num_objs):
-                if bboxes[i, -1] < self.obj_score_thr:
+                if scores[i] < self.obj_score_thr:
                     continue
-                conf, memo_ind = torch.max(scores[i, :], dim=0)
+                conf, memo_ind = torch.max(match_scores[i, :], dim=0)
                 obj_score_diff = obj_score_diffs[i, memo_ind]
                 # update track and object score for matched detection
                 if (conf > self.match_score_thr) and (obj_score_diff <
                                                       self.obj_score_diff_thr):
                     ids[i] = memo_ids[memo_ind]
-                    scores[:i, memo_ind] = 0
-                    scores[i + 1:, memo_ind] = 0
+                    match_scores[:i, memo_ind] = 0
+                    match_scores[i + 1:, memo_ind] = 0
                     m = self.momentum_obj_score
-                    bboxes[i, -1] = m * bboxes[i, -1] + (
-                        1 - m) * memo_bboxes[memo_ind, -1]
+                    scores[i] = m * scores[i] + (1 - m) * memo_scores[memo_ind]
         else:
             ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)
         # init tracklets
-        new_inds = (ids == -1) & (bboxes[:, 4] > self.init_score_thr).cpu()
+        new_inds = (ids == -1) & (scores > self.init_score_thr).cpu()
         num_news = new_inds.sum()
         ids[new_inds] = torch.arange(
             self.num_tracks, self.num_tracks + num_news, dtype=torch.long)
         self.num_tracks += num_news
 
-        self.update(ids, bboxes, labels, embeds, frame_id)
+        self.update(ids, bboxes, labels, embeds, scores, frame_id)
+
+        tracklet_inds = ids > -1
+        # update pred_track_instances
+        pred_track_instances.bboxes = bboxes[tracklet_inds]
+        pred_track_instances.labels = labels[tracklet_inds]
+        pred_track_instances.scores = scores[tracklet_inds]
+        pred_track_instances.instances_id = ids[tracklet_inds]
 
-        return bboxes, labels, ids
+        return pred_track_instances
diff --git a/mmtrack/models/trackers/quasi_dense_tracker.py b/mmtrack/models/trackers/quasi_dense_tracker.py
index b9d84977c..4baeb69ed 100644
--- a/mmtrack/models/trackers/quasi_dense_tracker.py
+++ b/mmtrack/models/trackers/quasi_dense_tracker.py
@@ -1,13 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
 import torch
 import torch.nn.functional as F
-from mmdet.core import bbox_overlaps
+from mmdet.structures.bbox import bbox_overlaps
+from mmengine.structures import InstanceData
+from torch import Tensor
 
-from ..builder import TRACKERS
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
 from .base_tracker import BaseTracker
 
 
-@TRACKERS.register_module()
+@MODELS.register_module()
 class QuasiDenseTracker(BaseTracker):
     """Tracker for Quasi-Dense Tracking.
 
@@ -35,17 +40,17 @@ class QuasiDenseTracker(BaseTracker):
     """
 
     def __init__(self,
-                 init_score_thr=0.8,
-                 obj_score_thr=0.5,
-                 match_score_thr=0.5,
-                 memo_tracklet_frames=10,
-                 memo_backdrop_frames=1,
-                 memo_momentum=0.8,
-                 nms_conf_thr=0.5,
-                 nms_backdrop_iou_thr=0.3,
-                 nms_class_iou_thr=0.7,
-                 with_cats=True,
-                 match_metric='bisoftmax',
+                 init_score_thr: float = 0.8,
+                 obj_score_thr: float = 0.5,
+                 match_score_thr: float = 0.5,
+                 memo_tracklet_frames: int = 10,
+                 memo_backdrop_frames: int = 1,
+                 memo_momentum: float = 0.8,
+                 nms_conf_thr: float = 0.5,
+                 nms_backdrop_iou_thr: float = 0.3,
+                 nms_class_iou_thr: float = 0.7,
+                 with_cats: bool = True,
+                 match_metric: str = 'bisoftmax',
                  **kwargs):
         super().__init__(**kwargs)
         assert 0 <= memo_momentum <= 1.0
@@ -74,7 +79,8 @@ def reset(self):
         self.tracks = dict()
         self.backdrops = []
 
-    def update(self, ids, bboxes, embeds, labels, frame_id):
+    def update(self, ids: Tensor, bboxes: Tensor, embeds: Tensor,
+               labels: Tensor, scores: Tensor, frame_id: int) -> None:
         """Tracking forward function.
 
         Args:
@@ -82,14 +88,16 @@ def update(self, ids, bboxes, embeds, labels, frame_id):
             bboxes (Tensor): of shape (N, 5).
             embeds (Tensor): of shape (N, 256).
             labels (Tensor): of shape (N, ).
+            scores (Tensor): of shape (N, ).
             frame_id (int): The id of current frame, 0-index.
         """
         tracklet_inds = ids > -1
 
-        for id, bbox, embed, label in zip(ids[tracklet_inds],
-                                          bboxes[tracklet_inds],
-                                          embeds[tracklet_inds],
-                                          labels[tracklet_inds]):
+        for id, bbox, embed, label, score in zip(ids[tracklet_inds],
+                                                 bboxes[tracklet_inds],
+                                                 embeds[tracklet_inds],
+                                                 labels[tracklet_inds],
+                                                 scores[tracklet_inds]):
             id = int(id)
             # update the tracked ones and initialize new tracks
             if id in self.tracks.keys():
@@ -101,6 +109,7 @@ def update(self, ids, bboxes, embeds, labels, frame_id):
                 ) * self.tracks[id]['embed'] + self.memo_momentum * embed
                 self.tracks[id]['last_frame'] = frame_id
                 self.tracks[id]['label'] = label
+                self.tracks[id]['score'] = score
                 self.tracks[id]['velocity'] = (
                     self.tracks[id]['velocity'] * self.tracks[id]['acc_frame']
                     + velocity) / (
@@ -111,12 +120,13 @@ def update(self, ids, bboxes, embeds, labels, frame_id):
                     bbox=bbox,
                     embed=embed,
                     label=label,
+                    score=score,
                     last_frame=frame_id,
                     velocity=torch.zeros_like(bbox),
                     acc_frame=0)
         # backdrop update according to IoU
         backdrop_inds = torch.nonzero(ids == -1, as_tuple=False).squeeze(1)
-        ious = bbox_overlaps(bboxes[backdrop_inds, :4], bboxes[:, :4])
+        ious = bbox_overlaps(bboxes[backdrop_inds], bboxes)
         for i, ind in enumerate(backdrop_inds):
             if (ious[i, :ind] > self.nms_backdrop_iou_thr).any():
                 backdrop_inds[i] = -1
@@ -141,7 +151,7 @@ def update(self, ids, bboxes, embeds, labels, frame_id):
             self.backdrops.pop()
 
     @property
-    def memo(self):
+    def memo(self) -> Tuple[Tensor, ...]:
         """Get tracks memory."""
         memo_embeds = []
         memo_ids = []
@@ -176,47 +186,74 @@ def memo(self):
         return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze(
             0), memo_vs
 
-    def track(self, img_metas, feats, model, bboxes, labels, frame_id):
+    def track(self,
+              model: torch.nn.Module,
+              img: torch.Tensor,
+              feats: List[torch.Tensor],
+              data_sample: TrackDataSample,
+              rescale=True,
+              **kwargs) -> InstanceData:
         """Tracking forward function.
 
         Args:
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-            feats (tuple): Backbone features of the input image.
-            model (nn.Module): The forward model.
-            bboxes (Tensor): of shape (N, 5).
-            labels (Tensor): of shape (N, ).
-            frame_id (int): The id of current frame, 0-index.
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                QDTrack method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                True.
 
         Returns:
-            list: Tracking results.
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
         """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_det_instances.bboxes
+        labels = data_sample.pred_det_instances.labels
+        scores = data_sample.pred_det_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        # create pred_track_instances
+        pred_track_instances = InstanceData()
+
         # return zero bboxes if there is no track targets
         if bboxes.shape[0] == 0:
             ids = torch.zeros_like(labels)
-            return bboxes, labels, ids
+            pred_track_instances = data_sample.pred_det_instances.clone()
+            pred_track_instances.instances_id = ids
+            return pred_track_instances
+
         # get track feats
-        track_bboxes = bboxes[:, :-1] * torch.tensor(
-            img_metas[0]['scale_factor']).to(bboxes.device)
-        track_feats = model.track_head.extract_bbox_feats(
-            feats, [track_bboxes])
+        rescaled_bboxes = bboxes.clone()
+        if rescale:
+            scale_factor = rescaled_bboxes.new_tensor(
+                metainfo['scale_factor']).repeat((1, 2))
+            rescaled_bboxes = rescaled_bboxes * scale_factor
+        track_feats = model.track_head.predict(feats, [rescaled_bboxes])
         # sort according to the object_score
-        _, inds = bboxes[:, -1].sort(descending=True)
-        bboxes = bboxes[inds, :]
+        _, inds = scores.sort(descending=True)
+        bboxes = bboxes[inds]
+        scores = scores[inds]
         labels = labels[inds]
         embeds = track_feats[inds, :]
 
         # duplicate removal for potential backdrops and cross classes
         valids = bboxes.new_ones((bboxes.size(0)))
-        ious = bbox_overlaps(bboxes[:, :-1], bboxes[:, :-1])
+        ious = bbox_overlaps(bboxes, bboxes)
         for i in range(1, bboxes.size(0)):
-            thr = self.nms_backdrop_iou_thr if bboxes[
-                i, -1] < self.obj_score_thr else self.nms_class_iou_thr
+            thr = self.nms_backdrop_iou_thr if scores[
+                i] < self.obj_score_thr else self.nms_class_iou_thr
             if (ious[i, :i] > thr).any():
                 valids[i] = 0
         valids = valids == 1
-        bboxes = bboxes[valids, :]
+        bboxes = bboxes[valids]
+        scores = scores[valids]
         labels = labels[valids]
         embeds = embeds[valids, :]
 
@@ -232,12 +269,12 @@ def track(self, img_metas, feats, model, bboxes, labels, frame_id):
                 feats = torch.mm(embeds, memo_embeds.t())
                 d2t_scores = feats.softmax(dim=1)
                 t2d_scores = feats.softmax(dim=0)
-                scores = (d2t_scores + t2d_scores) / 2
+                match_scores = (d2t_scores + t2d_scores) / 2
             elif self.match_metric == 'softmax':
                 feats = torch.mm(embeds, memo_embeds.t())
-                scores = feats.softmax(dim=1)
+                match_scores = feats.softmax(dim=1)
             elif self.match_metric == 'cosine':
-                scores = torch.mm(
+                match_scores = torch.mm(
                     F.normalize(embeds, p=2, dim=1),
                     F.normalize(memo_embeds, p=2, dim=1).t())
             else:
@@ -245,29 +282,35 @@ def track(self, img_metas, feats, model, bboxes, labels, frame_id):
             # track with the same category
             if self.with_cats:
                 cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
-                scores *= cat_same.float().to(scores.device)
-            # track according to scores
+                match_scores *= cat_same.float().to(match_scores.device)
+            # track according to match_scores
             for i in range(bboxes.size(0)):
-                conf, memo_ind = torch.max(scores[i, :], dim=0)
+                conf, memo_ind = torch.max(match_scores[i, :], dim=0)
                 id = memo_ids[memo_ind]
                 if conf > self.match_score_thr:
                     if id > -1:
                         # keep bboxes with high object score
                         # and remove background bboxes
-                        if bboxes[i, -1] > self.obj_score_thr:
+                        if scores[i] > self.obj_score_thr:
                             ids[i] = id
-                            scores[:i, memo_ind] = 0
-                            scores[i + 1:, memo_ind] = 0
+                            match_scores[:i, memo_ind] = 0
+                            match_scores[i + 1:, memo_ind] = 0
                         else:
                             if conf > self.nms_conf_thr:
                                 ids[i] = -2
         # initialize new tracks
-        new_inds = (ids == -1) & (bboxes[:, 4] > self.init_score_thr).cpu()
+        new_inds = (ids == -1) & (scores > self.init_score_thr).cpu()
         num_news = new_inds.sum()
         ids[new_inds] = torch.arange(
             self.num_tracks, self.num_tracks + num_news, dtype=torch.long)
         self.num_tracks += num_news
 
-        self.update(ids, bboxes, embeds, labels, frame_id)
+        self.update(ids, bboxes, embeds, labels, scores, frame_id)
+        tracklet_inds = ids > -1
+        # update pred_track_instances
+        pred_track_instances.bboxes = bboxes[tracklet_inds]
+        pred_track_instances.labels = labels[tracklet_inds]
+        pred_track_instances.scores = scores[tracklet_inds]
+        pred_track_instances.instances_id = ids[tracklet_inds]
 
-        return bboxes, labels, ids
+        return pred_track_instances
diff --git a/mmtrack/models/trackers/sort_tracker.py b/mmtrack/models/trackers/sort_tracker.py
index f5efd3abf..ede4eebf2 100644
--- a/mmtrack/models/trackers/sort_tracker.py
+++ b/mmtrack/models/trackers/sort_tracker.py
@@ -1,25 +1,28 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
 import numpy as np
 import torch
-from mmcv.runner import force_fp32
-from mmdet.core import bbox_overlaps
+from mmdet.structures.bbox import bbox_overlaps
+from mmengine.structures import InstanceData
 from motmetrics.lap import linear_sum_assignment
+from torch import Tensor
 
-from mmtrack.core import imrenormalize
-from mmtrack.core.bbox import bbox_xyxy_to_cxcyah
-from mmtrack.models import TRACKERS
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.structures.bbox import bbox_xyxy_to_cxcyah
+from mmtrack.utils import OptConfigType, imrenormalize
 from .base_tracker import BaseTracker
 
 
-@TRACKERS.register_module()
-class SortTracker(BaseTracker):
-    """Tracker for DeepSORT.
+@MODELS.register_module()
+class SORTTracker(BaseTracker):
+    """Tracker for SORT/DeepSORT.
 
     Args:
         obj_score_thr (float, optional): Threshold to filter the objects.
             Defaults to 0.3.
         reid (dict, optional): Configuration for the ReID model.
-
             - num_samples (int, optional): Number of samples to calculate the
                 feature embeddings of a track. Default to 10.
             - image_scale (tuple, optional): Input scale of the ReID model.
@@ -32,34 +35,31 @@ class SortTracker(BaseTracker):
             Defaults to 0.7.
         num_tentatives (int, optional): Number of continuous frames to confirm
             a track. Defaults to 3.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
-            Defaults to None.
     """
 
     def __init__(self,
-                 obj_score_thr=0.3,
-                 reid=dict(
+                 obj_score_thr: float = 0.3,
+                 reid: dict = dict(
                      num_samples=10,
                      img_scale=(256, 128),
                      img_norm_cfg=None,
                      match_score_thr=2.0),
-                 match_iou_thr=0.7,
-                 num_tentatives=3,
-                 init_cfg=None,
+                 match_iou_thr: float = 0.7,
+                 num_tentatives: int = 3,
                  **kwargs):
-        super().__init__(init_cfg=init_cfg, **kwargs)
+        super().__init__(**kwargs)
         self.obj_score_thr = obj_score_thr
         self.reid = reid
         self.match_iou_thr = match_iou_thr
         self.num_tentatives = num_tentatives
 
     @property
-    def confirmed_ids(self):
+    def confirmed_ids(self) -> List:
         """Confirmed ids in the tracker."""
         ids = [id for id, track in self.tracks.items() if not track.tentative]
         return ids
 
-    def init_track(self, id, obj):
+    def init_track(self, id: int, obj: Tuple[Tensor]) -> None:
         """Initialize a track."""
         super().init_track(id, obj)
         self.tracks[id].tentative = True
@@ -69,7 +69,7 @@ def init_track(self, id, obj):
         self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
             bbox)
 
-    def update_track(self, id, obj):
+    def update_track(self, id: int, obj: Tuple[Tensor]) -> None:
         """Update a track."""
         super().update_track(id, obj)
         if self.tracks[id].tentative:
@@ -81,7 +81,7 @@ def update_track(self, id, obj):
         self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
             self.tracks[id].mean, self.tracks[id].covariance, bbox)
 
-    def pop_invalid_tracks(self, frame_id):
+    def pop_invalid_tracks(self, frame_id: int) -> None:
         """Pop out invalid tracks."""
         invalid_ids = []
         for k, v in self.tracks.items():
@@ -94,62 +94,81 @@ def pop_invalid_tracks(self, frame_id):
         for invalid_id in invalid_ids:
             self.tracks.pop(invalid_id)
 
-    @force_fp32(apply_to=('img', ))
     def track(self,
-              img,
-              img_metas,
-              model,
-              bboxes,
-              labels,
-              frame_id,
-              rescale=False,
-              **kwargs):
+              model: torch.nn.Module,
+              img: Tensor,
+              feats: List[Tensor],
+              data_sample: TrackDataSample,
+              data_preprocessor: OptConfigType = None,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
         """Tracking forward function.
 
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
             model (nn.Module): MOT model.
-            bboxes (Tensor): of shape (N, 5).
-            labels (Tensor): of shape (N, ).
-            frame_id (int): The id of current frame, 0-index.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                SORT method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
             rescale (bool, optional): If True, the bounding boxes should be
                 rescaled to fit the original scale of the image. Defaults to
                 False.
 
         Returns:
-            tuple: Tracking results.
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
         """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_det_instances.bboxes
+        labels = data_sample.pred_det_instances.labels
+        scores = data_sample.pred_det_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
         if not hasattr(self, 'kf'):
             self.kf = model.motion
 
         if self.with_reid:
             if self.reid.get('img_norm_cfg', False):
-                reid_img = imrenormalize(img, img_metas[0]['img_norm_cfg'],
+                img_norm_cfg = dict(
+                    mean=data_preprocessor['mean'],
+                    std=data_preprocessor['std'],
+                    to_bgr=data_preprocessor['rgb_to_bgr'])
+                reid_img = imrenormalize(img, img_norm_cfg,
                                          self.reid['img_norm_cfg'])
             else:
                 reid_img = img.clone()
 
-        valid_inds = bboxes[:, -1] > self.obj_score_thr
+        valid_inds = scores > self.obj_score_thr
         bboxes = bboxes[valid_inds]
         labels = labels[valid_inds]
+        scores = scores[valid_inds]
 
         if self.empty or bboxes.size(0) == 0:
             num_new_tracks = bboxes.size(0)
             ids = torch.arange(
                 self.num_tracks,
                 self.num_tracks + num_new_tracks,
-                dtype=torch.long)
+                dtype=torch.long).to(bboxes.device)
             self.num_tracks += num_new_tracks
             if self.with_reid:
-                embeds = model.reid.simple_test(
-                    self.crop_imgs(reid_img, img_metas, bboxes[:, :4].clone(),
-                                   rescale))
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                if crops.size(0) > 0:
+                    embeds = model.reid(crops, mode='tensor')
+                else:
+                    embeds = crops.new_zeros((0, model.reid.head.out_channels))
         else:
-            ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
 
             # motion
             if model.with_motion:
@@ -158,9 +177,10 @@ def track(self,
 
             active_ids = self.confirmed_ids
             if self.with_reid:
-                embeds = model.reid.simple_test(
-                    self.crop_imgs(reid_img, img_metas, bboxes[:, :4].clone(),
-                                   rescale))
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                embeds = model.reid(crops, mode='tensor')
+
                 # reid
                 if len(active_ids) > 0:
                     track_embeds = self.get(
@@ -168,8 +188,15 @@ def track(self,
                         active_ids,
                         self.reid.get('num_samples', None),
                         behavior='mean')
-                    reid_dists = torch.cdist(track_embeds,
-                                             embeds).cpu().numpy()
+                    reid_dists = torch.cdist(track_embeds, embeds)
+
+                    # support multi-class association
+                    track_labels = torch.tensor([
+                        self.tracks[id]['labels'][-1] for id in active_ids
+                    ]).to(bboxes.device)
+                    cate_match = labels[None, :] == track_labels[:, None]
+                    cate_cost = (1 - cate_match.int()) * 1e6
+                    reid_dists = (reid_dists + cate_cost).cpu().numpy()
 
                     valid_inds = [list(self.ids).index(_) for _ in active_ids]
                     reid_dists[~np.isfinite(costs[valid_inds, :])] = np.nan
@@ -189,9 +216,17 @@ def track(self,
             if len(active_ids) > 0:
                 active_dets = torch.nonzero(ids == -1).squeeze(1)
                 track_bboxes = self.get('bboxes', active_ids)
-                ious = bbox_overlaps(
-                    track_bboxes, bboxes[active_dets][:, :-1]).cpu().numpy()
-                dists = 1 - ious
+                ious = bbox_overlaps(track_bboxes, bboxes[active_dets])
+
+                # support multi-class association
+                track_labels = torch.tensor([
+                    self.tracks[id]['labels'][-1] for id in active_ids
+                ]).to(bboxes.device)
+                cate_match = labels[None, active_dets] == track_labels[:, None]
+                cate_cost = (1 - cate_match.int()) * 1e6
+
+                dists = (1 - ious + cate_cost).cpu().numpy()
+
                 row, col = linear_sum_assignment(dists)
                 for r, c in zip(row, col):
                     dist = dists[r, c]
@@ -202,14 +237,22 @@ def track(self,
             ids[new_track_inds] = torch.arange(
                 self.num_tracks,
                 self.num_tracks + new_track_inds.sum(),
-                dtype=torch.long)
+                dtype=torch.long).to(bboxes.device)
             self.num_tracks += new_track_inds.sum()
 
         self.update(
             ids=ids,
-            bboxes=bboxes[:, :4],
-            scores=bboxes[:, -1],
+            bboxes=bboxes,
+            scores=scores,
             labels=labels,
             embeds=embeds if self.with_reid else None,
             frame_ids=frame_id)
-        return bboxes, labels, ids
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/mmtrack/models/trackers/strongsort_tracker.py b/mmtrack/models/trackers/strongsort_tracker.py
new file mode 100644
index 000000000..da9bc72ae
--- /dev/null
+++ b/mmtrack/models/trackers/strongsort_tracker.py
@@ -0,0 +1,264 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+from mmdet.structures.bbox import bbox_overlaps
+from mmengine.structures import InstanceData
+from motmetrics.lap import linear_sum_assignment
+from torch import Tensor
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.structures.bbox import bbox_xyxy_to_cxcyah
+from mmtrack.utils import OptConfigType, imrenormalize
+from .sort_tracker import SORTTracker
+
+
+def cosine_distance(x: Tensor, y: Tensor) -> np.ndarray:
+    """compute the cosine distance.
+
+    Args:
+        x (Tensor): embeddings with shape (N,C).
+        y (Tensor): embeddings with shape (M,C).
+
+    Returns:
+        ndarray: cosine distance with shape (N,M).
+    """
+    x = x.cpu().numpy()
+    y = y.cpu().numpy()
+    x = x / np.linalg.norm(x, axis=1, keepdims=True)
+    y = y / np.linalg.norm(y, axis=1, keepdims=True)
+    dists = 1. - np.dot(x, y.T)
+    return dists
+
+
+@MODELS.register_module()
+class StrongSORTTracker(SORTTracker):
+    """Tracker for StrongSORT.
+
+    Args:
+        obj_score_thr (float, optional): Threshold to filter the objects.
+            Defaults to 0.6.
+        reid (dict, optional): Configuration for the ReID model.
+            - num_samples (int, optional): Number of samples to calculate the
+                feature embeddings of a track. Default to None.
+            - image_scale (tuple, optional): Input scale of the ReID model.
+                Default to (256, 128).
+            - img_norm_cfg (dict, optional): Configuration to normalize the
+                input. Default to None.
+            - match_score_thr (float, optional): Similarity threshold for the
+                matching process. Default to 0.3.
+            - motion_weight (float, optional): the weight of the motion cost.
+                Defaults to 0.02.
+        match_iou_thr (float, optional): Threshold of the IoU matching process.
+            Defaults to 0.7.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 2.
+    """
+
+    def __init__(self,
+                 obj_score_thr: float = 0.6,
+                 reid: dict = dict(
+                     num_samples=None,
+                     img_scale=(256, 128),
+                     img_norm_cfg=None,
+                     match_score_thr=0.3,
+                     motion_weight=0.02),
+                 match_iou_thr: float = 0.7,
+                 num_tentatives: int = 2,
+                 **kwargs):
+        super().__init__(obj_score_thr, reid, match_iou_thr, num_tentatives,
+                         **kwargs)
+
+    def update_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Update a track."""
+        for k, v in zip(self.memo_items, obj):
+            v = v[None]
+            if self.momentums is not None and k in self.momentums:
+                m = self.momentums[k]
+                self.tracks[id][k] = (1 - m) * self.tracks[id][k] + m * v
+            else:
+                self.tracks[id][k].append(v)
+
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        score = float(self.tracks[id].scores[-1].cpu())
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox, score)
+
+    def track(self,
+              model: torch.nn.Module,
+              img: Tensor,
+              feats: List[Tensor],
+              data_sample: TrackDataSample,
+              data_preprocessor: OptConfigType = None,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                SORT method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                False.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_det_instances.bboxes
+        labels = data_sample.pred_det_instances.labels
+        scores = data_sample.pred_det_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = model.kalman
+
+        if self.with_reid:
+            if self.reid.get('img_norm_cfg', False):
+                img_norm_cfg = dict(
+                    mean=data_preprocessor.get('mean', [0, 0, 0]),
+                    std=data_preprocessor.get('std', [1, 1, 1]),
+                    to_bgr=data_preprocessor.get('rgb_to_bgr', False))
+                reid_img = imrenormalize(img, img_norm_cfg,
+                                         self.reid['img_norm_cfg'])
+            else:
+                reid_img = img.clone()
+
+        valid_inds = scores > self.obj_score_thr
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+
+        if self.empty or bboxes.size(0) == 0:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += num_new_tracks
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                if crops.size(0) > 0:
+                    embeds = model.reid(crops, mode='tensor')
+                else:
+                    embeds = crops.new_zeros((0, model.reid.head.out_channels))
+        else:
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
+
+            # motion
+            if model.with_cmc:
+                num_samples = 1
+                self.tracks = model.cmc.track(self.last_img, img, self.tracks,
+                                              num_samples, frame_id)
+
+            self.tracks, motion_dists = model.kalman.track(
+                self.tracks, bbox_xyxy_to_cxcyah(bboxes))
+
+            active_ids = self.confirmed_ids
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                embeds = model.reid(crops, mode='tensor')
+
+                # reid
+                if len(active_ids) > 0:
+                    track_embeds = self.get(
+                        'embeds',
+                        active_ids,
+                        self.reid.get('num_samples', None),
+                        behavior='mean')
+                    reid_dists = cosine_distance(track_embeds, embeds)
+                    valid_inds = [list(self.ids).index(_) for _ in active_ids]
+                    reid_dists[~np.isfinite(motion_dists[
+                        valid_inds, :])] = np.nan
+
+                    weight_motion = self.reid.get('motion_weight')
+                    match_dists = (1 - weight_motion) * reid_dists + \
+                        weight_motion * motion_dists[valid_inds]
+
+                    # support multi-class association
+                    track_labels = torch.tensor([
+                        self.tracks[id]['labels'][-1] for id in active_ids
+                    ]).to(bboxes.device)
+                    cate_match = labels[None, :] == track_labels[:, None]
+                    cate_cost = ((1 - cate_match.int()) * 1e6).cpu().numpy()
+                    match_dists = match_dists + cate_cost
+
+                    row, col = linear_sum_assignment(match_dists)
+                    for r, c in zip(row, col):
+                        dist = match_dists[r, c]
+                        if not np.isfinite(dist):
+                            continue
+                        if dist <= self.reid['match_score_thr']:
+                            ids[c] = active_ids[r]
+
+            active_ids = [
+                id for id in self.ids if id not in ids
+                and self.tracks[id].frame_ids[-1] == frame_id - 1
+            ]
+            if len(active_ids) > 0:
+                active_dets = torch.nonzero(ids == -1).squeeze(1)
+                track_bboxes = self.get('bboxes', active_ids)
+                ious = bbox_overlaps(track_bboxes, bboxes[active_dets])
+
+                # support multi-class association
+                track_labels = torch.tensor([
+                    self.tracks[id]['labels'][-1] for id in active_ids
+                ]).to(bboxes.device)
+                cate_match = labels[None, active_dets] == track_labels[:, None]
+                cate_cost = (1 - cate_match.int()) * 1e6
+
+                dists = (1 - ious + cate_cost).cpu().numpy()
+
+                row, col = linear_sum_assignment(dists)
+                for r, c in zip(row, col):
+                    dist = dists[r, c]
+                    if dist < 1 - self.match_iou_thr:
+                        ids[active_dets[c]] = active_ids[r]
+
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum(),
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            embeds=embeds if self.with_reid else None,
+            frame_ids=frame_id)
+        self.last_img = img
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/mmtrack/models/trackers/tracktor_tracker.py b/mmtrack/models/trackers/tracktor_tracker.py
index 4ce79aea5..989802733 100644
--- a/mmtrack/models/trackers/tracktor_tracker.py
+++ b/mmtrack/models/trackers/tracktor_tracker.py
@@ -1,21 +1,27 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
 import torch
-from mmcv.runner import force_fp32
-from mmdet.core import bbox_overlaps, multiclass_nms
+from mmdet.models.layers import multiclass_nms
+from mmdet.structures.bbox import bbox_overlaps
+from mmengine.structures import InstanceData
+# TODO: unify the linear_assignment package for different trackers
 from scipy.optimize import linear_sum_assignment
+from torch import Tensor, nn
 
-from mmtrack.core import imrenormalize
-from mmtrack.models import TRACKERS
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.utils import OptConfigType, imrenormalize
 from .base_tracker import BaseTracker
 
 
-@TRACKERS.register_module()
+@MODELS.register_module()
 class TracktorTracker(BaseTracker):
     """Tracker for Tracktor.
 
     Args:
         obj_score_thr (float, optional): Threshold to filter the objects.
-            Defaults to 0.3.
+            Defaults to 0.5.
         reid (dict, optional): Configuration for the ReID model.
 
             - obj_score_thr (float, optional): Threshold to filter the
@@ -41,99 +47,133 @@ class TracktorTracker(BaseTracker):
     """
 
     def __init__(self,
-                 obj_score_thr=0.5,
-                 regression=dict(
+                 obj_score_thr: float = 0.5,
+                 regression: dict = dict(
                      obj_score_thr=0.5,
                      nms=dict(type='nms', iou_threshold=0.6),
                      match_iou_thr=0.3),
-                 reid=dict(
+                 reid: dict = dict(
                      num_samples=10,
                      img_scale=(256, 128),
                      img_norm_cfg=None,
                      match_score_thr=2.0,
                      match_iou_thr=0.2),
-                 init_cfg=None,
                  **kwargs):
-        super().__init__(init_cfg=init_cfg, **kwargs)
+        super().__init__(**kwargs)
         self.obj_score_thr = obj_score_thr
         self.regression = regression
         self.reid = reid
 
-    def regress_tracks(self, x, img_metas, detector, frame_id, rescale=False):
+    def regress_tracks(self,
+                       x: List[Tensor],
+                       metainfo: dict,
+                       detector: nn.Module,
+                       frame_id: int,
+                       rescale: bool = False):
         """Regress the tracks to current frame."""
         memo = self.memo
         bboxes = memo.bboxes[memo.frame_ids == frame_id - 1]
         ids = memo.ids[memo.frame_ids == frame_id - 1]
+
         if rescale:
-            bboxes *= torch.tensor(img_metas[0]['scale_factor']).to(
-                bboxes.device)
-        track_bboxes, track_scores = detector.roi_head.simple_test_bboxes(
-            x, img_metas, [bboxes], None, rescale=rescale)
-        track_bboxes, track_labels, valid_inds = multiclass_nms(
-            track_bboxes[0],
-            track_scores[0],
+            factor_x, factor_y = metainfo['scale_factor']
+            bboxes *= torch.tensor([factor_x, factor_y, factor_x,
+                                    factor_y]).to(bboxes.device)
+
+        if bboxes.size(0) == 0:
+            return bboxes.new_zeros((0, 4)), bboxes.new_zeros(0), \
+                   ids.new_zeros(0), ids.new_zeros(0),
+
+        proposals = InstanceData(**dict(bboxes=bboxes))
+        det_results = detector.roi_head.predict_bbox(x, [metainfo],
+                                                     [proposals], None,
+                                                     rescale)
+        track_bboxes = det_results[0].bboxes
+        track_scores = det_results[0].scores
+        _track_bboxes, track_labels, valid_inds = multiclass_nms(
+            track_bboxes,
+            track_scores,
             0,
             self.regression['nms'],
             return_inds=True)
         ids = ids[valid_inds]
 
-        valid_inds = track_bboxes[:, -1] > self.regression['obj_score_thr']
-        return track_bboxes[valid_inds], track_labels[valid_inds], ids[
-            valid_inds]
+        track_bboxes = _track_bboxes[:, :-1].clone()
+        track_scores = _track_bboxes[:, -1].clone()
+        valid_inds = track_scores > self.regression['obj_score_thr']
+        return track_bboxes[valid_inds], track_scores[
+            valid_inds], track_labels[valid_inds], ids[valid_inds]
 
-    @force_fp32(apply_to=('img', 'feats'))
     def track(self,
-              img,
-              img_metas,
-              model,
-              feats,
-              bboxes,
-              labels,
-              frame_id,
-              rescale=False,
-              **kwargs):
+              model: nn.Module,
+              img: Tensor,
+              feats: List[Tensor],
+              data_sample: TrackDataSample,
+              data_preprocessor: OptConfigType = None,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
         """Tracking forward function.
 
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
             model (nn.Module): MOT model.
-            feats (tuple): Backbone features of the input image.
-            bboxes (Tensor): of shape (N, 5).
-            labels (Tensor): of shape (N, ).
-            frame_id (int): The id of current frame, 0-index.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                ByteTrack method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
             rescale (bool, optional): If True, the bounding boxes should be
                 rescaled to fit the original scale of the image. Defaults to
                 False.
 
         Returns:
-            tuple: Tracking results.
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
         """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_det_instances.bboxes
+        labels = data_sample.pred_det_instances.labels
+        scores = data_sample.pred_det_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+
         if self.with_reid:
             if self.reid.get('img_norm_cfg', False):
-                reid_img = imrenormalize(img, img_metas[0]['img_norm_cfg'],
+                img_norm_cfg = dict(
+                    mean=data_preprocessor['mean'],
+                    std=data_preprocessor['std'],
+                    to_bgr=data_preprocessor['rgb_to_bgr'])
+                reid_img = imrenormalize(img, img_norm_cfg,
                                          self.reid['img_norm_cfg'])
             else:
                 reid_img = img.clone()
 
-        valid_inds = bboxes[:, -1] > self.obj_score_thr
+        valid_inds = scores > self.obj_score_thr
         bboxes = bboxes[valid_inds]
         labels = labels[valid_inds]
+        scores = scores[valid_inds]
 
         if self.empty:
             num_new_tracks = bboxes.size(0)
             ids = torch.arange(
                 self.num_tracks,
                 self.num_tracks + num_new_tracks,
-                dtype=torch.long)
+                dtype=torch.long).to(bboxes.device)
             self.num_tracks += num_new_tracks
             if self.with_reid:
-                embeds = model.reid.simple_test(
-                    self.crop_imgs(reid_img, img_metas, bboxes[:, :4].clone(),
-                                   rescale))
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                if crops.size(0) > 0:
+                    embeds = model.reid(crops, mode='tensor')
+                else:
+                    embeds = crops.new_zeros((0, model.reid.head.out_channels))
         else:
             # motion
             if model.with_cmc:
@@ -148,26 +188,35 @@ def track(self,
                 self.tracks = model.linear_motion.track(self.tracks, frame_id)
 
             # propagate tracks
-            prop_bboxes, prop_labels, prop_ids = self.regress_tracks(
-                feats, img_metas, model.detector, frame_id, rescale)
+            prop_bboxes, prop_scores, prop_labels, prop_ids = \
+                self.regress_tracks(feats, metainfo,
+                                    model.detector, frame_id, rescale)
 
             # filter bboxes with propagated tracks
             ious = bbox_overlaps(bboxes[:, :4], prop_bboxes[:, :4])
             valid_inds = (ious < self.regression['match_iou_thr']).all(dim=1)
             bboxes = bboxes[valid_inds]
             labels = labels[valid_inds]
-            ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)
+            scores = scores[valid_inds]
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
 
             if self.with_reid:
-                prop_embeds = model.reid.simple_test(
-                    self.crop_imgs(reid_img, img_metas,
-                                   prop_bboxes[:, :4].clone(), rescale))
+                prop_crops = self.crop_imgs(reid_img, metainfo,
+                                            prop_bboxes.clone(), rescale)
+                if prop_crops.size(0) > 0:
+                    prop_embeds = model.reid(prop_crops, mode='tensor')
+                else:
+                    prop_embeds = prop_crops.new_zeros(
+                        (0, model.reid.head.out_channels))
                 if bboxes.size(0) > 0:
-                    embeds = model.reid.simple_test(
-                        self.crop_imgs(reid_img, img_metas,
-                                       bboxes[:, :4].clone(), rescale))
+                    embeds = model.reid(
+                        self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale),
+                        mode='tensor')
                 else:
-                    embeds = prop_embeds.new_zeros((0, prop_embeds.size(1)))
+                    embeds = prop_embeds.\
+                        new_zeros((0, model.reid.head.out_channels))
                 # reid
                 active_ids = [int(_) for _ in self.ids if _ not in prop_ids]
                 if len(active_ids) > 0 and bboxes.size(0) > 0:
@@ -180,8 +229,7 @@ def track(self,
                                              embeds).cpu().numpy()
 
                     track_bboxes = self.get('bboxes', active_ids)
-                    ious = bbox_overlaps(track_bboxes,
-                                         bboxes[:, :4]).cpu().numpy()
+                    ious = bbox_overlaps(track_bboxes, bboxes).cpu().numpy()
                     iou_masks = ious < self.reid['match_iou_thr']
                     reid_dists[iou_masks] = 1e6
 
@@ -195,15 +243,11 @@ def track(self,
             ids[new_track_inds] = torch.arange(
                 self.num_tracks,
                 self.num_tracks + new_track_inds.sum(),
-                dtype=torch.long)
+                dtype=torch.long).to(bboxes.device)
             self.num_tracks += new_track_inds.sum()
 
-            if bboxes.shape[1] == 4:
-                bboxes = bboxes.new_zeros((0, 5))
-            if prop_bboxes.shape[1] == 4:
-                prop_bboxes = prop_bboxes.new_zeros((0, 5))
-
             bboxes = torch.cat((prop_bboxes, bboxes), dim=0)
+            scores = torch.cat((prop_scores, scores), dim=0)
             labels = torch.cat((prop_labels, labels), dim=0)
             ids = torch.cat((prop_ids, ids), dim=0)
             if self.with_reid:
@@ -211,10 +255,18 @@ def track(self,
 
         self.update(
             ids=ids,
-            bboxes=bboxes[:, :4],
-            scores=bboxes[:, -1],
+            bboxes=bboxes,
+            scores=scores,
             labels=labels,
             embeds=embeds if self.with_reid else None,
             frame_ids=frame_id)
         self.last_img = img
-        return bboxes, labels, ids
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/mmtrack/models/vid/base.py b/mmtrack/models/vid/base.py
index 71a63ccbc..86d426728 100644
--- a/mmtrack/models/vid/base.py
+++ b/mmtrack/models/vid/base.py
@@ -1,29 +1,30 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
-from collections import OrderedDict
+from typing import Dict, List, Tuple, Union
 
-import mmcv
-import numpy as np
-import torch
-import torch.distributed as dist
-from mmcv.runner import BaseModule, auto_fp16
+from mmengine.model import BaseModel
+from torch import Tensor
 
-from mmtrack.utils import get_root_logger
+from mmtrack.utils import (ForwardResults, OptConfigType, OptMultiConfig,
+                           OptSampleList, SampleList)
 
 
-class BaseVideoDetector(BaseModule, metaclass=ABCMeta):
+class BaseVideoDetector(BaseModel, metaclass=ABCMeta):
     """Base class for video object detector.
 
     Args:
-        init_cfg (dict or list[dict], optional): Initialization config dict.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Initialization config dict.
     """
 
-    def __init__(self, init_cfg):
-        super(BaseVideoDetector, self).__init__(init_cfg)
-        self.logger = get_root_logger()
-        self.fp16_enabled = False
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(data_preprocessor, init_cfg)
 
-    def freeze_module(self, module):
+    def freeze_module(self, module: Union[List[str], Tuple[str], str]) -> None:
         """Freeze module during training."""
         if isinstance(module, str):
             modules = [module]
@@ -39,336 +40,95 @@ def freeze_module(self, module):
                 param.requires_grad = False
 
     @property
-    def with_detector(self):
+    def with_detector(self) -> bool:
         """bool: whether the framework has a detector"""
         return hasattr(self, 'detector') and self.detector is not None
 
     @property
-    def with_motion(self):
+    def with_motion(self) -> bool:
         """bool: whether the framework has a motion model"""
         return hasattr(self, 'motion') and self.motion is not None
 
     @property
-    def with_aggregator(self):
+    def with_aggregator(self) -> bool:
         """bool: whether the framework has a aggregator"""
         return hasattr(self, 'aggregator') and self.aggregator is not None
 
-    @abstractmethod
-    def forward_train(self,
-                      imgs,
-                      img_metas,
-                      ref_img=None,
-                      ref_img_metas=None,
-                      **kwargs):
-        """
-        Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            ref_img (Tensor): of shape (N, R, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-                R denotes there is #R reference images for each input image.
-
-            ref_img_metas (list[list[dict]]): The first list only has one
-                element. The second list contains reference image information
-                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
-                and may also contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-        """
-        pass
-
-    @abstractmethod
-    def simple_test(self, img, img_metas, **kwargs):
-        pass
-
-    def aug_test(self, imgs, img_metas, **kwargs):
-        """Test function with test time augmentation."""
-        pass
-
-    def forward_test(self,
-                     imgs,
-                     img_metas,
-                     ref_img=None,
-                     ref_img_metas=None,
-                     **kwargs):
-        """
-        Args:
-            imgs (List[Tensor]): the outer list indicates test-time
-                augmentations and inner Tensor should have a shape NxCxHxW,
-                which contains all images in the batch.
-
-            img_metas (List[List[dict]]): the outer list indicates test-time
-                augs (multiscale, flip, etc.) and the inner list indicates
-                images in a batch.
-
-            ref_img (list[Tensor] | None): The list only contains one Tensor
-                of shape (1, N, C, H, W) encoding input reference images.
-                Typically these should be mean centered and std scaled. N
-                denotes the number for reference images. There may be no
-                reference images in some cases.
-
-            ref_img_metas (list[list[list[dict]]] | None): The first and
-                second list only has one element. The third list contains
-                image information dict where each dict has: 'img_shape',
-                'scale_factor', 'flip', and may also contain 'filename',
-                'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on
-                the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. There
-                may be no reference images in some cases.
-        """
-        if isinstance(imgs, torch.Tensor):
-            imgs = [imgs]
-        elif not isinstance(imgs, list):
-            raise TypeError(
-                f'imgs must be a list or tensor, but got {type(imgs)}')
-
-        assert isinstance(img_metas, list)
-        if isinstance(img_metas[0], dict):
-            img_metas = [img_metas]
-        elif not isinstance(img_metas[0], list):
-            raise TypeError(
-                'img_metas must be a List[List[dict]] or List[dict]')
-
-        num_augs = len(imgs)
-        if num_augs != len(img_metas):
-            raise ValueError(f'num of augmentations ({len(imgs)}) '
-                             f'!= num of image meta ({len(img_metas)})')
-
-        if num_augs == 1:
-            # proposals (List[List[Tensor]]): the outer list indicates
-            # test-time augs (multiscale, flip, etc.) and the inner list
-            # indicates images in a batch.
-            # The Tensor should have a shape Px4, where P is the number of
-            # proposals.
-            if 'proposals' in kwargs:
-                kwargs['proposals'] = kwargs['proposals'][0]
-            return self.simple_test(
-                imgs[0],
-                img_metas[0],
-                ref_img=ref_img,
-                ref_img_metas=ref_img_metas,
-                **kwargs)
-        else:
-            assert imgs[0].size(0) == 1, 'aug test does not support ' \
-                                         'inference with batch size ' \
-                                         f'{imgs[0].size(0)}'
-            # TODO: support test augmentation for predefined proposals
-            assert 'proposals' not in kwargs
-            return self.aug_test(
-                imgs,
-                img_metas,
-                ref_img=ref_img,
-                ref_img_metas=ref_img_metas,
-                **kwargs)
-
-    @auto_fp16(apply_to=('img', 'ref_img'))
     def forward(self,
-                img,
-                img_metas,
-                ref_img=None,
-                ref_img_metas=None,
-                return_loss=True,
-                **kwargs):
-        """Calls either :func:`forward_train` or :func:`forward_test` depending
-        on whether ``return_loss`` is ``True``.
+                inputs: Dict[str, Tensor],
+                data_samples: OptSampleList = None,
+                mode: str = 'predict',
+                **kwargs) -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
 
-        Note this setting will change the expected inputs. When
-        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
-        and List[dict]), and when ``resturn_loss=False``, img and img_meta
-        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
-        the outer list indicating test time augmentations.
-        """
-        if return_loss:
-            return self.forward_train(
-                img,
-                img_metas,
-                ref_img=ref_img,
-                ref_img_metas=ref_img_metas,
-                **kwargs)
-        else:
-            return self.forward_test(
-                img,
-                img_metas,
-                ref_img=ref_img,
-                ref_img_metas=ref_img_metas,
-                **kwargs)
-
-    def _parse_losses(self, losses):
-        """Parse the raw outputs (losses) of the network.
-
-        Args:
-            losses (dict): Raw output of the network, which usually contain
-                losses and other necessary information.
-
-        Returns:
-            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
-                which may be a weighted sum of all losses, log_vars contains \
-                all the variables to be sent to the logger.
-        """
-        log_vars = OrderedDict()
-        for loss_name, loss_value in losses.items():
-            if isinstance(loss_value, torch.Tensor):
-                log_vars[loss_name] = loss_value.mean()
-            elif isinstance(loss_value, list):
-                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
-            else:
-                raise TypeError(
-                    f'{loss_name} is not a tensor or list of tensors')
-
-        loss = sum(_value for _key, _value in log_vars.items()
-                   if 'loss' in _key)
-
-        log_vars['loss'] = loss
-        for loss_name, loss_value in log_vars.items():
-            # reduce loss when distributed training
-            if dist.is_available() and dist.is_initialized():
-                loss_value = loss_value.data.clone()
-                dist.all_reduce(loss_value.div_(dist.get_world_size()))
-            log_vars[loss_name] = loss_value.item()
-
-        return loss, log_vars
+        The method should accept three modes: "tensor", "predict" and "loss":
 
-    def train_step(self, data, optimizer):
-        """The iteration step during training.
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`TrackDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
 
-        This method defines an iteration step during training, except for the
-        back propagation and optimizer updating, which are done in an optimizer
-        hook. Note that in some complicated cases or models, the whole process
-        including back propagation and optimizer updating is also defined in
-        this method, such as GAN.
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
 
         Args:
-            data (dict): The output of dataloader.
-            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
-                runner is passed to ``train_step()``. This argument is unused
-                and reserved.
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W)
+                encoding input images. Typically these should be mean centered
+                and std scaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'predict'.
 
         Returns:
-            dict: It should contain at least 3 keys: ``loss``, ``log_vars``, \
-                ``num_samples``.
+            The return type depends on ``mode``.
 
-                - ``loss`` is a tensor for back propagation, which can be a \
-                weighted sum of multiple losses.
-                - ``log_vars`` contains all the variables to be sent to the
-                logger.
-                - ``num_samples`` indicates the batch size (when the model is \
-                DDP, it means the batch size on each GPU), which is used for \
-                averaging the logs.
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`TrackDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
         """
-        losses = self(**data)
-        loss, log_vars = self._parse_losses(losses)
-
-        outputs = dict(
-            loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
-
-        return outputs
-
-    def val_step(self, data, optimizer):
-        """The iteration step during validation.
-
-        This method shares the same signature as :func:`train_step`, but used
-        during val epochs. Note that the evaluation after training epochs is
-        not implemented with this method, but an evaluation hook.
-        """
-        losses = self(**data)
-        loss, log_vars = self._parse_losses(losses)
+        if mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
 
-        outputs = dict(
-            loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
+    @abstractmethod
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
 
-        return outputs
+    @abstractmethod
+    def predict(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
 
-    def show_result(self,
-                    img,
-                    result,
-                    score_thr=0.3,
-                    bbox_color='green',
-                    text_color='green',
-                    thickness=1,
-                    font_scale=0.5,
-                    win_name='',
-                    show=False,
-                    wait_time=0,
-                    out_file=None):
-        """Draw `result` over `img`.
+    def _forward(self,
+                 inputs: Dict[str, Tensor],
+                 data_samples: OptSampleList = None,
+                 **kwargs):
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
 
-        Args:
-            img (str or Tensor): The image to be displayed.
-            result (dict): The results to draw over `img` det_bboxes or
-                (det_bboxes, det_masks). The value of key 'det_bboxes'
-                is list with length num_classes, and each element in list
-                is ndarray with shape(n, 5)
-                in [tl_x, tl_y, br_x, br_y, score] format.
-            score_thr (float, optional): Minimum score of bboxes to be shown.
-                Default: 0.3.
-            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
-            text_color (str or tuple or :obj:`Color`): Color of texts.
-            thickness (int): Thickness of lines.
-            font_scale (float): Font scales of texts.
-            win_name (str): The window name.
-            wait_time (int): Value of waitKey param.
-                Default: 0.
-            show (bool): Whether to show the image.
-                Default: False.
-            out_file (str or None): The filename to write the image.
-                Default: None.
+         Args:
+            inputs (Dict[str, Tensor]): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`TrackDataSample`], optional): The
+                Data Samples. It usually includes information such as
+                `gt_instance`.
 
         Returns:
-            img (Tensor): Only if not `show` or `out_file`
+            tuple[list]: A tuple of features from ``head`` forward.
         """
-        # TODO: make it support tracking
-        img = mmcv.imread(img)
-        img = img.copy()
-        assert isinstance(result, dict)
-        bbox_results = result.get('det_bboxes', None)
-        mask_results = result.get('det_masks', None)
-        if isinstance(mask_results, tuple):
-            mask_results = mask_results[0]  # ms rcnn
-        bboxes = np.vstack(bbox_results)
-        labels = [
-            np.full(bbox.shape[0], i, dtype=np.int32)
-            for i, bbox in enumerate(bbox_results)
-        ]
-        labels = np.concatenate(labels)
-        # draw segmentation masks
-        if mask_results is not None and len(labels) > 0:  # non empty
-            masks = mmcv.concat_list(mask_results)
-            inds = np.where(bboxes[:, -1] > score_thr)[0]
-            np.random.seed(42)
-            color_masks = [
-                np.random.randint(0, 256, (1, 3), dtype=np.uint8)
-                for _ in range(max(labels) + 1)
-            ]
-            for i in inds:
-                i = int(i)
-                color_mask = color_masks[labels[i]]
-                mask = masks[i].astype(bool)
-                img[mask] = img[mask] * 0.5 + color_mask * 0.5
-        # if out_file specified, do not show image in window
-        if out_file is not None:
-            show = False
-        # draw bounding boxes
-        mmcv.imshow_det_bboxes(
-            img,
-            bboxes,
-            labels,
-            class_names=self.CLASSES,
-            score_thr=score_thr,
-            bbox_color=bbox_color,
-            text_color=text_color,
-            thickness=thickness,
-            font_scale=font_scale,
-            win_name=win_name,
-            show=show,
-            wait_time=wait_time,
-            out_file=out_file)
-
-        if not (show or out_file):
-            return img
+        raise NotImplementedError(
+            "_forward function (namely 'tensor' mode) is not supported now")
diff --git a/mmtrack/models/vid/dff.py b/mmtrack/models/vid/dff.py
index f26494c3d..15241916d 100644
--- a/mmtrack/models/vid/dff.py
+++ b/mmtrack/models/vid/dff.py
@@ -1,13 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
+
+import copy
+from typing import List, Optional, Tuple, Union
 
 import torch
 from addict import Dict
-from mmdet.core import bbox2result
-from mmdet.models import build_detector
+from mmengine.structures import InstanceData
+from torch import Tensor
 
-from mmtrack.core.motion import flow_warp_feats
-from ..builder import MODELS, build_motion
+from mmtrack.registry import MODELS
+from mmtrack.utils import OptConfigType, OptMultiConfig, SampleList
+from ..task_modules.motion import flow_warp_feats
 from .base import BaseVideoDetector
 
 
@@ -20,135 +23,64 @@ class DFF(BaseVideoDetector):
     """
 
     def __init__(self,
-                 detector,
-                 motion,
-                 pretrains=None,
-                 init_cfg=None,
-                 frozen_modules=None,
-                 train_cfg=None,
-                 test_cfg=None):
-        super(DFF, self).__init__(init_cfg)
-        if isinstance(pretrains, dict):
-            warnings.warn('DeprecationWarning: pretrains is deprecated, '
-                          'please use "init_cfg" instead')
-            motion_pretrain = pretrains.get('motion', None)
-            if motion_pretrain:
-                motion.init_cfg = dict(
-                    type='Pretrained', checkpoint=motion_pretrain)
-            else:
-                motion.init_cfg = None
-            detector_pretrain = pretrains.get('detector', None)
-            if detector_pretrain:
-                detector.init_cfg = dict(
-                    type='Pretrained', checkpoint=detector_pretrain)
-            else:
-                detector.init_cfg = None
-        self.detector = build_detector(detector)
-        self.motion = build_motion(motion)
+                 detector: dict,
+                 motion: dict,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 frozen_modules: Optional[Union[List[str], Tuple[str],
+                                                str]] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+        self.detector = MODELS.build(detector)
+        self.motion = MODELS.build(motion)
         self.train_cfg = train_cfg
         self.test_cfg = test_cfg
+        self.preprocess_cfg = data_preprocessor
 
         if frozen_modules is not None:
             self.freeze_module(frozen_modules)
 
-    def forward_train(self,
-                      img,
-                      img_metas,
-                      gt_bboxes,
-                      gt_labels,
-                      ref_img,
-                      ref_img_metas,
-                      ref_gt_bboxes,
-                      ref_gt_labels,
-                      gt_instance_ids=None,
-                      gt_bboxes_ignore=None,
-                      gt_masks=None,
-                      proposals=None,
-                      ref_gt_instance_ids=None,
-                      ref_gt_bboxes_ignore=None,
-                      ref_gt_masks=None,
-                      ref_proposals=None,
-                      **kwargs):
+    def loss(self, inputs: dict, data_samples: SampleList, **kwargs) -> dict:
         """
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-
-            gt_labels (list[Tensor]): class indices corresponding to each box.
-
-            ref_img (Tensor): of shape (N, 1, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-                1 denotes there is only one reference image for each input
-                image.
-
-            ref_img_metas (list[list[dict]]): The first list only has one
-                element. The second list contains reference image information
-                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
-                and may also contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
-                Tensor contains ground truth bboxes for each reference image
-                with shape (num_all_ref_gts, 5) in
-                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
-                start from 0, and denotes the id of reference image for each
-                key image.
-
-            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
-                Tensor contains class indices corresponding to each reference
-                box with shape (num_all_ref_gts, 2) in
-                [ref_img_id, class_indice].
-
-            gt_instance_ids (None | list[Tensor]): specify the instance id for
-                each ground truth bbox.
-
-            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
-                boxes can be ignored when computing the loss.
-
-            gt_masks (None | Tensor) : true segmentation masks for each box
-                used if the architecture supports a segmentation task.
-
-            proposals (None | Tensor) : override rpn proposals with custom
-                proposals. Use when `with_rpn` is False.
-
-            ref_gt_instance_ids (None | list[Tensor]): specify the instance id
-                for each ground truth bboxes of reference images.
-
-            ref_gt_bboxes_ignore (None | list[Tensor]): specify which bounding
-                boxes of reference images can be ignored when computing the
-                loss.
-
-            ref_gt_masks (None | Tensor) : True segmentation masks for each
-                box of reference image used if the architecture supports a
-                segmentation task.
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size. The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instance``.
+
+        Return:
+            dict: A dictionary of loss components.
+        """
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'Dff video detectors only support 1 batch size per gpu for now.'
+        img = img[0]
 
-            ref_proposals (None | Tensor) : override rpn proposals with custom
-                proposals of reference images. Use when `with_rpn` is False.
+        ref_img = inputs['ref_img']
+        assert ref_img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert ref_img.size(0) == 1, \
+            'Dff video detectors only support 1 batch size per gpu for now.'
+        ref_img = ref_img[0]
 
-        Returns:
-            dict[str, Tensor]: a dictionary of loss components
-        """
-        assert len(img) == 1, \
+        assert len(data_samples) == 1, \
             'Dff video detectors only support 1 batch size per gpu for now.'
-        is_video_data = img_metas[0]['is_video_data']
+        metainfo = data_samples[0].metainfo
 
-        flow_img = torch.cat((img, ref_img[:, 0]), dim=1)
-        flow = self.motion(flow_img, img_metas)
-        ref_x = self.detector.extract_feat(ref_img[:, 0])
+        flow_img = torch.cat((img, ref_img), dim=1)
+        flow = self.motion(flow_img, metainfo, self.preprocess_cfg)
+        ref_x = self.detector.extract_feat(ref_img)
         x = []
         for i in range(len(ref_x)):
             x_single = flow_warp_feats(ref_x[i], flow)
-            if not is_video_data:
+            if not metainfo['is_video_data']:
                 x_single = 0 * x_single + ref_x[i]
             x.append(x_single)
 
@@ -160,50 +92,61 @@ def forward_train(self,
             if self.detector.with_rpn:
                 proposal_cfg = self.detector.train_cfg.get(
                     'rpn_proposal', self.detector.test_cfg.rpn)
-                rpn_losses, proposal_list = \
-                    self.detector.rpn_head.forward_train(
-                        x,
-                        img_metas,
-                        gt_bboxes,
-                        gt_labels=None,
-                        gt_bboxes_ignore=gt_bboxes_ignore,
-                        proposal_cfg=proposal_cfg)
+                rpn_data_samples = copy.deepcopy(data_samples)
+                # set cat_id of gt_labels to 0 in RPN
+                for data_sample in rpn_data_samples:
+                    data_sample.gt_instances.labels = \
+                        torch.zeros_like(data_sample.gt_instances.labels)
+
+                rpn_losses, rpn_results_list = \
+                    self.detector.rpn_head.loss_and_predict(
+                        x, rpn_data_samples, proposal_cfg=proposal_cfg,
+                        **kwargs)
+                # avoid get same name with roi_head loss
+                keys = rpn_losses.keys()
+                for key in keys:
+                    if 'loss' in key and 'rpn' not in key:
+                        rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
                 losses.update(rpn_losses)
             else:
-                proposal_list = proposals
-
-            roi_losses = self.detector.roi_head.forward_train(
-                x, img_metas, proposal_list, gt_bboxes, gt_labels,
-                gt_bboxes_ignore, gt_masks, **kwargs)
+                rpn_results_list = []
+                for i in range(len(data_samples)):
+                    results = InstanceData()
+                    results.bboxes = data_samples[i].proposals
+                    rpn_results_list.append(results)
+
+            roi_losses = self.detector.roi_head.loss(x, rpn_results_list,
+                                                     data_samples, **kwargs)
             losses.update(roi_losses)
         # Single stage detector
         elif hasattr(self.detector, 'bbox_head'):
-            bbox_losses = self.detector.bbox_head.forward_train(
-                x, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore)
+            bbox_losses = self.detector.bbox_head.loss(x, data_samples,
+                                                       **kwargs)
             losses.update(bbox_losses)
         else:
             raise TypeError('detector must has roi_head or bbox_head.')
 
         return losses
 
-    def extract_feats(self, img, img_metas):
+    def extract_feats(self, img: Tensor, metainfo: dict) -> Tensor:
         """Extract features for `img` during testing.
 
         Args:
-            img (Tensor): of shape (1, C, H, W) encoding input image.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
                 Typically these should be mean centered and std scaled.
-
-            img_metas (list[dict]): list of image information dict where each
+                The T denotes the number of key images and usually is 1 in
+                DFF method.
+            metainfo (dict): image information dict where each
                 dict has: 'img_shape', 'scale_factor', 'flip', and may also
                 contain 'filename', 'ori_shape', 'pad_shape', and
                 'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
+                `mmtrack/datasets/transforms/formatting.py:PackTrackInputs`.
 
         Returns:
-            list[Tensor]: Multi level feature maps of `img`.
+            tuple[Tensor]: Multi level feature maps of `img`.
         """
         key_frame_interval = self.test_cfg.get('key_frame_interval', 10)
-        frame_id = img_metas[0].get('frame_id', -1)
+        frame_id = metainfo.get('frame_id', -1)
         assert frame_id >= 0
         is_key_frame = False if frame_id % key_frame_interval else True
 
@@ -214,81 +157,75 @@ def extract_feats(self, img, img_metas):
             self.memo.feats = x
         else:
             flow_img = torch.cat((img, self.memo.img), dim=1)
-            flow = self.motion(flow_img, img_metas)
+            flow = self.motion(flow_img, metainfo, self.preprocess_cfg)
             x = []
             for i in range(len(self.memo.feats)):
                 x_single = flow_warp_feats(self.memo.feats[i], flow)
                 x.append(x_single)
+            x = tuple(x)
         return x
 
-    def simple_test(self,
-                    img,
-                    img_metas,
-                    ref_img=None,
-                    ref_img_metas=None,
-                    proposals=None,
-                    rescale=False):
+    def predict(self,
+                inputs: dict,
+                data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
         """Test without augmentation.
 
         Args:
-            img (Tensor): of shape (1, C, H, W) encoding input image.
-                Typically these should be mean centered and std scaled.
-
-            img_metas (list[dict]): list of image information dict where each
-                dict has: 'img_shape', 'scale_factor', 'flip', and may also
-                contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            ref_img (None): Not used in DFF. Only for unifying API interface.
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W)
+                encoding input images. Typically these should be mean centered
+                and std scaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+                In test mode, T = 1 and there is only ``img`` and no
+                ``ref_img``.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instances`` and 'metainfo'.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
 
-            ref_img_metas (None): Not used in DFF. Only for unifying API
-                interface.
+        Returns:
+            SampleList: Tracking results of the input images.
+            Each TrackDataSample usually contains ``pred_det_instances``.
+        """
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'Dff video detectors only support 1 batch size per gpu for now.'
+        img = img[0]
 
-            proposals (None | Tensor): Override rpn proposals with custom
-                proposals. Use when `with_rpn` is False. Defaults to None.
+        assert len(data_samples) == 1, \
+            'Dff video detectors only support 1 batch size per gpu for now.'
 
-            rescale (bool): If False, then returned bboxes and masks will fit
-                the scale of img, otherwise, returned bboxes and masks
-                will fit the scale of original image shape. Defaults to False.
+        metainfo = data_samples[0].metainfo
+        x = self.extract_feats(img, metainfo)
 
-        Returns:
-            dict[str : list(ndarray)]: The detection results.
-        """
-        x = self.extract_feats(img, img_metas)
+        track_data_sample = copy.deepcopy(data_samples[0])
 
         # Two stage detector
         if hasattr(self.detector, 'roi_head'):
-            if proposals is None:
-                proposal_list = self.detector.rpn_head.simple_test_rpn(
-                    x, img_metas)
+            if not hasattr(data_samples[0], 'proposals'):
+                rpn_results_list = self.detector.rpn_head.predict(
+                    x, data_samples, rescale=False)
             else:
-                proposal_list = proposals
-
-            outs = self.detector.roi_head.simple_test(
-                x, proposal_list, img_metas, rescale=rescale)
+                rpn_results_list = []
+                for i in range(len(data_samples)):
+                    results = InstanceData()
+                    results.bboxes = data_samples[i].proposals
+                    rpn_results_list.append(results)
+
+            results_list = self.detector.roi_head.predict(
+                x, rpn_results_list, data_samples, rescale=rescale)
+            track_data_sample.pred_det_instances = results_list[0]
         # Single stage detector
         elif hasattr(self.detector, 'bbox_head'):
-            outs = self.bbox_head(x)
-            bbox_list = self.bbox_head.get_bboxes(
-                *outs, img_metas, rescale=rescale)
-            # skip post-processing when exporting to ONNX
-            if torch.onnx.is_in_onnx_export():
-                return bbox_list
-
-            outs = [
-                bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
-                for det_bboxes, det_labels in bbox_list
-            ]
+            results_list = self.detector.bbox_head.predict(
+                x, data_samples, rescale=rescale)
+            track_data_sample.pred_det_instances = results_list[0]
         else:
             raise TypeError('detector must has roi_head or bbox_head.')
 
-        results = dict()
-        results['det_bboxes'] = outs[0]
-        if len(outs) == 2:
-            results['det_masks'] = outs[1]
-        return results
-
-    def aug_test(self, imgs, img_metas, **kwargs):
-        """Test function with test time augmentation."""
-        raise NotImplementedError
+        return [track_data_sample]
diff --git a/mmtrack/models/vid/fgfa.py b/mmtrack/models/vid/fgfa.py
index 58cd96d42..56ee9269c 100644
--- a/mmtrack/models/vid/fgfa.py
+++ b/mmtrack/models/vid/fgfa.py
@@ -1,13 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
+
+import copy
+from typing import List, Optional, Tuple, Union
 
 import torch
 from addict import Dict
-from mmdet.core import bbox2result
-from mmdet.models import build_detector
+from mmengine.structures import InstanceData
+from torch import Tensor
 
-from mmtrack.core import flow_warp_feats
-from ..builder import MODELS, build_aggregator, build_motion
+from mmtrack.registry import MODELS
+from mmtrack.utils import OptConfigType, OptMultiConfig, SampleList
+from ..task_modules.motion import flow_warp_feats
 from .base import BaseVideoDetector
 
 
@@ -20,140 +23,73 @@ class FGFA(BaseVideoDetector):
     """
 
     def __init__(self,
-                 detector,
-                 motion,
-                 aggregator,
-                 pretrains=None,
-                 init_cfg=None,
-                 frozen_modules=None,
-                 train_cfg=None,
-                 test_cfg=None):
-        super(FGFA, self).__init__(init_cfg)
-        if isinstance(pretrains, dict):
-            warnings.warn('DeprecationWarning: pretrains is deprecated, '
-                          'please use "init_cfg" instead')
-            motion_pretrain = pretrains.get('motion', None)
-            if motion_pretrain:
-                motion.init_cfg = dict(
-                    type='Pretrained', checkpoint=motion_pretrain)
-            else:
-                motion.init_cfg = None
-            detector_pretrain = pretrains.get('detector', None)
-            if detector_pretrain:
-                detector.init_cfg = dict(
-                    type='Pretrained', checkpoint=detector_pretrain)
-            else:
-                detector.init_cfg = None
-        self.detector = build_detector(detector)
-        self.motion = build_motion(motion)
-        self.aggregator = build_aggregator(aggregator)
+                 detector: dict,
+                 motion: dict,
+                 aggregator: dict,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 frozen_modules: Optional[Union[List[str], Tuple[str],
+                                                str]] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+        self.detector = MODELS.build(detector)
+        self.motion = MODELS.build(motion)
+        self.aggregator = MODELS.build(aggregator)
         self.train_cfg = train_cfg
         self.test_cfg = test_cfg
+        self.preprocess_cfg = data_preprocessor
 
         if frozen_modules is not None:
             self.freeze_module(frozen_modules)
 
-    def forward_train(self,
-                      img,
-                      img_metas,
-                      gt_bboxes,
-                      gt_labels,
-                      ref_img,
-                      ref_img_metas,
-                      ref_gt_bboxes,
-                      ref_gt_labels,
-                      gt_instance_ids=None,
-                      gt_bboxes_ignore=None,
-                      gt_masks=None,
-                      proposals=None,
-                      ref_gt_instance_ids=None,
-                      ref_gt_bboxes_ignore=None,
-                      ref_gt_masks=None,
-                      ref_proposals=None,
-                      **kwargs):
+    def loss(self, inputs: dict, data_samples: SampleList, **kwargs) -> dict:
         """
         Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-
-            gt_labels (list[Tensor]): class indices corresponding to each box.
-
-            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-                2 denotes there is two reference images for each input image.
-
-            ref_img_metas (list[list[dict]]): The first list only has one
-                element. The second list contains reference image information
-                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
-                and may also contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
-                Tensor contains ground truth bboxes for each reference image
-                with shape (num_all_ref_gts, 5) in
-                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
-                start from 0, and denotes the id of reference image for each
-                key image.
-
-            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
-                Tensor contains class indices corresponding to each reference
-                box with shape (num_all_ref_gts, 2) in
-                [ref_img_id, class_indice].
-
-            gt_instance_ids (None | list[Tensor]): specify the instance id for
-                each ground truth bbox.
-
-            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
-                boxes can be ignored when computing the loss.
-
-            gt_masks (None | Tensor) : true segmentation masks for each box
-                used if the architecture supports a segmentation task.
-
-            proposals (None | Tensor) : override rpn proposals with custom
-                proposals. Use when `with_rpn` is False.
-
-            ref_gt_instance_ids (None | list[Tensor]): specify the instance id
-                for each ground truth bboxes of reference images.
-
-            ref_gt_bboxes_ignore (None | list[Tensor]): specify which bounding
-                boxes of reference images can be ignored when computing the
-                loss.
-
-            ref_gt_masks (None | Tensor) : True segmentation masks for each
-                box of reference image used if the architecture supports a
-                segmentation task.
-
-            ref_proposals (None | Tensor) : override rpn proposals with custom
-                proposals of reference images. Use when `with_rpn` is False.
+            inputs (dict[Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size and must be 1 in FGFA method.
+                The T denotes the number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
 
         Returns:
             dict[str, Tensor]: a dictionary of loss components
         """
-        assert len(img) == 1, \
-            'fgfa video detectors only support 1 batch size per gpu for now.'
-
-        flow_imgs = torch.cat((img, ref_img[:, 0]), dim=1)
-        for i in range(1, ref_img.shape[1]):
-            flow_img = torch.cat((img, ref_img[:, i]), dim=1)
-            flow_imgs = torch.cat((flow_imgs, flow_img), dim=0)
-        flows = self.motion(flow_imgs, img_metas)
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'FGFA video detectors only support 1 batch size per gpu for now.'
+        assert img.size(1) == 1, \
+            'FGFA video detector only has 1 key image per batch.'
+        img = img[0]
+
+        ref_img = inputs['ref_img']
+        assert ref_img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert ref_img.size(0) == 1, \
+            'FGFA video detectors only support 1 batch size per gpu for now.'
+        ref_img = ref_img[0]
+
+        assert len(data_samples) == 1, \
+            'FGFA video detectors only support 1 batch size per gpu for now.'
+        metainfo = data_samples[0].metainfo
+
+        num_ref_imgs = ref_img.size(0)
+        flow_imgs = torch.cat((img.repeat(num_ref_imgs, 1, 1, 1), ref_img),
+                              dim=1)
+        flows = self.motion(flow_imgs, metainfo, self.preprocess_cfg)
+
+        img_x = self.detector.extract_feat(img)
+        ref_img_x = self.detector.extract_feat(ref_img)
+        assert len(img_x) == len(ref_img_x)
 
-        all_imgs = torch.cat((img, ref_img[0]), dim=0)
-        all_x = self.detector.extract_feat(all_imgs)
         x = []
-        for i in range(len(all_x)):
-            ref_x_single = flow_warp_feats(all_x[i][1:], flows)
-            agg_x_single = self.aggregator(all_x[i][[0]], ref_x_single)
+        for i in range(len(img_x)):
+            ref_x_single = flow_warp_feats(ref_img_x[i], flows)
+            agg_x_single = self.aggregator(img_x[i], ref_x_single)
             x.append(agg_x_single)
 
         losses = dict()
@@ -164,72 +100,75 @@ def forward_train(self,
             if self.detector.with_rpn:
                 proposal_cfg = self.detector.train_cfg.get(
                     'rpn_proposal', self.detector.test_cfg.rpn)
-                rpn_losses, proposal_list = \
-                    self.detector.rpn_head.forward_train(
-                        x,
-                        img_metas,
-                        gt_bboxes,
-                        gt_labels=None,
-                        gt_bboxes_ignore=gt_bboxes_ignore,
-                        proposal_cfg=proposal_cfg)
+                rpn_data_samples = copy.deepcopy(data_samples)
+                # set cat_id of gt_labels to 0 in RPN
+                for data_sample in rpn_data_samples:
+                    data_sample.gt_instances.labels = \
+                        torch.zeros_like(data_sample.gt_instances.labels)
+
+                rpn_losses, rpn_results_list = \
+                    self.detector.rpn_head.loss_and_predict(
+                        x, rpn_data_samples, proposal_cfg=proposal_cfg,
+                        **kwargs)
+                # avoid get same name with roi_head loss
+                keys = rpn_losses.keys()
+                for key in keys:
+                    if 'loss' in key and 'rpn' not in key:
+                        rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
                 losses.update(rpn_losses)
             else:
-                proposal_list = proposals
-
-            roi_losses = self.detector.roi_head.forward_train(
-                x, img_metas, proposal_list, gt_bboxes, gt_labels,
-                gt_bboxes_ignore, gt_masks, **kwargs)
+                rpn_results_list = []
+                for i in range(len(data_samples)):
+                    results = InstanceData()
+                    results.bboxes = data_samples[i].proposals
+                    rpn_results_list.append(results)
+
+            roi_losses = self.detector.roi_head.loss(x, rpn_results_list,
+                                                     data_samples, **kwargs)
             losses.update(roi_losses)
         # Single stage detector
         elif hasattr(self.detector, 'bbox_head'):
-            bbox_losses = self.detector.bbox_head.forward_train(
-                x, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore)
+            bbox_losses = self.detector.bbox_head.loss(x, data_samples,
+                                                       **kwargs)
             losses.update(bbox_losses)
         else:
             raise TypeError('detector must has roi_head or bbox_head.')
 
         return losses
 
-    def extract_feats(self, img, img_metas, ref_img, ref_img_metas):
+    def extract_feats(self, img: Tensor, ref_img: Union[Tensor, None],
+                      metainfo: dict) -> List[Tensor]:
         """Extract features for `img` during testing.
 
         Args:
-            img (Tensor): of shape (1, C, H, W) encoding input image.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
                 Typically these should be mean centered and std scaled.
-
-            img_metas (list[dict]): list of image information dict where each
+                The T denotes the number of key images and usually is 1 in
+                FGFA method.
+            ref_img (Tensor | None): of shape (T, C, H, W) encoding
+                reference image. Typically these should be mean centered
+                and std scaled. The T denotes the number of reference images.
+                There may be no reference images in some cases.
+            metainfo (dict): image information dict where each
                 dict has: 'img_shape', 'scale_factor', 'flip', and may also
                 contain 'filename', 'ori_shape', 'pad_shape', and
                 'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            ref_img (Tensor | None): of shape (1, N, C, H, W) encoding input
-                reference images. Typically these should be mean centered and
-                std scaled. N denotes the number of reference images. There
-                may be no reference images in some cases.
-
-            ref_img_metas (list[list[dict]] | None): The first list only has
-                one element. The second list contains image information dict
-                where each dict has: 'img_shape', 'scale_factor', 'flip', and
-                may also contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. There
-                may be no reference images in some cases.
+                `mmtrack/datasets/transforms/formatting.py:PackTrackInputs`.
 
         Returns:
             list[Tensor]: Multi level feature maps of `img`.
         """
-        frame_id = img_metas[0].get('frame_id', -1)
+        frame_id = metainfo.get('frame_id', -1)
         assert frame_id >= 0
-        num_left_ref_imgs = img_metas[0].get('num_left_ref_imgs', -1)
-        frame_stride = img_metas[0].get('frame_stride', -1)
+        num_left_ref_imgs = metainfo.get('num_left_ref_imgs', -1)
+        frame_stride = metainfo.get('frame_stride', -1)
 
         # test with adaptive stride
         if frame_stride < 1:
             if frame_id == 0:
                 self.memo = Dict()
-                self.memo.img = ref_img[0]
-                ref_x = self.detector.extract_feat(ref_img[0])
+                self.memo.img = ref_img
+                ref_x = self.detector.extract_feat(ref_img)
                 # 'tuple' object (e.g. the output of FPN) does not support
                 # item assignment
                 self.memo.feats = []
@@ -240,8 +179,8 @@ def extract_feats(self, img, img_metas, ref_img, ref_img_metas):
         else:
             if frame_id == 0:
                 self.memo = Dict()
-                self.memo.img = ref_img[0]
-                ref_x = self.detector.extract_feat(ref_img[0])
+                self.memo.img = ref_img
+                ref_x = self.detector.extract_feat(ref_img)
                 # 'tuple' object (e.g. the output of FPN) does not support
                 # item assignment
                 self.memo.feats = []
@@ -253,13 +192,12 @@ def extract_feats(self, img, img_metas, ref_img, ref_img_metas):
             elif frame_id % frame_stride == 0:
                 assert ref_img is not None
                 x = []
-                ref_x = self.detector.extract_feat(ref_img[0])
+                ref_x = self.detector.extract_feat(ref_img)
                 for i in range(len(ref_x)):
                     self.memo.feats[i] = torch.cat(
                         (self.memo.feats[i], ref_x[i]), dim=0)[1:]
                     x.append(self.memo.feats[i][[num_left_ref_imgs]])
-                self.memo.img = torch.cat((self.memo.img, ref_img[0]),
-                                          dim=0)[1:]
+                self.memo.img = torch.cat((self.memo.img, ref_img), dim=0)[1:]
             else:
                 assert ref_img is None
                 x = self.detector.extract_feat(img)
@@ -267,7 +205,7 @@ def extract_feats(self, img, img_metas, ref_img, ref_img_metas):
         flow_imgs = torch.cat(
             (img.repeat(self.memo.img.shape[0], 1, 1, 1), self.memo.img),
             dim=1)
-        flows = self.motion(flow_imgs, img_metas)
+        flows = self.motion(flow_imgs, metainfo, self.preprocess_cfg)
 
         agg_x = []
         for i in range(len(x)):
@@ -280,88 +218,79 @@ def extract_feats(self, img, img_metas, ref_img, ref_img_metas):
             agg_x.append(agg_x_single)
         return agg_x
 
-    def simple_test(self,
-                    img,
-                    img_metas,
-                    ref_img=None,
-                    ref_img_metas=None,
-                    proposals=None,
-                    rescale=False):
+    def predict(self,
+                inputs: dict,
+                data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
         """Test without augmentation.
 
         Args:
-            img (Tensor): of shape (1, C, H, W) encoding input image.
-                Typically these should be mean centered and std scaled.
-
-            img_metas (list[dict]): list of image information dict where each
-                dict has: 'img_shape', 'scale_factor', 'flip', and may also
-                contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            ref_img (list[Tensor] | None): The list only contains one Tensor
-                of shape (1, N, C, H, W) encoding input reference images.
-                Typically these should be mean centered and std scaled. N
-                denotes the number for reference images. There may be no
-                reference images in some cases.
-
-            ref_img_metas (list[list[list[dict]]] | None): The first and
-                second list only has one element. The third list contains
-                image information dict where each dict has: 'img_shape',
-                'scale_factor', 'flip', and may also contain 'filename',
-                'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on
-                the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. There
-                may be no reference images in some cases.
-
-            proposals (None | Tensor): Override rpn proposals with custom
-                proposals. Use when `with_rpn` is False. Defaults to None.
-
-            rescale (bool): If False, then returned bboxes and masks will fit
-                the scale of img, otherwise, returned bboxes and masks
-                will fit the scale of original image shape. Defaults to False.
+            inputs (dict[Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size and must be 1 in FGFA method.
+                The T denotes the number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor, Optional): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
 
         Returns:
-            dict[str : list(ndarray)]: The detection results.
+            list[obj:`TrackDataSample`]: Tracking results of the
+            input images. Each TrackDataSample usually contains
+            ``pred_det_instances`` or ``pred_track_instances``.
         """
-        if ref_img is not None:
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'FGFA video detectors only support 1 batch size per gpu for now.'
+        assert img.size(1) == 1, \
+            'FGFA video detector only has 1 key image per batch.'
+        img = img[0]
+
+        if 'ref_img' in inputs:
+            ref_img = inputs['ref_img']
+            assert ref_img.dim(
+            ) == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+            assert ref_img.size(0) == 1, 'FGFA video detectors only support' \
+                                         ' 1 batch size per gpu for now.'
             ref_img = ref_img[0]
-        if ref_img_metas is not None:
-            ref_img_metas = ref_img_metas[0]
-        x = self.extract_feats(img, img_metas, ref_img, ref_img_metas)
+        else:
+            ref_img = None
+
+        assert len(data_samples) == 1, \
+            'FGFA video detectors only support 1 batch size per gpu for now.'
+        metainfo = data_samples[0].metainfo
+
+        x = self.extract_feats(img, ref_img, metainfo)
+
+        track_data_sample = copy.deepcopy(data_samples[0])
 
         # Two stage detector
         if hasattr(self.detector, 'roi_head'):
-            if proposals is None:
-                proposal_list = self.detector.rpn_head.simple_test_rpn(
-                    x, img_metas)
+            if not hasattr(data_samples[0], 'proposals'):
+                rpn_results_list = self.detector.rpn_head.predict(
+                    x, data_samples, rescale=False)
             else:
-                proposal_list = proposals
-
-            outs = self.detector.roi_head.simple_test(
-                x, proposal_list, img_metas, rescale=rescale)
+                rpn_results_list = []
+                for i in range(len(data_samples)):
+                    results = InstanceData()
+                    results.bboxes = data_samples[i].proposals
+                    rpn_results_list.append(results)
+
+            results_list = self.detector.roi_head.predict(
+                x, rpn_results_list, data_samples, rescale=rescale)
+            track_data_sample.pred_det_instances = results_list[0]
         # Single stage detector
         elif hasattr(self.detector, 'bbox_head'):
-            outs = self.bbox_head(x)
-            bbox_list = self.bbox_head.get_bboxes(
-                *outs, img_metas, rescale=rescale)
-            # skip post-processing when exporting to ONNX
-            if torch.onnx.is_in_onnx_export():
-                return bbox_list
-
-            outs = [
-                bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
-                for det_bboxes, det_labels in bbox_list
-            ]
+            results_list = self.detector.bbox_head.predict(
+                x, data_samples, rescale=rescale)
+
+            track_data_sample.pred_det_instances = results_list[0]
         else:
             raise TypeError('detector must has roi_head or bbox_head.')
 
-        results = dict()
-        results['det_bboxes'] = outs[0]
-        if len(outs) == 2:
-            results['det_masks'] = outs[1]
-        return results
-
-    def aug_test(self, imgs, img_metas, **kwargs):
-        """Test function with test time augmentation."""
-        raise NotImplementedError
+        return [track_data_sample]
diff --git a/mmtrack/models/vid/selsa.py b/mmtrack/models/vid/selsa.py
index 74e0b6b3f..8c545f3ef 100644
--- a/mmtrack/models/vid/selsa.py
+++ b/mmtrack/models/vid/selsa.py
@@ -1,11 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
+from copy import deepcopy
+from typing import List, Optional, Tuple, Union
 
 import torch
 from addict import Dict
-from mmdet.models import build_detector
+from mmengine.structures import InstanceData
+from torch import Tensor
 
-from ..builder import MODELS
+from mmtrack.registry import MODELS
+from mmtrack.utils import (ConfigType, OptConfigType, SampleList,
+                           convert_data_sample_type)
 from .base import BaseVideoDetector
 
 
@@ -18,23 +22,15 @@ class SELSA(BaseVideoDetector):
     """
 
     def __init__(self,
-                 detector,
-                 pretrains=None,
-                 init_cfg=None,
-                 frozen_modules=None,
-                 train_cfg=None,
-                 test_cfg=None):
-        super(SELSA, self).__init__(init_cfg)
-        if isinstance(pretrains, dict):
-            warnings.warn('DeprecationWarning: pretrains is deprecated, '
-                          'please use "init_cfg" instead')
-            detector_pretrain = pretrains.get('detector', None)
-            if detector_pretrain:
-                detector.init_cfg = dict(
-                    type='Pretrained', checkpoint=detector_pretrain)
-            else:
-                detector.init_cfg = None
-        self.detector = build_detector(detector)
+                 detector: ConfigType,
+                 frozen_modules: Optional[Union[List[str], Tuple[str],
+                                                str]] = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super(SELSA, self).__init__(data_preprocessor, init_cfg)
+        self.detector = MODELS.build(detector)
         assert hasattr(self.detector, 'roi_head'), \
             'selsa video detector only supports two stage detector'
         self.train_cfg = train_cfg
@@ -43,96 +39,41 @@ def __init__(self,
         if frozen_modules is not None:
             self.freeze_module(frozen_modules)
 
-    def forward_train(self,
-                      img,
-                      img_metas,
-                      gt_bboxes,
-                      gt_labels,
-                      ref_img,
-                      ref_img_metas,
-                      ref_gt_bboxes,
-                      ref_gt_labels,
-                      gt_instance_ids=None,
-                      gt_bboxes_ignore=None,
-                      gt_masks=None,
-                      proposals=None,
-                      ref_gt_instance_ids=None,
-                      ref_gt_bboxes_ignore=None,
-                      ref_gt_masks=None,
-                      ref_proposals=None,
-                      **kwargs):
-        """
-        Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-
-            gt_labels (list[Tensor]): class indices corresponding to each box.
-
-            ref_img (Tensor): of shape (N, 2, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-                2 denotes there is two reference images for each input image.
-
-            ref_img_metas (list[list[dict]]): The first list only has one
-                element. The second list contains reference image information
-                dict where each dict has: 'img_shape', 'scale_factor', 'flip',
-                and may also contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            ref_gt_bboxes (list[Tensor]): The list only has one Tensor. The
-                Tensor contains ground truth bboxes for each reference image
-                with shape (num_all_ref_gts, 5) in
-                [ref_img_id, tl_x, tl_y, br_x, br_y] format. The ref_img_id
-                start from 0, and denotes the id of reference image for each
-                key image.
-
-            ref_gt_labels (list[Tensor]): The list only has one Tensor. The
-                Tensor contains class indices corresponding to each reference
-                box with shape (num_all_ref_gts, 2) in
-                [ref_img_id, class_indice].
-
-            gt_instance_ids (None | list[Tensor]): specify the instance id for
-                each ground truth bbox.
-
-            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
-                boxes can be ignored when computing the loss.
-
-            gt_masks (None | Tensor) : true segmentation masks for each box
-                used if the architecture supports a segmentation task.
-
-            proposals (None | Tensor) : override rpn proposals with custom
-                proposals. Use when `with_rpn` is False.
-
-            ref_gt_instance_ids (None | list[Tensor]): specify the instance id
-                for each ground truth bboxes of reference images.
-
-            ref_gt_bboxes_ignore (None | list[Tensor]): specify which bounding
-                boxes of reference images can be ignored when computing the
-                loss.
-
-            ref_gt_masks (None | Tensor) : True segmentation masks for each
-                box of reference image used if the architecture supports a
-                segmentation task.
+    def loss(self, inputs: dict, data_samples: SampleList, **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
 
-            ref_proposals (None | Tensor) : override rpn proposals with custom
-                proposals of reference images. Use when `with_rpn` is False.
+        Args:
+            inputs (dict[Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size and must be 1 in SELSA method.
+                The T denotes the number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
 
         Returns:
             dict[str, Tensor]: a dictionary of loss components
         """
-        assert len(img) == 1, \
-            'selsa video detector only supports 1 batch size per gpu for now.'
-
-        all_imgs = torch.cat((img, ref_img[0]), dim=0)
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'SELSA video detectors only support 1 batch size per gpu for now.'
+        assert img.size(1) == 1, \
+            'SELSA video detector only has 1 key image per batch.'
+        img = img[0]
+
+        ref_img = inputs['ref_img']
+        assert ref_img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert ref_img.size(0) == 1, \
+            'SELSA video detectors only support 1 batch size per gpu for now.'
+        ref_img = ref_img[0]
+
+        assert len(data_samples) == 1, \
+            'SELSA video detectors only support 1 batch size per gpu for now.'
+
+        all_imgs = torch.cat((img, ref_img), dim=0)
         all_x = self.detector.extract_feat(all_imgs)
         x = []
         ref_x = []
@@ -141,75 +82,83 @@ def forward_train(self,
             ref_x.append(all_x[i][1:])
 
         losses = dict()
+        ref_data_samples, _ = convert_data_sample_type(
+            data_samples[0], num_ref_imgs=len(ref_img))
 
         # RPN forward and loss
         if self.detector.with_rpn:
             proposal_cfg = self.detector.train_cfg.get(
                 'rpn_proposal', self.detector.test_cfg.rpn)
-            rpn_losses, proposal_list = self.detector.rpn_head.forward_train(
-                x,
-                img_metas,
-                gt_bboxes,
-                gt_labels=None,
-                gt_bboxes_ignore=gt_bboxes_ignore,
-                proposal_cfg=proposal_cfg)
+            rpn_data_samples = deepcopy(data_samples)
+            # set cat_id of gt_labels to 0 in RPN
+            for data_sample in rpn_data_samples:
+                data_sample.gt_instances.labels = torch.zeros_like(
+                    data_sample.gt_instances.labels)
+            (rpn_losses,
+             proposal_list) = self.detector.rpn_head.loss_and_predict(
+                 x, rpn_data_samples, proposal_cfg=proposal_cfg)
             losses.update(rpn_losses)
-
-            ref_proposals_list = self.detector.rpn_head.simple_test_rpn(
-                ref_x, ref_img_metas[0])
+            ref_proposals_list = self.detector.rpn_head.predict(
+                ref_x, ref_data_samples)
         else:
-            proposal_list = proposals
-            ref_proposals_list = ref_proposals
+            proposal_list, ref_proposals_list = [], []
+            for i in range(len(data_samples)):
+                proposal, ref_proposals = InstanceData(), InstanceData()
+                proposal.bboxes = data_samples[i].proposals
+                proposal_list.append(proposal)
+                ref_proposals.bboxes = data_samples[i].ref_proposals
+                ref_proposals_list.append(ref_proposals)
+
+        roi_losses = self.detector.roi_head.loss(x, ref_x, proposal_list,
+                                                 ref_proposals_list,
+                                                 data_samples, **kwargs)
 
-        roi_losses = self.detector.roi_head.forward_train(
-            x, ref_x, img_metas, proposal_list, ref_proposals_list, gt_bboxes,
-            gt_labels, gt_bboxes_ignore, gt_masks, **kwargs)
         losses.update(roi_losses)
 
         return losses
 
-    def extract_feats(self, img, img_metas, ref_img, ref_img_metas):
+    def extract_feats(self, img: Tensor, img_metas: dict,
+                      ref_img: Optional[Tensor],
+                      ref_img_metas: Optional[dict]) -> Tuple:
         """Extract features for `img` during testing.
 
         Args:
             img (Tensor): of shape (1, C, H, W) encoding input image.
                 Typically these should be mean centered and std scaled.
 
-            img_metas (list[dict]): list of image information dict where each
-                dict has: 'img_shape', 'scale_factor', 'flip', and may also
-                contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
+            img_metas (dict): list of image information dict where each
+                dict may has: 'img_id', 'img_path',
+                'ori_shape', 'img_shape', 'scale_factor','flip',
+                'flip_direction', 'frame_id', 'is_video_data', 'video_id',
+                'video_length', 'instances'.
 
             ref_img (Tensor | None): of shape (1, N, C, H, W) encoding input
                 reference images. Typically these should be mean centered and
                 std scaled. N denotes the number of reference images. There
                 may be no reference images in some cases.
 
-            ref_img_metas (list[list[dict]] | None): The first list only has
-                one element. The second list contains image information dict
-                where each dict has: 'img_shape', 'scale_factor', 'flip', and
-                may also contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. There
-                may be no reference images in some cases.
+            ref_img_metas (list[dict] | None): The list contains image
+                information dict where each dict may has: 'img_id', 'img_path',
+                'ori_shape', 'img_shape', 'scale_factor','flip',
+                'flip_direction', 'frame_id', 'is_video_data', 'video_id',
+                'video_length', 'instances'.
 
         Returns:
             tuple(x, img_metas, ref_x, ref_img_metas): x is the multi level
                 feature maps of `img`, ref_x is the multi level feature maps
                 of `ref_img`.
         """
-        frame_id = img_metas[0].get('frame_id', -1)
+        frame_id = img_metas.get('frame_id', -1)
         assert frame_id >= 0
-        num_left_ref_imgs = img_metas[0].get('num_left_ref_imgs', -1)
-        frame_stride = img_metas[0].get('frame_stride', -1)
+        num_left_ref_imgs = img_metas.get('num_left_ref_imgs', -1)
+        frame_stride = img_metas.get('frame_stride', -1)
 
         # test with adaptive stride
         if frame_stride < 1:
             if frame_id == 0:
                 self.memo = Dict()
-                self.memo.img_metas = ref_img_metas[0]
-                ref_x = self.detector.extract_feat(ref_img[0])
+                self.memo.img_metas = ref_img_metas
+                ref_x = self.detector.extract_feat(ref_img)
                 # 'tuple' object (e.g. the output of FPN) does not support
                 # item assignment
                 self.memo.feats = []
@@ -221,13 +170,13 @@ def extract_feats(self, img, img_metas, ref_img, ref_img_metas):
             for i in range(len(x)):
                 ref_x[i] = torch.cat((ref_x[i], x[i]), dim=0)
             ref_img_metas = self.memo.img_metas.copy()
-            ref_img_metas.extend(img_metas)
+            ref_img_metas.append(img_metas)
         # test with fixed stride
         else:
             if frame_id == 0:
                 self.memo = Dict()
                 self.memo.img_metas = ref_img_metas[0]
-                ref_x = self.detector.extract_feat(ref_img[0])
+                ref_x = self.detector.extract_feat(ref_img)
                 # 'tuple' object (e.g. the output of FPN) does not support
                 # item assignment
                 self.memo.feats = []
@@ -239,7 +188,7 @@ def extract_feats(self, img, img_metas, ref_img, ref_img_metas):
             elif frame_id % frame_stride == 0:
                 assert ref_img is not None
                 x = []
-                ref_x = self.detector.extract_feat(ref_img[0])
+                ref_x = self.detector.extract_feat(ref_img)
                 for i in range(len(ref_x)):
                     self.memo.feats[i] = torch.cat(
                         (self.memo.feats[i], ref_x[i]), dim=0)[1:]
@@ -254,85 +203,99 @@ def extract_feats(self, img, img_metas, ref_img, ref_img_metas):
             for i in range(len(x)):
                 ref_x[i][num_left_ref_imgs] = x[i]
             ref_img_metas = self.memo.img_metas.copy()
-            ref_img_metas[num_left_ref_imgs] = img_metas[0]
+            ref_img_metas[num_left_ref_imgs] = img_metas
 
         return x, img_metas, ref_x, ref_img_metas
 
-    def simple_test(self,
-                    img,
-                    img_metas,
-                    ref_img=None,
-                    ref_img_metas=None,
-                    proposals=None,
-                    ref_proposals=None,
-                    rescale=False):
+    def predict(self,
+                inputs: dict,
+                data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
         """Test without augmentation.
 
         Args:
-            img (Tensor): of shape (1, C, H, W) encoding input image.
-                Typically these should be mean centered and std scaled.
+            inputs (dict[Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size and must be 1 in SELSA method.
+                The T denotes the number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor, Optional): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
 
-            img_metas (list[dict]): list of image information dict where each
-                dict has: 'img_shape', 'scale_factor', 'flip', and may also
-                contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            ref_img (list[Tensor] | None): The list only contains one Tensor
-                of shape (1, N, C, H, W) encoding input reference images.
-                Typically these should be mean centered and std scaled. N
-                denotes the number for reference images. There may be no
-                reference images in some cases.
-
-            ref_img_metas (list[list[list[dict]]] | None): The first and
-                second list only has one element. The third list contains
-                image information dict where each dict has: 'img_shape',
-                'scale_factor', 'flip', and may also contain 'filename',
-                'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on
-                the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`. There
-                may be no reference images in some cases.
+        Returns:
+            list[obj:`TrackDataSample`]: Tracking results of the
+            input images. Each TrackDataSample usually contains
+            ``pred_det_instances`` or ``pred_track_instances``.
+        """
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(0) == 1, \
+            'SELSA video detectors only support 1 batch size per gpu for now.'
+        assert img.size(1) == 1, \
+            'SELSA video detector only has 1 key image per batch.'
+        img = img[0]
+
+        if 'ref_img' in inputs:
+            ref_img = inputs['ref_img']
+            assert ref_img.dim(
+            ) == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+            assert ref_img.size(0) == 1, 'SELSA video detectors only support' \
+                                         ' 1 batch size per gpu for now.'
+            ref_img = ref_img[0]
+        else:
+            ref_img = None
 
-            proposals (None | Tensor): Override rpn proposals with custom
-                proposals. Use when `with_rpn` is False. Defaults to None.
+        assert len(data_samples) == 1, \
+            'SELSA video detectors only support 1 batch size per gpu for now.'
 
-            rescale (bool): If False, then returned bboxes and masks will fit
-                the scale of img, otherwise, returned bboxes and masks
-                will fit the scale of original image shape. Defaults to False.
+        data_sample = data_samples[0]
+        img_metas = data_sample.metainfo
 
-        Returns:
-            dict[str : list(ndarray)]: The detection results.
-        """
         if ref_img is not None:
-            ref_img = ref_img[0]
-        if ref_img_metas is not None:
-            ref_img_metas = ref_img_metas[0]
+            _, ref_img_metas = convert_data_sample_type(
+                data_sample, num_ref_imgs=len(ref_img))
+        else:
+            ref_img_metas = None
+
         x, img_metas, ref_x, ref_img_metas = self.extract_feats(
             img, img_metas, ref_img, ref_img_metas)
 
-        if proposals is None:
-            proposal_list = self.detector.rpn_head.simple_test_rpn(
-                x, img_metas)
-            ref_proposals_list = self.detector.rpn_head.simple_test_rpn(
-                ref_x, ref_img_metas)
+        ref_data_samples = [
+            deepcopy(data_sample) for _ in range(len(ref_img_metas))
+        ]
+        for i in range(len(ref_img_metas)):
+            ref_data_samples[i].set_metainfo(ref_img_metas[i])
+
+        if data_samples[0].get('proposals', None) is None:
+            proposal_list = self.detector.rpn_head.predict(x, data_samples)
+            ref_proposals_list = self.detector.rpn_head.predict(
+                ref_x, ref_data_samples)
         else:
-            proposal_list = proposals
-            ref_proposals_list = ref_proposals
+            assert hasattr(data_samples[0], 'ref_proposals')
+            proposal_list = data_samples[0].proposals
+            ref_proposals_list = data_samples[0].ref_proposals
 
-        outs = self.detector.roi_head.simple_test(
+        results_list = self.detector.roi_head.predict(
             x,
             ref_x,
             proposal_list,
             ref_proposals_list,
-            img_metas,
+            data_samples,
             rescale=rescale)
 
-        results = dict()
-        results['det_bboxes'] = outs[0]
-        if len(outs) == 2:
-            results['det_masks'] = outs[1]
-        return results
+        track_data_sample = deepcopy(data_samples[0])
+        track_data_sample.pred_det_instances = results_list[0]
+        return [track_data_sample]
 
-    def aug_test(self, imgs, img_metas, **kwargs):
+    def aug_test(self,
+                 inputs: dict,
+                 data_samples: SampleList,
+                 rescale: bool = True,
+                 **kwargs):
         """Test function with test time augmentation."""
         raise NotImplementedError
diff --git a/mmtrack/models/vis/__init__.py b/mmtrack/models/vis/__init__.py
index e57077250..742ee213b 100644
--- a/mmtrack/models/vis/__init__.py
+++ b/mmtrack/models/vis/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .mask2former import Mask2Former
 from .masktrack_rcnn import MaskTrackRCNN
 
-__all__ = ['MaskTrackRCNN']
+__all__ = ['MaskTrackRCNN', 'Mask2Former']
diff --git a/mmtrack/models/vis/mask2former.py b/mmtrack/models/vis/mask2former.py
new file mode 100644
index 000000000..cb1d23c56
--- /dev/null
+++ b/mmtrack/models/vis/mask2former.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Union
+
+from torch import Tensor
+
+from mmtrack.models.mot import BaseMultiObjectTracker
+from mmtrack.registry import MODELS
+from mmtrack.utils import OptConfigType, OptMultiConfig, SampleList
+
+
+@MODELS.register_module()
+class Mask2Former(BaseMultiObjectTracker):
+    r"""Implementation of `Masked-attention Mask
+    Transformer for Universal Image Segmentation
+    <https://arxiv.org/pdf/2112.01527>`_.
+
+    Args:
+        backbone (dict): Configuration of backbone. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            Defaults to None.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(BaseMultiObjectTracker, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+        if backbone is not None:
+            self.backbone = MODELS.build(backbone)
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+
+        self.num_classes = self.track_head.num_classes
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Overload in order to load mmdet pretrained ckpt."""
+        for key in list(state_dict):
+            if key.startswith('panoptic_head'):
+                state_dict[key.replace('panoptic',
+                                       'track')] = state_dict.pop(key)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """
+        Args:
+            inputs (Tensor): Input images of shape (N, T, C, H, W).
+                These should usually be mean centered and std scaled.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        # shape (N * T, C, H, W)
+        img = img.flatten(0, 1)
+
+        x = self.backbone(img)
+        losses = self.track_head.loss(x, data_samples)
+
+        return losses
+
+    def predict(self,
+                inputs: dict,
+                data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with
+        postprocessing.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W)
+                encoding input images. Typically, these should be mean centered
+                and std scaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+                In test mode, T = 1 and there is only ``img`` and no
+                ``ref_img``.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instances`` and 'metainfo'.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            SampleList: Tracking results of the input images.
+            Each TrackDataSample usually contains ``pred_track_instances``.
+        """
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        # the "T" is 1
+        img = img.squeeze(1)
+        feats = self.backbone(img)
+        pred_track_ins_list = self.track_head.predict(feats, data_samples,
+                                                      rescale)
+
+        results = []
+        for idx, pred_track_ins in enumerate(pred_track_ins_list):
+            track_data_sample = data_samples[idx]
+            track_data_sample.pred_track_instances = pred_track_ins
+            results.append(track_data_sample)
+
+        return results
diff --git a/mmtrack/models/vis/masktrack_rcnn.py b/mmtrack/models/vis/masktrack_rcnn.py
index 96bd5a235..c00f87022 100644
--- a/mmtrack/models/vis/masktrack_rcnn.py
+++ b/mmtrack/models/vis/masktrack_rcnn.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmdet.models import build_detector, build_head
+import copy
+from typing import Dict, Optional
+
+from torch import Tensor
 
-from mmtrack.core import outs2results, results2outs
 from mmtrack.models.mot import BaseMultiObjectTracker
-from ..builder import MODELS, build_tracker
+from mmtrack.registry import MODELS
+from mmtrack.utils import OptConfigType, OptMultiConfig, SampleList
 
 
 @MODELS.register_module()
@@ -18,105 +20,62 @@ class MaskTrackRCNN(BaseMultiObjectTracker):
         detector (dict): Configuration of detector. Defaults to None.
         track_head (dict): Configuration of track head. Defaults to None.
         tracker (dict): Configuration of tracker. Defaults to None.
-        init_cfg (dict): Configuration of initialization. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
     """
 
     def __init__(self,
-                 detector=None,
-                 track_head=None,
-                 tracker=None,
-                 init_cfg=None):
-        super().__init__(init_cfg)
+                 detector: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+
         if detector is not None:
-            self.detector = build_detector(detector)
+            self.detector = MODELS.build(detector)
         assert hasattr(self.detector, 'roi_head'), \
             'MaskTrack R-CNN only supports two stage detectors.'
 
         if track_head is not None:
-            self.track_head = build_head(track_head)
+            self.track_head = MODELS.build(track_head)
         if tracker is not None:
-            self.tracker = build_tracker(tracker)
-
-    def forward_train(self,
-                      img,
-                      img_metas,
-                      gt_bboxes,
-                      gt_labels,
-                      ref_img,
-                      ref_img_metas,
-                      ref_gt_bboxes,
-                      ref_gt_labels,
-                      gt_instance_ids=None,
-                      gt_bboxes_ignore=None,
-                      gt_masks=None,
-                      proposals=None,
-                      ref_gt_instance_ids=None,
-                      ref_gt_bboxes_ignore=None,
-                      ref_gt_masks=None,
-                      ref_proposals=None,
-                      **kwargs):
-        """
-        Args:
-            img (Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-
-            gt_labels (list[Tensor]): class indices corresponding to each box.
-
-            ref_img (Tensor): of shape (N, C, H, W) encoding input reference
-                images. Typically these should be mean centered and std scaled.
-
-            ref_img_metas (list[dict]): list of reference image info dict
-                where each dict has: 'img_shape', 'scale_factor', 'flip', and
-                may also contain 'filename', 'ori_shape', 'pad_shape', and
-                'img_norm_cfg'. For details on the values of these keys see
-                `mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
-
-            ref_gt_bboxes (list[Tensor]): Ground truth bboxes for each
-                reference image with shape (num_gts, 4) in
-                [tl_x, tl_y, br_x, br_y] format.
-
-            ref_gt_labels (list[Tensor]): class indices corresponding to each
-                box.
-
-            gt_instance_ids (None | list[Tensor]): specify the instance id for
-                each ground truth bbox.
-
-            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
-                boxes can be ignored when computing the loss.
+            self.tracker = MODELS.build(tracker)
 
-            gt_masks (None | list[Tensor]) : true segmentation masks for each
-                box used if the architecture supports a segmentation task.
+    def loss(self, inputs: Dict[str, Tensor], data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
 
-            proposals (None | list[Tensor]) : override rpn proposals with
-                custom proposals. Use when `with_rpn` is False.
-
-            ref_gt_instance_ids (None | list[Tensor]): specify the instance id
-                for each ground truth bbox of reference images.
-
-            ref_gt_bboxes_ignore (None | list[Tensor]): specify which bounding
-                boxes of reference images can be ignored when computing the
-                loss.
-
-            ref_gt_masks (None | list[Tensor]) : true segmentation masks for
-                each box of reference images used if the architecture supports
-                a segmentation task.
-
-            ref_proposals (None | list[Tensor]) : override rpn proposals with
-                custom proposals of reference images. Use when `with_rpn` is
-                False.
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size.The T denotes the number of
+                key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
 
         Returns:
-            dict[str, Tensor]: a dictionary of loss components
+            dict: A dictionary of loss components.
         """
+        # modify the inputs shape to fit mmdet
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(1) == 1, \
+            'MaskTrackRCNN can only have 1 key frame and 1 reference frame.'
+        img = img[:, 0]
+
+        ref_img = inputs['ref_img']
+        assert ref_img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert ref_img.size(1) == 1, \
+            'MaskTrackRCNN can only have 1 key frame and 1 reference frame.'
+        ref_img = ref_img[:, 0]
+
         x = self.detector.extract_feat(img)
         ref_x = self.detector.extract_feat(ref_img)
 
@@ -126,89 +85,96 @@ def forward_train(self,
         if self.detector.with_rpn:
             proposal_cfg = self.detector.train_cfg.get(
                 'rpn_proposal', self.detector.test_cfg.rpn)
-            losses_rpn, proposal_list = self.detector.rpn_head.forward_train(
-                x,
-                img_metas,
-                gt_bboxes,
-                gt_labels=None,
-                gt_bboxes_ignore=gt_bboxes_ignore,
-                proposal_cfg=proposal_cfg)
-            losses.update(losses_rpn)
+            rpn_data_samples = copy.deepcopy(data_samples)
+            rpn_losses, rpn_results_list = self.detector.rpn_head.\
+                loss_and_predict(x,
+                                 rpn_data_samples,
+                                 proposal_cfg=proposal_cfg,
+                                 **kwargs)
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in keys:
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
         else:
-            proposal_list = proposals
-
-        losses_detect = self.detector.roi_head.forward_train(
-            x, img_metas, proposal_list, gt_bboxes, gt_labels,
-            gt_bboxes_ignore, gt_masks, **kwargs)
+            # TODO: Not support currently, should have a check at Fast R-CNN
+            assert data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in data_samples
+            ]
+
+        losses_detect = self.detector.roi_head.loss(x, rpn_results_list,
+                                                    data_samples, **kwargs)
         losses.update(losses_detect)
 
-        losses_track = self.track_head.forward_train(
-            x, ref_x, img_metas, proposal_list, gt_bboxes, ref_gt_bboxes,
-            gt_labels, gt_instance_ids, ref_gt_instance_ids, gt_bboxes_ignore,
-            **kwargs)
+        losses_track = self.track_head.loss(x, ref_x, rpn_results_list,
+                                            data_samples, **kwargs)
         losses.update(losses_track)
 
         return losses
 
-    def simple_test(self, img, img_metas, rescale=False, **kwargs):
-        """Test without augmentations.
+    def predict(self,
+                inputs: dict,
+                data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        """Test without augmentation.
 
         Args:
-            img (Tensor): of shape (1, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-            rescale (bool, optional): If False, then returned bboxes and masks
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W)
+                encoding input images. Typically these should be mean centered
+                and std scaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+                In test mode, T = 1 and there is only ``img`` and no
+                ``ref_img``.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as ``gt_instances`` and 'metainfo'.
+            rescale (bool, Optional): If False, then returned bboxes and masks
                 will fit the scale of img, otherwise, returned bboxes and masks
-                will fit the scale of original image shape. Defaults to False.
+                will fit the scale of original image shape. Defaults to True.
 
         Returns:
-            dict[str : list(ndarray)]: The tracking results.
+            SampleList: Tracking results of the input images.
+            Each TrackDataSample usually contains ``pred_track_instances``.
         """
-        frame_id = img_metas[0].get('frame_id', -1)
+        img = inputs['img']
+        assert img.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert img.size(1) == 1, \
+            'MaskTrackRCNN can only have 1 key frame.'
+        img = img[:, 0]
+
+        assert len(data_samples) == 1, \
+            'MaskTrackRCNN only support 1 batch size per gpu for now.'
+
+        metainfo = data_samples[0].metainfo
+        frame_id = metainfo.get('frame_id', -1)
         if frame_id == 0:
             self.tracker.reset()
 
         x = self.detector.extract_feat(img)
 
-        proposal_list = self.detector.rpn_head.simple_test_rpn(x, img_metas)
-
-        det_results = self.detector.roi_head.simple_test(
-            x, proposal_list, img_metas, rescale=rescale)
+        rpn_results_list = self.detector.rpn_head.predict(x, data_samples)
+        det_results = self.detector.roi_head.predict(
+            x, rpn_results_list, data_samples, rescale=rescale)
         assert len(det_results) == 1, 'Batch inference is not supported.'
-        assert len(det_results[0]) == 2, 'There are no mask results.'
-        bbox_results = det_results[0][0]
-        mask_results = det_results[0][1]
-        num_classes = len(bbox_results)
-
-        outs_det = results2outs(
-            bbox_results=bbox_results,
-            mask_results=mask_results,
-            mask_shape=img_metas[0]['ori_shape'][:2])
-        det_bboxes = torch.tensor(outs_det['bboxes']).to(img)
-        det_labels = torch.tensor(outs_det['labels']).to(img).long()
-        det_masks = torch.tensor(outs_det['masks']).to(img).bool()
-
-        (track_bboxes, track_labels, track_masks,
-         track_ids) = self.tracker.track(
-             img=img,
-             img_metas=img_metas,
-             model=self,
-             feats=x,
-             bboxes=det_bboxes,
-             labels=det_labels,
-             masks=det_masks,
-             frame_id=frame_id,
-             rescale=rescale,
-             **kwargs)
-
-        track_results = outs2results(
-            bboxes=track_bboxes,
-            labels=track_labels,
-            masks=track_masks,
-            ids=track_ids,
-            num_classes=num_classes)
-        return dict(
-            track_bboxes=track_results['bbox_results'],
-            track_masks=track_results['mask_results'])
+        assert 'masks' in det_results[0], 'There are no mask results.'
+        track_data_sample = data_samples[0]
+        track_data_sample.pred_det_instances = \
+            det_results[0].clone()
+
+        pred_track_instances = self.tracker.track(
+            model=self,
+            img=img,
+            feats=x,
+            data_sample=track_data_sample,
+            rescale=rescale,
+            **kwargs)
+        track_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/mmtrack/registry.py b/mmtrack/registry.py
new file mode 100644
index 000000000..92cff40f4
--- /dev/null
+++ b/mmtrack/registry.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMTracking provides 17 registry nodes to support using modules across
+projects. Each node is a child of the root registry in MMEngine.
+
+More details can be found at
+https://mmengine.readthedocs.io/en/latest/tutorials/registry.html.
+"""
+
+from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
+from mmengine.registry import DATASETS as MMENGINE_DATASETS
+from mmengine.registry import HOOKS as MMENGINE_HOOKS
+from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
+from mmengine.registry import LOOPS as MMENGINE_LOOPS
+from mmengine.registry import METRICS as MMENGINE_METRICS
+from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS
+from mmengine.registry import MODELS as MMENGINE_MODELS
+from mmengine.registry import \
+    OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS
+from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS
+from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS
+from mmengine.registry import \
+    RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS
+from mmengine.registry import RUNNERS as MMENGINE_RUNNERS
+from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS
+from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS
+from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS
+from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS
+from mmengine.registry import \
+    WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS
+from mmengine.registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry('runner', parent=MMENGINE_RUNNERS)
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry(
+    'runner constructor', parent=MMENGINE_RUNNER_CONSTRUCTORS)
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry('loop', parent=MMENGINE_LOOPS)
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry('hook', parent=MMENGINE_HOOKS)
+
+# manage data-related modules
+DATASETS = Registry('dataset', parent=MMENGINE_DATASETS)
+DATA_SAMPLERS = Registry('data sampler', parent=MMENGINE_DATA_SAMPLERS)
+TRANSFORMS = Registry('transform', parent=MMENGINE_TRANSFORMS)
+
+# mangage all kinds of modules inheriting `nn.Module`
+MODELS = Registry('model', parent=MMENGINE_MODELS)
+# mangage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry('model_wrapper', parent=MMENGINE_MODEL_WRAPPERS)
+# mangage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry(
+    'weight initializer', parent=MMENGINE_WEIGHT_INITIALIZERS)
+
+# mangage all kinds of optimizers like `SGD` and `Adam`
+OPTIMIZERS = Registry('optimizer', parent=MMENGINE_OPTIMIZERS)
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry('optim_wrapper', parent=MMENGINE_OPTIM_WRAPPERS)
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry(
+    'optimizer constructor', parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS)
+# mangage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler', parent=MMENGINE_PARAM_SCHEDULERS)
+# manage all kinds of metrics
+METRICS = Registry('metric', parent=MMENGINE_METRICS)
+
+# manage task-specific modules like anchor generators and box coders
+TASK_UTILS = Registry('task util', parent=MMENGINE_TASK_UTILS)
+
+# manage visualizer
+VISUALIZERS = Registry('visualizer', parent=MMENGINE_VISUALIZERS)
+# manage visualizer backend
+VISBACKENDS = Registry('vis_backend', parent=MMENGINE_VISBACKENDS)
+
+# manage all kinds log processors
+LOG_PROCESSORS = Registry('log processor', parent=MMENGINE_LOG_PROCESSORS)
diff --git a/mmtrack/structures/__init__.py b/mmtrack/structures/__init__.py
new file mode 100644
index 000000000..6874c0a68
--- /dev/null
+++ b/mmtrack/structures/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .reid_data_sample import ReIDDataSample
+from .track_data_sample import TrackDataSample
+
+__all__ = ['TrackDataSample', 'ReIDDataSample']
diff --git a/mmtrack/core/bbox/__init__.py b/mmtrack/structures/bbox/__init__.py
similarity index 54%
rename from mmtrack/core/bbox/__init__.py
rename to mmtrack/structures/bbox/__init__.py
index a3ab99d97..494d5bf11 100644
--- a/mmtrack/core/bbox/__init__.py
+++ b/mmtrack/structures/bbox/__init__.py
@@ -1,9 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .iou_calculators import calculate_region_overlap
 from .transforms import (bbox_cxcyah_to_xyxy, bbox_cxcywh_to_x1y1wh,
-                         bbox_xyxy_to_cxcyah, bbox_xyxy_to_x1y1wh, quad2bbox)
+                         bbox_rel_cxcywh_to_xywh, bbox_xywh_to_rel_cxcywh,
+                         bbox_xyxy_to_cxcyah, bbox_xyxy_to_x1y1wh,
+                         quad2bbox_cxcywh)
 
 __all__ = [
-    'quad2bbox', 'bbox_cxcywh_to_x1y1wh', 'bbox_xyxy_to_x1y1wh',
-    'calculate_region_overlap', 'bbox_xyxy_to_cxcyah', 'bbox_cxcyah_to_xyxy'
+    'quad2bbox_cxcywh', 'bbox_cxcywh_to_x1y1wh', 'bbox_xyxy_to_x1y1wh',
+    'calculate_region_overlap', 'bbox_xyxy_to_cxcyah', 'bbox_cxcyah_to_xyxy',
+    'bbox_xywh_to_rel_cxcywh', 'bbox_rel_cxcywh_to_xywh'
 ]
diff --git a/mmtrack/core/bbox/iou_calculators/__init__.py b/mmtrack/structures/bbox/iou_calculators/__init__.py
similarity index 100%
rename from mmtrack/core/bbox/iou_calculators/__init__.py
rename to mmtrack/structures/bbox/iou_calculators/__init__.py
diff --git a/mmtrack/core/bbox/iou_calculators/region_iou_calculator.py b/mmtrack/structures/bbox/iou_calculators/region_iou_calculator.py
similarity index 100%
rename from mmtrack/core/bbox/iou_calculators/region_iou_calculator.py
rename to mmtrack/structures/bbox/iou_calculators/region_iou_calculator.py
diff --git a/mmtrack/core/bbox/transforms.py b/mmtrack/structures/bbox/transforms.py
similarity index 53%
rename from mmtrack/core/bbox/transforms.py
rename to mmtrack/structures/bbox/transforms.py
index 4a01702af..506f56fba 100644
--- a/mmtrack/core/bbox/transforms.py
+++ b/mmtrack/structures/bbox/transforms.py
@@ -1,17 +1,20 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
 import torch
-from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh
+from mmdet.structures.bbox.transforms import bbox_xyxy_to_cxcywh
+from torch import Tensor
 
 
-def quad2bbox(quad):
+def quad2bbox_cxcywh(quad: torch.Tensor):
     """Convert quadrilateral to axis aligned box in [cx, cy, w, h] format.
 
     Args:
-        quad (Tensor): of shape (N, 8), (8, ), (N, 4) or (4, ). The
+        quad (torch.Tensor): of shape (N, 8), (8, ), (N, 4) or (4, ). The
             coordinates are in [x1, y1, x2, y2, x3, y3, x4, y4] or
             [tl_x, tl_y, br_x, br_y] format.
     Returns:
-        Tensor: in [cx, cy, w, h] format.
+        torch.Tensor: in [cx, cy, w, h] format.
     """
     if len(quad.shape) == 1:
         quad = quad.unsqueeze(0)
@@ -38,35 +41,35 @@ def quad2bbox(quad):
     return bbox
 
 
-def bbox_cxcywh_to_x1y1wh(bbox):
+def bbox_cxcywh_to_x1y1wh(bbox: torch.Tensor) -> torch.Tensor:
     """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, w, h).
 
     Args:
-        bbox (Tensor): Shape (n, 4) or (4, ) for bboxes.
+        bbox (torch.Tensor): Shape (n, 4) or (4, ) for bboxes.
 
     Returns:
-        Tensor: Converted bboxes.
+        torch.Tensor: Converted bboxes.
     """
     cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
     bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), w, h]
     return torch.cat(bbox_new, dim=-1)
 
 
-def bbox_xyxy_to_x1y1wh(bbox):
+def bbox_xyxy_to_x1y1wh(bbox: torch.Tensor) -> torch.Tensor:
     """Convert bbox coordinates from (x1, y1, x2, y2) to (x1, y1, w, h).
 
     Args:
-        bbox (Tensor): Shape (n, 4) or (4, ) for bboxes.
+        bbox (torch.Tensor): Shape (n, 4) or (4, ) for bboxes.
 
     Returns:
-        Tensor: Converted bboxes.
+        torch.Tensor: Converted bboxes.
     """
     x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
     bbox_new = [x1, y1, (x2 - x1), (y2 - y1)]
     return torch.cat(bbox_new, dim=-1)
 
 
-def bbox_xyxy_to_cxcyah(bboxes):
+def bbox_xyxy_to_cxcyah(bboxes: torch.Tensor) -> torch.Tensor:
     """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, ratio, h).
 
     Args:
@@ -83,7 +86,7 @@ def bbox_xyxy_to_cxcyah(bboxes):
     return xyah
 
 
-def bbox_cxcyah_to_xyxy(bboxes):
+def bbox_cxcyah_to_xyxy(bboxes: torch.Tensor) -> torch.Tensor:
     """Convert bbox coordinates from (cx, cy, ratio, h) to (x1, y1, x2, y2).
 
     Args:
@@ -96,3 +99,52 @@ def bbox_cxcyah_to_xyxy(bboxes):
     w = ratio * h
     x1y1x2y2 = [cx - w / 2.0, cy - h / 2.0, cx + w / 2.0, cy + h / 2.0]
     return torch.cat(x1y1x2y2, dim=-1)
+
+
+def bbox_xywh_to_rel_cxcywh(bboxes: Tensor,
+                            size_norm: Optional[Tensor] = None) -> Tensor:
+    """Convert standard rectangular parametrization of the bounding box.
+
+        [x, y, w, h] to relative parametrization [cx/sw, cy/sh, log(w), log(h)]
+        , where [cx, cy] is the center coordinate.
+
+    Args:
+        bboxes (Tensor): of shape (N, 4) in [x, y, w, h] format.
+        size_norm (Tensor, optional): It contains values of [sw, sh] and it's
+            of shape (N, 2).
+
+    Returns:
+        Tensor: The converted bbox.
+    """
+
+    c = bboxes[..., :2] + 0.5 * bboxes[..., 2:]
+    if size_norm is None:
+        c_rel = c / bboxes[..., 2:]
+    else:
+        c_rel = c / size_norm
+
+    sz_rel = torch.log(bboxes[..., 2:])
+    return torch.cat((c_rel, sz_rel), dim=-1)
+
+
+def bbox_rel_cxcywh_to_xywh(bboxes: Tensor,
+                            size_norm: Optional[Tensor] = None) -> Tensor:
+    """Inverts the effect of `bbox_xywh_to_rel_cxcywh`.
+
+    Args:
+        bboxes (Tensor): of shape (N, 4) in [cx/sw, cy/sh, log(w), log(h)]
+            format.
+        size_norm (Tensor, optional): It contains values of [sw, sh] and it's
+            of shape (N, 2).
+
+    Returns:
+        Tensor: The converted bbox.
+    """
+
+    sz = torch.exp(bboxes[..., 2:])
+    if size_norm is None:
+        c = bboxes[..., :2] * sz
+    else:
+        c = bboxes[..., :2] * size_norm
+    tl = c - 0.5 * sz
+    return torch.cat((tl, sz), dim=-1)
diff --git a/mmtrack/structures/reid_data_sample.py b/mmtrack/structures/reid_data_sample.py
new file mode 100644
index 000000000..69958eece
--- /dev/null
+++ b/mmtrack/structures/reid_data_sample.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from numbers import Number
+from typing import Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement, LabelData
+
+
+def format_label(value: Union[torch.Tensor, np.ndarray, Sequence, int],
+                 num_classes: int = None) -> LabelData:
+    """Convert label of various python types to :obj:`mmengine.LabelData`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int`.
+
+    Args:
+        value (torch.Tensor | numpy.ndarray | Sequence | int): Label value.
+        num_classes (int, optional): The number of classes. If not None, set
+            it to the metainfo. Defaults to None.
+
+    Returns:
+        :obj:`mmengine.LabelData`: The foramtted label data.
+    """
+
+    # Handle single number
+    if isinstance(value, (torch.Tensor, np.ndarray)) and value.ndim == 0:
+        value = int(value.item())
+
+    if isinstance(value, np.ndarray):
+        value = torch.from_numpy(value)
+    elif isinstance(value, Sequence) and not mmengine.utils.is_str(value):
+        value = torch.tensor(value)
+    elif isinstance(value, int):
+        value = torch.LongTensor([value])
+    elif not isinstance(value, torch.Tensor):
+        raise TypeError(f'Type {type(value)} is not an available label type.')
+
+    metainfo = {}
+    if num_classes is not None:
+        metainfo['num_classes'] = num_classes
+        if value.max() >= num_classes:
+            raise ValueError(f'The label data ({value}) should not '
+                             f'exceed num_classes ({num_classes}).')
+    label = LabelData(label=value, metainfo=metainfo)
+    return label
+
+
+class ReIDDataSample(BaseDataElement):
+    """A data structure interface of ReID task.
+
+    It's used as interfaces between different components.
+
+    Meta field:
+        img_shape (Tuple): The shape of the corresponding input image.
+            Used for visualization.
+        ori_shape (Tuple): The original shape of the corresponding image.
+            Used for visualization.
+        num_classes (int): The number of all categories.
+            Used for label format conversion.
+
+    Data field:
+        gt_label (LabelData): The ground truth label.
+        pred_label (LabelData): The predicted label.
+        scores (torch.Tensor): The outputs of model.
+    """
+
+    @property
+    def gt_label(self):
+        return self._gt_label
+
+    @gt_label.setter
+    def gt_label(self, value: LabelData):
+        self.set_field(value, '_gt_label', dtype=LabelData)
+
+    @gt_label.deleter
+    def gt_label(self):
+        del self._gt_label
+
+    def set_gt_label(
+        self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number]
+    ) -> 'ReIDDataSample':
+        """Set label of ``gt_label``."""
+        label = format_label(value, self.get('num_classes'))
+        if 'gt_label' in self:  # setting for the second time
+            self.gt_label.label = label.label
+        else:  # setting for the first time
+            self.gt_label = label
+        return self
+
+    def set_gt_score(self, value: torch.Tensor) -> 'ReIDDataSample':
+        """Set score of ``gt_label``."""
+        assert isinstance(value, torch.Tensor), \
+            f'The value should be a torch.Tensor but got {type(value)}.'
+        assert value.ndim == 1, \
+            f'The dims of value should be 1, but got {value.ndim}.'
+
+        if 'num_classes' in self:
+            assert value.size(0) == self.num_classes, \
+                f"The length of value ({value.size(0)}) doesn't "\
+                f'match the num_classes ({self.num_classes}).'
+            metainfo = {'num_classes': self.num_classes}
+        else:
+            metainfo = {'num_classes': value.size(0)}
+
+        if 'gt_label' in self:  # setting for the second time
+            self.gt_label.score = value
+        else:  # setting for the first time
+            self.gt_label = LabelData(score=value, metainfo=metainfo)
+        return self
+
+    @property
+    def pred_feature(self):
+        return self._pred_feature
+
+    @pred_feature.setter
+    def pred_feature(self, value: torch.Tensor):
+        self.set_field(value, '_pred_feature', dtype=torch.Tensor)
+
+    @pred_feature.deleter
+    def pred_feature(self):
+        del self._pred_feature
diff --git a/mmtrack/structures/track_data_sample.py b/mmtrack/structures/track_data_sample.py
new file mode 100644
index 000000000..dc19299a2
--- /dev/null
+++ b/mmtrack/structures/track_data_sample.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.structures import BaseDataElement, InstanceData
+
+
+class TrackDataSample(BaseDataElement):
+    """A data structure interface of MMTracking. They are used as interfaces
+    between different components.
+
+    The attributes in ``TrackDataSample`` are divided into several parts:
+
+        - ``gt_instances``(InstanceData): Ground truth of instance annotations
+            in key frames.
+        - ``ignored_instances``(InstanceData): Instances to be ignored during
+            training/testing in key frames.
+        - ``proposals``(InstanceData): Region proposals used in two-stage
+            detectors in key frames.
+        - ``ref_gt_instances``(InstanceData): Ground truth of instance
+            annotations in reference frames.
+        - ``ref_ignored_instances``(InstanceData): Instances to be ignored
+            during training/testing in reference frames.
+        - ``ref_proposals``(InstanceData): Region proposals used in two-stage
+            detectors in reference frames.
+        - ``pred_det_instances``(InstanceData): Detection instances of model
+            predictions in key frames.
+        - ``pred_track_instances``(InstanceData): Tracking instances of model
+            predictions in key frames.
+    """
+
+    # Typically used in key frames.
+    @property
+    def gt_instances(self) -> InstanceData:
+        return self._gt_instances
+
+    @gt_instances.setter
+    def gt_instances(self, value: InstanceData):
+        self.set_field(value, '_gt_instances', dtype=InstanceData)
+
+    @gt_instances.deleter
+    def gt_instances(self):
+        del self._gt_instances
+
+    # Typically used in key frames.
+    @property
+    def ignored_instances(self) -> InstanceData:
+        return self._ignored_instances
+
+    @ignored_instances.setter
+    def ignored_instances(self, value: InstanceData):
+        self.set_field(value, '_ignored_instances', dtype=InstanceData)
+
+    @ignored_instances.deleter
+    def ignored_instances(self):
+        del self._ignored_instances
+
+    # Typically used in key frames.
+    @property
+    def proposals(self) -> InstanceData:
+        return self._proposals
+
+    @proposals.setter
+    def proposals(self, value: InstanceData):
+        self.set_field(value, '_proposals', dtype=InstanceData)
+
+    @proposals.deleter
+    def proposals(self):
+        del self._proposals
+
+    # Typically denotes the detection results of key frame
+    @property
+    def pred_det_instances(self) -> InstanceData:
+        return self._pred_det_instances
+
+    @pred_det_instances.setter
+    def pred_det_instances(self, value: InstanceData):
+        self.set_field(value, '_pred_det_instances', dtype=InstanceData)
+
+    @pred_det_instances.deleter
+    def pred_det_instances(self):
+        del self._pred_det_instances
+
+    # Typically denotes the tracking results of key frame
+    @property
+    def pred_track_instances(self) -> InstanceData:
+        return self._pred_track_instances
+
+    @pred_track_instances.setter
+    def pred_track_instances(self, value: InstanceData):
+        self.set_field(value, '_pred_track_instances', dtype=InstanceData)
+
+    @pred_track_instances.deleter
+    def pred_track_instances(self):
+        del self._pred_track_instances
diff --git a/mmtrack/testing/__init__.py b/mmtrack/testing/__init__.py
new file mode 100644
index 000000000..cb309497d
--- /dev/null
+++ b/mmtrack/testing/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ._utils import demo_mm_inputs, get_model_cfg, random_boxes
+
+__all__ = ['demo_mm_inputs', 'get_model_cfg', 'random_boxes']
diff --git a/mmtrack/testing/_utils.py b/mmtrack/testing/_utils.py
new file mode 100644
index 000000000..9e4d65ebf
--- /dev/null
+++ b/mmtrack/testing/_utils.py
@@ -0,0 +1,269 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from os.path import dirname, exists, join
+
+import numpy as np
+import torch
+from mmdet.utils.util_random import ensure_rng
+from mmengine.dataset import pseudo_collate
+from mmengine.structures import InstanceData
+
+from mmtrack.structures import TrackDataSample
+
+
+def random_boxes(num=1, scale=1, rng=None):
+    """Simple version of ``kwimage.Boxes.random``
+
+    Returns:
+        Tensor: shape (n, 4) in x1, y1, x2, y2 format.
+
+    References:
+        https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390 # noqa: E501
+
+    Example:
+        >>> num = 3
+        >>> scale = 512
+        >>> rng = 0
+        >>> boxes = random_boxes(num, scale, rng)
+        >>> print(boxes)
+        tensor([[280.9925, 278.9802, 308.6148, 366.1769],
+                [216.9113, 330.6978, 224.0446, 456.5878],
+                [405.3632, 196.3221, 493.3953, 270.7942]])
+    """
+    rng = ensure_rng(rng)
+
+    tlbr = rng.rand(num, 4).astype(np.float32)
+
+    tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
+    tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
+    br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
+    br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
+
+    tlbr[:, 0] = tl_x * scale
+    tlbr[:, 1] = tl_y * scale
+    tlbr[:, 2] = br_x * scale
+    tlbr[:, 3] = br_y * scale
+
+    boxes = torch.from_numpy(tlbr)
+    return boxes
+
+
+def _get_config_directory():
+    """Find the predefined video detector or tracker config directory."""
+    try:
+        # Assume we are running in the source mmtracking repo
+        repo_dpath = dirname(dirname(dirname(__file__)))
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmtrack
+        repo_dpath = dirname(dirname(mmtrack.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def _get_config_module(fname):
+    """Load a configuration as a python module."""
+    from mmengine import Config
+    config_dpath = _get_config_directory()
+    config_fpath = join(config_dpath, fname)
+    config_mod = Config.fromfile(config_fpath)
+    return config_mod
+
+
+def get_model_cfg(fname):
+    """Grab configs necessary to create a video detector or tracker.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+    return model
+
+
+def _rand_bboxes(rng, num_boxes, w, h):
+    cx, cy, bw, bh = rng.rand(num_boxes, 4).T
+
+    tl_x = ((cx * w) - (w * bw / 2)).clip(0, w)
+    tl_y = ((cy * h) - (h * bh / 2)).clip(0, h)
+    br_x = ((cx * w) + (w * bw / 2)).clip(0, w)
+    br_y = ((cy * h) + (h * bh / 2)).clip(0, h)
+
+    bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T
+    return bboxes
+
+
+def _rand_masks(rng, num_boxes, bboxes, img_w, img_h):
+    from mmdet.structures.mask import BitmapMasks
+    masks = np.zeros((num_boxes, img_h, img_w))
+    for i, bbox in enumerate(bboxes):
+        bbox = bbox.astype(np.int32)
+        mask = (rng.rand(1, bbox[3] - bbox[1], bbox[2] - bbox[0]) >
+                0.3).astype(np.int)
+        masks[i:i + 1, bbox[1]:bbox[3], bbox[0]:bbox[2]] = mask
+    return BitmapMasks(masks, height=img_h, width=img_w)
+
+
+def demo_mm_inputs(batch_size=1,
+                   frame_id=0,
+                   num_key_imgs=1,
+                   num_ref_imgs=1,
+                   image_shapes=[(3, 128, 128)],
+                   num_items=None,
+                   num_classes=10,
+                   ref_prefix='ref',
+                   num_template_imgs=None,
+                   num_search_imgs=None,
+                   with_mask=False,
+                   with_semantic=False):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        batch_size (int): batch size. Default to 2.
+        frame_id (int): the frame id.
+        num_key_imgs (int): the number of key images. This input is used in
+            all methods except for training in SOT.
+        num_ref_imgs (int): the number of reference images. This input is
+            used in all methods except for training in SOT.
+        image_shapes (List[tuple], Optional): image shape.
+            Default to (3, 128, 128)
+        num_items (None | List[int]): specifies the number
+            of boxes in each batch item. Default to None.
+        num_classes (int): number of different labels a
+            box might have. Default to 10.
+        ref_prefix (str): the prefix of reference images (or search images
+            in SOT).
+        with_mask (bool): Whether to return mask annotation.
+            Defaults to False.
+        with_semantic (bool): whether to return semantic.
+            Default to False.
+        num_template_imgs (int): the number of template images. This input is
+            only used in training in SOT.
+        num_search_imgs (int): the number of search images. This input is
+            only used in training in SOT.
+    """
+    # Compatible the names of one image group in SOT. `ref_prefix` means the
+    # prefix of search images in SOT.
+    assert (num_template_imgs is None) == (num_search_imgs is None)
+    if num_template_imgs is not None:
+        num_key_imgs, num_ref_imgs = num_template_imgs, num_search_imgs
+
+    rng = np.random.RandomState(0)
+
+    # Make sure the length of image_shapes is equal to ``batch_size``
+    if isinstance(image_shapes, list):
+        assert len(image_shapes) == batch_size
+    else:
+        image_shapes = [image_shapes] * batch_size
+
+    # Make sure the length of each element in image_shapes is equal to
+    # the number of types of image shapes since key_img and ref_image may have
+    # different shapes.
+    # After these transforms, as for ``image_shapes``, the
+    # length of the outer list is equal to ``batch_size`` and the length of the
+    # inner list is equal to the type of image shapes.
+    num_img_group = int((num_key_imgs > 0) + (num_ref_imgs > 0))
+    if isinstance(image_shapes[0], list):
+        assert len(image_shapes[0]) == num_img_group and isinstance(
+            image_shapes[0][0], tuple)
+    else:
+        assert isinstance(image_shapes[0], tuple)
+        image_shapes = [[shape] * num_img_group for shape in image_shapes]
+
+    if isinstance(num_items, list):
+        assert len(num_items) == batch_size
+
+    packed_inputs = []
+    for idx in range(batch_size):
+        image_shape_group = image_shapes[idx]
+        c, h, w = image_shape_group[0]
+
+        mm_inputs = dict(inputs=dict())
+        if num_key_imgs > 0:
+            key_img = rng.randint(
+                0,
+                255,
+                size=(num_key_imgs, *image_shape_group[0]),
+                dtype=np.uint8)
+            mm_inputs['inputs']['img'] = torch.from_numpy(key_img)
+        if num_ref_imgs > 0:
+            index = int(num_key_imgs > 0)
+            ref_img = rng.randint(
+                0,
+                255,
+                size=(num_ref_imgs, *image_shape_group[index]),
+                dtype=np.uint8)
+            mm_inputs['inputs'][f'{ref_prefix}_img'] = torch.from_numpy(
+                ref_img)
+
+        img_meta = {
+            'img_id': idx,
+            'img_shape': image_shape_group[0][-2:],
+            'ori_shape': image_shape_group[0][-2:],
+            'filename': '<demo>.png',
+            'scale_factor': np.array([1.1, 1.2]),
+            'flip': False,
+            'flip_direction': None,
+            'is_video_data': True,
+            'frame_id': frame_id
+        }
+        if num_ref_imgs > 0:
+            search_img_meta = dict()
+            for key, value in img_meta.items():
+                search_img_meta[f'{ref_prefix}_{key}'] = [
+                    value
+                ] * num_ref_imgs if num_ref_imgs > 1 else value
+            search_shape = image_shape_group[int(num_key_imgs > 0)][-2:]
+            search_img_meta[f'{ref_prefix}_img_shape'] = [
+                search_shape
+            ] * num_ref_imgs if num_ref_imgs > 1 else search_shape
+            search_img_meta[f'{ref_prefix}_ori_shape'] = [
+                search_shape
+            ] * num_ref_imgs if num_ref_imgs > 1 else search_shape
+            img_meta.update(search_img_meta)
+
+        data_sample = TrackDataSample()
+        data_sample.set_metainfo(img_meta)
+
+        # gt_instances
+        gt_instances = InstanceData()
+        if num_items is None:
+            num_boxes = rng.randint(1, 10)
+        else:
+            num_boxes = num_items[idx]
+
+        bboxes = _rand_bboxes(rng, num_boxes, w, h)
+        labels = rng.randint(0, num_classes, size=num_boxes)
+        instances_id = rng.randint(100, num_classes + 100, size=num_boxes)
+        gt_instances.bboxes = torch.FloatTensor(bboxes)
+        gt_instances.labels = torch.LongTensor(labels)
+        gt_instances.instances_id = torch.LongTensor(instances_id)
+
+        if with_mask:
+            masks = _rand_masks(rng, num_boxes, bboxes, w, h)
+            gt_instances.masks = masks
+
+        data_sample.gt_instances = gt_instances
+        # ignore_instances
+        ignore_instances = InstanceData()
+        bboxes = _rand_bboxes(rng, num_boxes, w, h)
+        ignore_instances.bboxes = bboxes
+        data_sample.ignored_instances = ignore_instances
+
+        if num_ref_imgs > 0:
+            ref_gt_instances = copy.deepcopy(gt_instances)
+            setattr(data_sample, f'{ref_prefix}_gt_instances',
+                    ref_gt_instances)
+            ref_ignored_instances = copy.deepcopy(ignore_instances)
+            setattr(data_sample, f'{ref_prefix}_ignored_instances',
+                    ref_ignored_instances)
+
+        mm_inputs['data_samples'] = data_sample
+
+        # TODO: gt_ignore
+
+        packed_inputs.append(mm_inputs)
+    data = pseudo_collate(packed_inputs)
+    return data
diff --git a/mmtrack/utils/__init__.py b/mmtrack/utils/__init__.py
index 0e80ec135..c532e863f 100644
--- a/mmtrack/utils/__init__.py
+++ b/mmtrack/utils/__init__.py
@@ -1,5 +1,24 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .benchmark import (DataLoaderBenchmark, DatasetBenchmark,
+                        InferenceBenchmark)
 from .collect_env import collect_env
-from .logger import get_root_logger
+from .image import crop_image, gauss_blur, imrenormalize
+from .misc import (convert_data_sample_type, format_video_level_show,
+                   max_last2d, stack_batch)
+from .mot_error_visualization import imshow_mot_errors
+from .plot_sot_curve import (plot_norm_precision_curve, plot_precision_curve,
+                             plot_success_curve)
+from .setup_env import register_all_modules
+from .typing import (ConfigType, ForwardResults, InstanceList, MultiConfig,
+                     OptConfigType, OptInstanceList, OptMultiConfig,
+                     OptSampleList, SampleList)
 
-__all__ = ['collect_env', 'get_root_logger']
+__all__ = [
+    'collect_env', 'register_all_modules', 'DataLoaderBenchmark',
+    'DatasetBenchmark', 'InferenceBenchmark', 'crop_image', 'imrenormalize',
+    'stack_batch', 'ConfigType', 'ForwardResults', 'InstanceList',
+    'MultiConfig', 'OptConfigType', 'OptInstanceList', 'OptMultiConfig',
+    'OptSampleList', 'SampleList', 'convert_data_sample_type',
+    'imshow_mot_errors', 'max_last2d', 'gauss_blur', 'format_video_level_show',
+    'plot_success_curve', 'plot_norm_precision_curve', 'plot_precision_curve'
+]
diff --git a/mmtrack/utils/benchmark.py b/mmtrack/utils/benchmark.py
new file mode 100644
index 000000000..d2d7c3de6
--- /dev/null
+++ b/mmtrack/utils/benchmark.py
@@ -0,0 +1,515 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import time
+from functools import partial
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import fuse_conv_bn
+from mmengine import MMLogger
+from mmengine.config import Config
+from mmengine.device import get_max_cuda_memory
+from mmengine.dist import get_world_size
+from mmengine.runner import Runner, load_checkpoint
+from mmengine.utils.dl_utils import set_multi_processing
+from torch.nn.parallel import DistributedDataParallel
+
+from mmtrack.datasets import BaseSOTDataset
+from mmtrack.registry import DATASETS, MODELS
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+
+def custom_round(value: Union[int, float],
+                 factor: Union[int, float],
+                 precision: int = 2) -> float:
+    """Custom round function."""
+    return round(value / factor, precision)
+
+
+gb_round = partial(custom_round, factor=1024**3)
+
+
+def print_log(msg: str, logger: Optional[MMLogger] = None) -> None:
+    """Print a log message."""
+    if logger is None:
+        print(msg, flush=True)
+    else:
+        logger.info(msg)
+
+
+def print_process_memory(p: 'psutil.Process',
+                         logger: Optional[MMLogger] = None) -> None:
+    """print process memory info."""
+    mem_used = gb_round(psutil.virtual_memory().used)
+    memory_full_info = p.memory_full_info()
+    uss_mem = gb_round(memory_full_info.uss)
+    pss_mem = gb_round(memory_full_info.pss)
+    for children in p.children():
+        child_mem_info = children.memory_full_info()
+        uss_mem += gb_round(child_mem_info.uss)
+        pss_mem += gb_round(child_mem_info.pss)
+    process_count = 1 + len(p.children())
+    print_log(
+        f'(GB) mem_used: {mem_used:.2f} | uss: {uss_mem:.2f} | '
+        f'pss: {pss_mem:.2f} | total_proc: {process_count}', logger)
+
+
+class BaseBenchmark:
+    """The benchmark base class.
+
+    The ``run`` method is an external calling interface, and it will
+    call the ``run_once`` method ``repeat_num`` times for benchmarking.
+    Finally, call the ``average_multiple_runs`` method to further process
+    the results of multiple runs.
+
+    Args:
+        max_iter (int): maximum iterations of benchmark.
+        log_interval (int): interval of logging.
+        num_warmup (int): Number of Warmup.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 max_iter: int,
+                 log_interval: int,
+                 num_warmup: int,
+                 logger: Optional[MMLogger] = None):
+        self.max_iter = max_iter
+        self.log_interval = log_interval
+        self.num_warmup = num_warmup
+        self.logger = logger
+
+    def run(self, repeat_num: int = 1) -> dict:
+        """benchmark entry method.
+
+        Args:
+            repeat_num (int): Number of repeat benchmark.
+                Defaults to 1.
+        """
+        assert repeat_num >= 1
+
+        results = []
+        for _ in range(repeat_num):
+            results.append(self.run_once())
+
+        results = self.average_multiple_runs(results)
+        return results
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        raise NotImplementedError()
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        raise NotImplementedError()
+
+
+class InferenceBenchmark(BaseBenchmark):
+    """The inference benchmark class. It will be statistical inference FPS,
+    CUDA memory and CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        checkpoint (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``.
+        distributed (bool): distributed testing flag.
+        is_fuse_conv_bn (bool): Whether to fuse conv and bn, this will
+            slightly increase the inference speed.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 checkpoint: str,
+                 distributed: bool,
+                 is_fuse_conv_bn: bool,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+
+        assert get_world_size(
+        ) == 1, 'Inference benchmark does not allow distributed multi-GPU'
+
+        self.cfg = copy.deepcopy(cfg)
+        self.distributed = distributed
+
+        if psutil is None:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+
+        self._process = psutil.Process()
+        env_cfg = self.cfg.get('env_cfg')
+        if env_cfg.get('cudnn_benchmark'):
+            torch.backends.cudnn.benchmark = True
+
+        mp_cfg: dict = env_cfg.get('mp_cfg', {})
+        set_multi_processing(**mp_cfg, distributed=self.distributed)
+
+        print_log('before build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        self.model = self._init_model(checkpoint, is_fuse_conv_bn)
+
+        # Because multiple processes will occupy additional CPU resources,
+        # FPS statistics will be more unstable when num_workers is not 0.
+        # It is reasonable to set num_workers to 0.
+        dataloader_cfg = cfg.test_dataloader
+        dataloader_cfg['num_workers'] = 0
+        dataloader_cfg['batch_size'] = 1
+        dataloader_cfg['persistent_workers'] = False
+        self.data_loader = Runner.build_dataloader(dataloader_cfg)
+
+        print_log('after build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+    def _init_model(self, checkpoint: str, is_fuse_conv_bn: bool) -> nn.Module:
+        """Initialize the model."""
+        model = MODELS.build(self.cfg.model)
+        if checkpoint is not None:
+            load_checkpoint(model, checkpoint, map_location='cpu')
+        if is_fuse_conv_bn:
+            model = fuse_conv_bn(model)
+
+        model = model.cuda()
+
+        if self.distributed:
+            model = DistributedDataParallel(
+                model,
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=False)
+
+        model.eval()
+        return model
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        for i, data in enumerate(self.data_loader):
+
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            torch.cuda.synchronize()
+            start_time = time.perf_counter()
+
+            with torch.no_grad():
+                self.model.test_step(data)
+
+            torch.cuda.synchronize()
+            elapsed = time.perf_counter() - start_time
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+                    cuda_memory = get_max_cuda_memory()
+
+                    print_log(
+                        f'Done image [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} img/s, '
+                        f'times per image: {1000 / fps:.1f} ms/img, '
+                        f'cuda memory: {cuda_memory} MB', self.logger)
+                    print_process_memory(self._process, self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per image: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                'ms/img', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} img/s, '
+                f'times per image: {1000 / fps_list_[0]:.1f} ms/img',
+                self.logger)
+
+        print_log(f'cuda memory: {get_max_cuda_memory()} MB', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        return outputs
+
+
+class DataLoaderBenchmark(BaseBenchmark):
+    """The dataloader benchmark class. It will be statistical inference FPS and
+    CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        distributed (bool): distributed testing flag.
+        dataset_type (str): benchmark data type, only supports ``train``,
+            ``val`` and ``test``.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 distributed: bool,
+                 dataset_type: str,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+
+        assert dataset_type in ['train', 'val', 'test'], \
+            'dataset_type only supports train,' \
+            f' val and test, but got {dataset_type}'
+        assert get_world_size(
+        ) == 1, 'Dataloader benchmark does not allow distributed multi-GPU'
+
+        self.cfg = copy.deepcopy(cfg)
+        self.distributed = distributed
+
+        if psutil is None:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+        self._process = psutil.Process()
+
+        mp_cfg = self.cfg.get('env_cfg', {}).get('mp_cfg')
+        if mp_cfg is not None:
+            set_multi_processing(distributed=self.distributed, **mp_cfg)
+        else:
+            set_multi_processing(distributed=self.distributed)
+
+        print_log('before build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        if dataset_type == 'train':
+            self.data_loader = Runner.build_dataloader(cfg.train_dataloader)
+        elif dataset_type == 'test':
+            self.data_loader = Runner.build_dataloader(cfg.test_dataloader)
+        else:
+            self.data_loader = Runner.build_dataloader(cfg.val_dataloader)
+
+        self.batch_size = self.data_loader.batch_size
+        self.num_workers = self.data_loader.num_workers
+
+        print_log('after build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        # benchmark with 2000 image and take the average
+        start_time = time.perf_counter()
+        for i, data in enumerate(self.data_loader):
+            elapsed = time.perf_counter() - start_time
+
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+
+                    print_log(
+                        f'Done batch [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} batch/s, '
+                        f'times per batch: {1000 / fps:.1f} ms/batch, '
+                        f'batch size: {self.batch_size}, num_workers: '
+                        f'{self.num_workers}', self.logger)
+                    print_process_memory(self._process, self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+            start_time = time.perf_counter()
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per batch: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                f'ms/batch, batch size: {self.batch_size}, num_workers: '
+                f'{self.num_workers}', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} batch/s, '
+                f'times per batch: {1000 / fps_list_[0]:.1f} ms/batch, '
+                f'batch size: {self.batch_size}, num_workers: '
+                f'{self.num_workers}', self.logger)
+
+        print_process_memory(self._process, self.logger)
+
+        return outputs
+
+
+class DatasetBenchmark(BaseBenchmark):
+    """The dataset benchmark class. It will be statistical inference FPS, FPS
+    pre transform and CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        dataset_type (str): benchmark data type, only supports ``train``,
+            ``val`` and ``test``.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 dataset_type: str,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+        assert dataset_type in ['train', 'val', 'test'], \
+            'dataset_type only supports train,' \
+            f' val and test, but got {dataset_type}'
+        assert get_world_size(
+        ) == 1, 'Dataset benchmark does not allow distributed multi-GPU'
+        self.cfg = copy.deepcopy(cfg)
+
+        if dataset_type == 'train':
+            dataloader_cfg = copy.deepcopy(cfg.train_dataloader)
+        elif dataset_type == 'test':
+            dataloader_cfg = copy.deepcopy(cfg.test_dataloader)
+        else:
+            dataloader_cfg = copy.deepcopy(cfg.val_dataloader)
+
+        dataset_cfg = dataloader_cfg.pop('dataset')
+        dataset = DATASETS.build(dataset_cfg)
+        if hasattr(dataset, 'full_init'):
+            dataset.full_init()
+        self.dataset = dataset
+        self.dataset_type = dataset_type
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        if self.dataset_type == 'test' and isinstance(self.dataset,
+                                                      BaseSOTDataset):
+            total_index = []
+            for video_ind in range(self.dataset.num_videos):
+                total_index.extend([
+                    (video_ind, frame_ind) for frame_ind in range(
+                        self.dataset.get_len_per_video(video_ind))
+                ])
+        else:
+            total_index = list(range(len(self.dataset)))
+
+        start_time = time.perf_counter()
+        for i, idx in enumerate(total_index):
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            get_data_info_start_time = time.perf_counter()
+            valid_idx = idx[0] if isinstance(idx, tuple) else idx
+            self.dataset.get_data_info(valid_idx)
+            get_data_info_elapsed = time.perf_counter(
+            ) - get_data_info_start_time
+
+            if (i + 1) % self.log_interval == 0:
+                print_log(f'get_data_info - {get_data_info_elapsed * 1000} ms',
+                          self.logger)
+
+            self.dataset[idx]
+            elapsed = time.perf_counter() - start_time - get_data_info_elapsed
+
+            if i >= self.num_warmup:
+                # print_log(f'{elapsed}', self.logger)
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+
+                    print_log(
+                        f'Done img [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} img/s, '
+                        f'times per img: {1000 / fps:.1f} ms/img', self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+            start_time = time.perf_counter()
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per img: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                'ms/img', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} img/s, '
+                f'times per img: {1000 / fps_list_[0]:.1f} ms/img',
+                self.logger)
+
+        return outputs
diff --git a/mmtrack/utils/collect_env.py b/mmtrack/utils/collect_env.py
index 46dce373b..29bba3383 100644
--- a/mmtrack/utils/collect_env.py
+++ b/mmtrack/utils/collect_env.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmcv.utils import collect_env as collect_base_env
-from mmcv.utils import get_git_hash
+from mmengine.utils import get_git_hash
 
 import mmtrack
 
diff --git a/mmtrack/utils/image.py b/mmtrack/utils/image.py
new file mode 100644
index 000000000..bd3a0365d
--- /dev/null
+++ b/mmtrack/utils/image.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Union
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def crop_image(image, crop_region, crop_size, padding=(0, 0, 0)):
+    """Crop image based on `crop_region` and `crop_size`.
+
+    Args:
+        image (ndarray): of shape (H, W, 3).
+        crop_region (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
+        crop_size (int): Crop size.
+        padding (tuple | ndarray): of shape (3, ) denoting the padding values.
+
+    Returns:
+        ndarray: Cropped image of shape (crop_size, crop_size, 3).
+    """
+    a = crop_size / (crop_region[2] - crop_region[0])
+    b = crop_size / (crop_region[3] - crop_region[1])
+    c = -a * crop_region[0]
+    d = -b * crop_region[1]
+    mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float32)
+    crop_image = cv2.warpAffine(
+        image,
+        mapping, (crop_size, crop_size),
+        borderMode=cv2.BORDER_CONSTANT,
+        borderValue=padding)
+    return crop_image
+
+
+def imrenormalize(img: Union[Tensor, np.ndarray], img_norm_cfg: dict,
+                  new_img_norm_cfg: dict) -> Union[Tensor, np.ndarray]:
+    """Re-normalize the image.
+
+    Args:
+        img (Tensor | ndarray): Input image. If the input is a Tensor, the
+            shape is (1, C, H, W). If the input is a ndarray, the shape
+            is (H, W, C).
+        img_norm_cfg (dict): Original configuration for the normalization.
+        new_img_norm_cfg (dict): New configuration for the normalization.
+
+    Returns:
+        Tensor | ndarray: Output image with the same type and shape of
+        the input.
+    """
+    if isinstance(img, torch.Tensor):
+        assert img.ndim == 4 and img.shape[0] == 1
+        new_img = img.squeeze(0).cpu().numpy().transpose(1, 2, 0)
+        new_img = _imrenormalize(new_img, img_norm_cfg, new_img_norm_cfg)
+        new_img = new_img.transpose(2, 0, 1)[None]
+        return torch.from_numpy(new_img).to(img)
+    else:
+        return _imrenormalize(img, img_norm_cfg, new_img_norm_cfg)
+
+
+def _imrenormalize(img: Union[Tensor, np.ndarray], img_norm_cfg: dict,
+                   new_img_norm_cfg: dict) -> Union[Tensor, np.ndarray]:
+    """Re-normalize the image."""
+    img_norm_cfg = img_norm_cfg.copy()
+    new_img_norm_cfg = new_img_norm_cfg.copy()
+    for k, v in img_norm_cfg.items():
+        if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray):
+            img_norm_cfg[k] = np.array(v, dtype=img.dtype)
+    # reverse cfg
+    if 'bgr_to_rgb' in img_norm_cfg:
+        img_norm_cfg['rgb_to_bgr'] = img_norm_cfg['bgr_to_rgb']
+        img_norm_cfg.pop('bgr_to_rgb')
+    for k, v in new_img_norm_cfg.items():
+        if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray):
+            new_img_norm_cfg[k] = np.array(v, dtype=img.dtype)
+    img = mmcv.imdenormalize(img, **img_norm_cfg)
+    img = mmcv.imnormalize(img, **new_img_norm_cfg)
+    return img
+
+
+def gauss_blur(image: Tensor, kernel_size: Sequence,
+               sigma: Sequence) -> Tensor:
+    """The gauss blur transform.
+
+    Args:
+        image (Tensor): of shape (n, c, h, w)
+        kernel_size (Tensor): The argument kernel size for gauss blur.
+        sigma (Sequence): The argument sigma for gauss blur.
+
+    Returns:
+        Tensor: The blurred image.
+    """
+    assert len(kernel_size) == len(sigma) == 2
+    x_coord = [
+        torch.arange(-size, size + 1, dtype=torch.float32)
+        for size in kernel_size
+    ]
+    filter = [
+        torch.exp(-(x**2) / (2 * s**2)).to(image.device)
+        for x, s in zip(x_coord, sigma)
+    ]
+    filter[0] = filter[0].view(1, 1, -1, 1) / filter[0].sum()
+    filter[1] = filter[1].view(1, 1, 1, -1) / filter[1].sum()
+
+    size = image.shape[2:]
+    img_1 = F.conv2d(
+        image.view(-1, 1, size[0], size[1]),
+        filter[0],
+        padding=(kernel_size[0], 0))
+
+    img_2 = F.conv2d(
+        img_1, filter[1],
+        padding=(0, kernel_size[1])).view(1, -1, size[0], size[1])
+
+    return img_2
diff --git a/mmtrack/utils/logger.py b/mmtrack/utils/logger.py
deleted file mode 100644
index ee906f586..000000000
--- a/mmtrack/utils/logger.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-
-from mmcv.utils import get_logger
-
-
-def get_root_logger(log_file=None, log_level=logging.INFO):
-    """Get root logger.
-
-    Args:
-        log_file (str): File path of log. Defaults to None.
-        log_level (int): The level of logger. Defaults to logging.INFO.
-
-    Returns:
-        :obj:`logging.Logger`: The obtained logger
-    """
-    return get_logger('mmtrack', log_file, log_level)
diff --git a/mmtrack/utils/misc.py b/mmtrack/utils/misc.py
new file mode 100644
index 000000000..b43eb7764
--- /dev/null
+++ b/mmtrack/utils/misc.py
@@ -0,0 +1,180 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from ..structures import TrackDataSample
+
+
+def stack_batch(tensors: List[torch.Tensor],
+                pad_size_divisor: int = 0,
+                pad_value: Union[int, float] = 0) -> torch.Tensor:
+    """Stack multiple tensors to form a batch and pad the images to the max
+    shape use the right bottom padding mode in these images. If
+    ``pad_size_divisor > 0``, add padding to ensure the common height and width
+    is divisible by ``pad_size_divisor``.
+
+    Args:
+        tensors (List[Tensor]): The input multiple tensors. each is a
+            TCHW 4D-tensor. T denotes the number of key/reference frames.
+        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
+            to ensure the common height and width is divisible by
+            ``pad_size_divisor``. This depends on the model, and many
+            models need a divisibility of 32. Defaults to 0
+        pad_value (int, float): The padding value. Defaults to 0
+
+    Returns:
+       Tensor: The NTCHW 5D-tensor. N denotes the batch size.
+    """
+    assert isinstance(tensors, list), \
+        f'Expected input type to be list, but got {type(tensors)}'
+    assert len(set([tensor.ndim for tensor in tensors])) == 1, \
+        f'Expected the dimensions of all tensors must be the same, ' \
+        f'but got {[tensor.ndim for tensor in tensors]}'
+    assert tensors[0].ndim == 4, f'Expected tensor dimension to be 4, ' \
+                                 f'but got {tensors[0].ndim}'
+    assert len(set([tensor.shape[0] for tensor in tensors])) == 1, \
+        f'Expected the channels of all tensors must be the same, ' \
+        f'but got {[tensor.shape[0] for tensor in tensors]}'
+
+    tensor_sizes = [(tensor.shape[-2], tensor.shape[-1]) for tensor in tensors]
+    max_size = np.stack(tensor_sizes).max(0)
+
+    if pad_size_divisor > 1:
+        # the last two dims are H,W, both subject to divisibility requirement
+        max_size = (
+            max_size +
+            (pad_size_divisor - 1)) // pad_size_divisor * pad_size_divisor
+
+    padded_samples = []
+    for tensor in tensors:
+        padding_size = [
+            0, max_size[-1] - tensor.shape[-1], 0,
+            max_size[-2] - tensor.shape[-2]
+        ]
+        if sum(padding_size) == 0:
+            padded_samples.append(tensor)
+        else:
+            padded_samples.append(F.pad(tensor, padding_size, value=pad_value))
+
+    return torch.stack(padded_samples, dim=0)
+
+
+def convert_data_sample_type(
+        data_sample: TrackDataSample,
+        num_ref_imgs: int = 1) -> Tuple[List[TrackDataSample], List[dict]]:
+    """Convert the type of ``data_sample`` from dict[list] to list[dict].
+
+    Note: This function is mainly used to be compatible with the
+        interface of MMDetection. It make sure that the information of
+        each reference image can be independently packed into
+        ``data_sample`` in which all the keys are without prefix "ref_".
+
+    Args:
+        data_sample (TrackDataSample): Data sample input.
+        num_ref_imgs (int, optional): The numbe of reference images in the
+            ``data_sample``. Defaults to 1.
+
+    Returns:
+        Tuple[List[TrackDataSample], List[dict]]: The first element is the
+            list of object of TrackDataSample. The second element is the
+            list of meta information of reference images.
+    """
+    ref_data_samples, ref_metainfos = [], []
+    for _ in range(num_ref_imgs):
+        ref_data_samples.append(deepcopy(data_sample))
+        ref_metainfos.append(deepcopy(data_sample.metainfo))
+
+    for key, value in data_sample.metainfo.items():
+        if key.startswith('ref_'):
+            new_key = key[4:]
+            if num_ref_imgs == 1:
+                value = [value]
+            assert len(value) == num_ref_imgs
+            for i, v in enumerate(value):
+                ref_metainfos[i][new_key] = v
+                ref_data_samples[i].set_metainfo(dict(new_key=v))
+                # pop the redundant original reference key.
+                ref_metainfos[i].pop(key)
+                ref_data_samples[i].pop(key)
+
+    return ref_data_samples, ref_metainfos
+
+
+def max_last2d(input: Tensor) -> Tuple[Tensor, Tensor]:
+    """Computes the value and position of maximum in the last two dimensions.
+
+    Args:
+        input (Tensor): of shape (..., H, W)
+
+    Returns:
+        max_val (Tensor): The maximum value.
+        argmax (Tensor): The position of maximum in [row, col] format.
+    """
+
+    max_val_row, argmax_row = torch.max(input, dim=-2)
+    max_val, argmax_col = torch.max(max_val_row, dim=-1)
+    argmax_row = argmax_row.view(argmax_col.numel(),
+                                 -1)[torch.arange(argmax_col.numel()),
+                                     argmax_col.view(-1)]
+    argmax_row = argmax_row.reshape(argmax_col.shape)
+    argmax = torch.cat((argmax_row.unsqueeze(-1), argmax_col.unsqueeze(-1)),
+                       -1)
+    return max_val, argmax
+
+
+def format_video_level_show(
+        video_names: List,
+        eval_results: List[np.ndarray],
+        sort_by_first_metric: bool = True,
+        show_indices: Optional[Tuple[int, List]] = None) -> List[List]:
+    """Format video-level performance show.
+
+    Args:
+        video_names (List): The names of the videos.
+        eval_results (List[np.ndarray]): The evaluation results.
+        sort_by_first_metric (bool, optional): Whether to sort the results by
+            the first metric. Defaults to True.
+        show_indices (Optional[Tuple[int, List]], optional): The video indices
+            to be shown. Defaults to None, i.e., all videos.
+
+    Returns:
+        List[List]: The formatted video-level evaluation results. For example:
+            [[`video-2`, 48.2, 49.2, 51.9],
+             [`video-1`, 46.2, 48.2, 50.2]]
+    """
+    all_video_names_str = np.array(video_names, dtype=str)
+    eval_show_results = eval_results
+
+    if sort_by_first_metric:
+        # sort from largest to smallest
+        sorted_index = np.argsort(-eval_results[0])
+        all_video_names_str = all_video_names_str[sorted_index]
+        sorted_eval_results = []
+        for eval_res in eval_results:
+            sorted_eval_results.append(eval_res[sorted_index])
+        eval_show_results = np.stack(sorted_eval_results).T
+
+    if show_indices is not None:
+        if isinstance(show_indices, int):
+            if show_indices < 0:
+                show_indices = np.arange(show_indices, 0)
+            else:
+                show_indices = np.arange(show_indices)
+        elif isinstance(show_indices, Sequence):
+            show_indices = np.array(show_indices, dtype=np.int64)
+        else:
+            raise NotImplementedError(
+                f'{type(show_indices)} is not supported. '
+                'Please use type of int or list')
+        eval_show_results = eval_show_results[show_indices, :]
+
+    eval_show_results = eval_show_results.tolist()
+    for res_line, video_name in zip(eval_show_results, all_video_names_str):
+        res_line.insert(0, video_name)
+
+    return eval_show_results
diff --git a/mmtrack/core/utils/visualization.py b/mmtrack/utils/mot_error_visualization.py
similarity index 51%
rename from mmtrack/core/utils/visualization.py
rename to mmtrack/utils/mot_error_visualization.py
index 87a39bb92..eb23bccea 100644
--- a/mmtrack/core/utils/visualization.py
+++ b/mmtrack/utils/mot_error_visualization.py
@@ -1,232 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
-import random
+from typing import Union
 
 import cv2
-import matplotlib
 import matplotlib.pyplot as plt
 import mmcv
 import numpy as np
 import seaborn as sns
 from matplotlib.patches import Rectangle
-from mmcv.utils import mkdir_or_exist
+from mmengine.utils import mkdir_or_exist
 
 
-def random_color(seed):
-    """Random a color according to the input seed."""
-    random.seed(seed)
-    colors = sns.color_palette()
-    color = random.choice(colors)
-    return color
-
-
-def imshow_tracks(*args, backend='cv2', **kwargs):
-    """Show the tracks on the input image."""
-    if backend == 'cv2':
-        return _cv2_show_tracks(*args, **kwargs)
-    elif backend == 'plt':
-        return _plt_show_tracks(*args, **kwargs)
-    else:
-        raise NotImplementedError()
-
-
-def _cv2_show_tracks(img,
-                     bboxes,
-                     labels,
-                     ids,
-                     masks=None,
-                     classes=None,
-                     score_thr=0.0,
-                     thickness=2,
-                     font_scale=0.4,
-                     show=False,
-                     wait_time=0,
-                     out_file=None):
-    """Show the tracks with opencv."""
-    assert bboxes.ndim == 2
-    assert labels.ndim == 1
-    assert ids.ndim == 1
-    assert bboxes.shape[0] == labels.shape[0]
-    assert bboxes.shape[1] == 5
-    if isinstance(img, str):
-        img = mmcv.imread(img)
-
-    img_shape = img.shape
-    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
-    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
-
-    inds = np.where(bboxes[:, -1] > score_thr)[0]
-    bboxes = bboxes[inds]
-    labels = labels[inds]
-    ids = ids[inds]
-    if masks is not None:
-        assert masks.ndim == 3
-        masks = masks[inds]
-        assert masks.shape[0] == bboxes.shape[0]
-
-    text_width, text_height = 9, 13
-    for i, (bbox, label, id) in enumerate(zip(bboxes, labels, ids)):
-        x1, y1, x2, y2 = bbox[:4].astype(np.int32)
-        score = float(bbox[-1])
-
-        # bbox
-        bbox_color = random_color(id)
-        bbox_color = [int(255 * _c) for _c in bbox_color][::-1]
-        cv2.rectangle(img, (x1, y1), (x2, y2), bbox_color, thickness=thickness)
-
-        # score
-        text = '{:.02f}'.format(score)
-        if classes is not None:
-            text += f'|{classes[label]}'
-        width = len(text) * text_width
-        img[y1:y1 + text_height, x1:x1 + width, :] = bbox_color
-        cv2.putText(
-            img,
-            text, (x1, y1 + text_height - 2),
-            cv2.FONT_HERSHEY_COMPLEX,
-            font_scale,
-            color=(0, 0, 0))
-
-        # id
-        text = str(id)
-        width = len(text) * text_width
-        img[y1 + text_height:y1 + 2 * text_height,
-            x1:x1 + width, :] = bbox_color
-        cv2.putText(
-            img,
-            str(id), (x1, y1 + 2 * text_height - 2),
-            cv2.FONT_HERSHEY_COMPLEX,
-            font_scale,
-            color=(0, 0, 0))
-
-        # mask
-        if masks is not None:
-            mask = masks[i].astype(bool)
-            mask_color = np.array(bbox_color, dtype=np.uint8).reshape(1, -1)
-            img[mask] = img[mask] * 0.5 + mask_color * 0.5
-
-    if show:
-        mmcv.imshow(img, wait_time=wait_time)
-    if out_file is not None:
-        mmcv.imwrite(img, out_file)
-
-    return img
-
-
-def _plt_show_tracks(img,
-                     bboxes,
-                     labels,
-                     ids,
-                     masks=None,
-                     classes=None,
-                     score_thr=0.0,
-                     thickness=0.1,
-                     font_scale=5,
-                     show=False,
-                     wait_time=0,
-                     out_file=None):
-    """Show the tracks with matplotlib."""
-    assert bboxes.ndim == 2
-    assert labels.ndim == 1
-    assert ids.ndim == 1
-    assert bboxes.shape[0] == ids.shape[0]
-    assert bboxes.shape[1] == 5
-
-    if isinstance(img, str):
-        img = plt.imread(img)
-    else:
-        img = mmcv.bgr2rgb(img)
-
-    img_shape = img.shape
-    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
-    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
-
-    inds = np.where(bboxes[:, -1] > score_thr)[0]
-    bboxes = bboxes[inds]
-    labels = labels[inds]
-    ids = ids[inds]
-    if masks is not None:
-        assert masks.ndim == 3
-        masks = masks[inds]
-        assert masks.shape[0] == bboxes.shape[0]
-
-    if not show:
-        matplotlib.use('Agg')
-
-    plt.imshow(img)
-    plt.gca().set_axis_off()
-    plt.autoscale(False)
-    plt.subplots_adjust(
-        top=1, bottom=0, right=1, left=0, hspace=None, wspace=None)
-    plt.margins(0, 0)
-    plt.gca().xaxis.set_major_locator(plt.NullLocator())
-    plt.gca().yaxis.set_major_locator(plt.NullLocator())
-    plt.rcParams['figure.figsize'] = img_shape[1], img_shape[0]
-
-    text_width, text_height = 12, 16
-    for i, (bbox, label, id) in enumerate(zip(bboxes, labels, ids)):
-        x1, y1, x2, y2 = bbox[:4].astype(np.int32)
-        score = float(bbox[-1])
-        w, h = int(x2 - x1), int(y2 - y1)
-
-        # bbox
-        bbox_color = random_color(id)
-        plt.gca().add_patch(
-            Rectangle((x1, y1),
-                      w,
-                      h,
-                      thickness,
-                      edgecolor=bbox_color,
-                      facecolor='none'))
-
-        # score
-        text = '{:.02f}'.format(score)
-        if classes is not None:
-            text += f'|{classes[label]}'
-        width = len(text) * text_width
-        plt.gca().add_patch(
-            Rectangle((x1, y1),
-                      width,
-                      text_height,
-                      thickness,
-                      edgecolor=bbox_color,
-                      facecolor=bbox_color))
-        plt.text(x1, y1 + text_height, text, fontsize=5)
-
-        # id
-        text = str(id)
-        width = len(text) * text_width
-        plt.gca().add_patch(
-            Rectangle((x1, y1 + text_height + 1),
-                      width,
-                      text_height,
-                      thickness,
-                      edgecolor=bbox_color,
-                      facecolor=bbox_color))
-        plt.text(x1, y1 + 2 * text_height + 2, text, fontsize=5)
-
-        # mask
-        if masks is not None:
-            mask = masks[i].astype(bool)
-            bbox_color = [int(255 * _c) for _c in bbox_color]
-            mask_color = np.array(bbox_color, dtype=np.uint8).reshape(1, -1)
-            img[mask] = img[mask] * 0.5 + mask_color * 0.5
-    # In order to show the mask.
-    plt.imshow(img)
-
-    if out_file is not None:
-        plt.savefig(out_file, dpi=300, bbox_inches='tight', pad_inches=0.0)
-
-    if show:
-        plt.draw()
-        plt.pause(wait_time / 1000.)
-    else:
-        plt.show()
-    plt.clf()
-    return img
-
-
-def imshow_mot_errors(*args, backend='cv2', **kwargs):
+def imshow_mot_errors(*args, backend: str = 'cv2', **kwargs):
     """Show the wrong tracks on the input image.
 
     Args:
@@ -241,17 +26,17 @@ def imshow_mot_errors(*args, backend='cv2', **kwargs):
         raise NotImplementedError()
 
 
-def _cv2_show_wrong_tracks(img,
-                           bboxes,
-                           ids,
-                           error_types,
-                           thickness=2,
-                           font_scale=0.4,
-                           text_width=10,
-                           text_height=15,
-                           show=False,
-                           wait_time=100,
-                           out_file=None):
+def _cv2_show_wrong_tracks(img: Union[str, np.ndarray],
+                           bboxes: np.ndarray,
+                           ids: np.ndarray,
+                           error_types: np.ndarray,
+                           thickness: int = 2,
+                           font_scale: float = 0.4,
+                           text_width: int = 10,
+                           text_height: int = 15,
+                           show: bool = False,
+                           wait_time: int = 100,
+                           out_file: str = None) -> np.ndarray:
     """Show the wrong tracks with opencv.
 
     Args:
@@ -347,17 +132,17 @@ def _cv2_show_wrong_tracks(img,
     return img
 
 
-def _plt_show_wrong_tracks(img,
-                           bboxes,
-                           ids,
-                           error_types,
-                           thickness=0.1,
-                           font_scale=3,
-                           text_width=8,
-                           text_height=13,
-                           show=False,
-                           wait_time=100,
-                           out_file=None):
+def _plt_show_wrong_tracks(img: Union[str, np.ndarray],
+                           bboxes: np.ndarray,
+                           ids: np.ndarray,
+                           error_types: np.ndarray,
+                           thickness: float = 0.1,
+                           font_scale: float = 3.0,
+                           text_width: int = 8,
+                           text_height: int = 13,
+                           show: bool = False,
+                           wait_time: int = 100,
+                           out_file: str = None) -> np.ndarray:
     """Show the wrong tracks with matplotlib.
 
     Args:
@@ -369,7 +154,7 @@ def _plt_show_wrong_tracks(img,
         thickness (float, optional): Thickness of lines.
             Defaults to 0.1.
         font_scale (float, optional): Font scale to draw id and score.
-            Defaults to 3.
+            Defaults to 3.0.
         text_width (int, optional): Width to draw id and score.
             Defaults to 8.
         text_height (int, optional): Height to draw id and score.
diff --git a/mmtrack/utils/plot_sot_curve.py b/mmtrack/utils/plot_sot_curve.py
new file mode 100644
index 000000000..f5f6bb621
--- /dev/null
+++ b/mmtrack/utils/plot_sot_curve.py
@@ -0,0 +1,217 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# The code is modified from https://github.com/visionml/pytracking/blob/master/pytracking/analysis/plot_results.py # noqa: E501
+
+from typing import List, Optional
+
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+from mmengine.utils import mkdir_or_exist
+
+PALETTE = [(1.0, 0.0, 0.0), (0.0, 1.0, 0.0), (0.0, 0.0, 1.0), (1.0, 0.0, 1.0),
+           (0.0, 1.0, 1.0), (0.5, 0.5, 0.5),
+           (136.0 / 255.0, 0.0, 21.0 / 255.0),
+           (1.0, 127.0 / 255.0, 39.0 / 255.0),
+           (0.0, 162.0 / 255.0, 232.0 / 255.0),
+           (0.0, 0.5, 0.0), (1.0, 0.5, 0.2), (0.1, 0.4, 0.0), (0.6, 0.3, 0.9),
+           (0.4, 0.7, 0.1), (0.2, 0.1, 0.7), (0.7, 0.6, 0.2),
+           (1.0, 102.0 / 255.0, 102.0 / 255.0),
+           (153.0 / 255.0, 1.0, 153.0 / 255.0),
+           (102.0 / 255.0, 102.0 / 255.0, 1.0),
+           (1.0, 192.0 / 255.0, 203.0 / 255.0)]
+LINE_STYLE = ['-'] * len(PALETTE)
+
+
+def plot_sot_curve(y: np.ndarray,
+                   x: np.ndarray,
+                   scores: np.ndarray,
+                   tracker_names: List,
+                   plot_opts: dict,
+                   plot_save_path: Optional[str] = None,
+                   show: bool = False):
+    """Plot curves for SOT.
+
+    Args:
+        y (np.ndarray): The content along the Y axis. It has shape (N, M),
+            where N is the number of trackers and M is the number of values
+            corresponding to the X.
+        x (np.ndarray): The content along the X axis. It has shape (M).
+        scores (np.ndarray): The content of viualized indicators.
+        tracker_names (List): The names of trackers.
+        plot_opts (dict): The options for plot.
+        plot_save_path (Optional[str], optional): The saved path of the figure.
+            Defaults to None.
+        show (bool, optional): Whether to show. Defaults to False.
+    """
+    x, scores = x.squeeze(), scores.squeeze()
+    assert y.ndim == 2 and x.ndim == 1 and scores.ndim == 1
+
+    # Plot settings
+    font_size = plot_opts.get('font_size', 12)
+    font_size_axis = plot_opts.get('font_size_axis', 13)
+    line_width = plot_opts.get('line_width', 2)
+    font_size_legend = plot_opts.get('font_size_legend', 13)
+
+    plot_type = plot_opts['plot_type']
+    legend_loc = plot_opts['legend_loc']
+
+    xlabel = plot_opts['xlabel']
+    ylabel = plot_opts['ylabel']
+    xlim = plot_opts['xlim']
+    ylim = plot_opts['ylim']
+
+    title = plot_opts['title']
+
+    matplotlib.rcParams.update({'font.size': font_size})
+    matplotlib.rcParams.update({'axes.titlesize': font_size_axis})
+    matplotlib.rcParams.update({'axes.titleweight': 'black'})
+    matplotlib.rcParams.update({'axes.labelsize': font_size_axis})
+
+    # Plot curves
+    fig, ax = plt.subplots()
+
+    index_sort = np.argsort(scores)
+    plotted_lines = []
+    legend_text = []
+
+    for id, id_sort in enumerate(index_sort):
+        line = ax.plot(
+            x.tolist(),
+            y[id_sort, :].tolist(),
+            linewidth=line_width,
+            color=PALETTE[len(index_sort) - id - 1],
+            linestyle=LINE_STYLE[len(index_sort) - id - 1])
+
+        plotted_lines.append(line[0])
+        legend_text.append('{} [{:.1f}]'.format(tracker_names[id_sort],
+                                                scores[id_sort]))
+
+    ax.legend(
+        plotted_lines[::-1],
+        legend_text[::-1],
+        loc=legend_loc,
+        fancybox=False,
+        edgecolor='black',
+        fontsize=font_size_legend,
+        framealpha=1.0)
+    ax.set(xlabel=xlabel, ylabel=ylabel, xlim=xlim, ylim=ylim, title=title)
+    ax.grid(True, linestyle='-.')
+    fig.tight_layout()
+
+    if plot_save_path is not None:
+        mkdir_or_exist(plot_save_path)
+        fig.savefig(
+            '{}/{}_plot.pdf'.format(plot_save_path, plot_type),
+            dpi=300,
+            format='pdf',
+            transparent=True)
+    plt.draw()
+    if show:
+        plt.show()
+
+
+def plot_success_curve(success: np.ndarray,
+                       tracker_names: List,
+                       plot_opts: Optional[dict] = None,
+                       plot_save_path: Optional[str] = None,
+                       show: bool = False):
+    """Plot curves of Success for SOT.
+
+    Args:
+        success (np.ndarray): The content of viualized indicators. It has shape
+            (N, M), where N is the number of trackers and M is the number of
+            ``Success`` corresponding to the X.
+        tracker_names (List): The names of trackers.
+        plot_opts (Optional[dict], optional): The options for plot.
+            Defaults to None.
+        plot_save_path (Optional[str], optional): The saved path of the figure.
+            Defaults to None.
+        show (bool, optional): Whether to show. Defaults to False.
+    """
+    assert len(tracker_names) == len(success)
+    success_plot_opts = {
+        'plot_type': 'success',
+        'legend_loc': 'lower left',
+        'xlabel': 'Overlap threshold',
+        'ylabel': 'Overlap Precision [%]',
+        'xlim': (0, 1.0),
+        'ylim': (0, 100),
+        'title': 'Success plot'
+    }
+    if plot_opts is not None:
+        success_plot_opts.update(success_plot_opts)
+    success_scores = np.mean(success, axis=1)
+
+    plot_sot_curve(success, np.arange(0, 1.05, 0.05), success_scores,
+                   tracker_names, success_plot_opts, plot_save_path, show)
+
+
+def plot_norm_precision_curve(norm_precision: np.ndarray,
+                              tracker_names: List,
+                              plot_opts: Optional[dict] = None,
+                              plot_save_path: Optional[str] = None,
+                              show: bool = False):
+    """Plot curves of Norm Precision for SOT.
+
+    Args:
+        norm_precision (np.ndarray): The content of viualized indicators. It
+            has shape (N, M), where N is the number of trackers and M is the
+            number of ``Norm Precision`` corresponding to the X.
+        tracker_names (List): The names of trackers.
+        plot_opts (Optional[dict], optional): The options for plot.
+            Defaults to None.
+        plot_save_path (Optional[str], optional): The saved path of the figure.
+            Defaults to None.
+        show (bool, optional): Whether to show. Defaults to False.
+    """
+    assert len(tracker_names) == len(norm_precision)
+    norm_precision_plot_opts = {
+        'plot_type': 'norm_precision',
+        'legend_loc': 'lower right',
+        'xlabel': 'Location error threshold',
+        'ylabel': 'Distance Precision [%]',
+        'xlim': (0, 0.5),
+        'ylim': (0, 100),
+        'title': 'Normalized Precision plot'
+    }
+    if plot_opts is not None:
+        norm_precision_plot_opts.update(norm_precision_plot_opts)
+
+    plot_sot_curve(norm_precision, np.arange(0, 0.51, 0.01),
+                   norm_precision[:, 20], tracker_names,
+                   norm_precision_plot_opts, plot_save_path, show)
+
+
+def plot_precision_curve(precision: np.ndarray,
+                         tracker_names: List,
+                         plot_opts: Optional[dict] = None,
+                         plot_save_path: Optional[str] = None,
+                         show: bool = False):
+    """Plot curves of Precision for SOT.
+
+    Args:
+        precision (np.ndarray): The content of viualized indicators. It has
+            shape (N, M), where N is the number of trackers and M is the
+            number of ``Precision`` corresponding to the X.
+        tracker_names (List): The names of trackers.
+        plot_opts (Optional[dict], optional): The options for plot.
+            Defaults to None.
+        plot_save_path (Optional[str], optional): The saved path of the figure.
+            Defaults to None.
+        show (bool, optional): Whether to show. Defaults to False.
+    """
+    assert len(tracker_names) == len(precision)
+    precision_plot_opts = {
+        'plot_type': 'precision',
+        'legend_loc': 'lower right',
+        'xlabel': 'Location error threshold [pixels]',
+        'ylabel': 'Distance Precision [%]',
+        'xlim': (0, 50),
+        'ylim': (0, 100),
+        'title': 'Precision plot'
+    }
+    if plot_opts is not None:
+        precision_plot_opts.update(plot_opts)
+
+    plot_sot_curve(precision, np.arange(0, 51, 1), precision[:, 20],
+                   tracker_names, precision_plot_opts, plot_save_path, show)
diff --git a/mmtrack/utils/setup_env.py b/mmtrack/utils/setup_env.py
new file mode 100644
index 000000000..a1509ad67
--- /dev/null
+++ b/mmtrack/utils/setup_env.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import warnings
+
+from mmdet.utils import register_all_modules as register_all_mmdet_modules
+from mmengine import DefaultScope
+
+
+def register_all_modules(init_default_scope: bool = True) -> None:
+    """Register all modules in mmtrack into the registries.
+
+    Args:
+        init_default_scope (bool): Whether initialize the mmtrack default scope.
+            When `init_default_scope=True`, the global default scope will be
+            set to `mmtrack`, and all registries will build modules from mmtrack's
+            registry node. To understand more about the registry, please refer
+            to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
+            Defaults to True.
+    """  # noqa
+    import mmtrack.datasets  # noqa: F401,F403
+    import mmtrack.datasets.samplers  # noqa: F401,F403
+    import mmtrack.datasets.transforms  # noqa: F401,F403
+    import mmtrack.engine  # noqa: F401,F403
+    import mmtrack.evaluation  # noqa: F401,F403
+    import mmtrack.models  # noqa: F401,F403
+    import mmtrack.visualization  # noqa: F401,F403
+
+    register_all_mmdet_modules(init_default_scope=False)
+
+    if init_default_scope:
+        never_created = DefaultScope.get_current_instance() is None \
+                        or not DefaultScope.check_instance_created('mmtrack')
+        if never_created:
+            DefaultScope.get_instance('mmtrack', scope_name='mmtrack')
+            return
+        current_scope = DefaultScope.get_current_instance()
+        if current_scope.scope_name != 'mmtrack':
+            warnings.warn('The current default scope '
+                          f'"{current_scope.scope_name}" is not "mmtrack", '
+                          '`register_all_modules` will force the current'
+                          'default scope to be "mmtrack". If this is not '
+                          'expected, please set `init_default_scope=False`.')
+            # avoid name conflict
+            new_instance_name = f'mmtrack-{datetime.datetime.now()}'
+            DefaultScope.get_instance(new_instance_name, scope_name='mmtrack')
diff --git a/mmtrack/utils/typing.py b/mmtrack/utils/typing.py
new file mode 100644
index 000000000..62136698a
--- /dev/null
+++ b/mmtrack/utils/typing.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Collecting some commonly used type hint in mmdetection."""
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+
+from ..structures import TrackDataSample
+
+# Type hint of config data
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+# Type hint of one or more config data
+MultiConfig = Union[ConfigType, List[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+InstanceList = List[InstanceData]
+OptInstanceList = Optional[InstanceList]
+
+SampleList = List[TrackDataSample]
+OptSampleList = Optional[SampleList]
+
+ForwardResults = Union[Dict[str, torch.Tensor], List[TrackDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
diff --git a/mmtrack/version.py b/mmtrack/version.py
index e090d9f31..840dca669 100644
--- a/mmtrack/version.py
+++ b/mmtrack/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
 
-__version__ = '0.13.0'
+__version__ = '1.0.0rc1'
 
 
 def parse_version_info(version_str):
diff --git a/mmtrack/visualization/__init__.py b/mmtrack/visualization/__init__.py
new file mode 100644
index 000000000..a83ef77b4
--- /dev/null
+++ b/mmtrack/visualization/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .local_visualizer import DetLocalVisualizer, TrackLocalVisualizer
+
+__all__ = ['TrackLocalVisualizer', 'DetLocalVisualizer']
diff --git a/mmtrack/visualization/local_visualizer.py b/mmtrack/visualization/local_visualizer.py
new file mode 100644
index 000000000..7a89d80d3
--- /dev/null
+++ b/mmtrack/visualization/local_visualizer.py
@@ -0,0 +1,235 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Union
+
+import mmcv
+import numpy as np
+import seaborn as sns
+from mmdet.structures.mask import bitmap_to_polygon
+from mmdet.visualization import DetLocalVisualizer as MMDET_DetLocalVisualizer
+from mmdet.visualization.palette import _get_adaptive_scales
+from mmengine.dist import master_only
+from mmengine.structures import InstanceData
+from mmengine.visualization import Visualizer
+
+from mmtrack.registry import VISUALIZERS
+from mmtrack.structures import TrackDataSample
+
+
+def random_color(seed):
+    """Random a color according to the input seed."""
+    np.random.seed(seed)
+    colors = sns.color_palette()
+    color = colors[np.random.choice(range(len(colors)))]
+    color = tuple([int(255 * c) for c in color])
+    return color
+
+
+@VISUALIZERS.register_module()
+class TrackLocalVisualizer(Visualizer):
+    """MMTracking Local Visualizer for the MOT, VIS, SOT, VOS tasks.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        line_width (int, float): The linewidth of lines.
+            Defaults to 3.
+        alpha (int, float): The transparency of bboxes or mask.
+                Defaults to 0.8.
+    """
+
+    def __init__(self,
+                 name: str = 'visualizer',
+                 image: Optional[np.ndarray] = None,
+                 vis_backends: Optional[Dict] = None,
+                 save_dir: Optional[str] = None,
+                 line_width: Union[int, float] = 3,
+                 alpha: float = 0.8):
+        super().__init__(name, image, vis_backends, save_dir)
+        self.line_width = line_width
+        self.alpha = alpha
+        # Set default value. When calling
+        # `TrackLocalVisualizer().dataset_meta=xxx`,
+        # it will override the default value.
+        self.dataset_meta = {}
+
+    def _draw_instances(self, image: np.ndarray,
+                        instances: InstanceData) -> np.ndarray:
+        """Draw instances of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            instances (:obj:`InstanceData`): Data structure for
+                instance-level annotations or predictions.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        self.set_image(image)
+        classes = self.dataset_meta.get('CLASSES', None)
+
+        # get colors and texts
+        if hasattr(instances, 'instances_id'):
+            # for the MOT and VIS tasks
+            colors = [random_color(_id) for _id in instances.instances_id]
+            categories = [
+                classes[label] if classes is not None else f'cls{label}'
+                for label in instances.labels
+            ]
+            if 'scores' in instances:
+                texts = [
+                    f'{category_name}\n{instance_id} | {score:.2f}'
+                    for category_name, instance_id, score in zip(
+                        categories, instances.instances_id, instances.scores)
+                ]
+            else:
+                texts = [
+                    f'{category_name}\n{instance_id}'
+                    for category_name, instance_id in zip(
+                        categories, instances.instances_id)
+                ]
+        else:
+            # for the SOT and VOS tasks
+            num_instances = max(
+                len(instances.get('bboxes', [])),
+                len(instances.get('masks', [])))
+            colors = [random_color(_) for _ in range(num_instances)]
+            if 'scores' in instances:
+                texts = [f'{score:.2f}' for score in instances.scores]
+            else:
+                texts = None
+
+        # draw bboxes and texts
+        if 'bboxes' in instances:
+            # draw bboxes
+            bboxes = instances.bboxes.clone()
+            self.draw_bboxes(
+                bboxes,
+                edge_colors=colors,
+                alpha=self.alpha,
+                line_widths=self.line_width)
+            # draw texts
+            if texts is not None:
+                positions = bboxes[:, :2] + self.line_width
+                areas = (bboxes[:, 3] - bboxes[:, 1]) * (
+                    bboxes[:, 2] - bboxes[:, 0])
+                scales = _get_adaptive_scales(areas.cpu().numpy())
+                for i, pos in enumerate(positions):
+                    self.draw_texts(
+                        texts[i],
+                        pos,
+                        colors='black',
+                        font_sizes=int(13 * scales[i]),
+                        bboxes=[{
+                            'facecolor': [c / 255 for c in colors[i]],
+                            'alpha': 0.8,
+                            'pad': 0.7,
+                            'edgecolor': 'none'
+                        }])
+
+        # draw masks
+        if 'masks' in instances:
+            masks = instances.masks
+            polygons = []
+            for i, mask in enumerate(masks):
+                contours, _ = bitmap_to_polygon(mask)
+                polygons.extend(contours)
+            self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha)
+            self.draw_binary_masks(masks, colors=colors, alphas=self.alpha)
+
+        return self.get_image()
+
+    @master_only
+    def add_datasample(
+            self,
+            name: str,
+            image: np.ndarray,
+            data_sample: Optional['TrackDataSample'] = None,
+            draw_gt: bool = True,
+            draw_pred: bool = True,
+            show: bool = False,
+            wait_time: int = 0,
+            # TODO: Supported in mmengine's Viusalizer.
+            out_file: Optional[str] = None,
+            pred_score_thr: float = 0.3,
+            step: int = 0) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are
+        displayed in a stitched image where the left image is the
+        ground truth and the right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and
+        the images will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be
+        saved to ``out_file``. t is usually used when the display
+        is not available.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to draw.
+            data_sample (:obj:`TrackDataSample`, optional): A data
+                sample that contain annotations and predictions.
+                Defaults to None.
+            draw_gt (bool): Whether to draw GT TrackDataSample.
+                Default to True.
+            draw_pred (bool): Whether to draw Prediction DTrackDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn image. Default to False.
+            wait_time (int): The interval of show (s). Defaults to 0.
+            out_file (str): Path to output file. Defaults to None.
+            pred_score_thr (float): The threshold to visualize the bboxes
+                and masks. Defaults to 0.3.
+            step (int): Global step value to record. Defaults to 0.
+        """
+
+        gt_img_data = None
+        pred_img_data = None
+
+        if data_sample is not None:
+            data_sample = data_sample.cpu()
+
+        if draw_gt and data_sample is not None:
+            assert 'gt_instances' in data_sample
+            gt_img_data = self._draw_instances(image, data_sample.gt_instances)
+
+        if draw_pred and data_sample is not None:
+            assert 'pred_track_instances' in data_sample
+            pred_instances = data_sample.pred_track_instances
+            if 'scores' in pred_instances:
+                pred_instances = pred_instances[
+                    pred_instances.scores > pred_score_thr].cpu()
+            pred_img_data = self._draw_instances(image, pred_instances)
+
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        else:
+            drawn_img = pred_img_data
+
+        if show:
+            self.show(drawn_img, win_name=name, wait_time=wait_time)
+        else:
+            self.add_image(name, drawn_img, step)
+
+        if out_file is not None:
+            mmcv.imwrite(drawn_img[..., ::-1], out_file)
+
+
+@VISUALIZERS.register_module()
+class DetLocalVisualizer(MMDET_DetLocalVisualizer):
+    """MMTracking Local Visualizer for the VID task."""
+
+    def add_datasample(self, *args, **kwargs):
+        """Draw datasample and save to all backends."""
+        if 'data_sample' in kwargs:
+            # assign `pred_det_instances` to `pred_instances`
+            # to hack the interface of `super().add_datasample()`.
+            if 'pred_det_instances' in kwargs['data_sample']:
+                kwargs['data_sample'].pred_instances = \
+                    kwargs['data_sample'].pred_det_instances
+        super().add_datasample(*args, **kwargs)
diff --git a/model-index.yml b/model-index.yml
index d3b21e679..bc31f785d 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -3,10 +3,13 @@ Import:
   - configs/mot/tracktor/metafile.yml
   - configs/mot/qdtrack/metafile.yml
   - configs/mot/bytetrack/metafile.yml
+  - configs/mot/strongsort/metafile.yml
   - configs/sot/siamese_rpn/metafile.yml
+  - configs/sot/prdimp/metafile.yml
   - configs/sot/stark/metafile.yml
   - configs/vid/dff/metafile.yml
   - configs/vid/fgfa/metafile.yml
   - configs/vid/selsa/metafile.yml
   - configs/vid/temporal_roi_align/metafile.yml
   - configs/vis/masktrack_rcnn/metafile.yml
+  - configs/vis/mask2former/metafile.yml
diff --git a/mot.mp4 b/mot.mp4
new file mode 100644
index 000000000..885ac0fae
Binary files /dev/null and b/mot.mp4 differ
diff --git a/requirements.txt b/requirements.txt
index 5f50cbdc0..04c77efa6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 -r requirements/build.txt
 -r requirements/runtime.txt
--r requirements/tests.txt
+-r requirements/tests.txt
\ No newline at end of file
diff --git a/requirements/build.txt b/requirements/build.txt
index 66e007a98..7933e570e 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -1,3 +1,2 @@
 cython
-numba==0.53.0
 numpy
diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt
index aacc47297..0a9d68e7b 100644
--- a/requirements/mminstall.txt
+++ b/requirements/mminstall.txt
@@ -1,3 +1,4 @@
-mmcls>=0.16.0
-mmcv-full>=1.3.17,<1.6.0
-mmdet>=2.19.1,<3.0.0
+mmcls>=1.0.0rc0
+mmcv>=2.0.0rc1
+mmdet>=3.0.0rc0
+mmengine>=0.1.0
diff --git a/requirements/readthedocs.txt b/requirements/readthedocs.txt
index de6632cf0..f8a23eb1f 100644
--- a/requirements/readthedocs.txt
+++ b/requirements/readthedocs.txt
@@ -1,5 +1,6 @@
-mmcls
-mmcv
-mmdet
+mmcls>=1.0.0rc0
+mmcv>=2.0.0rc1
+mmdet>=3.0.0rc0
+mmengine>=0.1.0
 torch
 torchvision
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 67823dfbd..0e2445f7e 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,13 +1,14 @@
-attributee==0.1.5
-dotty_dict
+attributee
 lap
 matplotlib
-mmcls>=0.16.0
+mmcls>=1.0.0rc0
 motmetrics
 packaging
 pandas<=1.3.5
-pycocotools<=2.0.2
+pycocotools
+scikit-learn
 scipy<=1.7.3
 seaborn
+tabulate
 terminaltables
 tqdm
diff --git a/requirements/tests.txt b/requirements/tests.txt
index 974d41591..46911183a 100644
--- a/requirements/tests.txt
+++ b/requirements/tests.txt
@@ -5,6 +5,7 @@ interrogate
 isort==4.3.21
 # Note: used for kwarray.group_items, this may be ported to mmcv in the future.
 kwarray
+parameterized
 pytest
 ubelt
 xdoctest>=0.10.0
diff --git a/setup.cfg b/setup.cfg
index 44e1e1d16..9db0e6639 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,7 +3,7 @@ line_length = 79
 multi_line_output = 0
 extra_standard_library = setuptools
 known_first_party = mmtrack
-known_third_party = PIL,addict,cv2,dotty_dict,lap,matplotlib,mmcls,mmcv,mmdet,motmetrics,numpy,packaging,pandas,pycocotools,pytest,pytorch_sphinx_theme,requests,scipy,script_utils,seaborn,tao,terminaltables,torch,tqdm
+known_third_party = PIL,addict,cv2,lap,matplotlib,mmcls,mmcv,mmdet,motmetrics,numpy,packaging,pandas,pycocotools,pytest,pytorch_sphinx_theme,requests,scipy,script_utils,seaborn,tao,terminaltables,torch,tqdm
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY
 
@@ -13,5 +13,5 @@ BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
 SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
 
 [codespell]
-ignore-words-list = mot
+ignore-words-list = mot,gool
 skip = *.json
diff --git a/tests/data/demo_MOT15_data/train/TUD-Campus/gt/gt.txt b/tests/data/demo_MOT15_data/train/TUD-Campus/gt/gt.txt
deleted file mode 100644
index 182d09a16..000000000
--- a/tests/data/demo_MOT15_data/train/TUD-Campus/gt/gt.txt
+++ /dev/null
@@ -1,359 +0,0 @@
-1,1,399,182,121,229,1,-1,-1,-1
-1,2,282,201,92,184,1,-1,-1,-1
-1,3,63,153,82,288,1,-1,-1,-1
-1,4,192,206,62,137,1,-1,-1,-1
-1,5,125,209,74,157,1,-1,-1,-1
-1,6,162,208,55,145,1,-1,-1,-1
-2,1,399,181,139,235,1,-1,-1,-1
-2,2,269,202,87,182,1,-1,-1,-1
-2,3,71,151,100,284,1,-1,-1,-1
-2,4,200,206,55,137,1,-1,-1,-1
-2,5,127,210,77,157,1,-1,-1,-1
-2,6,157,206,71,143,1,-1,-1,-1
-3,1,419,182,106,227,1,-1,-1,-1
-3,2,271,196,76,190,1,-1,-1,-1
-3,3,70,155,111,286,1,-1,-1,-1
-3,4,209,204,47,139,1,-1,-1,-1
-3,5,136,206,64,160,1,-1,-1,-1
-3,6,162,204,71,142,1,-1,-1,-1
-4,1,428,181,111,237,1,-1,-1,-1
-4,2,262,196,76,185,1,-1,-1,-1
-4,3,80,160,106,280,1,-1,-1,-1
-4,4,218,206,41,138,1,-1,-1,-1
-4,5,131,208,75,154,1,-1,-1,-1
-4,6,164,212,73,131,1,-1,-1,-1
-5,1,439,179,95,238,1,-1,-1,-1
-5,2,264,197,66,187,1,-1,-1,-1
-5,3,84,165,104,267,1,-1,-1,-1
-5,4,227,208,39,139,1,-1,-1,-1
-5,5,136,208,74.364,153.95,1,-1,-1,-1
-5,6,180,211,53,139,1,-1,-1,-1
-6,1,454,179,87,238,1,-1,-1,-1
-6,2,251,194,68,187,1,-1,-1,-1
-6,3,89,164,115,270,1,-1,-1,-1
-6,4,228,208,47,136,1,-1,-1,-1
-6,5,141,209,73.727,153.91,1,-1,-1,-1
-6,6,183,214,53,136,1,-1,-1,-1
-7,1,453,177,81,239,1,-1,-1,-1
-7,2,245,190,69,196,1,-1,-1,-1
-7,3,103,165,110,272,1,-1,-1,-1
-7,4,234,208,48,135.5,1,-1,-1,-1
-7,5,146,209,73.091,153.86,1,-1,-1,-1
-7,6,184,211,56,145,1,-1,-1,-1
-8,1,471,178,76,241,1,-1,-1,-1
-8,2,236,188,70,197,1,-1,-1,-1
-8,3,117,165,101,276,1,-1,-1,-1
-8,4,239,208,49,135,1,-1,-1,-1
-8,5,151,209,72.454,153.82,1,-1,-1,-1
-8,6,190,211,55,144,1,-1,-1,-1
-9,1,464,173,101,244,1,-1,-1,-1
-9,2,232,190,74,195,1,-1,-1,-1
-9,3,125,158,113,283,1,-1,-1,-1
-9,4,245,209,50,134.5,1,-1,-1,-1
-9,5,156,209,71.818,153.77,1,-1,-1,-1
-9,6,193,214,56,138,1,-1,-1,-1
-10,1,479,168,81,251,1,-1,-1,-1
-10,2,224,190,78,194,1,-1,-1,-1
-10,3,134,159,97,283,1,-1,-1,-1
-10,4,251,209,51,134,1,-1,-1,-1
-10,5,161,210,71.182,153.73,1,-1,-1,-1
-11,1,483,169,88,250,1,-1,-1,-1
-11,2,220,194,77,191,1,-1,-1,-1
-11,3,139,154,92,286,1,-1,-1,-1
-11,4,256,209,52,133.5,1,-1,-1,-1
-11,5,166,210,70.546,153.68,1,-1,-1,-1
-12,1,497,167,99,249,1,-1,-1,-1
-12,2,210,195,70,185,1,-1,-1,-1
-12,3,139,155,100,285,1,-1,-1,-1
-12,4,262,209,53,133,1,-1,-1,-1
-12,5,171,210,69.909,153.64,1,-1,-1,-1
-13,1,502,172,100,246,1,-1,-1,-1
-13,2,210,195,73,185,1,-1,-1,-1
-13,3,160,151,90,289,1,-1,-1,-1
-13,4,264,209,55,129,1,-1,-1,-1
-13,5,176,210,69.273,153.59,1,-1,-1,-1
-14,1,499,172,108,241,1,-1,-1,-1
-14,2,203,194,72.2,187.8,1,-1,-1,-1
-14,3,163,149,88,293,1,-1,-1,-1
-14,4,261,208,58,132,1,-1,-1,-1
-14,5,181,211,68.636,153.55,1,-1,-1,-1
-15,1,506,178,109,239,1,-1,-1,-1
-15,2,196,194,71.4,190.6,1,-1,-1,-1
-15,3,179,149,91,291,1,-1,-1,-1
-15,4,268,206,65,149,1,-1,-1,-1
-15,5,186,211,68,153.5,1,-1,-1,-1
-16,1,514,177,117,239,1,-1,-1,-1
-16,2,190,193,70.6,193.4,1,-1,-1,-1
-16,3,182,150,92,292,1,-1,-1,-1
-16,4,279,205,50,139,1,-1,-1,-1
-16,5,191,211,67.364,153.45,1,-1,-1,-1
-17,1,520,176,114,247,1,-1,-1,-1
-17,2,183,193,69.8,196.2,1,-1,-1,-1
-17,3,200,148,93,296,1,-1,-1,-1
-17,4,287,201,44,148,1,-1,-1,-1
-17,5,196,212,66.727,153.41,1,-1,-1,-1
-18,1,522,165,111,263,1,-1,-1,-1
-18,2,176,192,69,199,1,-1,-1,-1
-18,3,196,149,111,299,1,-1,-1,-1
-18,4,293,208,49,139,1,-1,-1,-1
-18,5,201,212,66.091,153.36,1,-1,-1,-1
-19,1,534,168,103,253,1,-1,-1,-1
-19,2,174,185,68,199,1,-1,-1,-1
-19,3,206,157,104,287,1,-1,-1,-1
-19,4,296,213,57,132,1,-1,-1,-1
-19,5,206,212,65.454,153.32,1,-1,-1,-1
-20,1,547,182,88,240,1,-1,-1,-1
-20,2,165,187,62,199,1,-1,-1,-1
-20,3,204,159,118,285,1,-1,-1,-1
-20,4,296,205,60,137,1,-1,-1,-1
-20,5,211,212,64.818,153.27,1,-1,-1,-1
-21,1,565,176,74,247,1,-1,-1,-1
-21,2,159,194,60,191,1,-1,-1,-1
-21,3,215,162,122,282,1,-1,-1,-1
-21,4,301,209,57,135,1,-1,-1,-1
-21,5,216,213,64.182,153.23,1,-1,-1,-1
-22,1,575,170,87,255,1,-1,-1,-1
-22,2,150,188,68,200,1,-1,-1,-1
-22,3,222,163,108,286,1,-1,-1,-1
-22,4,307,208,61,140,1,-1,-1,-1
-22,5,221,213,63.545,153.18,1,-1,-1,-1
-23,1,582,168,81,262,1,-1,-1,-1
-23,2,139,186,69,199,1,-1,-1,-1
-23,3,219,164,119,282,1,-1,-1,-1
-23,4,307,205,68,144,1,-1,-1,-1
-23,5,226,213,62.909,153.14,1,-1,-1,-1
-24,1,585,165,94,269,1,-1,-1,-1
-24,2,131,188,75,200,1,-1,-1,-1
-24,3,243,162,120,289,1,-1,-1,-1
-24,4,310,205,71,142,1,-1,-1,-1
-24,5,231,213,62.273,153.09,1,-1,-1,-1
-24,7,-28,183,76,235,1,-1,-1,-1
-25,2,121,191,87,189,1,-1,-1,-1
-25,3,254,165,97,281,1,-1,-1,-1
-25,4,321,211,55,133,1,-1,-1,-1
-25,5,236,214,61.636,153.05,1,-1,-1,-1
-25,7,-15,179,63,240,1,-1,-1,-1
-26,2,113,190,79,195,1,-1,-1,-1
-26,3,259,155,97,294,1,-1,-1,-1
-26,4,322,208,58,136,1,-1,-1,-1
-26,5,241,214,61,153,1,-1,-1,-1
-26,7,-20,180,83,235,1,-1,-1,-1
-27,2,109,194,88,192,1,-1,-1,-1
-27,3,274,158,88,296,1,-1,-1,-1
-27,4,328,208,57.739,136.26,1,-1,-1,-1
-27,5,242,222,74,150,1,-1,-1,-1
-27,7,-30,182,91,233,1,-1,-1,-1
-28,2,99,196,96,193,1,-1,-1,-1
-28,3,285,153,90,295,1,-1,-1,-1
-28,4,333,208,57.478,136.52,1,-1,-1,-1
-28,5,257,224,55,147,1,-1,-1,-1
-28,7,-21,177,82,236,1,-1,-1,-1
-29,2,88,194,93,188,1,-1,-1,-1
-29,3,287,162,106,283,1,-1,-1,-1
-29,4,339,208,57.217,136.78,1,-1,-1,-1
-29,5,261,218,60,154,1,-1,-1,-1
-29,7,-10,173,74,239.5,1,-1,-1,-1
-30,2,88,188,84,193,1,-1,-1,-1
-30,3,307,157,93,292,1,-1,-1,-1
-30,4,344,209,56.956,137.04,1,-1,-1,-1
-30,5,259,215,65,153,1,-1,-1,-1
-30,7,2,168,66,243,1,-1,-1,-1
-31,2,90,201,84,184,1,-1,-1,-1
-31,3,319,162,95,286,1,-1,-1,-1
-31,4,350,209,56.696,137.3,1,-1,-1,-1
-31,5,264,208,55,162,1,-1,-1,-1
-31,7,3,176,82,235,1,-1,-1,-1
-32,2,84,181,81,205,1,-1,-1,-1
-32,3,319,157,112,287,1,-1,-1,-1
-32,4,355,209,56.435,137.57,1,-1,-1,-1
-32,5,270,215,61,159,1,-1,-1,-1
-32,7,1,174,93,240,1,-1,-1,-1
-33,2,72,188,85,200,1,-1,-1,-1
-33,3,322,162,115,283,1,-1,-1,-1
-33,4,361,209,56.174,137.83,1,-1,-1,-1
-33,5,277,214,52,158,1,-1,-1,-1
-33,7,8,187,100,224,1,-1,-1,-1
-34,2,70,181,74,196,1,-1,-1,-1
-34,3,324,162,122,289,1,-1,-1,-1
-34,4,367,209,55.913,138.09,1,-1,-1,-1
-34,5,278,215,57,155,1,-1,-1,-1
-34,7,22,181,96,227,1,-1,-1,-1
-35,2,62,182,75.2,197,1,-1,-1,-1
-35,3,338,154,121,295,1,-1,-1,-1
-35,4,372,209,55.652,138.35,1,-1,-1,-1
-35,5,288,219,54,151,1,-1,-1,-1
-35,7,33,182,90,235,1,-1,-1,-1
-36,2,54,184,76.4,198,1,-1,-1,-1
-36,3,352,160,111,291,1,-1,-1,-1
-36,4,378,209,55.391,138.61,1,-1,-1,-1
-36,5,299,218,49,153,1,-1,-1,-1
-36,7,31,188,103,221,1,-1,-1,-1
-37,2,47,185,77.6,199,1,-1,-1,-1
-37,3,361,167,106,282,1,-1,-1,-1
-37,4,383,209,55.13,138.87,1,-1,-1,-1
-37,5,301,222,55,153,1,-1,-1,-1
-37,7,43,178,93,238,1,-1,-1,-1
-38,2,39,187,78.8,200,1,-1,-1,-1
-38,3,371,167,107,286,1,-1,-1,-1
-38,4,389,210,54.87,139.13,1,-1,-1,-1
-38,5,303,217,60,157,1,-1,-1,-1
-38,7,49,182,99,230,1,-1,-1,-1
-39,2,31,188,80,201,1,-1,-1,-1
-39,3,385,169,91,281,1,-1,-1,-1
-39,4,394,210,54.609,139.39,1,-1,-1,-1
-39,5,305,213,61,155,1,-1,-1,-1
-39,7,59,178,81,234,1,-1,-1,-1
-40,2,21,179,86,212,1,-1,-1,-1
-40,3,386,155,108,300,1,-1,-1,-1
-40,4,400,210,54.348,139.65,1,-1,-1,-1
-40,5,307,217,64,153,1,-1,-1,-1
-40,7,59,174,92,237,1,-1,-1,-1
-41,2,12,183,83,208,1,-1,-1,-1
-41,3,400,158,96,296,1,-1,-1,-1
-41,4,405,210,54.087,139.91,1,-1,-1,-1
-41,5,314,219,63,155,1,-1,-1,-1
-41,7,71,176,79,235,1,-1,-1,-1
-42,2,10,181,85,207,1,-1,-1,-1
-42,3,404,157,107,300,1,-1,-1,-1
-42,4,411,210,53.826,140.17,1,-1,-1,-1
-42,5,319,220,62,156,1,-1,-1,-1
-42,7,77,177,91,239,1,-1,-1,-1
-43,2,2,186,84,202,1,-1,-1,-1
-43,3,426,159,97,296,1,-1,-1,-1
-43,4,417,210,53.565,140.43,1,-1,-1,-1
-43,5,321,214,65,157,1,-1,-1,-1
-43,7,97,172,80,249,1,-1,-1,-1
-44,2,-3,186,79,209,1,-1,-1,-1
-44,3,430,154,98,303,1,-1,-1,-1
-44,4,422,210,53.304,140.7,1,-1,-1,-1
-44,5,325,214,66,158,1,-1,-1,-1
-44,7,102,171,86,246,1,-1,-1,-1
-45,2,-8,186,74,216,1,-1,-1,-1
-45,3,436,153,110,307,1,-1,-1,-1
-45,4,428,210,53.044,140.96,1,-1,-1,-1
-45,5,329,217,66,153,1,-1,-1,-1
-45,7,97,174,103,240,1,-1,-1,-1
-46,2,-14,186,65,220,1,-1,-1,-1
-46,3,439,158,126,304,1,-1,-1,-1
-46,4,433,211,52.783,141.22,1,-1,-1,-1
-46,5,340,219,54,148,1,-1,-1,-1
-46,7,108,178,114,238,1,-1,-1,-1
-47,2,-24,182,69,221,1,-1,-1,-1
-47,3,449,164,119,294,1,-1,-1,-1
-47,4,439,211,52.522,141.48,1,-1,-1,-1
-47,5,334,211,64,159,1,-1,-1,-1
-47,7,121,177,101,231,1,-1,-1,-1
-47,8,312,204,63,155,1,-1,-1,-1
-48,2,-28,176,76,227,1,-1,-1,-1
-48,3,460,162,125,295,1,-1,-1,-1
-48,4,444,211,52.261,141.74,1,-1,-1,-1
-48,5,342,215,58,150,1,-1,-1,-1
-48,7,127,185,100,231,1,-1,-1,-1
-48,8,318,198,57,161,1,-1,-1,-1
-49,3,478,164,102,291,1,-1,-1,-1
-49,4,450,211,52,142,1,-1,-1,-1
-49,5,345,215,60,157,1,-1,-1,-1
-49,7,132,182,88,236,1,-1,-1,-1
-49,8,312,193,82,171,1,-1,-1,-1
-50,3,481,157,108,300,1,-1,-1,-1
-50,4,450,209,56,142,1,-1,-1,-1
-50,5,356,214,52,154,1,-1,-1,-1
-50,7,140,183,94,235,1,-1,-1,-1
-50,8,328,199,65,162,1,-1,-1,-1
-51,3,494,159,100,303,1,-1,-1,-1
-51,4,455,206,56,143,1,-1,-1,-1
-51,5,352,209,64,165,1,-1,-1,-1
-51,7,154,181,89,238,1,-1,-1,-1
-51,8,328,199,69,165,1,-1,-1,-1
-52,3,497,158,104,301,1,-1,-1,-1
-52,4,460,209,53,139,1,-1,-1,-1
-52,5,358,215,65,150,1,-1,-1,-1
-52,7,165,181,81,238,1,-1,-1,-1
-52,8,330,199,67,166,1,-1,-1,-1
-53,3,505,150,103,307,1,-1,-1,-1
-53,4,467,213,60,140,1,-1,-1,-1
-53,5,365,214,65,158,1,-1,-1,-1
-53,7,173,178,79,241,1,-1,-1,-1
-53,8,333,195,73,169,1,-1,-1,-1
-54,3,508,148,100,310,1,-1,-1,-1
-54,4,473,217,61,131,1,-1,-1,-1
-54,5,365,214,70,158,1,-1,-1,-1
-54,7,185,176,81,249,1,-1,-1,-1
-54,8,342,199,67,166,1,-1,-1,-1
-55,3,514,151,105,306,1,-1,-1,-1
-55,4,482,215,63,130,1,-1,-1,-1
-55,5,367,217,75,151,1,-1,-1,-1
-55,7,196,177,75,242,1,-1,-1,-1
-55,8,349,196,63,162,1,-1,-1,-1
-56,3,529,151,99,308,1,-1,-1,-1
-56,4,482,209,64,144,1,-1,-1,-1
-56,5,379,219,62,151,1,-1,-1,-1
-56,7,201,174,88,240,1,-1,-1,-1
-56,8,352,193,63,173,1,-1,-1,-1
-57,3,538,150,102,310,1,-1,-1,-1
-57,4,486,214,61,131,1,-1,-1,-1
-57,5,382,223,59,151,1,-1,-1,-1
-57,7,208,173,94,249,1,-1,-1,-1
-57,8,366,193,53,172,1,-1,-1,-1
-58,3,539,157,101,305,1,-1,-1,-1
-58,4,491,211,65,140,1,-1,-1,-1
-58,5,386,218,56,152,1,-1,-1,-1
-58,7,213,182,102,239,1,-1,-1,-1
-58,8,366,196,50,175,1,-1,-1,-1
-59,3,553,152,113,304,1,-1,-1,-1
-59,4,499,210,61,142,1,-1,-1,-1
-59,5,393,218,53,152,1,-1,-1,-1
-59,7,223,182,103,237,1,-1,-1,-1
-59,8,371,198,53,166,1,-1,-1,-1
-60,3,553,157,123,300,1,-1,-1,-1
-60,4,506,215,59,134,1,-1,-1,-1
-60,5,397,215,57,152,1,-1,-1,-1
-60,7,231,187,107,236,1,-1,-1,-1
-60,8,372,199,58,163,1,-1,-1,-1
-61,3,565,160,113,308,1,-1,-1,-1
-61,4,517,217,51,131,1,-1,-1,-1
-61,5,402,213,57,154,1,-1,-1,-1
-61,7,236,187,106,235,1,-1,-1,-1
-61,8,372,199,59,169,1,-1,-1,-1
-62,3,572,167,110,295,1,-1,-1,-1
-62,4,514,213,59,138,1,-1,-1,-1
-62,5,411,211,57,160,1,-1,-1,-1
-62,7,251,185,101,242,1,-1,-1,-1
-62,8,375,201,77,161,1,-1,-1,-1
-63,3,575,173,97,290,1,-1,-1,-1
-63,4,518,205,60,144,1,-1,-1,-1
-63,5,409,214,67,160,1,-1,-1,-1
-63,7,260,183,103,244,1,-1,-1,-1
-63,8,377,199,69,169,1,-1,-1,-1
-64,4,528,210,56,142,1,-1,-1,-1
-64,5,418,215,56,152,1,-1,-1,-1
-64,7,273,183,85,240,1,-1,-1,-1
-64,8,378,204,69,165,1,-1,-1,-1
-65,4,534,214,58,139,1,-1,-1,-1
-65,5,423,214,68,158,1,-1,-1,-1
-65,7,280,179,90,242,1,-1,-1,-1
-65,8,383,204,73,161,1,-1,-1,-1
-66,4,537,206,62,143,1,-1,-1,-1
-66,5,421,217,73,154,1,-1,-1,-1
-66,7,297,179,84,246,1,-1,-1,-1
-66,8,383,204,73,162,1,-1,-1,-1
-67,4,542,209,74,147,1,-1,-1,-1
-67,5,434,214,58,158,1,-1,-1,-1
-67,7,306,179,80,239,1,-1,-1,-1
-67,8,391,196,66,175,1,-1,-1,-1
-68,4,547,211,61,140,1,-1,-1,-1
-68,5,434,220,67,151,1,-1,-1,-1
-68,7,315,173,90,253,1,-1,-1,-1
-68,8,403,201,63,167,1,-1,-1,-1
-69,4,557,215,62,138,1,-1,-1,-1
-69,5,441,217,61,155,1,-1,-1,-1
-69,7,325,178,91,250,1,-1,-1,-1
-69,8,399,201,70,177,1,-1,-1,-1
-70,4,561,210,59,142,1,-1,-1,-1
-70,5,446,217,64,155,1,-1,-1,-1
-70,7,326,182,100,235,1,-1,-1,-1
-70,8,403,199,65,169,1,-1,-1,-1
-71,4,561,220,63,133,1,-1,-1,-1
-71,5,449,219,61,153,1,-1,-1,-1
-71,7,335,183,107,243,1,-1,-1,-1
-71,8,416,204,58,164,1,-1,-1,-1
diff --git a/tests/data/demo_MOT15_data/train/TUD-Campus/seqinfo.ini b/tests/data/demo_MOT15_data/train/TUD-Campus/seqinfo.ini
deleted file mode 100644
index 2becdbf43..000000000
--- a/tests/data/demo_MOT15_data/train/TUD-Campus/seqinfo.ini
+++ /dev/null
@@ -1,3 +0,0 @@
-[Sequence]
-name=TUD-Campus
-seqLength=71
diff --git a/tests/data/demo_MOT15_data/train/TUD-Stadtmitte/gt/gt.txt b/tests/data/demo_MOT15_data/train/TUD-Stadtmitte/gt/gt.txt
deleted file mode 100644
index c3bd759ab..000000000
--- a/tests/data/demo_MOT15_data/train/TUD-Stadtmitte/gt/gt.txt
+++ /dev/null
@@ -1,1156 +0,0 @@
-1,1,88,99,61.08,218.56,1,4.4852,5.5016,0
-1,2,181,95,75.808,227.01,1,4.4091,4.4283,0
-1,3,184,96,35.446,154.5,1,12.621,10.628,0
-1,4,357,82,72.924,246.18,1,4.3869,2.7804,0
-1,5,458,89,64.796,236.59,1,4.7801,2.1475,0
-1,6,513,111,36.501,122.15,1,16.592,8.209,0
-1,7,576,84,61.632,195.56,1,9.2619,3.381,0
-2,1,84,99,61.08,218.56,1,4.4717,5.5332,0
-2,2,184,95,75.63,226.62,1,4.4935,4.4599,0
-2,3,184,96,35.532,154.5,1,12.621,10.628,0
-2,4,360,82,72.831,245.79,1,4.4652,2.802,0
-2,5,460,89,64.89,236.19,1,4.7833,2.1314,0
-2,6,512,111,36.48,122.15,1,16.59,8.2237,0
-2,7,574,84,61.596,195.37,1,9.2594,3.4022,0
-3,1,80,100,61.08,218.56,1,4.3797,5.5027,0
-3,2,188,95,75.412,226.14,1,4.5047,4.4288,0
-3,3,184,96,35.636,154.51,1,12.621,10.628,0
-3,4,362,82,72.718,245.31,1,4.4692,2.7865,0
-3,5,462,89,65.005,235.71,1,4.861,2.1575,0
-3,6,511,111,36.453,122.14,1,16.589,8.2383,0
-3,7,572,84,61.552,195.14,1,9.257,3.4233,0
-4,1,75,100,61.08,218.56,1,4.3626,5.542,0
-4,2,192,95,75.173,225.62,1,4.5926,4.4526,0
-4,3,184,96,35.751,154.52,1,12.621,10.628,0
-4,4,365,82,72.593,244.78,1,4.5481,2.8082,0
-4,5,465,89,65.131,235.18,1,4.8657,2.1333,0
-4,6,510,111,36.425,122.13,1,16.588,8.2529,0
-4,7,569,84,61.504,194.89,1,9.3727,3.5155,0
-5,1,69,101,61.08,218.56,1,4.2642,5.527,0
-5,2,196,95,74.925,225.07,1,4.6036,4.4212,0
-5,3,184,96,35.87,154.53,1,12.621,10.628,0
-5,4,368,82,72.464,244.24,1,4.554,2.7847,0
-5,5,468,89,65.262,234.63,1,4.9457,2.1514,0
-5,6,508,111,36.395,122.13,1,16.585,8.2822,0
-5,7,566,84,61.455,194.63,1,9.3689,3.5473,0
-6,1,64,101,61.08,218.56,1,4.2469,5.5661,0
-6,2,200,95,74.675,224.53,1,4.6921,4.445,0
-6,3,184,96,35.99,154.53,1,12.621,10.628,0
-6,4,371,82,72.332,243.69,1,4.6335,2.8064,0
-6,5,471,89,65.394,234.07,1,4.9504,2.127,0
-6,6,507,111,36.365,122.12,1,16.584,8.2968,0
-6,7,564,84,61.404,194.37,1,9.3664,3.5685,0
-6,8,616,108,23.994,126.99,1,16.481,6.6959,0
-7,1,59,101,61.08,218.56,1,4.2294,5.6051,0
-7,2,204,95,74.425,223.98,1,4.7812,4.4689,0
-7,3,184,96,36.11,154.54,1,12.624,10.615,0
-7,4,374,82,72.2,243.13,1,4.6393,2.7828,0
-7,5,473,90,65.525,233.52,1,4.9535,2.1108,0
-7,6,506,112,36.335,122.11,1,16.365,8.1919,0
-7,7,561,84,61.352,194.1,1,9.3627,3.6003,0
-7,8,615,108,25.355,127.03,1,16.267,6.5908,0
-8,1,53,102,61.08,218.56,1,4.1314,5.5897,0
-8,2,208,95,74.175,223.43,1,4.7921,4.4372,0
-8,3,184,96,36.23,154.55,1,12.624,10.615,0
-8,4,377,82,72.068,242.59,1,4.7194,2.8046,0
-8,5,476,90,65.655,232.97,1,5.034,2.1287,0
-8,6,504,112,36.305,122.11,1,16.362,8.2209,0
-8,7,558,85,61.3,193.84,1,9.3589,3.6321,0
-8,8,613,108,27.016,127.08,1,16.266,6.6053,0
-9,1,48,102,61.08,218.56,1,4.1136,5.6285,0
-9,2,212,95,73.925,222.88,1,4.8791,4.469,0
-9,3,183,96,36.35,154.56,1,12.621,10.628,0
-9,4,380,82,71.936,242.03,1,4.7232,2.7887,0
-9,5,479,90,65.786,232.41,1,5.0386,2.1042,0
-9,6,503,112,36.275,122.1,1,16.361,8.2354,0
-9,7,555,85,61.248,193.58,1,9.355,3.6639,0
-9,8,611,108,28.839,127.13,1,16.265,6.6197,0
-10,1,43,103,61.08,218.56,1,4.0197,5.6051,0
-10,2,216,95,73.675,222.33,1,4.8899,4.4371,0
-10,3,183,96,36.47,154.57,1,12.621,10.628,0
-10,4,383,82,71.805,241.49,1,4.804,2.8105,0
-10,5,482,90,65.918,231.85,1,5.1198,2.1221,0
-10,6,502,112,36.245,122.1,1,16.36,8.25,0
-10,7,553,85,61.196,193.31,1,9.3524,3.685,0
-10,8,609,108,30.725,127.19,1,16.264,6.6341,0
-11,1,37,103,61.08,218.56,1,3.998,5.6514,0
-11,2,220,95,73.425,221.78,1,4.9803,4.461,0
-11,3,183,96,36.59,154.57,1,12.621,10.628,0
-11,4,386,82,71.675,240.94,1,4.8855,2.8325,0
-11,5,485,90,66.05,231.3,1,5.1258,2.0892,0
-11,6,500,112,36.216,122.09,1,16.357,8.279,0
-11,7,550,85,61.145,193.05,1,9.3486,3.7168,0
-11,8,607,107,32.63,127.24,1,16.477,6.7542,0
-12,1,32,103,61.238,218.55,1,3.9798,5.69,0
-12,2,224,95,73.176,221.23,1,4.9909,4.4288,0
-12,3,183,96,36.71,154.58,1,12.621,10.628,0
-12,4,389,82,71.545,240.38,1,4.8911,2.8085,0
-12,5,487,90,66.182,230.75,1,5.2062,2.1153,0
-12,6,499,112,36.188,122.08,1,16.356,8.2934,0
-12,7,547,85,61.095,192.79,1,9.4654,3.8112,0
-12,8,606,107,34.535,127.3,1,16.477,6.7542,0
-13,1,26,104,61.782,218.53,1,3.8826,5.6739,0
-13,2,228,95,72.928,220.68,1,5.082,4.4526,0
-13,3,183,96,36.83,154.59,1,12.621,10.628,0
-13,4,393,82,71.414,239.84,1,4.9751,2.8224,0
-13,5,490,90,66.314,230.19,1,5.2107,2.0905,0
-13,6,498,112,36.16,122.08,1,16.355,8.308,0
-13,7,545,85,61.045,192.53,1,9.4628,3.8325,0
-13,8,604,107,36.421,127.35,1,16.476,6.7687,0
-14,1,20,104,63.155,218.48,1,3.8641,5.7122,0
-14,2,232,95,72.68,220.13,1,5.0925,4.4202,0
-14,3,183,96,36.95,154.6,1,12.621,10.628,0
-14,4,396,82,71.282,239.28,1,4.9806,2.7982,0
-14,5,493,90,66.445,229.63,1,5.2932,2.1084,0
-14,6,496,112,36.132,122.07,1,16.352,8.337,0
-14,7,542,85,60.995,192.26,1,9.4588,3.8644,0
-14,8,602,107,38.244,127.4,1,16.475,6.7833,0
-15,1,14,105,65.7,218.39,1,3.7712,5.6883,0
-15,2,237,95,72.432,219.59,1,5.1868,4.4359,0
-15,3,183,96,37.07,154.61,1,12.621,10.628,0
-15,4,399,82,71.15,238.74,1,5.0634,2.8202,0
-15,5,496,90,66.575,229.08,1,5.2976,2.0834,0
-15,6,495,112,36.104,122.07,1,16.351,8.3515,0
-15,7,539,85,60.945,192,1,9.4548,3.8963,0
-15,8,600,107,39.905,127.45,1,16.473,6.8124,0
-16,1,7,105,69.165,218.26,1,3.7524,5.7263,0
-16,2,241,95,72.184,219.04,1,5.1971,4.4033,0
-16,3,183,96,37.19,154.61,1,12.621,10.628,0
-16,4,402,82,71.018,238.19,1,5.0689,2.7958,0
-16,5,499,90,66.706,228.53,1,5.3808,2.1013,0
-16,6,493,112,36.076,122.06,1,16.348,8.3805,0
-16,7,536,85,60.895,191.74,1,9.5729,3.9922,0
-16,8,598,107,41.266,127.49,1,16.472,6.8269,0
-17,1,0,105,72.378,218.15,1,3.7336,5.7644,0
-17,2,245,95,71.935,218.49,1,5.2869,4.4352,0
-17,3,183,96,37.309,154.62,1,12.621,10.628,0
-17,4,405,83,70.886,237.64,1,5.0743,2.7714,0
-17,5,501,90,66.838,227.97,1,5.4633,2.1278,0
-17,6,492,113,36.051,122.07,1,16.132,8.2756,0
-17,7,534,86,60.845,191.47,1,9.448,3.9494,0
-17,8,597,108,42.219,127.52,1,16.258,6.7208,0
-18,1,1,106,74.057,218.12,1,3.6675,5.6872,0
-18,2,249,95,71.685,217.94,1,5.3801,4.4591,0
-18,3,183,96,37.427,154.63,1,12.621,10.628,0
-18,4,408,83,70.755,237.09,1,5.0798,2.747,0
-18,5,504,90,66.97,227.41,1,5.4676,2.1025,0
-18,6,491,113,36.037,122.1,1,16.13,8.2899,0
-18,7,531,86,60.795,191.21,1,9.4439,3.9812,0
-18,8,595,108,42.763,127.53,1,16.256,6.7496,0
-19,1,-4,106,73.244,218.22,1,3.6448,5.7326,0
-19,2,253,95,71.435,217.39,1,5.3902,4.4261,0
-19,3,183,96,37.545,154.64,1,12.621,10.628,0
-19,4,411,83,70.625,236.54,1,5.1631,2.7687,0
-19,5,507,90,67.102,226.86,1,5.5523,2.1205,0
-19,6,489,113,36.041,122.18,1,16.128,8.3187,0
-19,7,528,86,60.745,190.95,1,9.562,4.0777,0
-19,8,594,108,43.004,127.54,1,16.255,6.7641,0
-20,1,-7,106,69.735,218.48,1,3.6257,5.7704,0
-20,2,257,95,71.185,216.84,1,5.4841,4.4499,0
-20,3,183,96,37.663,154.65,1,12.621,10.628,0
-20,4,414,83,70.495,235.99,1,5.2472,2.7906,0
-20,5,510,90,67.234,226.31,1,5.5565,2.0951,0
-20,6,487,113,36.071,122.33,1,16.125,8.3475,0
-20,7,526,86,60.695,190.69,1,9.5592,4.099,0
-20,8,592,109,43.075,127.56,1,16.042,6.6878,0
-21,1,-9,106,64.375,218.86,1,3.6104,5.8007,0
-21,2,261,95,70.935,216.29,1,5.494,4.4167,0
-21,3,183,96,37.781,154.65,1,12.621,10.628,0
-21,4,417,83,70.364,235.44,1,5.2525,2.7659,0
-21,5,513,90,67.365,225.75,1,5.6419,2.1131,0
-21,6,485,112,36.127,122.53,1,16.337,8.4965,0
-21,7,523,86,60.645,190.43,1,9.555,4.131,0
-21,8,591,109,43.068,127.59,1,16.041,6.7021,0
-22,1,0,106,58.857,219.25,1,3.5605,5.6935,0
-22,2,265,95,70.685,215.74,1,5.5886,4.4405,0
-22,3,183,96,37.9,154.66,1,12.621,10.628,0
-22,4,420,83,70.232,234.89,1,5.3372,2.7878,0
-22,5,515,90,67.495,225.19,1,5.6447,2.096,0
-22,6,483,112,36.201,122.79,1,16.335,8.5255,0
-22,7,520,86,60.595,190.16,1,9.5509,4.163,0
-22,8,590,109,43.022,127.65,1,16.04,6.7164,0
-23,2,269,95,70.435,215.19,1,5.5984,4.407,0
-23,3,183,96,38.02,154.67,1,12.624,10.615,0
-23,4,423,83,70.1,234.34,1,5.3425,2.7629,0
-23,5,518,90,67.626,224.64,1,5.7308,2.1141,0
-23,6,481,112,36.286,123.07,1,16.117,8.4337,0
-23,7,517,86,60.545,189.9,1,9.6702,4.2612,0
-23,8,590,109,42.956,127.74,1,16.04,6.7164,0
-24,2,273,95,70.185,214.64,1,5.6937,4.4308,0
-24,3,183,96,38.14,154.68,1,12.624,10.615,0
-24,4,426,83,69.968,233.79,1,5.4262,2.7932,0
-24,5,521,90,67.758,224.09,1,5.7349,2.0883,0
-24,6,479,112,36.375,123.36,1,16.114,8.4625,0
-24,7,515,86,60.495,189.64,1,9.6674,4.2826,0
-24,8,590,109,42.875,127.84,1,16.04,6.7164,0
-25,2,277,95,69.935,214.1,1,5.701,4.4055,0
-25,3,183,96,38.26,154.69,1,12.624,10.615,0
-25,4,429,83,69.836,233.25,1,5.4314,2.7682,0
-25,5,524,90,67.89,223.53,1,5.8217,2.1064,0
-25,6,477,112,36.465,123.66,1,16.111,8.4912,0
-25,7,512,86,60.445,189.38,1,9.6631,4.3148,0
-25,8,590,109,42.786,127.95,1,16.04,6.7164,0
-26,2,282,95,69.685,213.55,1,5.7994,4.4208,0
-26,3,183,96,38.38,154.69,1,12.624,10.615,0
-26,4,432,83,69.705,232.69,1,5.5176,2.7902,0
-26,5,527,90,68.022,222.97,1,5.9107,2.1159,0
-26,6,475,111,36.556,123.95,1,16.324,8.6416,0
-26,7,509,86,60.395,189.11,1,9.6587,4.3469,0
-26,8,589,109,42.695,128.06,1,15.832,6.6268,0
-27,2,286,95,69.435,213,1,5.8089,4.3868,0
-27,3,183,96,38.5,154.7,1,12.624,10.615,0
-27,4,435,83,69.575,232.15,1,5.5227,2.765,0
-27,5,529,90,68.154,222.42,1,5.9134,2.0985,0
-27,6,473,111,36.648,124.24,1,16.106,8.5488,0
-27,7,507,87,60.345,188.85,1,9.6558,4.3683,0
-27,8,589,109,42.602,128.17,1,15.832,6.6268,0
-28,2,290,95,69.185,212.45,1,5.9057,4.4105,0
-28,3,183,96,38.62,154.71,1,12.624,10.615,0
-28,4,438,83,69.444,231.59,1,5.6096,2.787,0
-28,5,532,90,68.285,221.87,1,6.0017,2.1167,0
-28,6,471,111,36.74,124.54,1,16.103,8.5775,0
-28,7,504,87,60.295,188.59,1,9.6515,4.4004,0
-28,8,589,109,42.51,128.29,1,15.832,6.6268,0
-29,2,294,95,68.935,211.9,1,6.0033,4.4344,0
-29,3,183,96,38.74,154.72,1,12.624,10.615,0
-29,4,441,83,69.312,231.04,1,5.6146,2.7616,0
-29,5,535,90,68.415,221.31,1,6.0057,2.0904,0
-29,6,469,111,36.832,124.83,1,16.1,8.6063,0
-29,7,501,87,60.245,188.32,1,9.6471,4.4325,0
-29,8,589,109,42.417,128.4,1,15.832,6.6268,0
-30,2,298,95,68.685,211.35,1,6.0126,4.3999,0
-30,3,183,96,38.86,154.73,1,12.624,10.615,0
-30,4,444,83,69.18,230.5,1,5.7023,2.7837,0
-30,5,538,91,68.546,220.75,1,6.0097,2.0641,0
-30,6,466,110,36.924,125.12,1,16.096,8.6494,0
-30,7,498,87,60.195,188.06,1,9.6426,4.4646,0
-30,8,589,109,42.323,128.51,1,15.832,6.6268,0
-31,2,302,95,68.436,210.8,1,6.111,4.4237,0
-31,3,183,96,38.98,154.73,1,12.624,10.615,0
-31,4,447,83,69.048,229.94,1,5.7906,2.8059,0
-31,5,541,91,68.678,220.2,1,6.0136,2.0378,0
-31,6,464,110,37.015,125.42,1,16.093,8.6782,0
-31,7,496,87,60.145,187.8,1,9.7647,4.5544,0
-31,8,589,109,42.23,128.63,1,15.832,6.6268,0
-32,2,306,95,68.188,210.25,1,6.1201,4.3891,0
-32,3,183,96,39.1,154.74,1,12.624,10.615,0
-32,4,450,84,68.916,229.4,1,5.7122,2.7326,0
-32,5,543,91,68.81,219.65,1,6.1013,2.0645,0
-32,6,462,110,37.105,125.71,1,16.091,8.707,0
-32,7,493,87,60.095,187.54,1,9.7602,4.5867,0
-32,8,589,109,42.137,128.74,1,15.832,6.6268,0
-33,2,310,95,67.94,209.7,1,6.217,4.4215,0
-33,3,183,96,39.22,154.75,1,12.624,10.615,0
-33,4,453,84,68.785,228.84,1,5.8005,2.7545,0
-33,5,546,91,68.942,219.09,1,6.1052,2.038,0
-33,6,460,110,37.195,126.01,1,15.876,8.6142,0
-33,7,490,87,60.045,187.27,1,9.7557,4.6189,0
-33,8,589,109,42.043,128.86,1,15.832,6.6268,0
-34,2,314,95,67.692,209.16,1,6.2261,4.3866,0
-34,3,182,96,39.34,154.76,1,12.621,10.628,0
-34,4,456,84,68.655,228.29,1,5.8054,2.7288,0
-34,5,549,91,69.074,218.54,1,6.195,2.0559,0
-34,6,458,109,37.285,126.3,1,16.085,8.7645,0
-34,7,488,87,59.995,187.01,1,9.7511,4.6512,0
-34,8,589,109,41.95,128.97,1,15.83,6.641,0
-35,2,318,95,67.444,208.61,1,6.326,4.4104,0
-35,3,182,96,39.46,154.77,1,12.621,10.628,0
-35,4,459,84,68.525,227.75,1,5.8945,2.7508,0
-35,5,552,91,69.205,217.98,1,6.2857,2.0739,0
-35,6,456,109,37.375,126.59,1,16.082,8.7933,0
-35,7,485,87,59.945,186.75,1,9.8731,4.7534,0
-35,8,589,109,41.858,129.09,1,15.626,6.5386,0
-36,2,322,95,67.195,208.06,1,6.3349,4.3753,0
-36,3,182,96,39.58,154.77,1,12.621,10.628,0
-36,4,463,84,68.394,227.2,1,5.9009,2.7163,0
-36,5,555,91,69.335,217.43,1,6.2895,2.0471,0
-36,6,454,109,37.465,126.89,1,16.079,8.8221,0
-36,7,482,87,59.895,186.49,1,9.8685,4.7859,0
-36,8,589,109,41.765,129.2,1,15.626,6.5386,0
-37,2,327,95,66.945,207.51,1,6.4379,4.3901,0
-37,3,182,96,39.7,154.78,1,12.621,10.628,0
-37,4,466,84,68.262,226.65,1,5.9907,2.7382,0
-37,5,557,91,69.466,216.87,1,6.3797,2.0741,0
-37,6,451,109,37.554,127.18,1,15.863,8.7425,0
-37,7,479,88,59.845,186.22,1,9.7372,4.7478,0
-37,8,589,109,41.672,129.31,1,15.626,6.5386,0
-38,2,331,95,66.695,206.96,1,6.5395,4.4139,0
-38,3,182,96,39.82,154.79,1,12.621,10.628,0
-38,4,469,84,68.13,226.1,1,5.9954,2.7121,0
-38,5,560,91,69.57,216.32,1,6.3834,2.0471,0
-38,6,449,108,37.642,127.47,1,16.072,8.894,0
-38,7,477,88,59.795,185.96,1,9.8607,4.8399,0
-38,8,589,109,41.58,129.43,1,15.626,6.5386,0
-39,2,335,95,66.445,206.41,1,6.5482,4.3783,0
-39,3,182,96,39.94,154.8,1,12.621,10.628,0
-39,4,472,84,67.998,225.55,1,6.0845,2.7428,0
-39,5,563,91,69.578,215.77,1,6.4757,2.0652,0
-39,6,447,108,37.725,127.74,1,16.069,8.9228,0
-39,7,474,88,59.745,185.7,1,9.8559,4.8723,0
-39,8,588,109,41.487,129.54,1,15.625,6.5526,0
-40,2,339,95,66.195,205.86,1,6.6506,4.402,0
-40,3,182,96,40.06,154.81,1,12.624,10.615,0
-40,4,475,84,67.866,225,1,6.0892,2.7165,0
-40,5,566,91,69.344,215.23,1,6.4794,2.038,0
-40,6,445,108,37.8,128,1,15.854,8.8281,0
-40,7,471,88,59.695,185.44,1,9.8512,4.9047,0
-40,8,588,108,41.393,129.66,1,15.829,6.6552,0
-41,2,343,95,65.945,205.31,1,6.657,4.3751,0
-41,3,182,96,40.18,154.81,1,12.624,10.615,0
-41,4,478,84,67.735,224.45,1,6.1806,2.7385,0
-41,5,569,91,68.659,214.71,1,6.5725,2.0561,0
-41,6,444,108,37.862,128.22,1,15.852,8.8424,0
-41,7,469,88,59.645,185.17,1,9.848,4.9262,0
-41,8,588,108,41.3,129.77,1,15.829,6.6552,0
-42,2,347,95,65.695,204.76,1,6.7604,4.3989,0
-42,3,182,96,40.3,154.82,1,12.624,10.615,0
-42,4,481,84,67.605,223.9,1,6.2728,2.7607,0
-42,5,571,91,67.362,214.2,1,6.5737,2.047,0
-42,6,442,108,37.913,128.41,1,15.849,8.8709,0
-42,7,466,88,59.595,184.91,1,9.9713,5.0309,0
-42,8,588,108,41.208,129.88,1,15.829,6.6552,0
-43,2,351,95,65.445,204.21,1,6.7688,4.3628,0
-43,3,182,96,40.42,154.83,1,12.624,10.615,0
-43,4,484,84,67.475,223.35,1,6.2774,2.7341,0
-43,5,574,91,65.454,213.72,1,6.6665,2.0743,0
-43,6,440,107,37.954,128.58,1,16.059,9.0236,0
-43,7,463,88,59.545,184.65,1,9.9665,5.0634,0
-43,8,588,108,41.115,130,1,15.625,6.5526,0
-44,2,355,95,65.195,203.67,1,6.873,4.3864,0
-44,3,182,96,40.54,154.84,1,12.624,10.615,0
-44,4,487,84,67.344,222.8,1,6.3705,2.7563,0
-44,5,577,91,63.096,213.26,1,6.6689,2.0559,0
-44,6,439,107,37.99,128.74,1,16.057,9.038,0
-44,7,461,88,59.495,184.38,1,9.9632,5.0851,0
-44,8,588,108,41.023,130.11,1,15.625,6.5526,0
-45,2,359,95,64.945,203.12,1,6.8813,4.3501,0
-45,3,182,96,40.66,154.85,1,12.624,10.615,0
-45,4,490,84,67.212,222.25,1,6.375,2.7296,0
-45,5,580,91,60.496,212.81,1,6.7625,2.0834,0
-45,6,437,107,38.025,128.89,1,16.056,9.0524,0
-45,7,458,88,59.445,184.12,1,9.9583,5.1177,0
-45,8,588,108,40.93,130.22,1,15.625,6.5526,0
-46,2,363,95,64.695,202.57,1,6.9863,4.3737,0
-46,3,182,96,40.78,154.85,1,12.624,10.615,0
-46,4,493,84,67.08,221.71,1,6.4689,2.7518,0
-46,5,582,91,57.8,212.36,1,6.7625,2.0834,0
-46,6,436,107,38.059,129.04,1,15.842,8.9423,0
-46,7,455,88,59.395,183.86,1,10.083,5.2241,0
-46,8,588,108,40.837,130.34,1,15.625,6.5526,0
-47,2,367,95,64.445,202.02,1,6.9945,4.3371,0
-47,3,182,96,40.9,154.86,1,12.624,10.615,0
-47,4,496,84,66.948,221.16,1,6.4733,2.7249,0
-47,5,585,91,55.075,211.91,1,6.8571,2.1111,0
-47,6,434,107,38.093,129.2,1,15.839,8.9708,0
-47,7,452,89,59.345,183.6,1,9.9484,5.1827,0
-47,8,588,108,40.743,130.45,1,15.625,6.5526,0
-48,2,372,95,64.195,201.47,1,7.1024,4.3514,0
-48,3,182,96,41.02,154.87,1,12.624,10.615,0
-48,4,499,85,66.816,220.6,1,6.4777,2.698,0
-48,5,588,91,52.35,211.46,1,6.8595,2.0925,0
-48,6,433,107,38.127,129.35,1,15.837,8.9851,0
-48,7,450,89,59.294,183.34,1,9.945,5.2044,0
-48,8,588,108,40.65,130.57,1,15.625,6.5526,0
-49,2,376,95,63.945,200.92,1,7.2074,4.3844,0
-49,3,182,96,41.14,154.88,1,12.624,10.615,0
-49,4,502,85,66.685,220.06,1,6.4821,2.6711,0
-49,5,590,91,49.625,211.01,1,6.8595,2.0925,0
-49,6,431,107,38.16,129.5,1,15.834,9.0136,0
-49,7,447,89,59.242,183.07,1,9.94,5.2369,0
-49,8,588,108,40.557,130.68,1,15.625,6.5526,0
-50,2,380,95,63.695,200.37,1,7.2153,4.3472,0
-50,3,182,96,41.26,154.89,1,12.624,10.615,0
-50,4,505,85,66.555,219.5,1,6.5768,2.6929,0
-50,5,593,91,46.9,210.56,1,6.955,2.1204,0
-50,6,430,107,38.193,129.65,1,15.833,9.0279,0
-50,7,444,89,59.19,182.81,1,10.065,5.3441,0
-50,8,588,108,40.463,130.8,1,15.625,6.5526,0
-51,2,384,95,63.446,199.82,1,7.3232,4.3709,0
-51,3,182,96,41.38,154.89,1,12.624,10.615,0
-51,4,508,85,66.425,218.96,1,6.6723,2.715,0
-51,5,596,91,44.176,210.11,1,6.9573,2.1017,0
-51,6,428,107,38.227,129.8,1,15.83,9.0565,0
-51,7,442,89,59.138,182.55,1,10.061,5.3659,0
-51,8,587,108,40.37,130.91,1,15.624,6.5667,0
-52,2,388,95,63.198,199.27,1,7.3309,4.3335,0
-52,3,182,96,41.5,154.9,1,12.624,10.615,0
-52,4,511,85,66.294,218.41,1,6.6766,2.6877,0
-52,5,599,91,41.452,209.66,1,7.0526,2.1391,0
-52,6,427,107,38.26,129.95,1,15.828,9.0707,0
-52,7,439,89,59.086,182.28,1,10.056,5.3986,0
-52,8,587,108,40.278,131.02,1,15.422,6.4656,0
-53,2,392,95,62.95,198.72,1,7.4398,4.3571,0
-53,3,182,96,41.62,154.91,1,12.624,10.615,0
-53,4,514,85,66.162,217.85,1,6.773,2.7098,0
-53,5,601,91,38.728,209.21,1,7.0537,2.1297,0
-53,6,425,107,38.293,130.1,1,15.616,8.9753,0
-53,7,436,89,59.035,182.02,1,10.051,5.4313,0
-53,8,587,108,40.185,131.14,1,15.422,6.4656,0
-54,2,396,95,62.702,198.18,1,7.4474,4.3194,0
-54,3,182,96,41.74,154.92,1,12.624,10.615,0
-54,4,517,85,66.03,217.31,1,6.7772,2.6824,0
-54,5,604,91,36.004,208.76,1,7.1511,2.158,0
-54,6,424,107,38.327,130.26,1,15.614,8.9895,0
-54,7,433,89,58.985,181.76,1,10.177,5.5404,0
-54,8,587,108,40.093,131.25,1,15.422,6.4656,0
-55,2,399,95,62.454,197.63,1,7.5553,4.3525,0
-55,3,182,96,41.86,154.93,1,12.624,10.615,0
-55,4,520,85,65.898,216.75,1,6.8731,2.7136,0
-55,5,607,91,33.28,208.32,1,7.1522,2.1485,0
-55,6,422,107,38.361,130.41,1,15.611,9.0178,0
-55,7,431,89,58.935,181.5,1,10.174,5.5623,0
-55,8,587,108,40.003,131.37,1,15.422,6.4656,0
-56,2,403,95,62.205,197.08,1,7.5628,4.3145,0
-56,3,182,96,41.98,154.93,1,12.624,10.615,0
-56,4,523,85,65.766,216.21,1,6.8773,2.686,0
-56,5,610,91,30.555,207.87,1,7.2506,2.177,0
-56,6,421,106,38.395,130.56,1,15.819,9.1564,0
-56,7,428,89,58.885,181.24,1,10.168,5.5952,0
-56,8,587,108,39.926,131.48,1,15.421,6.4795,0
-57,2,407,95,61.955,196.53,1,7.6717,4.3476,0
-57,3,182,96,42.1,154.94,1,12.627,10.602,0
-57,4,526,85,65.635,215.66,1,6.9755,2.7082,0
-57,5,612,91,27.83,207.42,1,7.2506,2.177,0
-57,6,419,106,38.429,130.71,1,15.816,9.185,0
-57,7,425,90,58.835,180.97,1,10.163,5.6281,0
-57,8,587,107,39.88,131.59,1,15.623,6.5808,0
-58,2,410,95,61.705,195.98,1,7.7817,4.381,0
-58,3,182,96,42.22,154.95,1,12.627,10.602,0
-58,4,529,85,65.505,215.11,1,6.9796,2.6804,0
-58,5,615,91,25.132,206.97,1,7.3499,2.2058,0
-58,6,418,106,38.463,130.86,1,15.814,9.1993,0
-58,7,423,90,58.785,180.71,1,10.16,5.65,0
-58,8,586,107,39.893,131.71,1,15.622,6.5949,0
-59,2,414,95,61.455,195.43,1,7.789,4.3426,0
-59,3,182,96,42.34,154.96,1,12.627,10.602,0
-59,4,533,85,65.374,214.56,1,7.08,2.6932,0
-59,5,618,91,22.525,206.54,1,7.3522,2.1866,0
-59,6,416,106,38.497,131.01,1,15.601,9.1027,0
-59,7,420,90,58.735,180.45,1,10.154,5.6829,0
-59,8,586,108,39.984,131.82,1,15.42,6.4935,0
-60,2,417,95,61.205,194.88,1,7.9001,4.376,0
-60,3,181,96,42.46,154.97,1,12.624,10.615,0
-60,4,536,85,65.242,214.01,1,7.0841,2.6652,0
-60,5,620,91,20.15,206.15,1,7.3533,2.1771,0
-60,6,415,106,38.53,131.16,1,15.6,9.1169,0
-60,7,417,90,58.685,180.18,1,10.149,5.7157,0
-60,8,585,108,40.156,131.94,1,15.42,6.4935,0
-61,2,421,95,60.955,194.33,1,7.9073,4.3373,0
-61,3,181,96,42.579,154.97,1,12.624,10.615,0
-61,4,539,85,65.11,213.46,1,7.1841,2.6874,0
-61,5,622,91,18.204,205.83,1,7.4525,2.2155,0
-61,6,413,106,38.563,131.32,1,15.597,9.1452,0
-61,7,414,90,58.635,179.92,1,10.276,5.8274,0
-61,8,584,108,40.385,132.05,1,15.221,6.4074,0
-62,2,425,95,60.705,193.78,1,8.0212,4.361,0
-62,3,181,96,42.697,154.98,1,12.624,10.615,0
-62,4,542,85,64.978,212.91,1,7.285,2.7097,0
-62,5,623,91,16.842,205.61,1,7.4525,2.2155,0
-62,6,412,106,38.597,131.47,1,15.595,9.1594,0
-62,7,412,90,58.585,179.66,1,10.273,5.8494,0
-62,8,582,108,40.646,132.16,1,15.219,6.4351,0
-63,2,428,95,60.455,193.24,1,8.0266,4.3318,0
-63,3,181,96,42.815,154.99,1,12.624,10.615,0
-63,4,545,86,64.846,212.36,1,7.192,2.6309,0
-63,6,410,106,38.63,131.62,1,15.592,9.1877,0
-63,7,409,90,58.535,179.4,1,10.267,5.8825,0
-63,8,581,108,40.92,132.28,1,15.217,6.449,0
-64,2,432,95,60.205,192.69,1,8.1415,4.3556,0
-64,3,181,96,42.933,155,1,12.45,10.488,0
-64,4,548,86,64.715,211.81,1,7.2929,2.6529,0
-64,6,409,106,38.663,131.77,1,15.59,9.2019,0
-64,7,406,90,58.485,179.13,1,10.262,5.9155,0
-64,8,580,108,41.198,132.39,1,15.216,6.4628,0
-65,2,435,95,59.955,192.14,1,8.145,4.3359,0
-65,3,181,96,43.051,155.01,1,12.45,10.488,0
-65,4,551,86,64.585,211.26,1,7.2968,2.6245,0
-65,6,407,106,38.697,131.92,1,15.587,9.2302,0
-65,7,404,90,58.435,178.87,1,10.393,6.0181,0
-65,8,579,108,41.475,132.51,1,15.215,6.4767,0
-66,2,439,96,59.705,191.59,1,8.152,4.2966,0
-66,3,181,96,43.17,155.01,1,12.45,10.488,0
-66,4,554,86,64.455,210.71,1,7.3986,2.6465,0
-66,6,406,106,38.731,132.07,1,15.379,9.1199,0
-66,7,401,91,58.385,178.61,1,10.252,5.9706,0
-66,8,577,108,41.752,132.62,1,15.213,6.5043,0
-67,2,442,96,59.455,191.04,1,8.1572,4.2671,0
-67,3,181,96,43.29,155.02,1,12.45,10.488,0
-67,4,557,86,64.324,210.16,1,7.4024,2.6179,0
-67,6,404,106,38.765,132.22,1,15.376,9.148,0
-67,7,398,91,58.335,178.34,1,10.247,6.0036,0
-67,8,576,109,42.03,132.74,1,15.017,6.4051,0
-68,2,446,96,59.205,190.49,1,8.2731,4.2905,0
-68,3,181,96,43.41,155.03,1,12.45,10.488,0
-68,4,560,86,64.192,209.62,1,7.5053,2.6399,0
-68,6,403,105,38.799,132.38,1,15.581,9.2869,0
-68,7,395,91,58.285,178.08,1,10.241,6.0366,0
-68,8,575,109,42.307,132.85,1,15.016,6.4188,0
-69,2,450,96,58.955,189.94,1,8.3902,4.314,0
-69,3,181,96,43.53,155.04,1,12.45,10.488,0
-69,4,563,86,64.06,209.06,1,7.509,2.6111,0
-69,6,401,105,38.833,132.53,1,15.577,9.3152,0
-69,7,393,91,58.235,177.82,1,10.372,6.14,0
-69,8,573,109,42.585,132.96,1,15.014,6.4462,0
-70,2,453,96,58.706,189.39,1,8.3953,4.2841,0
-70,3,181,96,43.65,155.04,1,12.45,10.488,0
-70,4,566,86,63.904,208.52,1,7.6116,2.6428,0
-70,6,400,105,38.867,132.68,1,15.576,9.3294,0
-70,7,390,91,58.185,177.56,1,10.367,6.1733,0
-70,8,572,109,42.862,133.08,1,14.82,6.3617,0
-71,2,457,96,58.458,188.84,1,8.5135,4.3076,0
-71,3,181,96,43.77,155.05,1,12.45,10.488,0
-71,4,569,86,63.666,207.98,1,7.7165,2.6651,0
-71,6,399,105,38.9,132.83,1,15.574,9.3436,0
-71,7,387,91,58.135,177.29,1,10.361,6.2065,0
-71,8,571,109,43.138,133.19,1,14.819,6.3753,0
-72,2,460,96,58.21,188.29,1,8.5184,4.2775,0
-72,3,181,96,43.889,155.06,1,12.45,10.488,0
-72,4,572,86,63.221,207.46,1,7.7202,2.6359,0
-72,6,397,105,38.933,132.98,1,15.571,9.3719,0
-72,7,385,91,58.085,177.03,1,10.357,6.2287,0
-72,8,570,109,43.415,133.31,1,14.818,6.3889,0
-73,2,464,96,57.962,187.75,1,8.6362,4.3112,0
-73,3,181,96,44.002,155.07,1,12.453,10.475,0
-73,4,575,86,62.392,206.98,1,7.8261,2.6582,0
-73,6,396,105,38.967,133.13,1,15.362,9.2604,0
-73,7,382,91,58.035,176.77,1,10.488,6.3452,0
-73,8,568,109,43.693,133.42,1,14.816,6.4161,0
-74,2,467,96,57.714,187.2,1,8.641,4.2808,0
-74,3,181,96,44.103,155.07,1,12.453,10.475,0
-74,4,578,86,61.039,206.56,1,7.8285,2.6387,0
-74,6,395,105,39,133.28,1,15.361,9.2745,0
-74,7,379,91,57.985,176.51,1,10.48,6.3898,0
-74,8,567,110,43.97,133.53,1,14.624,6.3324,0
-74,9,604,108,34.383,160.6,1,10.673,3.9121,0
-75,2,471,96,57.465,186.65,1,8.7616,4.3044,0
-75,3,181,96,44.18,155.08,1,12.453,10.475,0
-75,4,580,86,59.162,206.19,1,7.8297,2.6289,0
-75,6,395,105,39.033,133.44,1,15.361,9.2745,0
-75,7,376,91,57.935,176.25,1,10.474,6.4233,0
-75,8,566,110,44.248,133.65,1,14.624,6.3324,0
-75,9,601,109,36.779,160.24,1,10.536,3.8681,0
-76,2,475,96,57.215,186.1,1,8.768,4.2637,0
-76,3,181,96,44.225,155.07,1,12.453,10.475,0
-76,4,583,86,56.901,205.85,1,7.9354,2.661,0
-76,6,395,105,39.067,133.59,1,15.361,9.2745,0
-76,7,374,92,57.885,175.98,1,10.47,6.4455,0
-76,8,565,110,44.525,133.76,1,14.623,6.3459,0
-76,9,596,109,39.702,159.8,1,10.666,3.9802,0
-77,2,478,96,56.965,185.55,1,8.8881,4.2974,0
-77,3,182,96,44.239,155.07,1,12.456,10.462,0
-77,4,586,86,54.433,205.54,1,7.9378,2.6413,0
-77,6,394,105,39.101,133.74,1,15.359,9.2885,0
-77,7,371,92,57.835,175.72,1,10.464,6.479,0
-77,8,563,110,44.802,133.88,1,14.621,6.3729,0
-77,9,592,110,42.91,159.33,1,10.529,3.9357,0
-78,2,482,96,56.715,185,1,8.8944,4.2564,0
-78,3,182,96,44.23,155.06,1,12.456,10.462,0
-78,4,588,86,51.883,205.24,1,7.9378,2.6413,0
-78,6,394,105,39.135,133.89,1,15.359,9.2885,0
-78,7,368,92,57.785,175.46,1,10.458,6.5124,0
-78,8,562,110,45.078,133.99,1,14.62,6.3864,0
-78,9,587,110,46.196,158.83,1,10.66,4.0368,0
-79,2,485,97,56.465,184.45,1,8.8991,4.2256,0
-79,3,182,96,44.209,155.05,1,12.456,10.462,0
-79,4,591,86,49.309,204.94,1,8.0446,2.6737,0
-79,6,394,105,39.169,134.04,1,15.155,9.1645,0
-79,7,366,92,57.735,175.19,1,10.454,6.5347,0
-79,8,561,110,45.353,134.1,1,14.431,6.3034,0
-79,9,582,111,49.393,158.34,1,10.521,4.0145,0
-80,2,489,97,56.215,183.9,1,9.0219,4.2489,0
-80,3,183,96,44.182,155.04,1,12.459,10.449,0
-80,4,593,86,46.714,204.63,1,8.0458,2.6638,0
-80,6,394,105,39.203,134.19,1,15.155,9.1645,0
-80,7,363,92,57.685,174.93,1,10.586,6.6541,0
-80,8,559,110,45.621,134.22,1,14.429,6.3301,0
-80,9,577,111,52.285,157.88,1,10.651,4.1161,0
-81,2,493,97,55.965,183.35,1,9.0265,4.2178,0
-81,3,183,96,44.154,155.03,1,12.459,10.449,0
-81,4,596,86,44.047,204.31,1,8.0481,2.6439,0
-81,6,394,105,39.237,134.34,1,15.155,9.1645,0
-81,7,360,92,57.635,174.67,1,10.58,6.6878,0
-81,8,558,111,45.87,134.33,1,14.243,6.2481,0
-81,9,573,112,54.61,157.46,1,10.513,4.082,0
-82,2,496,97,55.715,182.81,1,9.1491,4.2515,0
-82,3,184,96,44.125,155.01,1,12.462,10.436,0
-82,4,599,86,41.201,203.93,1,8.1549,2.6864,0
-82,6,394,105,39.27,134.5,1,15.155,9.1645,0
-82,7,357,92,57.585,174.41,1,10.574,6.7214,0
-82,8,557,111,46.083,134.45,1,14.243,6.2481,0
-82,9,569,112,56.162,157.1,1,10.51,4.1157,0
-83,2,500,97,55.465,182.26,1,9.1551,4.2098,0
-83,3,184,96,44.095,155,1,12.462,10.436,0
-83,4,602,86,38.021,203.44,1,8.1572,2.6665,0
-83,6,394,105,39.303,134.65,1,15.155,9.1645,0
-83,7,355,92,57.535,174.15,1,10.57,6.7439,0
-83,8,555,111,46.248,134.56,1,14.241,6.2747,0
-83,9,565,112,56.941,156.8,1,10.639,4.2291,0
-84,2,503,97,55.215,181.71,1,9.279,4.2436,0
-84,3,185,96,44.065,154.99,1,12.638,10.55,0
-84,4,606,86,34.389,202.82,1,8.2663,2.6993,0
-84,6,394,105,39.337,134.8,1,15.155,9.1645,0
-84,7,352,92,57.485,173.88,1,10.704,6.8655,0
-84,8,554,111,46.364,134.68,1,14.239,6.2879,0
-84,9,561,112,57.154,156.55,1,10.634,4.2743,0
-85,2,507,97,54.965,181.16,1,9.2849,4.2016,0
-85,3,185,96,44.035,154.98,1,12.638,10.55,0
-85,4,610,86,30.35,202.06,1,8.2686,2.6791,0
-85,6,394,105,39.371,134.95,1,15.155,9.1645,0
-85,7,349,92,57.435,173.62,1,10.697,6.8994,0
-85,8,553,111,46.446,134.79,1,14.238,6.3012,0
-85,9,558,112,57.061,156.32,1,10.63,4.3082,0
-86,2,510,97,54.715,180.61,1,9.41,4.2354,0
-86,3,186,96,44.005,154.97,1,12.64,10.537,0
-86,4,614,86,26.137,201.23,1,8.3788,2.7121,0
-86,6,394,105,39.405,135.1,1,14.955,9.0423,0
-86,7,347,93,57.385,173.36,1,10.553,6.8336,0
-86,8,551,111,46.508,134.9,1,14.236,6.3278,0
-86,9,555,113,56.848,156.1,1,10.492,4.2727,0
-87,2,514,97,54.465,180.06,1,9.4158,4.1931,0
-87,3,186,96,43.976,154.96,1,12.638,10.55,0
-87,4,618,86,22.143,200.43,1,8.4901,2.7455,0
-87,6,393,105,39.439,135.25,1,14.953,9.0561,0
-87,7,344,93,57.335,173.09,1,10.547,6.8673,0
-87,8,550,111,46.563,135.02,1,14.053,6.2463,0
-87,9,551,113,56.6,155.89,1,10.621,4.3871,0
-88,2,518,97,54.215,179.51,1,9.5438,4.2163,0
-88,3,186,95,43.948,154.94,1,12.813,10.678,0
-88,4,621,86,18.815,199.75,1,8.6016,2.7894,0
-88,6,393,105,39.473,135.4,1,14.953,9.0561,0
-88,7,341,93,57.285,172.83,1,10.681,6.9898,0
-88,8,548,111,46.617,135.13,1,14.05,6.2726,0
-88,9,548,113,56.352,155.67,1,10.617,4.421,0
-89,2,521,97,53.966,178.96,1,9.6702,4.2612,0
-89,3,187,95,43.92,154.93,1,12.816,10.665,0
-89,4,624,86,16.471,199.28,1,8.6038,2.7689,0
-89,6,393,105,39.507,135.56,1,14.953,9.0561,0
-89,7,339,93,57.235,172.57,1,10.676,7.0124,0
-89,8,547,111,46.67,135.25,1,14.049,6.2858,0
-89,9,545,113,56.104,155.46,1,10.613,4.4547,0
-90,2,525,97,53.718,178.41,1,9.6759,4.2183,0
-90,3,187,95,43.892,154.92,1,12.816,10.665,0
-90,6,393,105,39.54,135.71,1,14.953,9.0561,0
-90,7,337,93,57.185,172.31,1,10.672,7.035,0
-90,8,545,111,46.722,135.36,1,14.046,6.3121,0
-90,9,541,113,55.855,155.24,1,10.607,4.5111,0
-91,2,528,97,53.47,177.87,1,9.8051,4.2525,0
-91,3,188,95,43.864,154.91,1,12.819,10.652,0
-91,6,393,105,39.573,135.86,1,14.953,9.0561,0
-91,7,335,93,57.134,172.04,1,10.668,7.0576,0
-91,8,544,111,46.775,135.47,1,14.045,6.3253,0
-91,9,538,113,55.605,155.03,1,10.603,4.5448,0
-92,2,532,98,53.222,177.32,1,9.6856,4.1431,0
-92,3,188,95,43.835,154.9,1,12.819,10.652,0
-92,6,393,105,39.607,136.01,1,14.755,8.9357,0
-92,7,334,93,57.082,171.78,1,10.808,7.1595,0
-92,8,542,111,46.827,135.59,1,14.043,6.3516,0
-92,9,535,113,55.355,154.81,1,10.735,4.6503,0
-93,2,536,98,52.974,176.77,1,9.8161,4.1661,0
-93,3,189,95,43.805,154.88,1,12.822,10.639,0
-93,6,393,106,39.64,136.16,1,14.561,8.817,0
-93,7,333,93,57.03,171.52,1,10.806,7.1709,0
-93,8,541,111,46.88,135.7,1,14.041,6.3648,0
-93,9,532,113,55.106,154.59,1,10.731,4.6842,0
-94,2,539,98,52.725,176.22,1,9.8202,4.1336,0
-94,3,189,95,43.775,154.87,1,12.822,10.639,0
-94,6,393,106,39.673,136.31,1,14.561,8.817,0
-94,7,331,93,56.978,171.25,1,10.801,7.1937,0
-94,8,539,111,46.933,135.81,1,14.039,6.3911,0
-94,9,528,113,54.858,154.38,1,10.725,4.7295,0
-95,2,543,98,52.475,175.67,1,9.9521,4.1565,0
-95,3,190,95,43.745,154.86,1,12.824,10.626,0
-95,6,393,106,39.707,136.46,1,14.561,8.817,0
-95,7,330,93,56.926,170.99,1,10.943,7.2972,0
-95,8,538,111,46.985,135.93,1,14.037,6.4043,0
-95,9,525,113,54.61,154.16,1,10.721,4.7634,0
-96,2,546,98,52.225,175.12,1,9.9561,4.1238,0
-96,3,190,95,43.715,154.85,1,12.824,10.626,0
-96,6,393,106,39.741,136.62,1,14.561,8.817,0
-96,7,329,93,56.875,170.73,1,10.941,7.3087,0
-96,8,536,111,47.037,136.04,1,13.855,6.3357,0
-96,9,522,113,54.362,153.95,1,10.855,4.8709,0
-97,2,550,98,51.975,174.57,1,10.088,4.1577,0
-97,3,190,95,43.685,154.84,1,12.824,10.626,0
-97,6,392,106,39.775,136.77,1,14.559,8.8306,0
-97,7,328,93,56.825,170.47,1,10.939,7.3201,0
-97,8,535,111,47.09,136.16,1,13.854,6.3488,0
-97,9,518,113,54.114,153.73,1,10.849,4.9164,0
-98,2,553,98,51.725,174.02,1,10.092,4.1248,0
-98,3,191,95,43.656,154.83,1,12.827,10.613,0
-98,6,392,106,39.809,136.92,1,14.559,8.8306,0
-98,7,326,93,56.775,170.21,1,10.934,7.3431,0
-98,8,533,111,47.143,136.27,1,13.851,6.3749,0
-98,9,515,113,53.865,153.51,1,10.843,4.9619,0
-99,2,557,98,51.475,173.47,1,10.227,4.1477,0
-99,3,191,95,43.628,154.81,1,12.827,10.613,0
-99,6,392,106,39.843,137.07,1,14.367,8.7136,0
-99,7,325,93,56.725,169.94,1,11.078,7.4483,0
-99,8,532,111,47.195,136.38,1,13.85,6.388,0
-99,9,512,114,53.615,153.3,1,10.701,4.9215,0
-100,2,561,98,51.225,172.92,1,10.363,4.1709,0
-100,3,192,95,43.6,154.8,1,12.83,10.6,0
-100,6,392,106,39.877,137.22,1,14.367,8.7136,0
-100,7,324,93,56.675,169.68,1,11.076,7.4599,0
-100,8,530,111,47.248,136.5,1,13.848,6.4141,0
-100,9,508,114,53.365,153.08,1,10.695,4.9667,0
-101,2,564,98,50.975,172.38,1,10.367,4.1375,0
-101,3,192,95,43.572,154.79,1,12.83,10.6,0
-101,6,392,106,39.91,137.37,1,14.367,8.7136,0
-101,7,322,93,56.625,169.42,1,11.071,7.483,0
-101,8,529,111,47.3,136.61,1,13.846,6.4271,0
-101,9,505,114,53.115,152.87,1,10.829,5.0755,0
-102,2,568,98,50.725,171.83,1,10.505,4.1606,0
-102,3,193,94,43.544,154.78,1,13.011,10.716,0
-102,6,392,106,39.943,137.52,1,14.367,8.7136,0
-102,7,321,93,56.575,169.16,1,11.069,7.4945,0
-102,8,527,111,47.353,136.73,1,13.844,6.4532,0
-102,9,502,114,52.866,152.65,1,10.825,5.1095,0
-103,2,571,98,50.475,171.28,1,10.508,4.1269,0
-103,3,193,94,43.515,154.77,1,13.011,10.716,0
-103,6,392,106,39.977,137.67,1,14.367,8.7136,0
-103,7,320,93,56.525,168.89,1,11.215,7.6016,0
-103,8,526,111,47.407,136.84,1,13.842,6.4663,0
-103,9,498,114,52.618,152.44,1,10.819,5.1549,0
-104,2,575,98,50.225,170.73,1,10.648,4.15,0
-104,3,194,94,43.485,154.76,1,13.013,10.703,0
-104,6,392,106,40.01,137.83,1,14.369,8.7001,0
-104,7,319,93,56.475,168.63,1,11.213,7.6132,0
-104,8,524,111,47.46,136.96,1,13.84,6.4924,0
-104,9,495,114,52.37,152.22,1,10.814,5.1889,0
-105,2,578,99,49.951,170.18,1,10.516,4.0595,0
-105,3,194,94,43.455,154.75,1,13.013,10.703,0
-105,6,392,106,40.043,137.98,1,14.369,8.7001,0
-105,7,317,93,56.425,168.37,1,11.208,7.6365,0
-105,8,523,111,47.513,137.07,1,13.661,6.4106,0
-105,9,492,114,52.122,152,1,10.81,5.2229,0
-106,2,582,99,49.594,169.65,1,10.655,4.0821,0
-106,3,194,94,43.425,154.73,1,13.013,10.703,0
-106,6,392,106,40.077,138.12,1,14.18,8.5849,0
-106,7,316,93,56.375,168.1,1,11.206,7.6481,0
-106,8,521,111,47.565,137.18,1,13.659,6.4365,0
-106,9,489,114,51.874,151.79,1,10.943,5.3455,0
-107,2,585,99,49.027,169.14,1,10.659,4.0481,0
-107,3,195,94,43.395,154.72,1,13.016,10.69,0
-107,6,392,106,40.111,138.27,1,14.18,8.5849,0
-107,7,315,93,56.325,167.84,1,11.353,7.757,0
-107,8,520,111,47.617,137.3,1,13.657,6.4494,0
-107,9,485,114,51.625,151.57,1,10.937,5.3911,0
-108,2,589,99,48.071,168.69,1,10.799,4.0707,0
-108,3,195,94,43.365,154.71,1,13.016,10.69,0
-108,6,392,106,40.145,138.4,1,14.18,8.5849,0
-108,7,313,93,56.275,167.58,1,11.349,7.7804,0
-108,8,518,111,47.67,137.41,1,13.655,6.4753,0
-108,9,482,114,51.377,151.36,1,10.932,5.4253,0
-109,2,592,99,46.586,168.31,1,10.802,4.0479,0
-109,3,196,94,43.336,154.7,1,13.019,10.677,0
-109,6,392,106,40.179,138.51,1,14.18,8.5849,0
-109,7,312,93,56.225,167.32,1,11.347,7.7922,0
-109,8,517,111,47.723,137.53,1,13.653,6.4883,0
-109,9,479,114,51.138,151.14,1,10.928,5.4595,0
-110,2,595,99,44.571,167.99,1,10.942,4.0935,0
-110,3,196,94,43.308,154.69,1,13.019,10.677,0
-110,6,392,106,40.213,138.61,1,14.18,8.5849,0
-110,7,311,93,56.175,167.06,1,11.345,7.8039,0
-110,8,515,111,47.775,137.64,1,13.65,6.5141,0
-110,9,475,114,50.92,150.93,1,11.063,5.5844,0
-111,2,598,99,42.167,167.73,1,10.944,4.0705,0
-111,3,197,94,43.28,154.68,1,13.021,10.664,0
-111,6,393,106,40.247,138.7,1,14.182,8.5716,0
-111,7,310,93,56.125,166.79,1,11.494,7.9146,0
-111,8,514,111,47.827,137.75,1,13.649,6.5271,0
-111,9,472,114,50.742,150.73,1,11.058,5.6188,0
-112,2,601,99,39.553,167.5,1,10.945,4.0591,0
-112,3,197,94,43.252,154.66,1,13.021,10.664,0
-112,6,393,106,40.28,138.78,1,14.182,8.5716,0
-112,7,308,93,56.075,166.53,1,11.489,7.9383,0
-112,8,512,111,47.88,137.87,1,13.646,6.553,0
-112,9,469,114,50.617,150.53,1,11.053,5.6532,0
-113,2,603,99,36.855,167.28,1,10.946,4.0476,0
-113,3,197,94,43.224,154.65,1,13.021,10.664,0
-113,6,394,107,40.313,138.86,1,13.998,8.4449,0
-113,7,307,93,56.025,166.27,1,11.487,7.9501,0
-113,8,511,111,47.933,137.98,1,13.645,6.5659,0
-113,9,466,114,50.545,150.35,1,11.049,5.6876,0
-114,2,606,99,34.132,167.06,1,10.948,4.0246,0
-114,3,198,94,43.195,154.64,1,13.024,10.65,0
-114,6,394,107,40.347,138.94,1,13.998,8.4449,0
-114,7,306,93,55.975,166,1,11.483,7.9738,0
-114,8,509,111,47.985,138.1,1,13.467,6.4969,0
-114,9,463,114,50.513,150.17,1,11.044,5.722,0
-115,2,609,99,31.41,166.84,1,11.089,4.0818,0
-115,3,198,94,43.165,154.63,1,13.024,10.65,0
-115,6,395,107,40.38,139.02,1,13.816,8.3202,0
-115,7,304,93,55.925,165.74,1,11.632,8.0983,0
-115,8,508,111,48.038,138.21,1,13.467,6.4969,0
-115,9,459,114,50.502,149.99,1,11.181,5.8494,0
-116,2,611,99,28.714,166.63,1,11.09,4.0702,0
-116,3,198,93,43.135,154.62,1,13.205,10.781,0
-116,6,395,107,40.413,139.09,1,13.816,8.3202,0
-116,7,303,93,55.875,165.48,1,11.63,8.1103,0
-116,8,506,111,48.09,138.32,1,13.465,6.5225,0
-116,9,456,114,50.5,149.82,1,11.176,5.884,0
-117,2,614,99,26.11,166.42,1,11.092,4.0471,0
-117,3,198,93,43.105,154.61,1,13.205,10.781,0
-117,6,396,107,40.447,139.17,1,13.818,8.3071,0
-117,7,302,93,55.825,165.22,1,11.628,8.1222,0
-117,8,505,111,48.143,138.44,1,13.463,6.5354,0
-117,9,453,114,50.5,149.65,1,11.171,5.9186,0
-118,2,616,99,23.737,166.23,1,11.092,4.0471,0
-118,3,198,93,43.075,154.59,1,13.205,10.781,0
-118,6,396,107,40.481,139.25,1,13.818,8.3071,0
-118,7,301,93,55.775,164.95,1,11.781,8.2367,0
-118,8,503,111,48.197,138.55,1,13.46,6.561,0
-118,9,450,114,50.5,149.47,1,11.166,5.9532,0
-119,2,618,99,21.792,166.07,1,11.094,4.0355,0
-119,3,198,93,43.045,154.58,1,13.205,10.781,0
-119,6,397,107,40.515,139.33,1,13.819,8.2939,0
-119,7,299,93,55.725,164.69,1,11.777,8.2608,0
-119,8,502,111,48.25,138.67,1,13.459,6.5739,0
-119,9,447,114,50.5,149.3,1,11.161,5.9878,0
-120,2,620,99,20.431,165.96,1,11.237,4.0815,0
-120,3,198,93,43.016,154.57,1,13.205,10.781,0
-120,6,397,108,40.549,139.41,1,13.639,8.1842,0
-120,7,298,93,55.675,164.43,1,11.774,8.2728,0
-120,8,500,111,48.303,138.78,1,13.456,6.5995,0
-120,9,444,114,50.5,149.12,1,11.156,6.0223,0
-121,3,198,93,42.988,154.56,1,13.205,10.781,0
-121,6,398,108,40.583,139.49,1,13.64,8.1712,0
-121,7,297,93,55.625,164.16,1,11.772,8.2848,0
-121,8,499,111,48.355,138.9,1,13.455,6.6123,0
-121,9,441,114,50.5,148.95,1,11.296,6.141,0
-122,3,198,93,42.96,154.55,1,13.205,10.781,0
-122,6,398,108,40.617,139.56,1,13.64,8.1712,0
-122,7,295,93,55.575,163.9,1,11.926,8.4134,0
-122,8,497,111,48.407,139.01,1,13.28,6.5433,0
-122,9,437,114,50.5,148.78,1,11.289,6.1874,0
-123,3,198,93,42.932,154.54,1,13.205,10.781,0
-123,6,398,108,40.65,139.64,1,13.64,8.1712,0
-123,7,294,93,55.525,163.64,1,11.924,8.4256,0
-123,8,496,111,48.46,139.12,1,13.278,6.556,0
-123,9,434,114,50.5,148.6,1,11.284,6.2222,0
-124,3,198,93,42.904,154.53,1,13.205,10.781,0
-124,6,399,108,40.683,139.72,1,13.642,8.1582,0
-124,7,293,93,55.475,163.38,1,11.921,8.4377,0
-124,8,494,111,48.513,139.24,1,13.275,6.5815,0
-124,9,431,114,50.5,148.43,1,11.279,6.257,0
-125,3,198,93,42.875,154.51,1,13.205,10.781,0
-125,6,399,108,40.717,139.8,1,13.642,8.1582,0
-125,7,292,94,55.425,163.12,1,11.761,8.345,0
-125,8,493,111,48.565,139.35,1,13.274,6.5942,0
-125,9,428,114,50.494,148.26,1,11.273,6.2918,0
-126,3,198,93,42.845,154.5,1,13.205,10.781,0
-126,6,400,108,40.751,139.88,1,13.644,8.1452,0
-126,7,290,94,55.375,162.85,1,11.914,8.4741,0
-126,8,491,111,48.617,139.47,1,13.271,6.6197,0
-126,9,425,114,50.47,148.08,1,11.268,6.3265,0
-127,3,198,93,42.815,154.49,1,13.205,10.781,0
-127,6,400,109,40.785,139.96,1,13.466,8.0373,0
-127,7,289,94,55.323,162.59,1,11.912,8.4862,0
-127,8,490,111,48.666,139.59,1,13.27,6.6324,0
-127,9,422,114,50.397,147.91,1,11.41,6.4482,0
-128,3,198,93,42.785,154.48,1,13.205,10.781,0
-128,6,401,109,40.819,140.04,1,13.292,7.9182,0
-128,7,288,94,55.262,162.34,1,11.91,8.4984,0
-128,8,488,111,48.709,139.72,1,13.267,6.6578,0
-128,9,419,114,50.234,147.74,1,11.405,6.4832,0
-129,3,198,93,42.755,154.47,1,13.205,10.781,0
-129,6,401,109,40.853,140.12,1,13.292,7.9182,0
-129,7,287,94,55.182,162.11,1,11.908,8.5105,0
-129,8,487,111,48.741,139.88,1,13.265,6.6706,0
-129,9,416,114,49.949,147.57,1,11.398,6.5298,0
-130,3,198,93,42.726,154.46,1,13.205,10.781,0
-130,6,402,109,40.887,140.19,1,13.294,7.9054,0
-130,7,285,94,55.064,161.91,1,12.063,8.6418,0
-130,8,485,111,48.758,140.08,1,13.092,6.6014,0
-130,9,414,114,49.541,147.4,1,11.394,6.5531,0
-131,3,198,93,42.698,154.44,1,13.205,10.781,0
-131,6,402,109,40.92,140.27,1,13.294,7.9054,0
-131,7,284,93,54.896,161.76,1,12.223,8.7626,0
-131,8,484,111,48.76,140.3,1,13.091,6.6141,0
-131,9,411,114,49.044,147.23,1,11.389,6.5881,0
-132,3,198,93,42.67,154.43,1,13.205,10.781,0
-132,6,403,109,40.953,140.35,1,13.296,7.8926,0
-132,7,284,93,54.678,161.67,1,12.223,8.7626,0
-132,8,483,111,48.751,140.56,1,13.089,6.6267,0
-132,9,409,114,48.498,147.06,1,11.385,6.6115,0
-133,3,198,93,42.642,154.42,1,13.205,10.781,0
-133,6,403,109,40.987,140.43,1,13.296,7.8926,0
-133,7,283,93,54.424,161.61,1,12.221,8.7749,0
-133,8,482,111,48.736,140.82,1,13.088,6.6393,0
-133,9,407,114,47.932,146.88,1,11.528,6.7359,0
-134,3,198,93,42.614,154.41,1,13.205,10.781,0
-134,6,404,110,41.02,140.51,1,13.124,7.7754,0
-134,7,282,93,54.15,161.57,1,12.219,8.7873,0
-134,8,480,111,48.719,141.1,1,12.917,6.5708,0
-134,9,404,113,47.36,146.72,1,11.674,6.862,0
-134,10,0,119,29.818,172.01,1,6.4463,8.3377,0
-135,3,198,93,42.585,154.4,1,13.205,10.781,0
-135,6,404,110,41.053,140.59,1,13.124,7.7754,0
-135,7,281,93,53.868,161.54,1,12.214,8.812,0
-135,8,479,111,48.7,141.37,1,12.915,6.5834,0
-135,9,402,113,46.788,146.54,1,11.671,6.8857,0
-135,10,0,118,32.385,173.01,1,6.4544,8.3186,0
-136,3,198,92,42.555,154.38,1,13.389,10.914,0
-136,6,405,110,41.087,140.66,1,13.126,7.7627,0
-136,7,281,93,53.585,161.51,1,12.214,8.812,0
-136,8,478,111,48.681,141.65,1,12.914,6.5959,0
-136,9,400,113,46.216,146.37,1,11.667,6.9094,0
-136,10,0,117,35.485,174.21,1,6.4585,8.3091,0
-137,3,198,92,42.525,154.37,1,13.389,10.914,0
-137,6,405,110,41.121,140.74,1,13.126,7.7627,0
-137,7,280,93,53.302,161.48,1,12.212,8.8243,0
-137,8,477,111,48.663,141.92,1,12.912,6.6084,0
-137,9,397,113,45.645,146.2,1,11.659,6.9567,0
-137,10,0,115,38.777,175.46,1,6.5707,8.3793,0
-138,3,198,92,42.495,154.36,1,13.389,10.914,0
-138,6,406,110,41.155,140.82,1,13.128,7.75,0
-138,7,279,93,53.018,161.45,1,12.21,8.8367,0
-138,8,476,111,48.645,142.19,1,12.745,6.5281,0
-138,9,395,113,45.075,146.03,1,11.656,6.9803,0
-138,10,0,114,41.906,176.59,1,6.5747,8.3697,0
-139,3,198,92,42.465,154.35,1,13.389,10.914,0
-139,6,406,110,41.189,140.9,1,13.128,7.75,0
-139,7,279,93,52.734,161.43,1,12.21,8.8367,0
-139,8,474,111,48.627,142.47,1,12.742,6.5529,0
-139,9,393,113,44.505,145.86,1,11.805,7.0972,0
-139,10,0,113,44.543,177.42,1,6.5827,8.3505,0
-140,3,198,92,42.435,154.34,1,13.389,10.914,0
-140,6,407,111,41.223,140.98,1,12.959,7.6347,0
-140,7,278,92,52.45,161.4,1,12.372,8.9598,0
-140,8,473,111,48.609,142.74,1,12.741,6.5653,0
-140,9,390,113,43.935,145.69,1,11.797,7.1449,0
-140,10,0,112,46.463,177.82,1,6.6921,8.4308,0
-141,3,197,92,42.406,154.33,1,13.386,10.928,0
-141,6,407,111,41.257,141.06,1,12.79,7.5334,0
-141,7,277,92,52.166,161.37,1,12.37,8.9723,0
-141,8,472,111,48.59,143.01,1,12.576,6.4858,0
-141,9,388,113,43.365,145.52,1,11.794,7.1687,0
-141,10,8,112,47.665,177.81,1,6.7239,8.3536,0
-142,3,197,92,42.378,154.31,1,13.386,10.928,0
-142,6,408,111,41.29,141.13,1,12.792,7.5209,0
-142,7,277,92,51.882,161.34,1,12.367,8.9848,0
-142,8,471,110,48.571,143.29,1,12.738,6.5902,0
-142,9,386,113,42.794,145.35,1,11.79,7.1925,0
-142,10,11,112,48.341,177.48,1,6.7397,8.315,0
-143,3,197,92,42.35,154.3,1,13.386,10.928,0
-143,6,408,111,41.323,141.21,1,12.792,7.5209,0
-143,7,276,92,51.598,161.31,1,12.365,8.9972,0
-143,8,469,110,48.553,143.56,1,12.735,6.615,0
-143,9,384,112,42.222,145.18,1,11.941,7.3117,0
-143,10,15,112,48.732,176.98,1,6.862,8.3662,0
-144,3,197,92,42.322,154.29,1,13.386,10.928,0
-144,6,409,111,41.357,141.29,1,12.794,7.5085,0
-144,7,275,92,51.314,161.28,1,12.363,9.0097,0
-144,8,468,110,48.535,143.84,1,12.733,6.6274,0
-144,9,381,112,41.65,145.01,1,11.934,7.3597,0
-144,10,19,112,49.01,176.41,1,6.8775,8.3273,0
-145,3,197,92,42.294,154.28,1,13.386,10.928,0
-145,6,409,111,41.39,141.37,1,12.794,7.5085,0
-145,7,275,92,51.03,161.25,1,12.363,9.0097,0
-145,8,467,110,48.517,144.11,1,12.568,6.5475,0
-145,9,379,112,41.078,144.84,1,12.087,7.4809,0
-145,10,23,112,49.255,175.83,1,7.0009,8.3788,0
-146,3,197,92,42.265,154.27,1,13.386,10.928,0
-146,6,410,111,41.423,141.45,1,12.796,7.496,0
-146,7,274,92,50.746,161.22,1,12.361,9.0221,0
-146,8,466,110,48.499,144.38,1,12.567,6.5598,0
-146,9,377,112,40.506,144.66,1,12.083,7.5051,0
-146,10,27,112,49.5,175.25,1,7.0163,8.3397,0
-147,3,197,92,42.235,154.26,1,13.386,10.928,0
-147,6,410,112,41.457,141.54,1,12.63,7.3964,0
-147,7,273,92,50.462,161.19,1,12.358,9.0346,0
-147,8,464,110,48.48,144.66,1,12.563,6.5844,0
-147,9,374,112,39.935,144.5,1,12.075,7.5536,0
-147,10,31,112,49.745,174.66,1,7.1408,8.3913,0
-148,3,197,92,42.205,154.25,1,13.386,10.928,0
-148,6,411,112,41.491,141.66,1,12.632,7.384,0
-148,7,273,92,50.178,161.16,1,12.358,9.0346,0
-148,8,463,110,48.461,144.93,1,12.562,6.5968,0
-148,9,372,112,39.371,144.32,1,12.072,7.5777,0
-148,10,35,112,49.991,174.08,1,7.156,8.352,0
-149,3,197,92,42.175,154.23,1,13.386,10.928,0
-149,6,411,112,41.525,141.82,1,12.632,7.384,0
-149,7,272,91,49.894,161.14,1,12.521,9.1727,0
-149,8,462,110,48.443,145.21,1,12.399,6.5174,0
-149,9,370,112,38.83,144.15,1,12.068,7.602,0
-149,10,39,112,50.237,173.5,1,7.2853,8.3939,0
-150,3,197,92,42.145,154.22,1,13.386,10.928,0
-150,6,411,112,41.559,142.04,1,12.468,7.2859,0
-150,7,271,91,49.61,161.11,1,12.518,9.1853,0
-150,8,461,110,48.425,145.48,1,12.398,6.5297,0
-150,9,367,112,38.343,143.98,1,12.221,7.738,0
-150,10,43,113,50.483,172.91,1,7.3003,8.3543,0
-151,3,197,92,42.115,154.21,1,13.386,10.928,0
-151,6,411,112,41.593,142.32,1,12.468,7.2859,0
-151,7,270,91,49.326,161.08,1,12.516,9.1979,0
-151,8,460,110,48.407,145.75,1,12.396,6.5419,0
-151,9,365,112,37.959,143.79,1,12.216,7.7746,0
-151,10,47,113,50.729,172.33,1,7.3152,8.3147,0
-152,3,197,92,42.086,154.2,1,13.386,10.928,0
-152,6,410,112,41.627,142.65,1,12.466,7.2982,0
-152,7,270,91,49.042,161.05,1,12.516,9.1979,0
-152,8,458,110,48.389,146.03,1,12.234,6.4756,0
-152,9,363,111,37.714,143.6,1,12.373,7.9006,0
-152,10,51,113,50.975,171.75,1,7.4418,8.3664,0
-153,3,197,92,42.058,154.19,1,13.386,10.928,0
-153,6,410,112,41.66,143,1,12.305,7.2013,0
-153,7,269,91,48.758,161.02,1,12.514,9.2104,0
-153,8,457,110,48.37,146.3,1,12.232,6.4877,0
-153,9,362,111,37.608,143.39,1,12.371,7.9129,0
-153,10,56,113,51.22,171.16,1,7.4602,8.3166,0
-154,3,197,92,42.03,154.18,1,13.386,10.928,0
-154,6,409,112,41.693,143.37,1,12.303,7.2135,0
-154,7,268,91,48.475,160.99,1,12.681,9.3383,0
-154,8,456,110,48.351,146.58,1,12.231,6.4998,0
-154,9,360,111,37.605,143.17,1,12.367,7.9375,0
-154,10,60,113,51.465,170.58,1,7.5879,8.3684,0
-155,3,197,92,42.002,154.16,1,13.386,10.928,0
-155,6,409,112,41.727,143.73,1,12.303,7.2135,0
-155,7,268,91,48.192,160.96,1,12.681,9.3383,0
-155,8,455,110,48.333,146.85,1,12.229,6.5119,0
-155,9,359,111,37.656,142.95,1,12.53,8.0532,0
-155,10,64,113,51.71,170,1,7.6024,8.3283,0
-156,3,197,92,41.974,154.15,1,13.383,10.941,0
-156,6,408,112,41.76,144.09,1,12.142,7.1299,0
-156,7,267,91,47.908,160.93,1,12.676,9.3637,0
-156,8,453,110,48.315,147.12,1,12.069,6.4462,0
-156,9,357,111,37.73,142.73,1,12.526,8.078,0
-156,10,68,113,51.955,169.41,1,7.7312,8.3803,0
-157,3,197,91,41.945,154.14,1,13.57,11.076,0
-157,6,408,112,41.793,144.46,1,12.142,7.1299,0
-157,7,266,91,47.624,160.9,1,12.674,9.3764,0
-157,8,452,110,48.297,147.4,1,12.067,6.4582,0
-157,9,356,111,37.81,142.51,1,12.524,8.0904,0
-157,10,72,113,52.201,168.83,1,7.865,8.4227,0
-158,3,197,91,41.915,154.13,1,13.57,11.076,0
-158,6,407,112,41.827,144.82,1,12.14,7.1421,0
-158,7,266,90,47.34,160.88,1,12.846,9.4936,0
-158,8,451,110,48.279,147.67,1,12.066,6.4703,0
-158,9,354,111,37.89,142.28,1,12.52,8.1152,0
-158,10,76,113,52.447,168.25,1,7.8791,8.382,0
-159,3,197,91,41.885,154.12,1,13.57,11.076,0
-159,6,407,112,41.861,145.19,1,11.982,7.0475,0
-159,7,265,90,47.056,160.85,1,12.844,9.5064,0
-159,8,450,110,48.26,147.95,1,12.064,6.4823,0
-159,9,353,111,37.97,142.06,1,12.518,8.1276,0
-159,10,80,113,52.693,167.66,1,8.0105,8.4347,0
-160,3,197,91,41.855,154.11,1,13.57,11.076,0
-160,6,406,112,41.895,145.55,1,11.981,7.0595,0
-160,7,264,90,46.772,160.82,1,12.841,9.5192,0
-160,8,448,110,48.24,148.22,1,11.906,6.4171,0
-160,9,351,111,38.05,141.84,1,12.682,8.2455,0
-160,10,84,114,52.939,167.08,1,7.9073,8.3008,0
-161,3,197,91,41.825,154.09,1,13.57,11.076,0
-161,6,406,112,41.929,145.91,1,11.981,7.0595,0
-161,7,264,90,46.488,160.79,1,12.841,9.5192,0
-161,8,447,110,48.221,148.5,1,11.904,6.4291,0
-161,9,350,111,38.13,141.62,1,12.68,8.258,0
-161,10,88,114,53.185,166.5,1,8.0384,8.3529,0
-162,3,197,91,41.795,154.08,1,13.57,11.076,0
-162,6,405,112,41.963,146.25,1,11.824,6.978,0
-162,7,263,90,46.204,160.76,1,12.839,9.532,0
-162,8,446,110,48.203,148.77,1,11.902,6.441,0
-162,9,348,111,38.21,141.4,1,12.676,8.283,0
-162,10,93,114,53.43,165.91,1,8.1743,8.3952,0
-163,3,197,91,41.766,154.07,1,13.57,11.076,0
-163,6,404,112,41.997,146.55,1,11.822,6.9899,0
-163,7,262,90,45.922,160.73,1,12.834,9.5576,0
-163,8,445,110,48.185,149.04,1,11.748,6.3646,0
-163,9,347,111,38.29,141.17,1,12.674,8.2955,0
-163,10,97,114,53.675,165.33,1,8.188,8.354,0
-164,3,196,91,41.738,154.06,1,13.567,11.09,0
-164,6,403,112,42.03,146.81,1,11.822,6.9899,0
-164,7,262,90,45.646,160.69,1,12.834,9.5576,0
-164,8,444,110,48.167,149.32,1,11.746,6.3764,0
-164,9,345,111,38.369,140.95,1,12.839,8.4282,0
-164,10,101,114,53.92,164.75,1,8.3217,8.4068,0
-165,3,196,91,41.71,154.05,1,13.567,11.09,0
-165,6,401,112,42.063,147.03,1,11.665,6.9212,0
-165,7,261,90,45.386,160.64,1,12.832,9.5704,0
-165,8,442,110,48.149,149.59,1,11.743,6.4001,0
-165,9,344,111,38.447,140.73,1,12.837,8.4408,0
-165,10,105,114,54.165,164.16,1,8.3385,8.3549,0
-166,3,196,91,41.682,154.04,1,13.567,11.09,0
-166,6,399,113,42.097,147.2,1,11.51,6.8533,0
-166,7,260,90,45.154,160.56,1,12.83,9.5832,0
-166,8,441,110,48.13,149.86,1,11.741,6.412,0
-166,9,342,110,38.525,140.51,1,13.004,8.5755,0
-166,10,109,114,54.411,163.58,1,8.4735,8.4078,0
-167,3,196,91,41.654,154.03,1,13.567,11.09,0
-167,6,397,113,42.131,147.36,1,11.506,6.8768,0
-167,7,260,90,44.962,160.44,1,12.83,9.5832,0
-167,8,440,110,48.111,150.14,1,11.588,6.3362,0
-167,9,341,110,38.603,140.28,1,13.002,8.5882,0
-167,10,113,114,54.657,163,1,8.4868,8.3661,0
-168,3,196,91,41.625,154.01,1,13.567,11.09,0
-168,6,395,113,42.165,147.52,1,11.502,6.9003,0
-168,7,260,89,44.808,160.28,1,13.004,9.703,0
-168,8,439,110,48.093,150.41,1,11.587,6.3479,0
-168,9,339,110,38.681,140.06,1,12.998,8.6136,0
-168,10,117,114,54.903,162.41,1,8.623,8.4192,0
-169,3,196,91,41.595,154,1,13.567,11.09,0
-169,6,393,113,42.199,147.67,1,11.499,6.9238,0
-169,7,260,90,44.682,160.09,1,12.83,9.5832,0
-169,8,437,110,48.075,150.69,1,11.583,6.3715,0
-169,9,338,110,38.76,139.84,1,13.17,8.7378,0
-169,10,121,115,55.149,161.83,1,8.6361,8.3771,0
-170,3,196,91,41.565,153.99,1,13.756,11.227,0
-170,6,391,113,42.233,147.82,1,11.495,6.9473,0
-170,7,260,90,44.573,159.88,1,13.004,9.703,0
-170,8,436,110,48.057,150.96,1,11.582,6.3833,0
-170,9,336,110,38.84,139.62,1,13.165,8.7634,0
-170,10,125,115,55.395,161.25,1,8.6491,8.3351,0
-171,3,196,91,41.535,153.98,1,13.756,11.227,0
-171,6,389,113,42.267,147.97,1,11.491,6.9707,0
-171,7,260,90,44.47,159.68,1,13.004,9.703,0
-171,8,435,110,48.039,151.23,1,11.431,6.3081,0
-171,9,335,110,38.92,139.4,1,13.164,8.7763,0
-171,10,130,115,55.64,160.66,1,8.7898,8.3776,0
-172,3,196,91,41.505,153.97,1,13.756,11.227,0
-172,6,387,113,42.3,148.12,1,11.338,6.9028,0
-172,7,260,90,44.37,159.46,1,13.004,9.703,0
-172,8,434,110,48.02,151.51,1,11.429,6.3197,0
-172,9,333,110,39,139.17,1,13.159,8.8019,0
-172,10,134,115,55.885,160.08,1,8.8026,8.3353,0
-173,3,196,91,41.476,153.96,1,13.756,11.227,0
-173,6,385,113,42.333,148.27,1,11.334,6.9261,0
-173,7,260,90,44.27,159.25,1,13.004,9.703,0
-173,8,433,110,48.001,151.78,1,11.428,6.3314,0
-173,9,332,110,39.08,138.95,1,13.334,8.9285,0
-173,10,138,115,56.13,159.5,1,8.9445,8.3778,0
-174,3,196,91,41.448,153.94,1,13.756,11.227,0
-174,6,383,113,42.367,148.42,1,11.33,6.9494,0
-174,7,260,90,44.17,159.04,1,13.004,9.703,0
-174,8,431,110,47.983,152.06,1,11.275,6.2802,0
-174,9,330,110,39.16,138.73,1,13.33,8.9543,0
-174,10,142,115,56.375,158.91,1,9.0846,8.4314,0
-175,3,196,91,41.42,153.93,1,13.756,11.227,0
-175,6,381,113,42.4,148.57,1,11.326,6.9727,0
-175,7,260,90,44.071,158.83,1,13.182,9.8245,0
-175,8,430,110,47.965,152.33,1,11.273,6.2918,0
-175,9,329,110,39.239,138.51,1,13.328,8.9673,0
-175,10,146,115,56.618,158.33,1,9.0971,8.3885,0
-176,3,196,91,41.394,153.92,1,13.756,11.227,0
-176,6,379,113,42.432,148.72,1,11.322,6.996,0
-176,7,260,90,43.975,158.63,1,13.179,9.8375,0
-176,8,429,110,47.948,152.59,1,11.272,6.3034,0
-176,9,327,110,39.316,138.29,1,13.323,8.9932,0
-176,10,150,115,56.854,157.78,1,9.2387,8.4423,0
-177,3,196,91,41.369,153.91,1,13.756,11.227,0
-177,6,377,113,42.461,148.85,1,11.319,7.0193,0
-177,7,260,90,43.888,158.44,1,13.179,9.8375,0
-177,8,428,110,47.932,152.83,1,11.27,6.3149,0
-177,9,326,110,39.386,138.1,1,13.321,9.0061,0
-177,10,154,115,57.068,157.27,1,9.251,8.3992,0
-178,3,196,90,41.349,153.9,1,13.948,11.366,0
-178,6,376,113,42.486,148.96,1,11.316,7.031,0
-178,7,260,90,43.817,158.29,1,13.179,9.8375,0
-178,8,427,110,47.919,153.02,1,11.123,6.2411,0
-178,9,325,110,39.443,137.94,1,13.498,9.1352,0
-178,10,157,115,57.244,156.85,1,9.3909,8.4641,0
-179,3,196,90,41.334,153.9,1,13.948,11.366,0
-179,6,375,113,42.503,149.04,1,11.167,6.9514,0
-179,7,260,90,43.767,158.19,1,13.179,9.8375,0
-179,8,426,110,47.909,153.16,1,11.121,6.2526,0
-179,9,324,110,39.483,137.83,1,13.496,9.1483,0
-179,10,159,116,57.366,156.56,1,9.2663,8.3452,0
diff --git a/tests/data/demo_MOT15_data/train/TUD-Stadtmitte/seqinfo.ini b/tests/data/demo_MOT15_data/train/TUD-Stadtmitte/seqinfo.ini
deleted file mode 100644
index 0f9d37129..000000000
--- a/tests/data/demo_MOT15_data/train/TUD-Stadtmitte/seqinfo.ini
+++ /dev/null
@@ -1,3 +0,0 @@
-[Sequence]
-name=TUD-Stadtmitte
-seqLength=179
diff --git a/tests/data/demo_MOT15_data/train/results/TUD-Campus.txt b/tests/data/demo_MOT15_data/train/results/TUD-Campus.txt
deleted file mode 100644
index 64b0fe10e..000000000
--- a/tests/data/demo_MOT15_data/train/results/TUD-Campus.txt
+++ /dev/null
@@ -1,222 +0,0 @@
-1,3,113.84,274.5,57.307,130.05,-1,-1,-1,-1
-1,6,273.05,203.83,77.366,175.56,-1,-1,-1,-1
-1,10,416.68,205.54,91.04,206.59,-1,-1,-1,-1
-1,13,175.02,195.54,60.972,138.36,-1,-1,-1,-1
-2,3,116.37,265.2,62.858,142.64,-1,-1,-1,-1
-2,6,267.86,202.71,77.704,176.33,-1,-1,-1,-1
-2,10,423.95,203.42,91.88,208.5,-1,-1,-1,-1
-2,13,177.14,202.51,58.209,132.09,-1,-1,-1,-1
-3,3,118.93,255.89,68.408,155.24,-1,-1,-1,-1
-3,6,262.73,201.65,78.033,177.08,-1,-1,-1,-1
-3,10,431.14,201.32,92.719,210.4,-1,-1,-1,-1
-3,13,179.21,209.5,55.445,125.82,-1,-1,-1,-1
-4,3,121.53,246.57,73.959,167.83,-1,-1,-1,-1
-4,6,257.67,200.61,78.354,177.81,-1,-1,-1,-1
-4,10,438.16,199.26,93.559,212.31,-1,-1,-1,-1
-4,13,181.25,216.56,52.681,119.55,-1,-1,-1,-1
-5,3,124.22,237.24,79.51,180.43,-1,-1,-1,-1
-5,6,252.68,199.54,78.667,178.52,-1,-1,-1,-1
-5,10,444.94,197.29,94.398,214.21,-1,-1,-1,-1
-5,13,183.24,223.72,49.917,113.27,-1,-1,-1,-1
-6,3,127,227.96,85.061,193.02,-1,-1,-1,-1
-6,6,247.74,198.46,78.972,179.21,-1,-1,-1,-1
-6,10,451.36,195.47,95.238,216.12,-1,-1,-1,-1
-6,13,185.16,230.95,47.154,107,-1,-1,-1,-1
-7,3,129.86,218.75,90.612,205.62,-1,-1,-1,-1
-7,6,242.86,197.44,79.267,179.88,-1,-1,-1,-1
-7,10,457.4,193.8,96.077,218.02,-1,-1,-1,-1
-7,13,187.05,238.21,44.39,100.73,-1,-1,-1,-1
-8,3,132.77,209.65,96.163,218.21,-1,-1,-1,-1
-8,6,237.99,196.55,79.555,180.53,-1,-1,-1,-1
-8,10,463.14,192.21,96.916,219.93,-1,-1,-1,-1
-9,3,135.64,200.65,101.71,230.81,-1,-1,-1,-1
-9,6,233.04,195.92,79.834,181.16,-1,-1,-1,-1
-9,10,468.66,190.65,97.756,221.83,-1,-1,-1,-1
-10,3,138.45,191.79,107.27,243.4,-1,-1,-1,-1
-10,6,227.9,195.56,80.105,181.78,-1,-1,-1,-1
-10,10,474.09,189.09,98.595,223.74,-1,-1,-1,-1
-11,3,141.22,183.1,112.82,256,-1,-1,-1,-1
-11,6,222.43,195.44,80.367,182.37,-1,-1,-1,-1
-11,10,479.5,187.49,99.435,225.64,-1,-1,-1,-1
-12,3,143.96,174.55,118.37,268.6,-1,-1,-1,-1
-12,6,216.54,195.56,80.621,182.95,-1,-1,-1,-1
-12,10,484.99,185.85,100.27,227.55,-1,-1,-1,-1
-13,3,146.68,166.1,123.92,281.19,-1,-1,-1,-1
-13,6,210.1,195.93,80.866,183.51,-1,-1,-1,-1
-13,10,490.67,184.15,101.11,229.45,-1,-1,-1,-1
-14,6,203.1,196.5,81.104,184.04,-1,-1,-1,-1
-14,10,496.59,182.38,101.95,231.36,-1,-1,-1,-1
-15,6,195.62,197.15,81.332,184.56,-1,-1,-1,-1
-15,10,502.81,180.54,102.79,233.26,-1,-1,-1,-1
-16,6,187.79,197.8,81.553,185.06,-1,-1,-1,-1
-16,7,276.17,205.24,60.444,137.16,-1,-1,-1,-1
-16,10,509.37,178.64,103.63,235.16,-1,-1,-1,-1
-17,6,179.76,198.42,81.764,185.54,-1,-1,-1,-1
-17,7,282.02,209.51,58.423,132.58,-1,-1,-1,-1
-17,10,516.23,176.69,104.47,237.07,-1,-1,-1,-1
-18,6,171.65,199.03,81.968,186.01,-1,-1,-1,-1
-18,7,287.93,213.81,56.403,127.99,-1,-1,-1,-1
-18,10,523.34,174.73,105.31,238.97,-1,-1,-1,-1
-19,6,163.55,199.67,82.163,186.45,-1,-1,-1,-1
-19,7,293.99,218.11,54.382,123.4,-1,-1,-1,-1
-19,10,530.57,172.78,106.15,240.88,-1,-1,-1,-1
-20,6,155.46,200.39,82.35,186.87,-1,-1,-1,-1
-20,7,300.27,222.37,52.361,118.82,-1,-1,-1,-1
-21,6,147.3,201.23,82.528,187.28,-1,-1,-1,-1
-21,7,306.86,226.57,50.34,114.23,-1,-1,-1,-1
-22,6,139.04,202.19,82.698,187.66,-1,-1,-1,-1
-22,7,313.74,230.72,48.32,109.65,-1,-1,-1,-1
-23,6,130.69,203.28,82.859,188.03,-1,-1,-1,-1
-23,7,320.84,234.83,46.299,105.06,-1,-1,-1,-1
-24,6,122.3,204.52,83.012,188.38,-1,-1,-1,-1
-24,7,328.08,238.93,44.278,100.48,-1,-1,-1,-1
-24,11,224.23,208.03,71.57,162.41,-1,-1,-1,-1
-25,6,113.93,205.85,83.157,188.71,-1,-1,-1,-1
-25,7,335.33,243.04,42.257,95.892,-1,-1,-1,-1
-25,11,230.36,214.02,68.455,155.34,-1,-1,-1,-1
-26,4,119.05,191.06,80.289,182.2,-1,-1,-1,-1
-26,7,342.57,247.15,40.236,91.306,-1,-1,-1,-1
-26,9,-15.182,261.28,64.106,145.47,-1,-1,-1,-1
-26,11,236.15,218.42,66.058,149.9,-1,-1,-1,-1
-27,4,109.54,188.88,82.744,187.77,-1,-1,-1,-1
-27,7,349.78,251.25,38.216,86.721,-1,-1,-1,-1
-27,9,-16.51,246.39,71.474,162.19,-1,-1,-1,-1
-27,11,241.64,221.42,64.306,145.92,-1,-1,-1,-1
-28,4,100.03,186.72,85.2,193.34,-1,-1,-1,-1
-28,9,-17.899,231.48,78.843,178.91,-1,-1,-1,-1
-28,11,246.79,223.23,63.128,143.25,-1,-1,-1,-1
-29,4,90.482,184.67,87.656,198.91,-1,-1,-1,-1
-29,9,-19.351,216.58,86.212,195.63,-1,-1,-1,-1
-29,11,251.59,224.01,62.458,141.73,-1,-1,-1,-1
-30,4,80.854,182.84,90.111,204.49,-1,-1,-1,-1
-30,9,-20.845,201.69,93.58,212.35,-1,-1,-1,-1
-30,11,255.99,223.9,62.231,141.22,-1,-1,-1,-1
-31,4,71.177,181.25,92.567,210.06,-1,-1,-1,-1
-31,9,-22.364,186.8,100.95,229.07,-1,-1,-1,-1
-31,11,260.09,223.01,62.386,141.57,-1,-1,-1,-1
-32,4,61.563,179.93,95.023,215.63,-1,-1,-1,-1
-32,11,263.94,221.53,62.864,142.65,-1,-1,-1,-1
-33,4,52.124,178.89,97.479,221.2,-1,-1,-1,-1
-33,8,324.5,165.22,108.85,247.01,-1,-1,-1,-1
-33,11,267.58,219.61,63.611,144.35,-1,-1,-1,-1
-34,4,42.892,178.09,99.934,226.78,-1,-1,-1,-1
-34,8,336.28,176.03,105.56,239.53,-1,-1,-1,-1
-34,11,271.09,217.43,64.576,146.54,-1,-1,-1,-1
-35,4,33.841,177.49,102.39,232.35,-1,-1,-1,-1
-35,8,348.07,186.95,102.26,232.06,-1,-1,-1,-1
-35,11,274.48,215.13,65.708,149.11,-1,-1,-1,-1
-36,4,24.941,177.01,104.85,237.92,-1,-1,-1,-1
-36,8,359.8,198.05,98.967,224.58,-1,-1,-1,-1
-36,11,277.84,212.8,66.965,151.96,-1,-1,-1,-1
-37,4,16.117,176.59,107.3,243.49,-1,-1,-1,-1
-37,8,371.47,209.36,95.673,217.1,-1,-1,-1,-1
-37,11,281.24,210.52,68.301,154.99,-1,-1,-1,-1
-38,2,52.423,232.95,80.36,182.36,-1,-1,-1,-1
-38,8,383.09,220.95,92.379,209.63,-1,-1,-1,-1
-38,11,284.69,208.3,69.68,158.12,-1,-1,-1,-1
-39,2,56.427,217.08,87.228,197.94,-1,-1,-1,-1
-39,8,394.66,232.78,89.085,202.15,-1,-1,-1,-1
-39,11,288.21,206.16,71.063,161.26,-1,-1,-1,-1
-40,2,61.088,203.79,92.98,210.99,-1,-1,-1,-1
-40,8,406.23,244.74,85.791,194.68,-1,-1,-1,-1
-40,11,291.81,204.09,72.419,164.34,-1,-1,-1,-1
-41,2,66.422,192.94,97.686,221.67,-1,-1,-1,-1
-41,5,394.42,197.9,97.849,222.04,-1,-1,-1,-1
-41,11,295.46,202.07,73.718,167.28,-1,-1,-1,-1
-42,2,72.446,184.34,101.42,230.14,-1,-1,-1,-1
-42,5,400.86,193.91,102.46,232.51,-1,-1,-1,-1
-42,11,299.14,200,74.933,170.04,-1,-1,-1,-1
-43,2,79.136,177.84,104.25,236.56,-1,-1,-1,-1
-43,5,407.31,190.07,107.08,242.98,-1,-1,-1,-1
-43,11,302.83,197.86,76.041,172.55,-1,-1,-1,-1
-44,2,86.433,173.27,106.24,241.09,-1,-1,-1,-1
-44,5,413.79,186.43,111.69,253.45,-1,-1,-1,-1
-44,11,306.54,195.65,77.02,174.78,-1,-1,-1,-1
-45,2,94.236,170.49,107.48,243.89,-1,-1,-1,-1
-45,5,420.34,182.99,116.3,263.91,-1,-1,-1,-1
-45,11,310.3,193.42,77.854,176.67,-1,-1,-1,-1
-46,2,102.52,169.29,108.02,245.13,-1,-1,-1,-1
-46,5,426.95,179.62,120.92,274.38,-1,-1,-1,-1
-46,11,314.13,191.28,78.529,178.2,-1,-1,-1,-1
-47,2,111.21,169.48,107.95,244.96,-1,-1,-1,-1
-47,5,433.61,176.29,125.53,284.85,-1,-1,-1,-1
-47,11,318.04,189.41,79.034,179.34,-1,-1,-1,-1
-48,2,120.2,170.97,107.33,243.55,-1,-1,-1,-1
-48,5,442.57,183.44,125.53,284.85,-1,-1,-1,-1
-48,11,322.05,187.93,79.36,180.08,-1,-1,-1,-1
-49,1,459.32,237.96,50.475,114.54,-1,-1,-1,-1
-49,2,129.39,173.6,106.23,241.06,-1,-1,-1,-1
-49,11,326.2,186.99,79.503,180.41,-1,-1,-1,-1
-50,1,462.86,233.83,51.899,117.77,-1,-1,-1,-1
-50,2,138.73,177.21,104.73,237.65,-1,-1,-1,-1
-50,11,330.53,186.6,79.461,180.31,-1,-1,-1,-1
-51,1,466.5,230.01,53.199,120.72,-1,-1,-1,-1
-51,2,148.19,181.54,102.89,233.48,-1,-1,-1,-1
-51,11,335.03,186.77,79.236,179.8,-1,-1,-1,-1
-52,1,470.24,226.53,54.374,123.39,-1,-1,-1,-1
-52,2,157.75,186.54,100.79,228.71,-1,-1,-1,-1
-52,11,339.68,187.47,78.833,178.89,-1,-1,-1,-1
-53,1,474.11,223.4,55.425,125.78,-1,-1,-1,-1
-53,2,167.48,192.06,98.496,223.51,-1,-1,-1,-1
-53,11,344.45,188.69,78.259,177.59,-1,-1,-1,-1
-54,1,478.14,220.61,56.352,127.88,-1,-1,-1,-1
-54,2,177.42,197.94,96.081,218.03,-1,-1,-1,-1
-54,11,349.28,190.37,77.525,175.92,-1,-1,-1,-1
-55,1,482.39,218.19,57.154,129.7,-1,-1,-1,-1
-55,2,187.55,204.01,93.617,212.44,-1,-1,-1,-1
-55,11,354.13,192.44,76.646,173.93,-1,-1,-1,-1
-55,12,533.9,309.3,68.825,156.18,-1,-1,-1,-1
-56,1,486.87,216.16,57.832,131.24,-1,-1,-1,-1
-56,2,197.87,210.13,91.174,206.9,-1,-1,-1,-1
-56,11,358.95,194.8,75.638,171.64,-1,-1,-1,-1
-56,12,533.51,279.28,82.049,186.19,-1,-1,-1,-1
-57,1,491.6,214.54,58.386,132.49,-1,-1,-1,-1
-57,2,208.33,216.16,88.824,201.56,-1,-1,-1,-1
-57,11,363.73,197.38,74.522,169.1,-1,-1,-1,-1
-57,12,533.16,249.29,95.273,216.2,-1,-1,-1,-1
-58,1,496.55,213.28,58.815,133.47,-1,-1,-1,-1
-58,2,218.83,221.94,86.637,196.6,-1,-1,-1,-1
-58,11,368.43,200.11,73.32,166.38,-1,-1,-1,-1
-58,12,532.85,219.33,108.5,246.2,-1,-1,-1,-1
-59,1,501.69,212.34,59.119,134.16,-1,-1,-1,-1
-59,2,229.42,227.21,84.684,192.17,-1,-1,-1,-1
-59,11,373.11,202.9,72.061,163.52,-1,-1,-1,-1
-59,12,532.56,189.41,121.72,276.21,-1,-1,-1,-1
-60,1,506.97,211.69,59.3,134.57,-1,-1,-1,-1
-60,2,240.03,231.7,83.037,188.43,-1,-1,-1,-1
-60,11,377.81,205.68,70.773,160.6,-1,-1,-1,-1
-60,12,536.94,180.92,125.53,284.85,-1,-1,-1,-1
-61,1,512.37,211.32,59.355,134.69,-1,-1,-1,-1
-61,2,250.51,235.22,81.767,185.55,-1,-1,-1,-1
-61,11,382.61,208.36,69.489,157.68,-1,-1,-1,-1
-61,12,543.18,181.11,125.53,284.85,-1,-1,-1,-1
-62,1,517.89,211.16,59.287,134.54,-1,-1,-1,-1
-62,2,260.69,237.62,80.944,183.68,-1,-1,-1,-1
-62,11,387.51,210.9,68.246,154.86,-1,-1,-1,-1
-63,1,523.53,211.17,59.094,134.1,-1,-1,-1,-1
-63,2,270.48,238.63,80.641,182.99,-1,-1,-1,-1
-63,11,392.47,213.29,67.081,152.22,-1,-1,-1,-1
-64,1,529.29,211.36,58.776,133.38,-1,-1,-1,-1
-64,2,279.79,238.08,80.928,183.64,-1,-1,-1,-1
-64,11,397.5,215.45,66.039,149.86,-1,-1,-1,-1
-65,1,535.17,211.62,58.334,132.37,-1,-1,-1,-1
-65,2,288.52,235.8,81.876,185.79,-1,-1,-1,-1
-65,11,402.55,217.33,65.163,147.87,-1,-1,-1,-1
-66,1,541.16,211.9,57.768,131.09,-1,-1,-1,-1
-66,2,296.67,231.74,83.556,189.61,-1,-1,-1,-1
-66,11,407.6,218.87,64.503,146.37,-1,-1,-1,-1
-67,1,547.25,212.24,57.077,129.52,-1,-1,-1,-1
-67,2,304.28,225.68,86.04,195.24,-1,-1,-1,-1
-67,11,412.69,220,64.11,145.48,-1,-1,-1,-1
-68,1,553.44,212.71,56.262,127.67,-1,-1,-1,-1
-68,2,311.34,217.45,89.398,202.86,-1,-1,-1,-1
-68,11,417.76,220.6,64.039,145.32,-1,-1,-1,-1
-69,1,559.73,213.37,55.323,125.54,-1,-1,-1,-1
-69,2,317.88,206.93,93.702,212.63,-1,-1,-1,-1
-69,11,422.74,220.5,64.348,146.02,-1,-1,-1,-1
-70,1,566.12,214.27,54.259,123.13,-1,-1,-1,-1
-70,2,323.85,194.07,99.022,224.7,-1,-1,-1,-1
-70,11,427.58,219.49,65.097,147.72,-1,-1,-1,-1
-71,1,572.59,215.45,53.07,120.43,-1,-1,-1,-1
-71,2,329.22,178.75,105.43,239.25,-1,-1,-1,-1
-71,11,432.2,217.39,66.352,150.57,-1,-1,-1,-1
diff --git a/tests/data/demo_MOT15_data/train/results/TUD-Stadtmitte.txt b/tests/data/demo_MOT15_data/train/results/TUD-Stadtmitte.txt
deleted file mode 100644
index 3ad95599f..000000000
--- a/tests/data/demo_MOT15_data/train/results/TUD-Stadtmitte.txt
+++ /dev/null
@@ -1,749 +0,0 @@
-1,1,425.78,91.371,106.46,241.58,-1,-1,-1,-1
-1,3,330.85,77.998,104.84,237.9,-1,-1,-1,-1
-1,4,85.65,135.15,80.321,182.27,-1,-1,-1,-1
-1,5,167.02,73.423,106.01,240.55,-1,-1,-1,-1
-1,6,559.16,78.692,86.706,196.76,-1,-1,-1,-1
-2,1,429.87,98.817,102.38,232.33,-1,-1,-1,-1
-2,3,335.1,81,103.64,235.19,-1,-1,-1,-1
-2,4,78.12,125.34,84.666,192.13,-1,-1,-1,-1
-2,5,172.08,78.856,103.68,235.27,-1,-1,-1,-1
-2,6,558.37,79.386,86.527,196.35,-1,-1,-1,-1
-3,1,433.01,101.84,100.26,227.53,-1,-1,-1,-1
-3,3,339.22,83.447,102.7,233.05,-1,-1,-1,-1
-3,4,70.836,116.64,88.525,200.89,-1,-1,-1,-1
-3,5,177.12,83.989,101.49,230.3,-1,-1,-1,-1
-3,6,557.55,80.091,86.348,195.95,-1,-1,-1,-1
-4,1,435.58,101.99,99.444,225.66,-1,-1,-1,-1
-4,3,343.16,85.406,101.98,231.41,-1,-1,-1,-1
-4,4,63.745,109.07,91.899,208.54,-1,-1,-1,-1
-4,5,182.19,88.799,99.481,225.75,-1,-1,-1,-1
-4,6,556.7,80.759,86.169,195.54,-1,-1,-1,-1
-5,1,437.88,100.46,99.431,225.63,-1,-1,-1,-1
-5,3,346.93,86.87,101.46,230.23,-1,-1,-1,-1
-5,4,56.774,102.66,94.788,215.1,-1,-1,-1,-1
-5,5,187.36,93.314,97.674,221.65,-1,-1,-1,-1
-5,6,555.78,81.345,85.99,195.13,-1,-1,-1,-1
-6,1,440.14,98.199,99.87,226.63,-1,-1,-1,-1
-6,3,350.54,87.831,101.11,229.44,-1,-1,-1,-1
-6,4,49.907,97.427,97.192,220.56,-1,-1,-1,-1
-6,5,192.63,97.494,96.079,218.03,-1,-1,-1,-1
-6,6,554.78,81.812,85.812,194.73,-1,-1,-1,-1
-7,1,442.5,95.795,100.51,228.09,-1,-1,-1,-1
-7,3,354.04,88.268,100.91,228.99,-1,-1,-1,-1
-7,4,43.151,93.419,99.111,224.91,-1,-1,-1,-1
-7,5,198,101.21,94.697,214.89,-1,-1,-1,-1
-7,6,553.68,82.118,85.633,194.32,-1,-1,-1,-1
-8,1,445.03,93.591,101.19,229.63,-1,-1,-1,-1
-8,3,357.44,88.237,100.84,228.84,-1,-1,-1,-1
-8,4,36.564,90.66,100.55,228.16,-1,-1,-1,-1
-8,5,203.43,104.33,93.524,212.23,-1,-1,-1,-1
-8,6,552.49,82.234,85.454,193.92,-1,-1,-1,-1
-9,1,447.79,91.768,101.8,231.02,-1,-1,-1,-1
-9,3,360.74,87.834,100.89,228.93,-1,-1,-1,-1
-9,4,30.22,89.167,101.49,230.32,-1,-1,-1,-1
-9,5,208.85,106.77,92.545,210.01,-1,-1,-1,-1
-9,6,551.28,82.135,85.275,193.51,-1,-1,-1,-1
-9,11,154.14,64.542,82.792,187.87,-1,-1,-1,-1
-10,1,450.79,90.403,102.29,232.13,-1,-1,-1,-1
-10,3,363.95,87.164,101.02,229.23,-1,-1,-1,-1
-10,4,24.182,88.961,101.96,231.37,-1,-1,-1,-1
-10,5,214.14,108.44,91.746,208.2,-1,-1,-1,-1
-10,6,550.14,81.763,85.096,193.1,-1,-1,-1,-1
-10,11,157.44,76.474,76.717,174.09,-1,-1,-1,-1
-11,1,454.07,89.477,102.64,232.91,-1,-1,-1,-1
-11,3,367.04,86.337,101.22,229.68,-1,-1,-1,-1
-11,4,18.498,89.982,101.94,231.32,-1,-1,-1,-1
-11,5,219.25,109.36,91.106,206.74,-1,-1,-1,-1
-11,6,549.14,81.068,84.917,192.7,-1,-1,-1,-1
-11,11,160.12,85.622,71.876,163.1,-1,-1,-1,-1
-12,1,457.64,88.91,102.84,233.36,-1,-1,-1,-1
-12,3,369.98,85.471,101.47,230.26,-1,-1,-1,-1
-12,4,13.194,92.161,101.43,230.18,-1,-1,-1,-1
-12,5,224.06,109.63,90.605,205.61,-1,-1,-1,-1
-12,6,548.33,80.01,84.738,192.29,-1,-1,-1,-1
-12,11,162.24,92.333,68.116,154.57,-1,-1,-1,-1
-13,1,461.41,88.635,102.91,233.53,-1,-1,-1,-1
-13,3,372.77,84.724,101.76,230.91,-1,-1,-1,-1
-13,4,8.2951,95.348,100.44,227.93,-1,-1,-1,-1
-13,5,228.55,109.37,90.221,204.74,-1,-1,-1,-1
-13,6,547.75,78.569,84.559,191.89,-1,-1,-1,-1
-13,11,163.88,96.898,65.293,148.17,-1,-1,-1,-1
-14,1,465.3,88.552,102.88,233.45,-1,-1,-1,-1
-14,3,375.41,84.194,102.06,231.61,-1,-1,-1,-1
-14,4,3.8143,99.469,98.966,224.58,-1,-1,-1,-1
-14,5,232.75,108.69,89.933,204.08,-1,-1,-1,-1
-14,6,547.44,76.752,84.38,191.48,-1,-1,-1,-1
-14,11,165.12,99.619,63.274,143.58,-1,-1,-1,-1
-15,1,469.26,88.544,102.77,233.2,-1,-1,-1,-1
-15,3,377.98,83.9,102.38,232.32,-1,-1,-1,-1
-15,4,-0.2482,104.42,97.005,220.13,-1,-1,-1,-1
-15,5,236.77,107.76,89.718,203.59,-1,-1,-1,-1
-15,6,547.42,74.626,84.202,191.07,-1,-1,-1,-1
-15,11,166.02,100.8,61.936,140.55,-1,-1,-1,-1
-16,1,473.28,88.486,102.61,232.84,-1,-1,-1,-1
-16,3,380.58,83.797,102.68,233.01,-1,-1,-1,-1
-16,4,-3.9267,110.16,94.559,214.58,-1,-1,-1,-1
-16,5,240.66,106.73,89.557,203.23,-1,-1,-1,-1
-16,6,547.68,72.256,84.023,190.67,-1,-1,-1,-1
-16,11,166.61,100.84,61.167,138.8,-1,-1,-1,-1
-17,1,477.41,88.306,102.43,232.43,-1,-1,-1,-1
-17,3,383.22,83.824,102.97,233.66,-1,-1,-1,-1
-17,4,-7.2591,116.72,91.629,207.93,-1,-1,-1,-1
-17,5,244.43,105.7,89.433,202.95,-1,-1,-1,-1
-17,6,548.14,69.75,83.844,190.26,-1,-1,-1,-1
-17,11,166.93,100.07,60.863,138.11,-1,-1,-1,-1
-18,1,481.66,87.972,102.24,232,-1,-1,-1,-1
-18,3,385.96,83.926,103.22,234.24,-1,-1,-1,-1
-18,4,-10.217,124.08,88.213,200.18,-1,-1,-1,-1
-18,5,248.11,104.77,89.329,202.71,-1,-1,-1,-1
-18,6,548.71,67.217,83.665,189.86,-1,-1,-1,-1
-18,11,167.05,98.737,60.933,138.27,-1,-1,-1,-1
-19,1,485.91,87.493,102.06,231.6,-1,-1,-1,-1
-19,3,388.79,84.058,103.44,234.73,-1,-1,-1,-1
-19,4,-12.816,132.29,84.312,191.33,-1,-1,-1,-1
-19,5,251.73,103.97,89.232,202.49,-1,-1,-1,-1
-19,11,166.99,97.06,61.295,139.09,-1,-1,-1,-1
-20,1,490.02,86.905,101.91,231.26,-1,-1,-1,-1
-20,3,391.73,84.168,103.6,235.1,-1,-1,-1,-1
-20,4,-15.096,141.49,79.927,181.37,-1,-1,-1,-1
-20,5,255.33,103.34,89.128,202.26,-1,-1,-1,-1
-20,11,166.79,95.19,61.875,140.41,-1,-1,-1,-1
-21,1,493.91,86.241,101.78,230.97,-1,-1,-1,-1
-21,3,394.75,84.212,103.71,235.34,-1,-1,-1,-1
-21,4,-17.098,151.75,75.056,170.32,-1,-1,-1,-1
-21,5,258.93,102.9,89.011,201.99,-1,-1,-1,-1
-21,11,166.51,93.316,62.609,142.07,-1,-1,-1,-1
-22,1,497.58,85.544,101.69,230.75,-1,-1,-1,-1
-22,3,397.78,84.197,103.75,235.44,-1,-1,-1,-1
-22,4,-18.857,163.12,69.7,158.17,-1,-1,-1,-1
-22,5,262.49,102.71,88.872,201.68,-1,-1,-1,-1
-22,11,166.16,91.566,63.443,143.97,-1,-1,-1,-1
-23,1,501.02,84.877,101.61,230.59,-1,-1,-1,-1
-23,3,400.84,84.132,103.72,235.38,-1,-1,-1,-1
-23,4,-20.357,175.63,63.86,144.91,-1,-1,-1,-1
-23,5,266.02,102.74,88.708,201.3,-1,-1,-1,-1
-23,11,165.76,90,64.329,145.98,-1,-1,-1,-1
-24,1,504.22,84.315,101.56,230.47,-1,-1,-1,-1
-24,3,403.86,84.034,103.63,235.15,-1,-1,-1,-1
-24,4,-21.602,189.27,57.534,130.56,-1,-1,-1,-1
-24,5,269.55,102.97,88.515,200.86,-1,-1,-1,-1
-24,11,165.33,88.665,65.227,148.02,-1,-1,-1,-1
-25,1,507.13,83.943,101.52,230.37,-1,-1,-1,-1
-25,3,406.85,83.938,103.45,234.75,-1,-1,-1,-1
-25,5,273.08,103.36,88.294,200.36,-1,-1,-1,-1
-25,11,164.88,87.584,66.105,150.01,-1,-1,-1,-1
-26,1,509.79,83.796,101.48,230.27,-1,-1,-1,-1
-26,3,409.82,83.857,103.2,234.18,-1,-1,-1,-1
-26,5,276.64,103.84,88.045,199.8,-1,-1,-1,-1
-26,11,164.46,86.759,66.936,151.89,-1,-1,-1,-1
-27,1,512.25,83.831,101.42,230.15,-1,-1,-1,-1
-27,3,412.79,83.815,102.87,233.43,-1,-1,-1,-1
-27,5,280.26,104.34,87.772,199.18,-1,-1,-1,-1
-27,11,164.08,86.205,67.7,153.63,-1,-1,-1,-1
-28,1,514.63,83.961,101.35,229.98,-1,-1,-1,-1
-28,3,415.81,83.828,102.46,232.51,-1,-1,-1,-1
-28,5,283.94,104.76,87.48,198.51,-1,-1,-1,-1
-28,11,163.75,85.964,68.381,155.17,-1,-1,-1,-1
-29,1,517.06,84.126,101.24,229.73,-1,-1,-1,-1
-29,3,418.9,83.932,101.98,231.42,-1,-1,-1,-1
-29,5,287.71,105.08,87.172,197.82,-1,-1,-1,-1
-29,11,163.46,86.014,68.97,156.51,-1,-1,-1,-1
-30,1,519.62,84.297,101.09,229.4,-1,-1,-1,-1
-30,3,422.06,84.171,101.43,230.18,-1,-1,-1,-1
-30,5,291.55,105.31,86.855,197.1,-1,-1,-1,-1
-30,11,163.22,86.342,69.46,157.62,-1,-1,-1,-1
-31,1,522.35,84.456,100.89,228.95,-1,-1,-1,-1
-31,3,425.28,84.583,100.82,228.78,-1,-1,-1,-1
-31,5,295.47,105.45,86.536,196.37,-1,-1,-1,-1
-31,11,163.05,86.852,69.849,158.5,-1,-1,-1,-1
-32,1,525.22,84.6,100.65,228.39,-1,-1,-1,-1
-32,3,428.54,85.218,100.14,227.25,-1,-1,-1,-1
-32,5,299.45,105.55,86.221,195.66,-1,-1,-1,-1
-32,11,162.94,87.494,70.138,159.16,-1,-1,-1,-1
-33,1,528.15,84.733,100.34,227.7,-1,-1,-1,-1
-33,3,431.75,86.059,99.414,225.6,-1,-1,-1,-1
-33,5,303.5,105.53,85.918,194.97,-1,-1,-1,-1
-33,11,162.91,88.27,70.329,159.59,-1,-1,-1,-1
-34,1,531.06,84.878,99.982,226.89,-1,-1,-1,-1
-34,3,434.89,87.004,98.643,223.85,-1,-1,-1,-1
-34,5,307.63,105.39,85.631,194.32,-1,-1,-1,-1
-34,11,162.92,89.195,70.429,159.82,-1,-1,-1,-1
-35,1,533.87,85.094,99.57,225.95,-1,-1,-1,-1
-35,3,437.91,88.01,97.84,222.02,-1,-1,-1,-1
-35,5,311.87,105.12,85.369,193.72,-1,-1,-1,-1
-35,11,162.96,90.252,70.445,159.86,-1,-1,-1,-1
-36,1,536.54,85.473,99.107,224.9,-1,-1,-1,-1
-36,3,440.85,89.03,97.016,220.15,-1,-1,-1,-1
-36,5,316.17,104.74,85.135,193.19,-1,-1,-1,-1
-36,11,163.02,91.419,70.386,159.72,-1,-1,-1,-1
-37,1,539,86.055,98.602,223.75,-1,-1,-1,-1
-37,3,443.69,90.029,96.183,218.26,-1,-1,-1,-1
-37,5,320.51,104.18,84.934,192.74,-1,-1,-1,-1
-37,11,163.07,92.642,70.262,159.44,-1,-1,-1,-1
-38,1,541.15,86.826,98.06,222.52,-1,-1,-1,-1
-38,3,446.38,91.004,95.356,216.39,-1,-1,-1,-1
-38,5,324.86,103.46,84.769,192.36,-1,-1,-1,-1
-38,11,163.09,93.875,70.083,159.03,-1,-1,-1,-1
-39,1,542.92,87.761,97.492,221.23,-1,-1,-1,-1
-39,3,448.91,91.901,94.551,214.56,-1,-1,-1,-1
-39,5,329.24,102.69,84.644,192.08,-1,-1,-1,-1
-39,11,163.14,95.037,69.86,158.53,-1,-1,-1,-1
-40,1,544.26,88.8,96.907,219.91,-1,-1,-1,-1
-40,3,451.32,92.677,93.785,212.82,-1,-1,-1,-1
-40,5,333.66,102.01,84.559,191.89,-1,-1,-1,-1
-40,11,163.2,96.048,69.605,157.95,-1,-1,-1,-1
-41,1,545.13,89.944,96.317,218.57,-1,-1,-1,-1
-41,3,453.58,93.321,93.076,211.22,-1,-1,-1,-1
-41,5,338.11,101.44,84.514,191.78,-1,-1,-1,-1
-41,11,163.31,96.877,69.327,157.32,-1,-1,-1,-1
-42,1,545.58,91.159,95.731,217.24,-1,-1,-1,-1
-42,3,455.66,93.867,92.445,209.78,-1,-1,-1,-1
-42,5,342.57,101.02,84.507,191.77,-1,-1,-1,-1
-42,11,163.44,97.508,69.039,156.67,-1,-1,-1,-1
-43,1,545.62,92.383,95.162,215.95,-1,-1,-1,-1
-43,3,457.47,94.398,91.913,208.58,-1,-1,-1,-1
-43,5,347.01,100.71,84.537,191.84,-1,-1,-1,-1
-43,11,163.58,97.945,68.749,156.01,-1,-1,-1,-1
-44,1,545.19,93.536,94.618,214.71,-1,-1,-1,-1
-44,3,458.98,94.946,91.503,207.65,-1,-1,-1,-1
-44,5,351.46,100.47,84.598,191.98,-1,-1,-1,-1
-44,11,163.69,98.215,68.467,155.37,-1,-1,-1,-1
-45,1,544.33,94.606,94.109,213.56,-1,-1,-1,-1
-45,3,460.25,95.449,91.239,207.05,-1,-1,-1,-1
-45,5,355.89,100.22,84.687,192.18,-1,-1,-1,-1
-45,11,163.83,98.263,68.202,154.77,-1,-1,-1,-1
-46,1,542.95,95.544,93.643,212.5,-1,-1,-1,-1
-46,3,461.28,95.828,91.147,206.84,-1,-1,-1,-1
-46,5,360.33,99.957,84.798,192.43,-1,-1,-1,-1
-46,11,164,98.093,67.961,154.22,-1,-1,-1,-1
-47,1,541.11,96.292,93.227,211.56,-1,-1,-1,-1
-47,3,462.05,95.994,91.254,207.08,-1,-1,-1,-1
-47,5,364.75,99.618,84.922,192.71,-1,-1,-1,-1
-47,11,164.22,97.741,67.751,153.74,-1,-1,-1,-1
-48,1,538.87,96.774,92.865,210.74,-1,-1,-1,-1
-48,3,462.59,95.84,91.59,207.84,-1,-1,-1,-1
-48,5,369.16,99.167,85.053,193.01,-1,-1,-1,-1
-48,11,164.48,97.283,67.577,153.35,-1,-1,-1,-1
-49,1,536.24,96.94,92.559,210.04,-1,-1,-1,-1
-49,3,462.88,95.254,92.185,209.19,-1,-1,-1,-1
-49,5,373.61,98.612,85.183,193.3,-1,-1,-1,-1
-49,11,164.77,96.828,67.443,153.04,-1,-1,-1,-1
-50,1,533.3,96.808,92.312,209.48,-1,-1,-1,-1
-50,3,462.94,94.13,93.07,211.2,-1,-1,-1,-1
-50,5,378.07,98.005,85.302,193.57,-1,-1,-1,-1
-50,11,165.07,96.501,67.353,152.84,-1,-1,-1,-1
-51,1,530.14,96.432,92.121,209.05,-1,-1,-1,-1
-51,3,462.75,92.362,94.28,213.95,-1,-1,-1,-1
-51,5,382.47,97.398,85.403,193.8,-1,-1,-1,-1
-51,11,165.34,96.351,67.309,152.74,-1,-1,-1,-1
-52,1,526.94,95.899,91.984,208.74,-1,-1,-1,-1
-52,3,462.35,89.807,95.849,217.51,-1,-1,-1,-1
-52,5,386.71,96.853,85.477,193.97,-1,-1,-1,-1
-52,11,165.53,96.365,67.312,152.75,-1,-1,-1,-1
-53,1,523.89,95.285,91.895,208.54,-1,-1,-1,-1
-53,3,461.75,86.352,97.813,221.97,-1,-1,-1,-1
-53,5,390.72,96.409,85.515,194.06,-1,-1,-1,-1
-53,11,165.59,96.499,67.362,152.86,-1,-1,-1,-1
-54,1,521.19,94.597,91.848,208.43,-1,-1,-1,-1
-54,5,394.41,96.1,85.51,194.04,-1,-1,-1,-1
-54,11,165.51,96.758,67.458,153.08,-1,-1,-1,-1
-55,1,519.13,93.843,91.835,208.4,-1,-1,-1,-1
-55,5,397.72,95.895,85.456,193.92,-1,-1,-1,-1
-55,11,165.29,97.105,67.599,153.4,-1,-1,-1,-1
-56,1,517.92,93.036,91.846,208.42,-1,-1,-1,-1
-56,5,400.63,95.732,85.347,193.67,-1,-1,-1,-1
-56,11,164.99,97.407,67.782,153.81,-1,-1,-1,-1
-57,1,517.58,92.206,91.87,208.48,-1,-1,-1,-1
-57,5,403.19,95.539,85.179,193.29,-1,-1,-1,-1
-57,11,164.67,97.58,68.004,154.32,-1,-1,-1,-1
-58,1,518.06,91.393,91.896,208.54,-1,-1,-1,-1
-58,5,405.5,95.302,84.949,192.77,-1,-1,-1,-1
-58,11,164.34,97.598,68.26,154.9,-1,-1,-1,-1
-59,1,519.27,90.605,91.912,208.58,-1,-1,-1,-1
-59,5,407.68,95.068,84.657,192.11,-1,-1,-1,-1
-59,11,164,97.459,68.546,155.55,-1,-1,-1,-1
-60,1,521.08,89.857,91.908,208.57,-1,-1,-1,-1
-60,5,409.89,94.881,84.303,191.3,-1,-1,-1,-1
-60,11,163.67,97.191,68.857,156.25,-1,-1,-1,-1
-61,1,523.43,89.13,91.872,208.48,-1,-1,-1,-1
-61,5,412.25,94.747,83.891,190.37,-1,-1,-1,-1
-61,11,163.32,96.843,69.187,157,-1,-1,-1,-1
-62,1,526.18,88.444,91.793,208.3,-1,-1,-1,-1
-62,5,414.89,94.782,83.426,189.31,-1,-1,-1,-1
-62,11,162.99,96.36,69.532,157.78,-1,-1,-1,-1
-63,1,529.2,87.856,91.662,208.01,-1,-1,-1,-1
-63,5,417.88,95.002,82.915,188.16,-1,-1,-1,-1
-63,11,162.71,95.726,69.884,158.58,-1,-1,-1,-1
-64,1,532.35,87.382,91.471,207.57,-1,-1,-1,-1
-64,5,421.24,95.412,82.369,186.92,-1,-1,-1,-1
-64,11,162.49,94.932,70.239,159.39,-1,-1,-1,-1
-65,1,535.48,87.008,91.214,206.99,-1,-1,-1,-1
-65,5,424.94,96.019,81.797,185.62,-1,-1,-1,-1
-65,11,162.34,93.98,70.591,160.19,-1,-1,-1,-1
-66,1,538.6,86.726,90.886,206.25,-1,-1,-1,-1
-66,5,428.89,96.804,81.213,184.29,-1,-1,-1,-1
-66,11,162.27,92.9,70.933,160.96,-1,-1,-1,-1
-67,1,541.65,86.511,90.486,205.34,-1,-1,-1,-1
-67,2,368.38,84.482,80.264,182.14,-1,-1,-1,-1
-67,5,432.96,97.72,80.632,182.97,-1,-1,-1,-1
-67,11,162.28,91.728,71.261,161.71,-1,-1,-1,-1
-68,1,544.59,86.383,90.013,204.26,-1,-1,-1,-1
-68,2,368.26,86.399,79.339,180.04,-1,-1,-1,-1
-68,5,436.93,98.738,80.068,181.69,-1,-1,-1,-1
-68,11,162.37,90.521,71.568,162.41,-1,-1,-1,-1
-69,1,547.24,86.422,89.469,203.03,-1,-1,-1,-1
-69,2,367.96,87.645,78.712,178.62,-1,-1,-1,-1
-69,5,440.71,99.776,79.536,180.49,-1,-1,-1,-1
-69,11,162.52,89.368,71.851,163.05,-1,-1,-1,-1
-70,1,549.56,86.701,88.86,201.65,-1,-1,-1,-1
-70,2,367.51,88.465,78.276,177.63,-1,-1,-1,-1
-70,5,444.22,100.73,79.052,179.39,-1,-1,-1,-1
-70,11,162.71,88.317,72.105,163.62,-1,-1,-1,-1
-71,1,551.52,87.251,88.191,200.13,-1,-1,-1,-1
-71,2,366.96,89.072,77.954,176.9,-1,-1,-1,-1
-71,5,447.42,101.54,78.629,178.43,-1,-1,-1,-1
-71,11,162.94,87.4,72.326,164.12,-1,-1,-1,-1
-72,1,553.15,88.112,87.472,198.5,-1,-1,-1,-1
-72,2,366.33,89.625,77.693,176.3,-1,-1,-1,-1
-72,5,450.21,102.12,78.279,177.63,-1,-1,-1,-1
-72,11,163.18,86.629,72.51,164.54,-1,-1,-1,-1
-73,1,554.48,89.225,86.714,196.78,-1,-1,-1,-1
-73,2,365.67,90.213,77.456,175.77,-1,-1,-1,-1
-73,5,452.6,102.45,78.009,177.02,-1,-1,-1,-1
-73,11,163.42,85.994,72.655,164.87,-1,-1,-1,-1
-74,1,555.42,90.529,85.929,194.99,-1,-1,-1,-1
-74,2,364.99,90.845,77.225,175.24,-1,-1,-1,-1
-74,5,454.7,102.55,77.822,176.6,-1,-1,-1,-1
-74,11,163.66,85.479,72.758,165.1,-1,-1,-1,-1
-75,1,555.99,91.897,85.131,193.18,-1,-1,-1,-1
-75,2,364.29,91.477,76.99,174.71,-1,-1,-1,-1
-75,5,456.58,102.45,77.716,176.36,-1,-1,-1,-1
-75,11,163.92,85.102,72.819,165.24,-1,-1,-1,-1
-76,1,556.2,93.232,84.335,191.38,-1,-1,-1,-1
-76,2,363.56,92.056,76.75,174.16,-1,-1,-1,-1
-76,5,458.35,102.18,77.678,176.27,-1,-1,-1,-1
-76,11,164.23,84.869,72.835,165.28,-1,-1,-1,-1
-77,1,556.03,94.479,83.556,189.61,-1,-1,-1,-1
-77,2,362.78,92.546,76.51,173.62,-1,-1,-1,-1
-77,5,460.11,101.79,77.687,176.29,-1,-1,-1,-1
-77,11,164.59,84.784,72.806,165.21,-1,-1,-1,-1
-78,1,555.56,95.595,82.81,187.92,-1,-1,-1,-1
-78,2,361.94,92.918,76.279,173.09,-1,-1,-1,-1
-78,5,461.94,101.39,77.707,176.34,-1,-1,-1,-1
-78,11,165.03,84.839,72.734,165.05,-1,-1,-1,-1
-79,1,554.77,96.534,82.111,186.33,-1,-1,-1,-1
-79,2,361.04,93.17,76.066,172.61,-1,-1,-1,-1
-79,5,463.91,101.08,77.69,176.3,-1,-1,-1,-1
-79,11,165.56,85.023,72.618,164.79,-1,-1,-1,-1
-80,1,553.66,97.272,81.473,184.88,-1,-1,-1,-1
-80,2,360.1,93.266,75.881,172.19,-1,-1,-1,-1
-80,5,466.05,101.01,77.568,176.02,-1,-1,-1,-1
-80,11,166.21,85.318,72.46,164.43,-1,-1,-1,-1
-81,1,552.21,97.822,80.909,183.6,-1,-1,-1,-1
-81,2,359.09,93.141,75.733,171.85,-1,-1,-1,-1
-81,5,468.4,101.36,77.251,175.3,-1,-1,-1,-1
-81,11,166.97,85.705,72.263,163.98,-1,-1,-1,-1
-82,1,550.37,98.252,80.429,182.51,-1,-1,-1,-1
-82,2,358,92.74,75.629,171.62,-1,-1,-1,-1
-82,5,471,102.35,76.627,173.89,-1,-1,-1,-1
-82,11,167.79,86.157,72.029,163.45,-1,-1,-1,-1
-83,1,548.07,98.533,80.041,181.63,-1,-1,-1,-1
-83,2,356.8,92.038,75.573,171.49,-1,-1,-1,-1
-83,5,473.95,104.28,75.555,171.45,-1,-1,-1,-1
-83,11,168.6,86.645,71.761,162.84,-1,-1,-1,-1
-84,1,545.27,98.668,79.749,180.97,-1,-1,-1,-1
-84,2,355.48,91.035,75.569,171.48,-1,-1,-1,-1
-84,5,477.32,107.56,73.859,167.6,-1,-1,-1,-1
-84,11,169.37,87.138,71.463,162.16,-1,-1,-1,-1
-85,1,542.03,98.653,79.556,180.53,-1,-1,-1,-1
-85,2,354.04,89.839,75.614,171.59,-1,-1,-1,-1
-85,5,481.21,112.71,71.329,161.86,-1,-1,-1,-1
-85,11,170.1,87.602,71.138,161.43,-1,-1,-1,-1
-86,1,538.59,98.508,79.461,180.32,-1,-1,-1,-1
-86,2,352.5,88.594,75.706,171.79,-1,-1,-1,-1
-86,11,170.79,88.005,70.791,160.64,-1,-1,-1,-1
-87,1,535.15,98.269,79.457,180.31,-1,-1,-1,-1
-87,2,350.89,87.523,75.836,172.09,-1,-1,-1,-1
-87,11,171.45,88.318,70.427,159.81,-1,-1,-1,-1
-88,1,531.87,97.949,79.537,180.49,-1,-1,-1,-1
-88,2,349.2,86.729,75.997,172.45,-1,-1,-1,-1
-88,11,172.08,88.517,70.05,158.96,-1,-1,-1,-1
-89,1,528.95,97.532,79.686,180.83,-1,-1,-1,-1
-89,2,347.45,86.229,76.176,172.86,-1,-1,-1,-1
-89,11,172.69,88.588,69.665,158.08,-1,-1,-1,-1
-90,1,526.5,96.947,79.891,181.29,-1,-1,-1,-1
-90,2,345.63,85.995,76.36,173.28,-1,-1,-1,-1
-90,11,173.31,88.53,69.278,157.21,-1,-1,-1,-1
-91,1,524.64,96.198,80.13,181.84,-1,-1,-1,-1
-91,2,343.72,85.992,76.536,173.68,-1,-1,-1,-1
-91,11,173.93,88.365,68.892,156.33,-1,-1,-1,-1
-92,1,523.4,95.388,80.385,182.41,-1,-1,-1,-1
-92,2,341.74,86.146,76.687,174.02,-1,-1,-1,-1
-92,11,174.53,88.14,68.514,155.47,-1,-1,-1,-1
-93,1,522.81,94.614,80.63,182.97,-1,-1,-1,-1
-93,2,339.66,86.411,76.799,174.27,-1,-1,-1,-1
-93,11,175.1,87.939,68.148,154.64,-1,-1,-1,-1
-94,1,522.9,93.942,80.842,183.45,-1,-1,-1,-1
-94,2,337.44,86.741,76.855,174.4,-1,-1,-1,-1
-94,11,175.64,87.812,67.798,153.85,-1,-1,-1,-1
-95,1,523.67,93.403,80.997,183.8,-1,-1,-1,-1
-95,2,335.06,87.126,76.843,174.37,-1,-1,-1,-1
-95,11,176.14,87.79,67.469,153.1,-1,-1,-1,-1
-96,1,525.2,92.996,81.072,183.97,-1,-1,-1,-1
-96,2,332.49,87.563,76.749,174.16,-1,-1,-1,-1
-96,11,176.6,87.882,67.165,152.41,-1,-1,-1,-1
-97,1,527.53,92.736,81.046,183.91,-1,-1,-1,-1
-97,2,329.71,88.088,76.562,173.74,-1,-1,-1,-1
-97,11,177.01,88.083,66.89,151.79,-1,-1,-1,-1
-98,1,530.65,92.625,80.903,183.59,-1,-1,-1,-1
-98,2,326.71,88.751,76.273,173.08,-1,-1,-1,-1
-98,11,177.37,88.377,66.646,151.23,-1,-1,-1,-1
-99,1,534.41,92.646,80.63,182.97,-1,-1,-1,-1
-99,2,323.52,89.595,75.876,172.18,-1,-1,-1,-1
-99,11,177.66,88.75,66.437,150.76,-1,-1,-1,-1
-100,1,538.6,92.769,80.22,182.04,-1,-1,-1,-1
-100,2,320.27,90.594,75.367,171.02,-1,-1,-1,-1
-100,11,177.85,89.182,66.265,150.37,-1,-1,-1,-1
-100,12,499.13,176.65,41.191,93.471,-1,-1,-1,-1
-101,1,543.03,92.946,79.675,180.8,-1,-1,-1,-1
-101,2,317.05,91.783,74.747,169.62,-1,-1,-1,-1
-101,11,177.92,89.647,66.131,150.07,-1,-1,-1,-1
-101,12,496.56,163.59,46.581,105.7,-1,-1,-1,-1
-102,1,547.57,93.203,79.002,179.27,-1,-1,-1,-1
-102,2,314,93.148,74.018,167.96,-1,-1,-1,-1
-102,11,177.82,90.108,66.037,149.85,-1,-1,-1,-1
-102,12,494.36,152.17,51.254,116.31,-1,-1,-1,-1
-103,1,552.03,93.619,78.219,177.5,-1,-1,-1,-1
-103,2,311.24,94.679,73.187,166.08,-1,-1,-1,-1
-103,9,399.49,172.11,27.7,62.858,-1,-1,-1,-1
-103,11,177.56,90.563,65.983,149.73,-1,-1,-1,-1
-103,12,492.54,142.38,55.232,125.33,-1,-1,-1,-1
-104,1,556.39,94.202,77.35,175.52,-1,-1,-1,-1
-104,2,308.92,96.252,72.262,163.98,-1,-1,-1,-1
-104,9,397.76,168.04,30.041,68.17,-1,-1,-1,-1
-104,11,177.22,91.032,65.97,149.7,-1,-1,-1,-1
-104,12,491.13,134.13,58.548,132.86,-1,-1,-1,-1
-105,1,560.66,94.892,76.428,173.43,-1,-1,-1,-1
-105,2,307.07,97.79,71.257,161.7,-1,-1,-1,-1
-105,9,396.04,164.02,32.382,73.483,-1,-1,-1,-1
-105,11,176.87,91.521,65.996,149.76,-1,-1,-1,-1
-105,12,490.19,127.3,61.24,138.97,-1,-1,-1,-1
-106,1,564.9,95.656,75.492,171.31,-1,-1,-1,-1
-106,2,305.67,99.224,70.188,159.27,-1,-1,-1,-1
-106,9,394.37,160.03,34.723,78.796,-1,-1,-1,-1
-106,11,176.57,92.025,66.06,149.9,-1,-1,-1,-1
-106,12,489.74,121.73,63.353,143.76,-1,-1,-1,-1
-107,1,569.04,96.444,74.584,169.25,-1,-1,-1,-1
-107,2,304.64,100.53,69.073,156.74,-1,-1,-1,-1
-107,9,392.75,156.08,37.064,84.109,-1,-1,-1,-1
-107,11,176.35,92.516,66.161,150.13,-1,-1,-1,-1
-107,12,489.84,117.28,64.934,147.35,-1,-1,-1,-1
-108,1,573.11,97.191,73.749,167.35,-1,-1,-1,-1
-108,2,303.94,101.67,67.932,154.15,-1,-1,-1,-1
-108,9,391.19,152.16,39.405,89.421,-1,-1,-1,-1
-108,11,176.23,93.016,66.296,150.44,-1,-1,-1,-1
-108,12,490.45,113.82,66.037,149.85,-1,-1,-1,-1
-109,1,577.07,97.844,73.023,165.7,-1,-1,-1,-1
-109,2,303.53,102.68,66.788,151.56,-1,-1,-1,-1
-109,9,389.65,148.27,41.746,94.734,-1,-1,-1,-1
-109,11,176.26,93.517,66.462,150.82,-1,-1,-1,-1
-109,12,491.54,111.25,66.715,151.39,-1,-1,-1,-1
-110,1,580.91,98.323,72.433,164.37,-1,-1,-1,-1
-110,2,303.32,103.58,65.666,149.01,-1,-1,-1,-1
-110,9,388.14,144.4,44.088,100.05,-1,-1,-1,-1
-110,11,176.43,93.981,66.657,151.26,-1,-1,-1,-1
-110,12,493.01,109.44,67.02,152.08,-1,-1,-1,-1
-111,1,584.56,98.561,71.985,163.35,-1,-1,-1,-1
-111,2,303.24,104.48,64.59,146.57,-1,-1,-1,-1
-111,9,386.64,140.52,46.429,105.36,-1,-1,-1,-1
-111,11,176.75,94.326,66.875,151.75,-1,-1,-1,-1
-111,12,494.73,108.28,67.007,152.05,-1,-1,-1,-1
-112,1,587.99,98.571,71.654,162.6,-1,-1,-1,-1
-112,2,303.2,105.38,63.585,144.29,-1,-1,-1,-1
-112,10,425.04,113.09,66.686,151.33,-1,-1,-1,-1
-112,11,177.21,94.5,67.113,152.29,-1,-1,-1,-1
-112,12,496.52,107.63,66.729,151.42,-1,-1,-1,-1
-113,1,591.34,98.484,71.366,161.94,-1,-1,-1,-1
-113,2,303.07,106.22,62.676,142.22,-1,-1,-1,-1
-113,10,424.74,118.84,63.69,144.53,-1,-1,-1,-1
-113,11,177.79,94.476,67.366,152.87,-1,-1,-1,-1
-113,12,498.19,107.43,66.236,150.3,-1,-1,-1,-1
-114,1,594.71,98.599,70.986,161.08,-1,-1,-1,-1
-114,2,302.78,106.92,61.886,140.43,-1,-1,-1,-1
-114,10,424.09,123.02,61.395,139.32,-1,-1,-1,-1
-114,11,178.45,94.244,67.63,153.47,-1,-1,-1,-1
-114,12,499.59,107.57,65.576,148.8,-1,-1,-1,-1
-115,1,598.25,99.411,70.293,159.51,-1,-1,-1,-1
-115,2,302.32,107.41,61.237,138.96,-1,-1,-1,-1
-115,10,423.1,125.84,59.71,135.5,-1,-1,-1,-1
-115,11,179.1,93.811,67.9,154.08,-1,-1,-1,-1
-115,12,500.66,107.99,64.794,147.03,-1,-1,-1,-1
-116,1,602.17,101.71,68.955,156.47,-1,-1,-1,-1
-116,2,301.73,107.62,60.748,137.85,-1,-1,-1,-1
-116,10,421.79,127.5,58.551,132.87,-1,-1,-1,-1
-116,11,179.71,93.198,68.169,154.69,-1,-1,-1,-1
-116,12,501.32,108.63,63.933,145.08,-1,-1,-1,-1
-117,1,606.72,106.57,66.502,150.91,-1,-1,-1,-1
-117,2,301.01,107.5,60.434,137.14,-1,-1,-1,-1
-117,10,420.2,128.23,57.841,131.25,-1,-1,-1,-1
-117,11,180.24,92.437,68.434,155.29,-1,-1,-1,-1
-117,12,501.58,109.45,63.03,143.03,-1,-1,-1,-1
-118,2,300.15,107.03,60.308,136.85,-1,-1,-1,-1
-118,10,418.35,128.22,57.51,130.5,-1,-1,-1,-1
-118,11,180.65,91.572,68.688,155.87,-1,-1,-1,-1
-118,12,501.48,110.37,62.119,140.96,-1,-1,-1,-1
-119,2,299.16,106.23,60.376,137.01,-1,-1,-1,-1
-119,10,416.33,127.64,57.494,130.47,-1,-1,-1,-1
-119,11,180.96,90.605,68.928,156.41,-1,-1,-1,-1
-119,12,501.05,111.35,61.231,138.95,-1,-1,-1,-1
-120,2,298.04,105.16,60.639,137.6,-1,-1,-1,-1
-120,10,414.26,126.69,57.733,131.01,-1,-1,-1,-1
-120,11,181.19,89.548,69.147,156.91,-1,-1,-1,-1
-120,12,500.36,112.37,60.39,137.04,-1,-1,-1,-1
-121,2,296.78,103.84,61.093,138.63,-1,-1,-1,-1
-121,10,412.17,125.43,58.175,132.01,-1,-1,-1,-1
-121,11,181.33,88.438,69.342,157.35,-1,-1,-1,-1
-121,12,499.4,113.42,59.619,135.29,-1,-1,-1,-1
-122,2,295.38,102.36,61.727,140.07,-1,-1,-1,-1
-122,10,410.11,123.95,58.772,133.37,-1,-1,-1,-1
-122,11,181.42,87.276,69.508,157.73,-1,-1,-1,-1
-122,12,498.22,114.47,58.935,133.74,-1,-1,-1,-1
-123,2,293.85,100.8,62.525,141.88,-1,-1,-1,-1
-123,10,408.14,122.35,59.482,134.98,-1,-1,-1,-1
-123,11,181.46,86.091,69.641,158.03,-1,-1,-1,-1
-123,12,496.86,115.5,58.35,132.41,-1,-1,-1,-1
-124,2,292.21,99.201,63.462,144.01,-1,-1,-1,-1
-124,10,406.25,120.64,60.265,136.75,-1,-1,-1,-1
-124,11,181.48,84.942,69.739,158.25,-1,-1,-1,-1
-124,12,495.37,116.51,57.873,131.33,-1,-1,-1,-1
-125,2,290.48,97.594,64.508,146.38,-1,-1,-1,-1
-125,10,404.46,118.87,61.089,138.62,-1,-1,-1,-1
-125,11,181.47,83.928,69.798,158.39,-1,-1,-1,-1
-125,12,493.78,117.49,57.509,130.5,-1,-1,-1,-1
-126,2,288.74,95.937,65.625,148.92,-1,-1,-1,-1
-126,10,402.76,117.12,61.925,140.52,-1,-1,-1,-1
-126,11,181.45,83.112,69.816,158.43,-1,-1,-1,-1
-126,12,492.12,118.41,57.26,129.94,-1,-1,-1,-1
-127,2,287.03,94.26,66.77,151.52,-1,-1,-1,-1
-127,10,401.15,115.46,62.747,142.39,-1,-1,-1,-1
-127,11,181.41,82.541,69.792,158.37,-1,-1,-1,-1
-127,12,490.37,119.22,57.124,129.63,-1,-1,-1,-1
-128,2,285.41,92.613,67.895,154.07,-1,-1,-1,-1
-128,10,399.58,113.94,63.533,144.17,-1,-1,-1,-1
-128,11,181.37,82.232,69.726,158.22,-1,-1,-1,-1
-128,12,488.52,119.88,57.097,129.57,-1,-1,-1,-1
-129,2,283.9,91.091,68.947,156.46,-1,-1,-1,-1
-129,10,398,112.57,64.266,145.83,-1,-1,-1,-1
-129,11,181.32,82.18,69.618,157.98,-1,-1,-1,-1
-129,12,486.63,120.34,57.17,129.73,-1,-1,-1,-1
-130,2,282.53,89.795,69.87,158.55,-1,-1,-1,-1
-130,10,396.36,111.34,64.93,147.34,-1,-1,-1,-1
-130,11,181.29,82.36,69.468,157.64,-1,-1,-1,-1
-130,12,484.75,120.56,57.334,130.11,-1,-1,-1,-1
-131,2,281.32,88.852,70.606,160.22,-1,-1,-1,-1
-131,10,394.64,110.25,65.515,148.67,-1,-1,-1,-1
-131,11,181.28,82.738,69.279,157.21,-1,-1,-1,-1
-131,12,482.89,120.54,57.578,130.66,-1,-1,-1,-1
-132,2,280.28,88.385,71.1,161.34,-1,-1,-1,-1
-132,10,392.86,109.33,66.01,149.79,-1,-1,-1,-1
-132,11,181.3,83.272,69.052,156.69,-1,-1,-1,-1
-132,12,481.04,120.32,57.888,131.36,-1,-1,-1,-1
-133,2,279.39,88.524,71.299,161.79,-1,-1,-1,-1
-133,10,391.06,108.6,66.411,150.7,-1,-1,-1,-1
-133,11,181.34,83.977,68.792,156.1,-1,-1,-1,-1
-133,12,479.21,119.91,58.25,132.18,-1,-1,-1,-1
-134,2,278.67,89.373,71.155,161.47,-1,-1,-1,-1
-134,10,389.29,108.09,66.712,151.38,-1,-1,-1,-1
-134,11,181.38,84.851,68.503,155.45,-1,-1,-1,-1
-134,12,477.39,119.34,58.649,133.09,-1,-1,-1,-1
-135,2,278.08,91.019,70.631,160.28,-1,-1,-1,-1
-135,10,387.61,107.84,66.913,151.84,-1,-1,-1,-1
-135,11,181.4,85.871,68.19,154.74,-1,-1,-1,-1
-135,12,475.57,118.61,59.07,134.04,-1,-1,-1,-1
-136,2,277.62,93.498,69.705,158.18,-1,-1,-1,-1
-136,10,386.04,107.87,67.013,152.07,-1,-1,-1,-1
-136,11,181.39,86.997,67.857,153.98,-1,-1,-1,-1
-136,12,473.71,117.75,59.498,135.01,-1,-1,-1,-1
-137,2,277.29,96.798,68.371,155.15,-1,-1,-1,-1
-137,10,384.58,108.19,67.015,152.07,-1,-1,-1,-1
-137,11,181.32,88.174,67.512,153.2,-1,-1,-1,-1
-137,12,471.8,116.8,59.918,135.97,-1,-1,-1,-1
-138,2,277.08,100.88,66.648,151.24,-1,-1,-1,-1
-138,10,383.2,108.73,66.922,151.86,-1,-1,-1,-1
-138,11,181.18,89.33,67.161,152.4,-1,-1,-1,-1
-138,12,469.84,115.81,60.316,136.87,-1,-1,-1,-1
-139,2,276.99,105.68,64.585,146.56,-1,-1,-1,-1
-139,8,-27.108,101.78,79.487,180.38,-1,-1,-1,-1
-139,10,381.84,109.45,66.741,151.45,-1,-1,-1,-1
-139,11,180.96,90.379,66.811,151.61,-1,-1,-1,-1
-139,12,467.84,114.84,60.681,137.7,-1,-1,-1,-1
-140,2,276.96,111.01,62.266,141.3,-1,-1,-1,-1
-140,8,-21.33,103.29,78.943,179.14,-1,-1,-1,-1
-140,10,380.46,110.28,66.476,150.85,-1,-1,-1,-1
-140,11,180.64,91.281,66.469,150.83,-1,-1,-1,-1
-140,12,465.85,114.01,61.001,138.42,-1,-1,-1,-1
-141,2,276.88,116.59,59.82,135.74,-1,-1,-1,-1
-141,8,-15.54,104.98,78.346,177.79,-1,-1,-1,-1
-141,10,379.03,111.16,66.135,150.08,-1,-1,-1,-1
-141,11,180.21,92.007,66.142,150.09,-1,-1,-1,-1
-141,12,463.9,113.38,61.267,139.03,-1,-1,-1,-1
-142,2,276.61,122.01,57.427,130.32,-1,-1,-1,-1
-142,8,-9.727,106.83,77.703,176.33,-1,-1,-1,-1
-142,10,377.47,112.04,65.728,149.15,-1,-1,-1,-1
-142,11,179.66,92.578,65.838,149.4,-1,-1,-1,-1
-142,12,462.03,112.97,61.474,139.5,-1,-1,-1,-1
-143,2,276.01,126.71,55.329,125.55,-1,-1,-1,-1
-143,8,-3.8728,108.78,77.019,174.77,-1,-1,-1,-1
-143,10,375.75,112.82,65.263,148.1,-1,-1,-1,-1
-143,11,179.03,93.022,65.564,148.78,-1,-1,-1,-1
-143,12,460.2,112.78,61.615,139.82,-1,-1,-1,-1
-144,2,274.88,129.99,53.838,122.17,-1,-1,-1,-1
-144,8,2.0073,110.76,76.298,173.14,-1,-1,-1,-1
-144,10,373.87,113.53,64.749,146.93,-1,-1,-1,-1
-144,11,178.38,93.38,65.326,148.24,-1,-1,-1,-1
-144,12,458.41,112.75,61.689,139.99,-1,-1,-1,-1
-145,2,273.05,130.94,53.348,121.06,-1,-1,-1,-1
-145,8,7.8413,112.73,75.546,171.43,-1,-1,-1,-1
-145,10,371.79,114.2,64.198,145.68,-1,-1,-1,-1
-145,11,177.81,93.673,65.131,147.8,-1,-1,-1,-1
-145,12,456.62,112.81,61.698,140.01,-1,-1,-1,-1
-146,2,270.3,128.46,54.349,123.33,-1,-1,-1,-1
-146,8,13.564,114.74,74.768,169.67,-1,-1,-1,-1
-146,10,369.49,114.87,63.619,144.36,-1,-1,-1,-1
-146,11,177.39,93.894,64.984,147.46,-1,-1,-1,-1
-146,12,454.84,112.88,61.644,139.88,-1,-1,-1,-1
-147,2,266.38,121.22,57.437,130.34,-1,-1,-1,-1
-147,8,19.139,116.73,73.969,167.85,-1,-1,-1,-1
-147,10,366.97,115.58,63.023,143.01,-1,-1,-1,-1
-147,11,177.16,94.012,64.888,147.24,-1,-1,-1,-1
-147,12,453.06,112.96,61.534,139.63,-1,-1,-1,-1
-148,2,261.02,107.61,63.329,143.71,-1,-1,-1,-1
-148,8,24.565,118.67,73.154,166,-1,-1,-1,-1
-148,10,364.28,116.31,62.42,141.64,-1,-1,-1,-1
-148,11,177.17,93.968,64.848,147.15,-1,-1,-1,-1
-148,12,451.26,113.04,61.377,139.28,-1,-1,-1,-1
-149,2,253.83,85.685,72.88,165.38,-1,-1,-1,-1
-149,8,29.841,120.52,72.328,164.13,-1,-1,-1,-1
-149,10,361.48,117.06,61.821,140.29,-1,-1,-1,-1
-149,11,177.4,93.756,64.865,147.19,-1,-1,-1,-1
-149,12,449.43,113.18,61.184,138.84,-1,-1,-1,-1
-150,8,34.933,122.31,71.496,162.24,-1,-1,-1,-1
-150,10,358.65,117.83,61.236,138.96,-1,-1,-1,-1
-150,11,177.81,93.422,64.939,147.36,-1,-1,-1,-1
-150,12,447.6,113.49,60.969,138.35,-1,-1,-1,-1
-151,8,39.846,124.06,70.664,160.35,-1,-1,-1,-1
-151,10,355.86,118.57,60.675,137.69,-1,-1,-1,-1
-151,11,178.31,92.984,65.069,147.65,-1,-1,-1,-1
-151,12,445.75,114.05,60.748,137.85,-1,-1,-1,-1
-152,8,44.606,125.75,69.835,158.47,-1,-1,-1,-1
-152,10,353.13,119.27,60.148,136.49,-1,-1,-1,-1
-152,11,178.85,92.446,65.252,148.07,-1,-1,-1,-1
-152,12,443.89,114.87,60.539,137.38,-1,-1,-1,-1
-153,8,49.262,127.39,69.017,156.61,-1,-1,-1,-1
-153,10,350.49,119.9,59.662,135.39,-1,-1,-1,-1
-153,11,179.37,91.805,65.482,148.59,-1,-1,-1,-1
-153,12,442.05,115.92,60.36,136.97,-1,-1,-1,-1
-154,8,53.861,128.98,68.213,154.79,-1,-1,-1,-1
-154,10,347.96,120.45,59.226,134.4,-1,-1,-1,-1
-154,11,179.86,91.055,65.754,149.21,-1,-1,-1,-1
-154,12,440.22,117.05,60.231,136.68,-1,-1,-1,-1
-155,8,58.411,130.53,67.429,153.01,-1,-1,-1,-1
-155,10,345.55,120.92,58.846,133.54,-1,-1,-1,-1
-155,11,180.3,90.185,66.059,149.9,-1,-1,-1,-1
-155,12,438.44,118.1,60.172,136.54,-1,-1,-1,-1
-156,8,62.974,132.05,66.669,151.29,-1,-1,-1,-1
-156,10,343.32,121.34,58.53,132.82,-1,-1,-1,-1
-156,11,180.68,89.244,66.385,150.64,-1,-1,-1,-1
-156,12,436.7,118.94,60.201,136.61,-1,-1,-1,-1
-157,8,67.594,133.56,65.94,149.63,-1,-1,-1,-1
-157,10,341.28,121.66,58.282,132.25,-1,-1,-1,-1
-157,11,181.01,88.281,66.721,151.4,-1,-1,-1,-1
-157,12,435.02,119.48,60.336,136.91,-1,-1,-1,-1
-158,8,72.286,135.02,65.245,148.06,-1,-1,-1,-1
-158,10,339.41,121.85,58.106,131.86,-1,-1,-1,-1
-158,11,181.29,87.341,67.053,152.16,-1,-1,-1,-1
-158,12,433.4,119.69,60.59,137.49,-1,-1,-1,-1
-159,8,77.009,136.37,64.591,146.57,-1,-1,-1,-1
-159,10,337.71,121.88,58.005,131.63,-1,-1,-1,-1
-159,11,181.53,86.459,67.366,152.87,-1,-1,-1,-1
-159,12,431.78,119.54,60.976,138.37,-1,-1,-1,-1
-160,8,81.693,137.57,63.981,145.19,-1,-1,-1,-1
-160,10,336.19,121.73,57.982,131.57,-1,-1,-1,-1
-160,11,181.77,85.669,67.644,153.5,-1,-1,-1,-1
-160,12,430.1,119,61.498,139.55,-1,-1,-1,-1
-161,8,86.286,138.6,63.422,143.92,-1,-1,-1,-1
-161,10,334.81,121.36,58.035,131.69,-1,-1,-1,-1
-161,11,182.01,84.995,67.873,154.02,-1,-1,-1,-1
-161,12,428.33,118.11,62.157,141.05,-1,-1,-1,-1
-162,8,90.719,139.46,62.919,142.78,-1,-1,-1,-1
-162,10,333.55,120.7,58.164,131.99,-1,-1,-1,-1
-162,11,182.28,84.456,68.036,154.39,-1,-1,-1,-1
-162,12,426.41,116.89,62.943,142.83,-1,-1,-1,-1
-163,8,94.986,140.13,62.476,141.77,-1,-1,-1,-1
-163,10,332.38,119.73,58.368,132.45,-1,-1,-1,-1
-163,11,182.57,84.066,68.121,154.58,-1,-1,-1,-1
-163,12,424.36,115.37,63.84,144.87,-1,-1,-1,-1
-164,8,99.075,140.6,62.098,140.91,-1,-1,-1,-1
-164,10,331.27,118.48,58.64,133.07,-1,-1,-1,-1
-164,11,182.87,83.827,68.116,154.57,-1,-1,-1,-1
-164,12,422.22,113.6,64.818,147.09,-1,-1,-1,-1
-165,8,102.95,140.86,61.791,140.22,-1,-1,-1,-1
-165,10,330.18,116.99,58.977,133.83,-1,-1,-1,-1
-165,11,183.21,83.786,68.013,154.34,-1,-1,-1,-1
-165,12,420.07,111.65,65.835,149.39,-1,-1,-1,-1
-166,8,106.54,140.92,61.56,139.69,-1,-1,-1,-1
-166,10,329.11,115.36,59.37,134.72,-1,-1,-1,-1
-166,11,183.64,83.953,67.812,153.88,-1,-1,-1,-1
-166,12,417.96,109.61,66.832,151.66,-1,-1,-1,-1
-167,8,109.84,140.81,61.41,139.35,-1,-1,-1,-1
-167,10,328.04,113.68,59.811,135.73,-1,-1,-1,-1
-167,11,184.19,84.33,67.517,153.21,-1,-1,-1,-1
-167,12,415.96,107.63,67.736,153.71,-1,-1,-1,-1
-168,8,112.87,140.51,61.346,139.21,-1,-1,-1,-1
-168,10,327,112.04,60.29,136.81,-1,-1,-1,-1
-168,11,184.87,84.888,67.141,152.36,-1,-1,-1,-1
-168,12,414.11,105.99,68.451,155.33,-1,-1,-1,-1
-169,8,115.68,140.05,61.372,139.27,-1,-1,-1,-1
-169,10,326.03,110.48,60.794,137.95,-1,-1,-1,-1
-169,11,185.68,85.569,66.711,151.38,-1,-1,-1,-1
-169,12,412.46,104.97,68.859,156.26,-1,-1,-1,-1
-170,8,118.31,139.43,61.495,139.55,-1,-1,-1,-1
-170,10,325.18,109.06,61.309,139.12,-1,-1,-1,-1
-170,11,186.58,86.275,66.265,150.37,-1,-1,-1,-1
-170,12,411.06,104.91,68.816,156.16,-1,-1,-1,-1
-171,8,120.8,138.65,61.719,140.05,-1,-1,-1,-1
-171,10,324.48,107.79,61.819,140.28,-1,-1,-1,-1
-171,11,187.54,86.865,65.86,149.45,-1,-1,-1,-1
-171,12,409.97,106.23,68.152,154.65,-1,-1,-1,-1
-172,8,123.17,137.71,62.05,140.8,-1,-1,-1,-1
-172,10,323.92,106.73,62.308,141.39,-1,-1,-1,-1
-172,11,188.48,87.148,65.574,148.8,-1,-1,-1,-1
-172,12,409.31,109.41,66.664,151.28,-1,-1,-1,-1
-173,7,430.33,141.89,48.719,110.55,-1,-1,-1,-1
-173,8,125.5,136.61,62.492,141.81,-1,-1,-1,-1
-173,10,323.46,105.93,62.755,142.41,-1,-1,-1,-1
-173,11,189.37,86.928,65.506,148.65,-1,-1,-1,-1
-174,7,425.77,134.94,52.855,119.94,-1,-1,-1,-1
-174,8,127.81,135.36,63.05,143.07,-1,-1,-1,-1
-174,10,323.1,105.43,63.141,143.28,-1,-1,-1,-1
-174,11,190.17,85.936,65.788,149.29,-1,-1,-1,-1
-175,7,421.19,127.97,56.992,129.33,-1,-1,-1,-1
-175,8,130.09,133.96,63.73,144.62,-1,-1,-1,-1
-175,10,322.85,105.28,63.441,143.96,-1,-1,-1,-1
-175,11,190.8,83.813,66.582,151.09,-1,-1,-1,-1
-176,7,416.57,120.95,61.128,138.71,-1,-1,-1,-1
-176,8,132.35,132.37,64.537,146.45,-1,-1,-1,-1
-176,10,322.7,105.51,63.632,144.39,-1,-1,-1,-1
-176,11,191.2,80.103,68.09,154.51,-1,-1,-1,-1
-177,7,411.88,113.97,65.264,148.1,-1,-1,-1,-1
-177,8,134.57,130.56,65.475,148.58,-1,-1,-1,-1
-177,10,322.65,106.16,63.687,144.52,-1,-1,-1,-1
-177,11,191.25,74.239,70.561,160.12,-1,-1,-1,-1
-178,7,407.12,107.07,69.4,157.49,-1,-1,-1,-1
-178,8,136.75,128.46,66.551,151.02,-1,-1,-1,-1
-178,10,322.69,107.27,63.578,144.27,-1,-1,-1,-1
-178,11,190.75,65.546,74.291,168.58,-1,-1,-1,-1
-179,7,402.35,100.18,73.536,166.87,-1,-1,-1,-1
-179,8,138.9,126.05,67.769,153.78,-1,-1,-1,-1
-179,10,322.82,108.83,63.275,143.58,-1,-1,-1,-1
-179,11,189.49,53.203,79.64,180.72,-1,-1,-1,-1
diff --git a/tests/data/demo_cocovid_data/ann.json b/tests/data/demo_cocovid_data/ann.json
deleted file mode 100644
index f8e3bba8f..000000000
--- a/tests/data/demo_cocovid_data/ann.json
+++ /dev/null
@@ -1,286 +0,0 @@
-{
-    "categories": [
-        {
-            "id": 1,
-            "name": "car"
-        },
-        {
-            "id": 2,
-            "name": "person"
-        }
-    ],
-    "videos": [
-        {
-            "id": 1,
-            "name": "dummy_video"
-        }
-    ],
-    "images": [
-        {
-            "file_name": "image_1.jpg",
-            "height": 256,
-            "width": 512,
-            "id": 1,
-            "video_id": 1,
-            "frame_id": 0
-        },
-        {
-            "file_name": "image_2.jpg",
-            "height": 256,
-            "width": 512,
-            "id": 2,
-            "video_id": 1,
-            "frame_id": 1
-        },
-        {
-            "file_name": "image_3.jpg",
-            "height": 256,
-            "width": 512,
-            "id": 3,
-            "video_id": 1,
-            "frame_id": 2
-        },
-        {
-            "file_name": "image_4.jpg",
-            "height": 256,
-            "width": 512,
-            "id": 4,
-            "video_id": 1,
-            "frame_id": 3
-        },
-        {
-            "file_name": "image_5.jpg",
-            "height": 256,
-            "width": 512,
-            "id": 5,
-            "video_id": 1,
-            "frame_id": 4
-        },
-        {
-            "file_name": "image_6.jpg",
-            "height": 256,
-            "width": 512,
-            "id": 6,
-            "video_id": 1,
-            "frame_id": 5
-        },
-        {
-            "file_name": "image_7.jpg",
-            "height": 256,
-            "width": 512,
-            "id": 7,
-            "video_id": 1,
-            "frame_id": 6
-        },
-        {
-            "file_name": "image_8.jpg",
-            "height": 256,
-            "width": 512,
-            "id": 8,
-            "video_id": 1,
-            "frame_id": 7
-        }
-    ],
-    "annotations": [
-        {
-            "id": 1,
-            "image_id": 2,
-            "video_id": 1,
-            "category_id": 1,
-            "instance_id": 1,
-            "bbox": [
-                44.96680450439453,
-                46.50063705444336,
-                51.31086730957031,
-                4.578987121582031
-            ],
-            "area": 234.9518006077269,
-            "occluded": false,
-            "truncated": false,
-            "iscrowd": false,
-            "ignore": false,
-            "is_vid_train_frame": true,
-            "visibility": 1.0
-        },
-        {
-            "id": 2,
-            "image_id": 3,
-            "video_id": 1,
-            "category_id": 1,
-            "instance_id": 1,
-            "bbox": [
-                120.4886245727539,
-                227.72906494140625,
-                45.48944854736328,
-                25.938629150390625
-            ],
-            "area": 1179.9339361258317,
-            "occluded": false,
-            "truncated": false,
-            "iscrowd": false,
-            "ignore": false,
-            "is_vid_train_frame": true,
-            "visibility": 1.0
-        },
-        {
-            "id": 3,
-            "image_id": 4,
-            "video_id": 1,
-            "category_id": 1,
-            "instance_id": 1,
-            "bbox": [
-                2.3915271759033203,
-                129.34030151367188,
-                79.5280590057373,
-                111.25004577636719
-            ],
-            "area": 8847.500204893906,
-            "occluded": false,
-            "truncated": false,
-            "iscrowd": false,
-            "ignore": false,
-            "is_vid_train_frame": true,
-            "visibility": 1.0
-        },
-        {
-            "id": 4,
-            "image_id": 5,
-            "video_id": 1,
-            "category_id": 1,
-            "instance_id": 1,
-            "bbox": [
-                197.55728149414062,
-                32.26101303100586,
-                21.632125854492188,
-                208.88010025024414
-            ],
-            "area": 4518.520617112226,
-            "occluded": false,
-            "truncated": false,
-            "iscrowd": false,
-            "ignore": false,
-            "is_vid_train_frame": true,
-            "visibility": 1.0
-        },
-        {
-            "id": 5,
-            "image_id": 5,
-            "video_id": 1,
-            "category_id": 2,
-            "instance_id": 2,
-            "bbox": [
-                182.20187377929688,
-                227.82843017578125,
-                8.478378295898438,
-                14.818862915039062
-            ],
-            "area": 125.63992570876144,
-            "occluded": false,
-            "truncated": false,
-            "iscrowd": false,
-            "ignore": false,
-            "is_vid_train_frame": true,
-            "visibility": 1.0
-        },
-        {
-            "id": 6,
-            "image_id": 6,
-            "video_id": 1,
-            "category_id": 2,
-            "instance_id": 2,
-            "bbox": [
-                5.008738994598389,
-                64.79338836669922,
-                228.1608624458313,
-                113.8602066040039
-            ],
-            "area": 25978.442937030068,
-            "occluded": false,
-            "truncated": false,
-            "iscrowd": false,
-            "ignore": false,
-            "is_vid_train_frame": true,
-            "visibility": 1.0
-        },
-        {
-            "id": 7,
-            "image_id": 7,
-            "video_id": 1,
-            "category_id": 2,
-            "instance_id": 2,
-            "bbox": [
-                91.12625122070312,
-                89.11300659179688,
-                14.969261169433594,
-                91.34773254394531
-            ],
-            "area": 1367.408065685886,
-            "occluded": false,
-            "truncated": false,
-            "iscrowd": false,
-            "ignore": false,
-            "is_vid_train_frame": true,
-            "visibility": 1.0
-        },
-        {
-            "id": 8,
-            "image_id": 1,
-            "video_id": 1,
-            "category_id": 1,
-            "instance_id": 3,
-            "bbox": [
-                76.43632507324219,
-                39.33003234863281,
-                148.8035125732422,
-                29.683120727539062
-            ],
-            "area": 4416.952628393425,
-            "occluded": false,
-            "truncated": false,
-            "iscrowd": true,
-            "ignore": false,
-            "is_vid_train_frame": true,
-            "visibility": 1.0
-        },
-        {
-            "id": 9,
-            "image_id": 1,
-            "video_id": 1,
-            "category_id": 1,
-            "instance_id": 4,
-            "bbox": [
-                120.81246948242188,
-                85.94278717041016,
-                8.577041625976562,
-                159.13533782958984
-            ],
-            "area": 1364.9104167282348,
-            "occluded": false,
-            "truncated": false,
-            "iscrowd": true,
-            "ignore": false,
-            "is_vid_train_frame": true,
-            "visibility": 1.0
-        },
-        {
-            "id": 10,
-            "image_id": 1,
-            "video_id": 1,
-            "category_id": 1,
-            "instance_id": 5,
-            "bbox": [
-                66.48412322998047,
-                140.93060302734375,
-                44.840370178222656,
-                70.38531494140625
-            ],
-            "area": 3156.1035770834424,
-            "occluded": false,
-            "truncated": false,
-            "iscrowd": true,
-            "ignore": false,
-            "is_vid_train_frame": true,
-            "visibility": 1.0
-        }
-    ]
-}
diff --git a/tests/data/demo_cocovid_data/create_assets.py b/tests/data/demo_cocovid_data/create_assets.py
deleted file mode 100644
index 2779c62f3..000000000
--- a/tests/data/demo_cocovid_data/create_assets.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from collections import defaultdict
-
-import mmcv
-from mmdet.core.bbox.demodata import random_boxes
-
-
-def create_dummy_data():
-    # 2 categories: ['car', 'person']
-    classes = ['car', 'person']
-    # 1 video
-    videos = [
-        dict(
-            name='dummy_video',
-            images=dict(num=8, shape=(256, 512, 3)),
-            instances=[
-                dict(frames=[1, 2, 3, 4], category='car'),
-                dict(frames=[4, 5, 6], category='person')
-            ])
-    ]
-    attrs = dict(occluded=False, truncated=False, iscrowd=False, ignore=False)
-    attrs['is_vid_train_frame'] = True  # ImageNet VID
-    attrs['visibility'] = 1.0  # MOT17
-    # set all corner cases in img_id == 1
-    corner_cases = dict(ignore=0, iscrowd=3)
-
-    ann = defaultdict(list)
-    img_id, ann_id, ins_id = 1, 1, 1
-    for cls_id, cls in enumerate(classes, 1):
-        ann['categories'].append(dict(id=cls_id, name=cls))
-
-    for vid_id, video in enumerate(videos, 1):
-        ann['videos'].append(dict(id=vid_id, name=video['name']))
-
-        img_info = video['images']
-        frame2id = dict()
-        for i in range(img_info['num']):
-            img_name = f'image_{img_id}.jpg'
-            # img = np.ones(img_info['shape']) * 125
-            # mmcv.imwrite(img, img_name)
-            ann['images'].append(
-                dict(
-                    file_name=img_name,
-                    height=img_info['shape'][0],
-                    width=img_info['shape'][1],
-                    id=img_id,
-                    video_id=vid_id,
-                    frame_id=i))
-            frame2id[i] = img_id
-            img_id += 1
-
-        ins_info = video['instances']
-        for i, ins in enumerate(ins_info):
-            bboxes = random_boxes(
-                len(ins['frames']), min(img_info['shape'][:-1])).numpy()
-            for ind, frame_id in enumerate(ins['frames']):
-                assert frame_id < img_info['num']
-                x1 = float(bboxes[ind][0])
-                y1 = float(bboxes[ind][1])
-                x2 = float(bboxes[ind][2])
-                y2 = float(bboxes[ind][3])
-                bbox = [x1, y1, x2 - x1, y2 - y1]
-                area = float((x2 - x1) * (y2 - y1))
-                bbox[2] = 2.0 if bbox[2] < 1 else bbox[2]
-                bbox[3] = 2.0 if bbox[2] < 1 else bbox[3]
-                ann['annotations'].append(
-                    dict(
-                        id=ann_id,
-                        image_id=frame2id[frame_id],
-                        video_id=vid_id,
-                        category_id=classes.index(ins['category']) + 1,
-                        instance_id=ins_id,
-                        bbox=bbox,
-                        area=area,
-                        **attrs))
-                ann_id += 1
-            ins_id += 1
-
-    for case, num in corner_cases.items():
-        bboxes = random_boxes(num, min(img_info['shape'][:-1]) - 1).numpy()
-        for ind in range(bboxes.shape[0]):
-            x1 = float(bboxes[ind][0])
-            y1 = float(bboxes[ind][1])
-            x2 = float(bboxes[ind][2])
-            y2 = float(bboxes[ind][3])
-            bbox = [x1, y1, x2 - x1, y2 - y1]
-            bbox[2] = 2.0 if bbox[2] < 1 else bbox[2]
-            bbox[3] = 2.0 if bbox[3] < 1 else bbox[3]
-            area = float((x2 - x1) * (y2 - y1))
-            _attrs = attrs.copy()
-            if case == 'ignore':
-                _attrs['ignore'] = True
-            elif case == 'iscrowd':
-                _attrs['iscrowd'] = True
-            elif case == 'visibility':
-                _attrs['visibility'] = 1.0
-            else:
-                raise KeyError()
-            ann['annotations'].append(
-                dict(
-                    id=ann_id,
-                    image_id=1,
-                    video_id=1,
-                    category_id=1,
-                    instance_id=ins_id,
-                    bbox=bbox,
-                    area=area,
-                    **_attrs))
-            ann_id += 1
-            ins_id += 1
-
-    mmcv.dump(ann, 'ann.json')
-
-
-if __name__ == '__main__':
-    create_dummy_data()
diff --git a/tests/data/demo_reid_data/mot17_reid/ann.txt b/tests/data/demo_reid_data/mot17_reid/ann.txt
deleted file mode 100644
index ab22c93a2..000000000
--- a/tests/data/demo_reid_data/mot17_reid/ann.txt
+++ /dev/null
@@ -1,704 +0,0 @@
-MOT17-05-FRCNN_000110/000018.jpg 0
-MOT17-05-FRCNN_000110/000015.jpg 0
-MOT17-05-FRCNN_000110/000009.jpg 0
-MOT17-05-FRCNN_000110/000005.jpg 0
-MOT17-05-FRCNN_000110/000016.jpg 0
-MOT17-05-FRCNN_000110/000010.jpg 0
-MOT17-05-FRCNN_000110/000007.jpg 0
-MOT17-05-FRCNN_000110/000008.jpg 0
-MOT17-05-FRCNN_000110/000011.jpg 0
-MOT17-05-FRCNN_000110/000002.jpg 0
-MOT17-05-FRCNN_000110/000020.jpg 0
-MOT17-05-FRCNN_000110/000006.jpg 0
-MOT17-05-FRCNN_000110/000019.jpg 0
-MOT17-05-FRCNN_000110/000004.jpg 0
-MOT17-05-FRCNN_000110/000014.jpg 0
-MOT17-05-FRCNN_000110/000013.jpg 0
-MOT17-05-FRCNN_000110/000017.jpg 0
-MOT17-05-FRCNN_000110/000001.jpg 0
-MOT17-05-FRCNN_000110/000003.jpg 0
-MOT17-05-FRCNN_000110/000000.jpg 0
-MOT17-05-FRCNN_000110/000012.jpg 0
-MOT17-13-FRCNN_000146/000039.jpg 1
-MOT17-13-FRCNN_000146/000018.jpg 1
-MOT17-13-FRCNN_000146/000015.jpg 1
-MOT17-13-FRCNN_000146/000009.jpg 1
-MOT17-13-FRCNN_000146/000022.jpg 1
-MOT17-13-FRCNN_000146/000040.jpg 1
-MOT17-13-FRCNN_000146/000027.jpg 1
-MOT17-13-FRCNN_000146/000024.jpg 1
-MOT17-13-FRCNN_000146/000005.jpg 1
-MOT17-13-FRCNN_000146/000016.jpg 1
-MOT17-13-FRCNN_000146/000010.jpg 1
-MOT17-13-FRCNN_000146/000007.jpg 1
-MOT17-13-FRCNN_000146/000034.jpg 1
-MOT17-13-FRCNN_000146/000036.jpg 1
-MOT17-13-FRCNN_000146/000008.jpg 1
-MOT17-13-FRCNN_000146/000025.jpg 1
-MOT17-13-FRCNN_000146/000011.jpg 1
-MOT17-13-FRCNN_000146/000035.jpg 1
-MOT17-13-FRCNN_000146/000002.jpg 1
-MOT17-13-FRCNN_000146/000026.jpg 1
-MOT17-13-FRCNN_000146/000020.jpg 1
-MOT17-13-FRCNN_000146/000006.jpg 1
-MOT17-13-FRCNN_000146/000019.jpg 1
-MOT17-13-FRCNN_000146/000004.jpg 1
-MOT17-13-FRCNN_000146/000038.jpg 1
-MOT17-13-FRCNN_000146/000014.jpg 1
-MOT17-13-FRCNN_000146/000030.jpg 1
-MOT17-13-FRCNN_000146/000013.jpg 1
-MOT17-13-FRCNN_000146/000017.jpg 1
-MOT17-13-FRCNN_000146/000037.jpg 1
-MOT17-13-FRCNN_000146/000033.jpg 1
-MOT17-13-FRCNN_000146/000042.jpg 1
-MOT17-13-FRCNN_000146/000021.jpg 1
-MOT17-13-FRCNN_000146/000023.jpg 1
-MOT17-13-FRCNN_000146/000028.jpg 1
-MOT17-13-FRCNN_000146/000029.jpg 1
-MOT17-13-FRCNN_000146/000031.jpg 1
-MOT17-13-FRCNN_000146/000001.jpg 1
-MOT17-13-FRCNN_000146/000003.jpg 1
-MOT17-13-FRCNN_000146/000000.jpg 1
-MOT17-13-FRCNN_000146/000043.jpg 1
-MOT17-13-FRCNN_000146/000012.jpg 1
-MOT17-13-FRCNN_000146/000032.jpg 1
-MOT17-13-FRCNN_000146/000041.jpg 1
-MOT17-05-FRCNN_000088/000018.jpg 2
-MOT17-05-FRCNN_000088/000015.jpg 2
-MOT17-05-FRCNN_000088/000009.jpg 2
-MOT17-05-FRCNN_000088/000005.jpg 2
-MOT17-05-FRCNN_000088/000016.jpg 2
-MOT17-05-FRCNN_000088/000010.jpg 2
-MOT17-05-FRCNN_000088/000007.jpg 2
-MOT17-05-FRCNN_000088/000008.jpg 2
-MOT17-05-FRCNN_000088/000011.jpg 2
-MOT17-05-FRCNN_000088/000002.jpg 2
-MOT17-05-FRCNN_000088/000006.jpg 2
-MOT17-05-FRCNN_000088/000004.jpg 2
-MOT17-05-FRCNN_000088/000014.jpg 2
-MOT17-05-FRCNN_000088/000013.jpg 2
-MOT17-05-FRCNN_000088/000017.jpg 2
-MOT17-05-FRCNN_000088/000001.jpg 2
-MOT17-05-FRCNN_000088/000003.jpg 2
-MOT17-05-FRCNN_000088/000000.jpg 2
-MOT17-05-FRCNN_000088/000012.jpg 2
-MOT17-02-FRCNN_000009/000091.jpg 3
-MOT17-02-FRCNN_000009/000067.jpg 3
-MOT17-02-FRCNN_000009/000083.jpg 3
-MOT17-02-FRCNN_000009/000172.jpg 3
-MOT17-02-FRCNN_000009/000054.jpg 3
-MOT17-02-FRCNN_000009/000077.jpg 3
-MOT17-02-FRCNN_000009/000118.jpg 3
-MOT17-02-FRCNN_000009/000148.jpg 3
-MOT17-02-FRCNN_000009/000039.jpg 3
-MOT17-02-FRCNN_000009/000141.jpg 3
-MOT17-02-FRCNN_000009/000128.jpg 3
-MOT17-02-FRCNN_000009/000216.jpg 3
-MOT17-02-FRCNN_000009/000114.jpg 3
-MOT17-02-FRCNN_000009/000113.jpg 3
-MOT17-02-FRCNN_000009/000018.jpg 3
-MOT17-02-FRCNN_000009/000119.jpg 3
-MOT17-02-FRCNN_000009/000177.jpg 3
-MOT17-02-FRCNN_000009/000192.jpg 3
-MOT17-02-FRCNN_000009/000116.jpg 3
-MOT17-02-FRCNN_000009/000217.jpg 3
-MOT17-02-FRCNN_000009/000046.jpg 3
-MOT17-02-FRCNN_000009/000234.jpg 3
-MOT17-02-FRCNN_000009/000166.jpg 3
-MOT17-02-FRCNN_000009/000209.jpg 3
-MOT17-02-FRCNN_000009/000202.jpg 3
-MOT17-02-FRCNN_000009/000136.jpg 3
-MOT17-02-FRCNN_000009/000242.jpg 3
-MOT17-02-FRCNN_000009/000015.jpg 3
-MOT17-02-FRCNN_000009/000183.jpg 3
-MOT17-02-FRCNN_000009/000081.jpg 3
-MOT17-02-FRCNN_000009/000198.jpg 3
-MOT17-02-FRCNN_000009/000210.jpg 3
-MOT17-02-FRCNN_000009/000009.jpg 3
-MOT17-02-FRCNN_000009/000208.jpg 3
-MOT17-02-FRCNN_000009/000153.jpg 3
-MOT17-02-FRCNN_000009/000064.jpg 3
-MOT17-02-FRCNN_000009/000050.jpg 3
-MOT17-02-FRCNN_000009/000084.jpg 3
-MOT17-02-FRCNN_000009/000022.jpg 3
-MOT17-02-FRCNN_000009/000235.jpg 3
-MOT17-02-FRCNN_000009/000130.jpg 3
-MOT17-02-FRCNN_000009/000140.jpg 3
-MOT17-02-FRCNN_000009/000040.jpg 3
-MOT17-02-FRCNN_000009/000095.jpg 3
-MOT17-02-FRCNN_000009/000221.jpg 3
-MOT17-02-FRCNN_000009/000027.jpg 3
-MOT17-02-FRCNN_000009/000243.jpg 3
-MOT17-02-FRCNN_000009/000180.jpg 3
-MOT17-02-FRCNN_000009/000168.jpg 3
-MOT17-02-FRCNN_000009/000024.jpg 3
-MOT17-02-FRCNN_000009/000231.jpg 3
-MOT17-02-FRCNN_000009/000125.jpg 3
-MOT17-02-FRCNN_000009/000220.jpg 3
-MOT17-02-FRCNN_000009/000110.jpg 3
-MOT17-02-FRCNN_000009/000063.jpg 3
-MOT17-02-FRCNN_000009/000115.jpg 3
-MOT17-02-FRCNN_000009/000239.jpg 3
-MOT17-02-FRCNN_000009/000073.jpg 3
-MOT17-02-FRCNN_000009/000214.jpg 3
-MOT17-02-FRCNN_000009/000226.jpg 3
-MOT17-02-FRCNN_000009/000005.jpg 3
-MOT17-02-FRCNN_000009/000016.jpg 3
-MOT17-02-FRCNN_000009/000051.jpg 3
-MOT17-02-FRCNN_000009/000170.jpg 3
-MOT17-02-FRCNN_000009/000193.jpg 3
-MOT17-02-FRCNN_000009/000196.jpg 3
-MOT17-02-FRCNN_000009/000158.jpg 3
-MOT17-02-FRCNN_000009/000117.jpg 3
-MOT17-02-FRCNN_000009/000206.jpg 3
-MOT17-02-FRCNN_000009/000096.jpg 3
-MOT17-02-FRCNN_000009/000178.jpg 3
-MOT17-02-FRCNN_000009/000144.jpg 3
-MOT17-02-FRCNN_000009/000200.jpg 3
-MOT17-02-FRCNN_000009/000122.jpg 3
-MOT17-02-FRCNN_000009/000189.jpg 3
-MOT17-02-FRCNN_000009/000127.jpg 3
-MOT17-02-FRCNN_000009/000010.jpg 3
-MOT17-02-FRCNN_000009/000007.jpg 3
-MOT17-02-FRCNN_000009/000072.jpg 3
-MOT17-02-FRCNN_000009/000090.jpg 3
-MOT17-02-FRCNN_000009/000229.jpg 3
-MOT17-02-FRCNN_000009/000139.jpg 3
-MOT17-02-FRCNN_000009/000034.jpg 3
-MOT17-02-FRCNN_000009/000112.jpg 3
-MOT17-02-FRCNN_000009/000203.jpg 3
-MOT17-02-FRCNN_000009/000036.jpg 3
-MOT17-02-FRCNN_000009/000212.jpg 3
-MOT17-02-FRCNN_000009/000008.jpg 3
-MOT17-02-FRCNN_000009/000025.jpg 3
-MOT17-02-FRCNN_000009/000227.jpg 3
-MOT17-02-FRCNN_000009/000011.jpg 3
-MOT17-02-FRCNN_000009/000151.jpg 3
-MOT17-02-FRCNN_000009/000076.jpg 3
-MOT17-02-FRCNN_000009/000190.jpg 3
-MOT17-02-FRCNN_000009/000035.jpg 3
-MOT17-02-FRCNN_000009/000099.jpg 3
-MOT17-02-FRCNN_000009/000201.jpg 3
-MOT17-02-FRCNN_000009/000181.jpg 3
-MOT17-02-FRCNN_000009/000225.jpg 3
-MOT17-02-FRCNN_000009/000002.jpg 3
-MOT17-02-FRCNN_000009/000163.jpg 3
-MOT17-02-FRCNN_000009/000105.jpg 3
-MOT17-02-FRCNN_000009/000145.jpg 3
-MOT17-02-FRCNN_000009/000137.jpg 3
-MOT17-02-FRCNN_000009/000240.jpg 3
-MOT17-02-FRCNN_000009/000094.jpg 3
-MOT17-02-FRCNN_000009/000089.jpg 3
-MOT17-02-FRCNN_000009/000045.jpg 3
-MOT17-02-FRCNN_000009/000026.jpg 3
-MOT17-02-FRCNN_000009/000108.jpg 3
-MOT17-02-FRCNN_000009/000222.jpg 3
-MOT17-02-FRCNN_000009/000097.jpg 3
-MOT17-02-FRCNN_000009/000131.jpg 3
-MOT17-02-FRCNN_000009/000146.jpg 3
-MOT17-02-FRCNN_000009/000176.jpg 3
-MOT17-02-FRCNN_000009/000142.jpg 3
-MOT17-02-FRCNN_000009/000020.jpg 3
-MOT17-02-FRCNN_000009/000006.jpg 3
-MOT17-02-FRCNN_000009/000071.jpg 3
-MOT17-02-FRCNN_000009/000019.jpg 3
-MOT17-02-FRCNN_000009/000075.jpg 3
-MOT17-02-FRCNN_000009/000080.jpg 3
-MOT17-02-FRCNN_000009/000086.jpg 3
-MOT17-02-FRCNN_000009/000124.jpg 3
-MOT17-02-FRCNN_000009/000150.jpg 3
-MOT17-02-FRCNN_000009/000056.jpg 3
-MOT17-04-FRCNN_000122/000091.jpg 4
-MOT17-04-FRCNN_000122/000067.jpg 4
-MOT17-04-FRCNN_000122/000083.jpg 4
-MOT17-04-FRCNN_000122/000172.jpg 4
-MOT17-04-FRCNN_000122/000054.jpg 4
-MOT17-04-FRCNN_000122/000077.jpg 4
-MOT17-04-FRCNN_000122/000118.jpg 4
-MOT17-04-FRCNN_000122/000148.jpg 4
-MOT17-04-FRCNN_000122/000039.jpg 4
-MOT17-04-FRCNN_000122/000141.jpg 4
-MOT17-04-FRCNN_000122/000128.jpg 4
-MOT17-04-FRCNN_000122/000114.jpg 4
-MOT17-04-FRCNN_000122/000113.jpg 4
-MOT17-04-FRCNN_000122/000018.jpg 4
-MOT17-04-FRCNN_000122/000119.jpg 4
-MOT17-04-FRCNN_000122/000177.jpg 4
-MOT17-04-FRCNN_000122/000192.jpg 4
-MOT17-04-FRCNN_000122/000116.jpg 4
-MOT17-04-FRCNN_000122/000046.jpg 4
-MOT17-04-FRCNN_000122/000166.jpg 4
-MOT17-04-FRCNN_000122/000136.jpg 4
-MOT17-04-FRCNN_000122/000015.jpg 4
-MOT17-04-FRCNN_000122/000183.jpg 4
-MOT17-04-FRCNN_000122/000081.jpg 4
-MOT17-04-FRCNN_000122/000009.jpg 4
-MOT17-04-FRCNN_000122/000153.jpg 4
-MOT17-04-FRCNN_000122/000064.jpg 4
-MOT17-04-FRCNN_000122/000050.jpg 4
-MOT17-04-FRCNN_000122/000084.jpg 4
-MOT17-04-FRCNN_000122/000022.jpg 4
-MOT17-04-FRCNN_000122/000130.jpg 4
-MOT17-04-FRCNN_000122/000140.jpg 4
-MOT17-04-FRCNN_000122/000040.jpg 4
-MOT17-04-FRCNN_000122/000095.jpg 4
-MOT17-04-FRCNN_000122/000027.jpg 4
-MOT17-04-FRCNN_000122/000180.jpg 4
-MOT17-04-FRCNN_000122/000168.jpg 4
-MOT17-04-FRCNN_000122/000024.jpg 4
-MOT17-04-FRCNN_000122/000125.jpg 4
-MOT17-04-FRCNN_000122/000110.jpg 4
-MOT17-04-FRCNN_000122/000063.jpg 4
-MOT17-04-FRCNN_000122/000115.jpg 4
-MOT17-04-FRCNN_000122/000073.jpg 4
-MOT17-04-FRCNN_000122/000035.jpg 4
-MOT17-04-FRCNN_000122/000099.jpg 4
-MOT17-04-FRCNN_000122/000181.jpg 4
-MOT17-04-FRCNN_000122/000002.jpg 4
-MOT17-04-FRCNN_000122/000163.jpg 4
-MOT17-04-FRCNN_000122/000105.jpg 4
-MOT17-04-FRCNN_000122/000145.jpg 4
-MOT17-04-FRCNN_000122/000137.jpg 4
-MOT17-04-FRCNN_000122/000094.jpg 4
-MOT17-04-FRCNN_000122/000089.jpg 4
-MOT17-04-FRCNN_000122/000100.jpg 4
-MOT17-04-FRCNN_000122/000149.jpg 4
-MOT17-04-FRCNN_000122/000107.jpg 4
-MOT17-04-FRCNN_000122/000004.jpg 4
-MOT17-04-FRCNN_000122/000038.jpg 4
-MOT17-04-FRCNN_000122/000065.jpg 4
-MOT17-04-FRCNN_000122/000103.jpg 4
-MOT17-04-FRCNN_000122/000171.jpg 4
-MOT17-04-FRCNN_000122/000173.jpg 4
-MOT17-04-FRCNN_000122/000014.jpg 4
-MOT17-04-FRCNN_000122/000058.jpg 4
-MOT17-04-FRCNN_000122/000143.jpg 4
-MOT17-04-FRCNN_000122/000138.jpg 4
-MOT17-04-FRCNN_000122/000068.jpg 4
-MOT17-04-FRCNN_000122/000159.jpg 4
-MOT17-04-FRCNN_000122/000167.jpg 4
-MOT17-04-FRCNN_000122/000030.jpg 4
-MOT17-04-FRCNN_000122/000013.jpg 4
-MOT17-04-FRCNN_000122/000132.jpg 4
-MOT17-04-FRCNN_000122/000134.jpg 4
-MOT17-04-FRCNN_000122/000082.jpg 4
-MOT17-04-FRCNN_000122/000121.jpg 4
-MOT17-04-FRCNN_000122/000169.jpg 4
-MOT17-04-FRCNN_000122/000188.jpg 4
-MOT17-04-FRCNN_000122/000079.jpg 4
-MOT17-04-FRCNN_000122/000165.jpg 4
-MOT17-04-FRCNN_000122/000109.jpg 4
-MOT17-04-FRCNN_000122/000187.jpg 4
-MOT17-04-FRCNN_000122/000017.jpg 4
-MOT17-04-FRCNN_000122/000037.jpg 4
-MOT17-04-FRCNN_000122/000033.jpg 4
-MOT17-04-FRCNN_000122/000157.jpg 4
-MOT17-04-FRCNN_000122/000074.jpg 4
-MOT17-04-FRCNN_000122/000152.jpg 4
-MOT17-04-FRCNN_000122/000087.jpg 4
-MOT17-04-FRCNN_000122/000135.jpg 4
-MOT17-04-FRCNN_000122/000182.jpg 4
-MOT17-04-FRCNN_000122/000042.jpg 4
-MOT17-04-FRCNN_000122/000052.jpg 4
-MOT17-04-FRCNN_000122/000185.jpg 4
-MOT17-04-FRCNN_000122/000092.jpg 4
-MOT17-04-FRCNN_000122/000106.jpg 4
-MOT17-04-FRCNN_000122/000021.jpg 4
-MOT17-04-FRCNN_000122/000023.jpg 4
-MOT17-04-FRCNN_000122/000066.jpg 4
-MOT17-04-FRCNN_000122/000164.jpg 4
-MOT17-04-FRCNN_000122/000028.jpg 4
-MOT17-04-FRCNN_000122/000029.jpg 4
-MOT17-04-FRCNN_000122/000031.jpg 4
-MOT17-04-FRCNN_000122/000001.jpg 4
-MOT17-04-FRCNN_000122/000048.jpg 4
-MOT17-04-FRCNN_000122/000123.jpg 4
-MOT17-04-FRCNN_000122/000061.jpg 4
-MOT17-04-FRCNN_000122/000062.jpg 4
-MOT17-04-FRCNN_000122/000085.jpg 4
-MOT17-04-FRCNN_000122/000003.jpg 4
-MOT17-04-FRCNN_000122/000000.jpg 4
-MOT17-04-FRCNN_000122/000174.jpg 4
-MOT17-04-FRCNN_000122/000161.jpg 4
-MOT17-04-FRCNN_000122/000098.jpg 4
-MOT17-04-FRCNN_000122/000078.jpg 4
-MOT17-04-FRCNN_000122/000043.jpg 4
-MOT17-04-FRCNN_000122/000053.jpg 4
-MOT17-04-FRCNN_000122/000056.jpg 4
-MOT17-10-FRCNN_000049/000091.jpg 5
-MOT17-10-FRCNN_000049/000067.jpg 5
-MOT17-10-FRCNN_000049/000083.jpg 5
-MOT17-10-FRCNN_000049/000172.jpg 5
-MOT17-10-FRCNN_000049/000054.jpg 5
-MOT17-10-FRCNN_000049/000077.jpg 5
-MOT17-10-FRCNN_000049/000118.jpg 5
-MOT17-10-FRCNN_000049/000148.jpg 5
-MOT17-10-FRCNN_000049/000039.jpg 5
-MOT17-10-FRCNN_000049/000141.jpg 5
-MOT17-10-FRCNN_000049/000128.jpg 5
-MOT17-10-FRCNN_000049/000216.jpg 5
-MOT17-10-FRCNN_000049/000114.jpg 5
-MOT17-10-FRCNN_000049/000113.jpg 5
-MOT17-10-FRCNN_000049/000018.jpg 5
-MOT17-10-FRCNN_000049/000119.jpg 5
-MOT17-10-FRCNN_000049/000177.jpg 5
-MOT17-10-FRCNN_000049/000192.jpg 5
-MOT17-10-FRCNN_000049/000116.jpg 5
-MOT17-10-FRCNN_000049/000271.jpg 5
-MOT17-10-FRCNN_000049/000217.jpg 5
-MOT17-10-FRCNN_000049/000046.jpg 5
-MOT17-10-FRCNN_000049/000234.jpg 5
-MOT17-10-FRCNN_000049/000166.jpg 5
-MOT17-10-FRCNN_000049/000209.jpg 5
-MOT17-10-FRCNN_000049/000202.jpg 5
-MOT17-10-FRCNN_000049/000136.jpg 5
-MOT17-10-FRCNN_000049/000242.jpg 5
-MOT17-10-FRCNN_000049/000015.jpg 5
-MOT17-10-FRCNN_000049/000183.jpg 5
-MOT17-10-FRCNN_000049/000081.jpg 5
-MOT17-10-FRCNN_000049/000198.jpg 5
-MOT17-10-FRCNN_000049/000210.jpg 5
-MOT17-10-FRCNN_000049/000009.jpg 5
-MOT17-10-FRCNN_000049/000208.jpg 5
-MOT17-10-FRCNN_000049/000153.jpg 5
-MOT17-10-FRCNN_000049/000037.jpg 5
-MOT17-10-FRCNN_000049/000033.jpg 5
-MOT17-10-FRCNN_000049/000157.jpg 5
-MOT17-10-FRCNN_000049/000074.jpg 5
-MOT17-10-FRCNN_000049/000152.jpg 5
-MOT17-10-FRCNN_000049/000087.jpg 5
-MOT17-10-FRCNN_000049/000195.jpg 5
-MOT17-10-FRCNN_000049/000215.jpg 5
-MOT17-10-FRCNN_000049/000135.jpg 5
-MOT17-10-FRCNN_000049/000247.jpg 5
-MOT17-10-FRCNN_000049/000257.jpg 5
-MOT17-10-FRCNN_000049/000182.jpg 5
-MOT17-10-FRCNN_000049/000042.jpg 5
-MOT17-10-FRCNN_000049/000052.jpg 5
-MOT17-10-FRCNN_000049/000185.jpg 5
-MOT17-10-FRCNN_000049/000092.jpg 5
-MOT17-10-FRCNN_000049/000241.jpg 5
-MOT17-10-FRCNN_000049/000106.jpg 5
-MOT17-10-FRCNN_000049/000021.jpg 5
-MOT17-10-FRCNN_000049/000023.jpg 5
-MOT17-10-FRCNN_000049/000066.jpg 5
-MOT17-10-FRCNN_000049/000164.jpg 5
-MOT17-10-FRCNN_000049/000028.jpg 5
-MOT17-10-FRCNN_000049/000029.jpg 5
-MOT17-10-FRCNN_000049/000218.jpg 5
-MOT17-10-FRCNN_000049/000031.jpg 5
-MOT17-10-FRCNN_000049/000256.jpg 5
-MOT17-10-FRCNN_000049/000001.jpg 5
-MOT17-10-FRCNN_000049/000266.jpg 5
-MOT17-10-FRCNN_000049/000048.jpg 5
-MOT17-10-FRCNN_000049/000123.jpg 5
-MOT17-10-FRCNN_000049/000205.jpg 5
-MOT17-10-FRCNN_000049/000061.jpg 5
-MOT17-10-FRCNN_000049/000062.jpg 5
-MOT17-10-FRCNN_000049/000085.jpg 5
-MOT17-10-FRCNN_000049/000003.jpg 5
-MOT17-10-FRCNN_000049/000254.jpg 5
-MOT17-10-FRCNN_000049/000000.jpg 5
-MOT17-10-FRCNN_000049/000275.jpg 5
-MOT17-10-FRCNN_000049/000232.jpg 5
-MOT17-10-FRCNN_000049/000174.jpg 5
-MOT17-10-FRCNN_000049/000161.jpg 5
-MOT17-10-FRCNN_000049/000269.jpg 5
-MOT17-10-FRCNN_000049/000267.jpg 5
-MOT17-10-FRCNN_000049/000230.jpg 5
-MOT17-10-FRCNN_000049/000223.jpg 5
-MOT17-10-FRCNN_000049/000236.jpg 5
-MOT17-10-FRCNN_000049/000098.jpg 5
-MOT17-10-FRCNN_000049/000104.jpg 5
-MOT17-10-FRCNN_000049/000126.jpg 5
-MOT17-10-FRCNN_000049/000272.jpg 5
-MOT17-10-FRCNN_000049/000032.jpg 5
-MOT17-10-FRCNN_000049/000055.jpg 5
-MOT17-10-FRCNN_000049/000175.jpg 5
-MOT17-10-FRCNN_000049/000041.jpg 5
-MOT17-10-FRCNN_000049/000070.jpg 5
-MOT17-10-FRCNN_000049/000056.jpg 5
-MOT17-10-FRCNN_000027/000054.jpg 6
-MOT17-10-FRCNN_000027/000039.jpg 6
-MOT17-10-FRCNN_000027/000018.jpg 6
-MOT17-10-FRCNN_000027/000046.jpg 6
-MOT17-10-FRCNN_000027/000015.jpg 6
-MOT17-10-FRCNN_000027/000009.jpg 6
-MOT17-10-FRCNN_000027/000050.jpg 6
-MOT17-10-FRCNN_000027/000022.jpg 6
-MOT17-10-FRCNN_000027/000040.jpg 6
-MOT17-10-FRCNN_000027/000027.jpg 6
-MOT17-10-FRCNN_000027/000024.jpg 6
-MOT17-10-FRCNN_000027/000005.jpg 6
-MOT17-10-FRCNN_000027/000016.jpg 6
-MOT17-10-FRCNN_000027/000051.jpg 6
-MOT17-10-FRCNN_000027/000010.jpg 6
-MOT17-10-FRCNN_000027/000007.jpg 6
-MOT17-10-FRCNN_000027/000034.jpg 6
-MOT17-10-FRCNN_000027/000036.jpg 6
-MOT17-10-FRCNN_000027/000008.jpg 6
-MOT17-10-FRCNN_000027/000025.jpg 6
-MOT17-10-FRCNN_000027/000011.jpg 6
-MOT17-10-FRCNN_000027/000035.jpg 6
-MOT17-10-FRCNN_000027/000002.jpg 6
-MOT17-10-FRCNN_000027/000045.jpg 6
-MOT17-10-FRCNN_000027/000026.jpg 6
-MOT17-10-FRCNN_000027/000020.jpg 6
-MOT17-10-FRCNN_000027/000006.jpg 6
-MOT17-10-FRCNN_000027/000019.jpg 6
-MOT17-10-FRCNN_000027/000057.jpg 6
-MOT17-10-FRCNN_000027/000049.jpg 6
-MOT17-10-FRCNN_000027/000004.jpg 6
-MOT17-10-FRCNN_000027/000038.jpg 6
-MOT17-10-FRCNN_000027/000014.jpg 6
-MOT17-10-FRCNN_000027/000058.jpg 6
-MOT17-10-FRCNN_000027/000030.jpg 6
-MOT17-10-FRCNN_000027/000013.jpg 6
-MOT17-10-FRCNN_000027/000017.jpg 6
-MOT17-10-FRCNN_000027/000037.jpg 6
-MOT17-10-FRCNN_000027/000033.jpg 6
-MOT17-10-FRCNN_000027/000042.jpg 6
-MOT17-10-FRCNN_000027/000052.jpg 6
-MOT17-10-FRCNN_000027/000021.jpg 6
-MOT17-10-FRCNN_000027/000023.jpg 6
-MOT17-10-FRCNN_000027/000028.jpg 6
-MOT17-10-FRCNN_000027/000029.jpg 6
-MOT17-10-FRCNN_000027/000031.jpg 6
-MOT17-10-FRCNN_000027/000001.jpg 6
-MOT17-10-FRCNN_000027/000048.jpg 6
-MOT17-10-FRCNN_000027/000003.jpg 6
-MOT17-10-FRCNN_000027/000000.jpg 6
-MOT17-10-FRCNN_000027/000043.jpg 6
-MOT17-10-FRCNN_000027/000053.jpg 6
-MOT17-10-FRCNN_000027/000044.jpg 6
-MOT17-10-FRCNN_000027/000047.jpg 6
-MOT17-10-FRCNN_000027/000012.jpg 6
-MOT17-10-FRCNN_000027/000032.jpg 6
-MOT17-10-FRCNN_000027/000055.jpg 6
-MOT17-10-FRCNN_000027/000041.jpg 6
-MOT17-10-FRCNN_000027/000056.jpg 6
-MOT17-02-FRCNN_000037/000091.jpg 7
-MOT17-02-FRCNN_000037/000067.jpg 7
-MOT17-02-FRCNN_000037/000083.jpg 7
-MOT17-02-FRCNN_000037/000054.jpg 7
-MOT17-02-FRCNN_000037/000077.jpg 7
-MOT17-02-FRCNN_000037/000118.jpg 7
-MOT17-02-FRCNN_000037/000039.jpg 7
-MOT17-02-FRCNN_000037/000114.jpg 7
-MOT17-02-FRCNN_000037/000113.jpg 7
-MOT17-02-FRCNN_000037/000018.jpg 7
-MOT17-02-FRCNN_000037/000119.jpg 7
-MOT17-02-FRCNN_000037/000116.jpg 7
-MOT17-02-FRCNN_000037/000046.jpg 7
-MOT17-02-FRCNN_000037/000015.jpg 7
-MOT17-02-FRCNN_000037/000081.jpg 7
-MOT17-02-FRCNN_000037/000009.jpg 7
-MOT17-02-FRCNN_000037/000064.jpg 7
-MOT17-02-FRCNN_000037/000050.jpg 7
-MOT17-02-FRCNN_000037/000084.jpg 7
-MOT17-02-FRCNN_000037/000022.jpg 7
-MOT17-02-FRCNN_000037/000040.jpg 7
-MOT17-02-FRCNN_000037/000095.jpg 7
-MOT17-02-FRCNN_000037/000027.jpg 7
-MOT17-02-FRCNN_000037/000024.jpg 7
-MOT17-02-FRCNN_000037/000110.jpg 7
-MOT17-02-FRCNN_000037/000063.jpg 7
-MOT17-02-FRCNN_000037/000115.jpg 7
-MOT17-02-FRCNN_000037/000073.jpg 7
-MOT17-02-FRCNN_000037/000005.jpg 7
-MOT17-02-FRCNN_000037/000016.jpg 7
-MOT17-02-FRCNN_000037/000051.jpg 7
-MOT17-02-FRCNN_000037/000117.jpg 7
-MOT17-02-FRCNN_000037/000096.jpg 7
-MOT17-02-FRCNN_000037/000010.jpg 7
-MOT17-02-FRCNN_000037/000007.jpg 7
-MOT17-02-FRCNN_000037/000072.jpg 7
-MOT17-02-FRCNN_000037/000090.jpg 7
-MOT17-02-FRCNN_000037/000034.jpg 7
-MOT17-02-FRCNN_000037/000112.jpg 7
-MOT17-02-FRCNN_000037/000036.jpg 7
-MOT17-02-FRCNN_000037/000008.jpg 7
-MOT17-02-FRCNN_000037/000025.jpg 7
-MOT17-02-FRCNN_000037/000011.jpg 7
-MOT17-02-FRCNN_000037/000076.jpg 7
-MOT17-02-FRCNN_000037/000035.jpg 7
-MOT17-02-FRCNN_000037/000099.jpg 7
-MOT17-02-FRCNN_000037/000002.jpg 7
-MOT17-02-FRCNN_000037/000105.jpg 7
-MOT17-02-FRCNN_000037/000094.jpg 7
-MOT17-02-FRCNN_000037/000089.jpg 7
-MOT17-02-FRCNN_000037/000045.jpg 7
-MOT17-02-FRCNN_000037/000026.jpg 7
-MOT17-02-FRCNN_000037/000108.jpg 7
-MOT17-02-FRCNN_000037/000097.jpg 7
-MOT17-02-FRCNN_000037/000020.jpg 7
-MOT17-02-FRCNN_000037/000006.jpg 7
-MOT17-02-FRCNN_000037/000071.jpg 7
-MOT17-02-FRCNN_000037/000019.jpg 7
-MOT17-02-FRCNN_000037/000075.jpg 7
-MOT17-02-FRCNN_000037/000080.jpg 7
-MOT17-02-FRCNN_000037/000086.jpg 7
-MOT17-02-FRCNN_000037/000111.jpg 7
-MOT17-02-FRCNN_000037/000120.jpg 7
-MOT17-02-FRCNN_000037/000057.jpg 7
-MOT17-02-FRCNN_000037/000101.jpg 7
-MOT17-02-FRCNN_000037/000049.jpg 7
-MOT17-02-FRCNN_000037/000100.jpg 7
-MOT17-02-FRCNN_000037/000107.jpg 7
-MOT17-02-FRCNN_000037/000004.jpg 7
-MOT17-02-FRCNN_000037/000038.jpg 7
-MOT17-02-FRCNN_000037/000065.jpg 7
-MOT17-02-FRCNN_000037/000103.jpg 7
-MOT17-02-FRCNN_000037/000014.jpg 7
-MOT17-02-FRCNN_000037/000058.jpg 7
-MOT17-02-FRCNN_000037/000068.jpg 7
-MOT17-02-FRCNN_000037/000104.jpg 7
-MOT17-02-FRCNN_000037/000032.jpg 7
-MOT17-02-FRCNN_000037/000055.jpg 7
-MOT17-02-FRCNN_000037/000041.jpg 7
-MOT17-02-FRCNN_000037/000070.jpg 7
-MOT17-02-FRCNN_000037/000056.jpg 7
-MOT17-10-FRCNN_000023/000091.jpg 8
-MOT17-10-FRCNN_000023/000067.jpg 8
-MOT17-10-FRCNN_000023/000083.jpg 8
-MOT17-10-FRCNN_000023/000172.jpg 8
-MOT17-10-FRCNN_000023/000054.jpg 8
-MOT17-10-FRCNN_000023/000077.jpg 8
-MOT17-10-FRCNN_000023/000343.jpg 8
-MOT17-10-FRCNN_000023/000118.jpg 8
-MOT17-10-FRCNN_000023/000148.jpg 8
-MOT17-10-FRCNN_000023/000039.jpg 8
-MOT17-10-FRCNN_000023/000334.jpg 8
-MOT17-10-FRCNN_000023/000141.jpg 8
-MOT17-10-FRCNN_000023/000322.jpg 8
-MOT17-10-FRCNN_000023/000128.jpg 8
-MOT17-10-FRCNN_000023/000216.jpg 8
-MOT17-10-FRCNN_000023/000114.jpg 8
-MOT17-10-FRCNN_000023/000113.jpg 8
-MOT17-10-FRCNN_000023/000377.jpg 8
-MOT17-10-FRCNN_000023/000018.jpg 8
-MOT17-10-FRCNN_000023/000307.jpg 8
-MOT17-10-FRCNN_000023/000396.jpg 8
-MOT17-10-FRCNN_000023/000390.jpg 8
-MOT17-10-FRCNN_000023/000119.jpg 8
-MOT17-10-FRCNN_000023/000177.jpg 8
-MOT17-10-FRCNN_000023/000192.jpg 8
-MOT17-10-FRCNN_000023/000116.jpg 8
-MOT17-10-FRCNN_000023/000271.jpg 8
-MOT17-10-FRCNN_000023/000410.jpg 8
-MOT17-10-FRCNN_000023/000217.jpg 8
-MOT17-10-FRCNN_000023/000046.jpg 8
-MOT17-10-FRCNN_000023/000234.jpg 8
-MOT17-10-FRCNN_000023/000166.jpg 8
-MOT17-10-FRCNN_000023/000316.jpg 8
-MOT17-10-FRCNN_000023/000371.jpg 8
-MOT17-10-FRCNN_000023/000088.jpg 8
-MOT17-10-FRCNN_000023/000424.jpg 8
-MOT17-10-FRCNN_000023/000104.jpg 8
-MOT17-10-FRCNN_000023/000287.jpg 8
-MOT17-10-FRCNN_000023/000344.jpg 8
-MOT17-10-FRCNN_000023/000126.jpg 8
-MOT17-10-FRCNN_000023/000398.jpg 8
-MOT17-10-FRCNN_000023/000272.jpg 8
-MOT17-10-FRCNN_000023/000032.jpg 8
-MOT17-10-FRCNN_000023/000291.jpg 8
-MOT17-10-FRCNN_000023/000055.jpg 8
-MOT17-10-FRCNN_000023/000340.jpg 8
-MOT17-10-FRCNN_000023/000175.jpg 8
-MOT17-10-FRCNN_000023/000361.jpg 8
-MOT17-10-FRCNN_000023/000041.jpg 8
-MOT17-10-FRCNN_000023/000070.jpg 8
-MOT17-10-FRCNN_000023/000412.jpg 8
-MOT17-10-FRCNN_000023/000056.jpg 8
-MOT17-04-FRCNN_000112/000091.jpg 9
-MOT17-04-FRCNN_000112/000067.jpg 9
-MOT17-04-FRCNN_000112/000083.jpg 9
-MOT17-04-FRCNN_000112/000172.jpg 9
-MOT17-04-FRCNN_000112/000054.jpg 9
-MOT17-04-FRCNN_000112/000077.jpg 9
-MOT17-04-FRCNN_000112/000118.jpg 9
-MOT17-04-FRCNN_000112/000148.jpg 9
-MOT17-04-FRCNN_000112/000039.jpg 9
-MOT17-04-FRCNN_000112/000141.jpg 9
-MOT17-04-FRCNN_000112/000128.jpg 9
-MOT17-04-FRCNN_000112/000216.jpg 9
-MOT17-04-FRCNN_000112/000114.jpg 9
-MOT17-04-FRCNN_000112/000113.jpg 9
-MOT17-04-FRCNN_000112/000018.jpg 9
-MOT17-04-FRCNN_000112/000119.jpg 9
-MOT17-04-FRCNN_000112/000177.jpg 9
-MOT17-04-FRCNN_000112/000192.jpg 9
-MOT17-04-FRCNN_000112/000116.jpg 9
-MOT17-04-FRCNN_000112/000217.jpg 9
-MOT17-04-FRCNN_000112/000046.jpg 9
-MOT17-04-FRCNN_000112/000234.jpg 9
-MOT17-04-FRCNN_000112/000166.jpg 9
-MOT17-04-FRCNN_000112/000209.jpg 9
-MOT17-04-FRCNN_000112/000202.jpg 9
-MOT17-04-FRCNN_000112/000136.jpg 9
-MOT17-04-FRCNN_000112/000242.jpg 9
-MOT17-04-FRCNN_000112/000015.jpg 9
-MOT17-04-FRCNN_000112/000183.jpg 9
-MOT17-04-FRCNN_000112/000081.jpg 9
-MOT17-04-FRCNN_000112/000198.jpg 9
-MOT17-04-FRCNN_000112/000210.jpg 9
-MOT17-04-FRCNN_000112/000239.jpg 9
-MOT17-04-FRCNN_000112/000073.jpg 9
-MOT17-04-FRCNN_000112/000214.jpg 9
-MOT17-04-FRCNN_000112/000226.jpg 9
-MOT17-04-FRCNN_000112/000005.jpg 9
-MOT17-04-FRCNN_000112/000016.jpg 9
-MOT17-04-FRCNN_000112/000051.jpg 9
-MOT17-04-FRCNN_000112/000170.jpg 9
-MOT17-04-FRCNN_000112/000193.jpg 9
-MOT17-04-FRCNN_000112/000196.jpg 9
-MOT17-04-FRCNN_000112/000158.jpg 9
-MOT17-04-FRCNN_000112/000117.jpg 9
-MOT17-04-FRCNN_000112/000206.jpg 9
-MOT17-04-FRCNN_000112/000096.jpg 9
-MOT17-04-FRCNN_000112/000178.jpg 9
-MOT17-04-FRCNN_000112/000144.jpg 9
-MOT17-04-FRCNN_000112/000200.jpg 9
-MOT17-04-FRCNN_000112/000122.jpg 9
-MOT17-04-FRCNN_000112/000189.jpg 9
-MOT17-04-FRCNN_000112/000127.jpg 9
-MOT17-04-FRCNN_000112/000010.jpg 9
-MOT17-04-FRCNN_000112/000007.jpg 9
-MOT17-04-FRCNN_000112/000094.jpg 9
-MOT17-04-FRCNN_000112/000089.jpg 9
-MOT17-04-FRCNN_000112/000045.jpg 9
-MOT17-04-FRCNN_000112/000026.jpg 9
-MOT17-04-FRCNN_000112/000108.jpg 9
-MOT17-04-FRCNN_000112/000222.jpg 9
-MOT17-04-FRCNN_000112/000097.jpg 9
-MOT17-04-FRCNN_000112/000131.jpg 9
-MOT17-04-FRCNN_000112/000146.jpg 9
-MOT17-04-FRCNN_000112/000176.jpg 9
-MOT17-04-FRCNN_000112/000142.jpg 9
-MOT17-04-FRCNN_000112/000049.jpg 9
-MOT17-04-FRCNN_000112/000155.jpg 9
-MOT17-04-FRCNN_000112/000147.jpg 9
-MOT17-04-FRCNN_000112/000162.jpg 9
-MOT17-04-FRCNN_000112/000100.jpg 9
-MOT17-04-FRCNN_000112/000211.jpg 9
-MOT17-04-FRCNN_000112/000149.jpg 9
-MOT17-04-FRCNN_000112/000107.jpg 9
-MOT17-04-FRCNN_000112/000238.jpg 9
-MOT17-04-FRCNN_000112/000004.jpg 9
-MOT17-04-FRCNN_000112/000213.jpg 9
-MOT17-04-FRCNN_000112/000038.jpg 9
-MOT17-04-FRCNN_000112/000065.jpg 9
-MOT17-04-FRCNN_000112/000245.jpg 9
-MOT17-04-FRCNN_000112/000103.jpg 9
-MOT17-04-FRCNN_000112/000171.jpg 9
-MOT17-13-FRCNN_000009/000009.jpg 10
-MOT17-13-FRCNN_000009/000005.jpg 10
-MOT17-13-FRCNN_000009/000010.jpg 10
-MOT17-13-FRCNN_000009/000007.jpg 10
-MOT17-13-FRCNN_000009/000008.jpg 10
-MOT17-13-FRCNN_000009/000002.jpg 10
-MOT17-13-FRCNN_000009/000006.jpg 10
-MOT17-13-FRCNN_000009/000004.jpg 10
-MOT17-13-FRCNN_000009/000001.jpg 10
-MOT17-13-FRCNN_000009/000003.jpg 10
-MOT17-13-FRCNN_000009/000000.jpg 10
diff --git a/tests/data/demo_sot_data/lasot/airplane-1/groundtruth.txt b/tests/data/demo_sot_data/lasot/airplane-1/groundtruth.txt
deleted file mode 100644
index 496a7042e..000000000
--- a/tests/data/demo_sot_data/lasot/airplane-1/groundtruth.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-367,101,41,16
-366,103,45,16
-364,107,45,15
-362,109,46,16
-362,111,46,18
-362,113,46,18
-364,116,46,17
-366,118,45,17
-362,119,48,17
-359,119,45,17
-358,119,46,17
-360,121,46,17
-360,124,46,17
-359,124,47,17
-360,126,46,17
-356,127,46,18
-354,127,46,17
-352,127,46,17
-352,126,44,17
-349,126,46,17
-347,126,46,17
-346,125,46,17
-345,124,47,17
-345,124,46,17
-344,124,47,17
diff --git a/tests/data/demo_sot_data/lasot/airplane-1/track_results.txt b/tests/data/demo_sot_data/lasot/airplane-1/track_results.txt
deleted file mode 100644
index 7648a02f5..000000000
--- a/tests/data/demo_sot_data/lasot/airplane-1/track_results.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-367,101,408,117
-367,102,410,118
-363,105,406,121
-362,109,407,124
-361,112,407,128
-362,114,408,130
-364,116,410,132
-364,118,411,134
-360,120,408,136
-356,119,404,135
-356,119,404,135
-359,121,407,137
-359,124,407,141
-359,125,407,141
-358,126,406,143
-354,127,402,144
-351,127,400,144
-350,127,398,143
-349,127,397,143
-346,126,394,142
-344,126,392,143
-343,125,392,142
-343,123,392,140
-343,124,392,141
-341,124,392,141
diff --git a/tests/data/demo_sot_data/lasot/airplane-2/groundtruth.txt b/tests/data/demo_sot_data/lasot/airplane-2/groundtruth.txt
deleted file mode 100644
index 2580b2402..000000000
--- a/tests/data/demo_sot_data/lasot/airplane-2/groundtruth.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-76,74,367,151
-75,76,369,150
-78,76,368,150
-81,77,366,149
-82,76,367,150
-81,74,370,151
-81,74,370,152
-84,77,370,151
-89,79,371,149
-88,78,372,149
-88,78,372,150
-90,79,374,149
-90,80,374,149
-89,81,374,150
-92,81,375,150
-94,80,378,150
-95,80,379,150
-96,79,376,151
-96,79,375,152
-100,81,377,150
-102,81,377,150
-99,79,376,152
-99,82,379,150
-104,82,375,150
-100,81,379,152
diff --git a/tests/data/demo_sot_data/lasot/airplane-2/track_results.txt b/tests/data/demo_sot_data/lasot/airplane-2/track_results.txt
deleted file mode 100644
index 2716acd33..000000000
--- a/tests/data/demo_sot_data/lasot/airplane-2/track_results.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-15,123,544,267
-18,130,545,274
-23,106,553,252
-20,117,547,264
-17,122,545,267
-13,129,540,273
-24,104,551,249
-29,110,559,255
-34,113,566,258
-31,122,557,266
-32,127,552,271
-30,135,548,276
-37,110,554,254
-31,112,558,258
-31,119,560,264
-21,124,547,268
-48,132,578,277
-22,102,553,249
-11,105,544,253
-19,110,551,257
-22,113,557,257
-32,112,567,255
-30,115,566,258
-34,116,570,261
-28,120,556,265
diff --git a/tests/data/demo_sot_data/lasot/create_assets.py b/tests/data/demo_sot_data/lasot/create_assets.py
deleted file mode 100644
index 0dace74ab..000000000
--- a/tests/data/demo_sot_data/lasot/create_assets.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-from collections import defaultdict
-
-import mmcv
-
-
-def create_dummy_data():
-    lasot_test = defaultdict(list)
-    records = dict(vid_id=1, img_id=1, ann_id=1, global_instance_id=1)
-    videos_list = ['airplane-1', 'airplane-2']
-
-    lasot_test['categories'] = [dict(id=0, name=0)]
-
-    for video_name in videos_list:
-        video_path = video_name
-        video = dict(id=records['vid_id'], name=video_name)
-        lasot_test['videos'].append(video)
-
-        gt_bboxes = mmcv.list_from_file(
-            osp.join(video_path, 'groundtruth.txt'))
-
-        height, width, _ = (360, 640, 3)
-        for frame_id, gt_bbox in enumerate(gt_bboxes):
-            file_name = '%08d' % (frame_id + 1) + '.jpg'
-            file_name = osp.join(video_name, 'img', file_name)
-            image = dict(
-                file_name=file_name,
-                height=height,
-                width=width,
-                id=records['img_id'],
-                frame_id=frame_id,
-                video_id=records['vid_id'])
-            lasot_test['images'].append(image)
-
-            x1, y1, w, h = gt_bbox.split(',')
-            ann = dict(
-                id=records['ann_id'],
-                image_id=records['img_id'],
-                instance_id=records['global_instance_id'],
-                category_id=0,
-                bbox=[int(x1), int(y1), int(w),
-                      int(h)],
-                area=int(w) * int(h),
-                full_occlusion=False,
-                out_of_view=False)
-            lasot_test['annotations'].append(ann)
-
-            records['ann_id'] += 1
-            records['img_id'] += 1
-        records['global_instance_id'] += 1
-        records['vid_id'] += 1
-
-    mmcv.dump(lasot_test, 'lasot_test_dummy.json')
-
-
-if __name__ == '__main__':
-    create_dummy_data()
diff --git a/tests/data/demo_sot_data/lasot/lasot_test_dummy.json b/tests/data/demo_sot_data/lasot/lasot_test_dummy.json
deleted file mode 100644
index 3d57d95a0..000000000
--- a/tests/data/demo_sot_data/lasot/lasot_test_dummy.json
+++ /dev/null
@@ -1,1172 +0,0 @@
-{
-    "categories": [
-        {
-            "id": 0,
-            "name": 0
-        }
-    ],
-    "videos": [
-        {
-            "id": 1,
-            "name": "airplane-1"
-        },
-        {
-            "id": 2,
-            "name": "airplane-2"
-        }
-    ],
-    "images": [
-        {
-            "file_name": "airplane-1/img/00000001.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 1,
-            "frame_id": 0,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000002.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 2,
-            "frame_id": 1,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000003.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 3,
-            "frame_id": 2,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000004.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 4,
-            "frame_id": 3,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000005.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 5,
-            "frame_id": 4,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000006.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 6,
-            "frame_id": 5,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000007.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 7,
-            "frame_id": 6,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000008.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 8,
-            "frame_id": 7,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000009.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 9,
-            "frame_id": 8,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000010.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 10,
-            "frame_id": 9,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000011.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 11,
-            "frame_id": 10,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000012.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 12,
-            "frame_id": 11,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000013.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 13,
-            "frame_id": 12,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000014.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 14,
-            "frame_id": 13,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000015.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 15,
-            "frame_id": 14,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000016.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 16,
-            "frame_id": 15,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000017.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 17,
-            "frame_id": 16,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000018.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 18,
-            "frame_id": 17,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000019.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 19,
-            "frame_id": 18,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000020.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 20,
-            "frame_id": 19,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000021.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 21,
-            "frame_id": 20,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000022.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 22,
-            "frame_id": 21,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000023.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 23,
-            "frame_id": 22,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000024.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 24,
-            "frame_id": 23,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-1/img/00000025.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 25,
-            "frame_id": 24,
-            "video_id": 1
-        },
-        {
-            "file_name": "airplane-2/img/00000001.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 26,
-            "frame_id": 0,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000002.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 27,
-            "frame_id": 1,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000003.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 28,
-            "frame_id": 2,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000004.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 29,
-            "frame_id": 3,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000005.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 30,
-            "frame_id": 4,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000006.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 31,
-            "frame_id": 5,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000007.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 32,
-            "frame_id": 6,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000008.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 33,
-            "frame_id": 7,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000009.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 34,
-            "frame_id": 8,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000010.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 35,
-            "frame_id": 9,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000011.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 36,
-            "frame_id": 10,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000012.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 37,
-            "frame_id": 11,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000013.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 38,
-            "frame_id": 12,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000014.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 39,
-            "frame_id": 13,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000015.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 40,
-            "frame_id": 14,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000016.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 41,
-            "frame_id": 15,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000017.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 42,
-            "frame_id": 16,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000018.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 43,
-            "frame_id": 17,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000019.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 44,
-            "frame_id": 18,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000020.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 45,
-            "frame_id": 19,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000021.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 46,
-            "frame_id": 20,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000022.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 47,
-            "frame_id": 21,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000023.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 48,
-            "frame_id": 22,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000024.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 49,
-            "frame_id": 23,
-            "video_id": 2
-        },
-        {
-            "file_name": "airplane-2/img/00000025.jpg",
-            "height": 360,
-            "width": 640,
-            "id": 50,
-            "frame_id": 24,
-            "video_id": 2
-        }
-    ],
-    "annotations": [
-        {
-            "id": 1,
-            "image_id": 1,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                367,
-                101,
-                41,
-                16
-            ],
-            "area": 656,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 2,
-            "image_id": 2,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                366,
-                103,
-                45,
-                16
-            ],
-            "area": 720,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 3,
-            "image_id": 3,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                364,
-                107,
-                45,
-                15
-            ],
-            "area": 675,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 4,
-            "image_id": 4,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                362,
-                109,
-                46,
-                16
-            ],
-            "area": 736,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 5,
-            "image_id": 5,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                362,
-                111,
-                46,
-                18
-            ],
-            "area": 828,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 6,
-            "image_id": 6,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                362,
-                113,
-                46,
-                18
-            ],
-            "area": 828,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 7,
-            "image_id": 7,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                364,
-                116,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 8,
-            "image_id": 8,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                366,
-                118,
-                45,
-                17
-            ],
-            "area": 765,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 9,
-            "image_id": 9,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                362,
-                119,
-                48,
-                17
-            ],
-            "area": 816,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 10,
-            "image_id": 10,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                359,
-                119,
-                45,
-                17
-            ],
-            "area": 765,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 11,
-            "image_id": 11,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                358,
-                119,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 12,
-            "image_id": 12,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                360,
-                121,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 13,
-            "image_id": 13,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                360,
-                124,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 14,
-            "image_id": 14,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                359,
-                124,
-                47,
-                17
-            ],
-            "area": 799,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 15,
-            "image_id": 15,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                360,
-                126,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 16,
-            "image_id": 16,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                356,
-                127,
-                46,
-                18
-            ],
-            "area": 828,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 17,
-            "image_id": 17,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                354,
-                127,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 18,
-            "image_id": 18,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                352,
-                127,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 19,
-            "image_id": 19,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                352,
-                126,
-                44,
-                17
-            ],
-            "area": 748,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 20,
-            "image_id": 20,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                349,
-                126,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 21,
-            "image_id": 21,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                347,
-                126,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 22,
-            "image_id": 22,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                346,
-                125,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 23,
-            "image_id": 23,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                345,
-                124,
-                47,
-                17
-            ],
-            "area": 799,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 24,
-            "image_id": 24,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                345,
-                124,
-                46,
-                17
-            ],
-            "area": 782,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 25,
-            "image_id": 25,
-            "instance_id": 1,
-            "category_id": 0,
-            "bbox": [
-                344,
-                124,
-                47,
-                17
-            ],
-            "area": 799,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 26,
-            "image_id": 26,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                76,
-                74,
-                367,
-                151
-            ],
-            "area": 55417,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 27,
-            "image_id": 27,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                75,
-                76,
-                369,
-                150
-            ],
-            "area": 55350,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 28,
-            "image_id": 28,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                78,
-                76,
-                368,
-                150
-            ],
-            "area": 55200,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 29,
-            "image_id": 29,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                81,
-                77,
-                366,
-                149
-            ],
-            "area": 54534,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 30,
-            "image_id": 30,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                82,
-                76,
-                367,
-                150
-            ],
-            "area": 55050,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 31,
-            "image_id": 31,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                81,
-                74,
-                370,
-                151
-            ],
-            "area": 55870,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 32,
-            "image_id": 32,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                81,
-                74,
-                370,
-                152
-            ],
-            "area": 56240,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 33,
-            "image_id": 33,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                84,
-                77,
-                370,
-                151
-            ],
-            "area": 55870,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 34,
-            "image_id": 34,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                89,
-                79,
-                371,
-                149
-            ],
-            "area": 55279,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 35,
-            "image_id": 35,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                88,
-                78,
-                372,
-                149
-            ],
-            "area": 55428,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 36,
-            "image_id": 36,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                88,
-                78,
-                372,
-                150
-            ],
-            "area": 55800,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 37,
-            "image_id": 37,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                90,
-                79,
-                374,
-                149
-            ],
-            "area": 55726,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 38,
-            "image_id": 38,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                90,
-                80,
-                374,
-                149
-            ],
-            "area": 55726,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 39,
-            "image_id": 39,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                89,
-                81,
-                374,
-                150
-            ],
-            "area": 56100,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 40,
-            "image_id": 40,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                92,
-                81,
-                375,
-                150
-            ],
-            "area": 56250,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 41,
-            "image_id": 41,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                94,
-                80,
-                378,
-                150
-            ],
-            "area": 56700,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 42,
-            "image_id": 42,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                95,
-                80,
-                379,
-                150
-            ],
-            "area": 56850,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 43,
-            "image_id": 43,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                96,
-                79,
-                376,
-                151
-            ],
-            "area": 56776,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 44,
-            "image_id": 44,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                96,
-                79,
-                375,
-                152
-            ],
-            "area": 57000,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 45,
-            "image_id": 45,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                100,
-                81,
-                377,
-                150
-            ],
-            "area": 56550,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 46,
-            "image_id": 46,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                102,
-                81,
-                377,
-                150
-            ],
-            "area": 56550,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 47,
-            "image_id": 47,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                99,
-                79,
-                376,
-                152
-            ],
-            "area": 57152,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 48,
-            "image_id": 48,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                99,
-                82,
-                379,
-                150
-            ],
-            "area": 56850,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 49,
-            "image_id": 49,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                104,
-                82,
-                375,
-                150
-            ],
-            "area": 56250,
-            "full_occlusion": false,
-            "out_of_view": false
-        },
-        {
-            "id": 50,
-            "image_id": 50,
-            "instance_id": 2,
-            "category_id": 0,
-            "bbox": [
-                100,
-                81,
-                379,
-                152
-            ],
-            "area": 57608,
-            "full_occlusion": false,
-            "out_of_view": false
-        }
-    ]
-}
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/anno/gt_for_eval.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/anno/gt_for_eval.txt
deleted file mode 100644
index 496a7042e..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/anno/gt_for_eval.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-367,101,41,16
-366,103,45,16
-364,107,45,15
-362,109,46,16
-362,111,46,18
-362,113,46,18
-364,116,46,17
-366,118,45,17
-362,119,48,17
-359,119,45,17
-358,119,46,17
-360,121,46,17
-360,124,46,17
-359,124,47,17
-360,126,46,17
-356,127,46,18
-354,127,46,17
-352,127,46,17
-352,126,44,17
-349,126,46,17
-347,126,46,17
-346,125,46,17
-345,124,47,17
-345,124,46,17
-344,124,47,17
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/anno/track_results.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/anno/track_results.txt
deleted file mode 100644
index 7648a02f5..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/anno/track_results.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-367,101,408,117
-367,102,410,118
-363,105,406,121
-362,109,407,124
-361,112,407,128
-362,114,408,130
-364,116,410,132
-364,118,411,134
-360,120,408,136
-356,119,404,135
-356,119,404,135
-359,121,407,137
-359,124,407,141
-359,125,407,141
-358,126,406,143
-354,127,402,144
-351,127,400,144
-350,127,398,143
-349,127,397,143
-346,126,394,142
-344,126,392,143
-343,125,392,142
-343,123,392,140
-343,124,392,141
-341,124,392,141
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/full_occlusion.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/full_occlusion.txt
deleted file mode 100755
index 15794e007..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/full_occlusion.txt
+++ /dev/null
@@ -1 +0,0 @@
-0,0
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/out_of_view.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/out_of_view.txt
deleted file mode 100755
index 15794e007..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/out_of_view.txt
+++ /dev/null
@@ -1 +0,0 @@
-0,0
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/00000001.jpg b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/00000001.jpg
deleted file mode 100644
index 81e94785b..000000000
Binary files a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/00000001.jpg and /dev/null differ
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/absence.label b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/absence.label
deleted file mode 100644
index aa47d0d46..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/absence.label
+++ /dev/null
@@ -1,2 +0,0 @@
-0
-0
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/cover.label b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/cover.label
deleted file mode 100644
index 49019db80..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/cover.label
+++ /dev/null
@@ -1,2 +0,0 @@
-7
-7
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/gt_for_eval.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/gt_for_eval.txt
deleted file mode 100644
index 496a7042e..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/gt_for_eval.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-367,101,41,16
-366,103,45,16
-364,107,45,15
-362,109,46,16
-362,111,46,18
-362,113,46,18
-364,116,46,17
-366,118,45,17
-362,119,48,17
-359,119,45,17
-358,119,46,17
-360,121,46,17
-360,124,46,17
-359,124,47,17
-360,126,46,17
-356,127,46,18
-354,127,46,17
-352,127,46,17
-352,126,44,17
-349,126,46,17
-347,126,46,17
-346,125,46,17
-345,124,47,17
-345,124,46,17
-344,124,47,17
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/track_results.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/track_results.txt
deleted file mode 100644
index 7648a02f5..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/track_results.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-367,101,408,117
-367,102,410,118
-363,105,406,121
-362,109,407,124
-361,112,407,128
-362,114,408,130
-364,116,410,132
-364,118,411,134
-360,120,408,136
-356,119,404,135
-356,119,404,135
-359,121,407,137
-359,124,407,141
-359,125,407,141
-358,126,406,143
-354,127,402,144
-351,127,400,144
-350,127,398,143
-349,127,397,143
-346,126,394,142
-344,126,392,143
-343,125,392,142
-343,123,392,140
-343,124,392,141
-341,124,392,141
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/video-1.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/video-1.txt
deleted file mode 100644
index c3d2ab4ee..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/video-1.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-1,1,100,100
-1,1,100,100
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/vot2018_gt_for_eval.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/vot2018_gt_for_eval.txt
deleted file mode 100644
index 68f7c1d1b..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/vot2018_gt_for_eval.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-367,101,408,101,408,117,367,117
-366,103,411,103,411,119,366,119
-364,107,409,107,409,122,364,122
-362,109,408,109,408,125,362,125
-362,111,408,111,408,129,362,129
-362,113,408,113,408,131,362,131
-364,116,410,116,410,133,364,133
-366,118,411,118,411,135,366,135
-362,119,410,119,410,136,362,136
-359,119,404,119,404,136,359,136
-358,119,404,119,404,136,358,136
-360,121,406,121,406,138,360,138
-360,124,406,124,406,141,360,141
-359,124,406,124,406,141,359,141
-360,126,406,126,406,143,360,143
-356,127,402,127,402,145,356,145
-354,127,400,127,400,144,354,144
-352,127,398,127,398,144,352,144
-352,126,396,126,396,143,352,143
-349,126,395,126,395,143,349,143
-347,126,393,126,393,143,347,143
-346,125,392,125,392,142,346,142
-345,124,392,124,392,141,345,141
-345,124,391,124,391,141,345,141
-344,124,391,124,391,141,344,141
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/vot2018_track_results.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/vot2018_track_results.txt
deleted file mode 100644
index b735f2e17..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-1/vot2018_track_results.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-1
-367,102,410,118
-363,105,406,121
-362,109,407,124
-361,112,407,128
-362,114,408,130
-364,116,410,132
-2
-0
-0
-0
-0
-1
-359,125,407,141
-358,126,406,143
-354,127,402,144
-351,127,400,144
-350,127,398,143
-349,127,397,143
-346,126,394,142
-344,126,392,143
-343,125,392,142
-343,123,392,140
-343,124,392,141
-341,124,392,1416
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/00000001.jpg b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/00000001.jpg
deleted file mode 100644
index 81e94785b..000000000
Binary files a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/00000001.jpg and /dev/null differ
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/absence.label b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/absence.label
deleted file mode 100644
index aa47d0d46..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/absence.label
+++ /dev/null
@@ -1,2 +0,0 @@
-0
-0
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/cover.label b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/cover.label
deleted file mode 100644
index 49019db80..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/cover.label
+++ /dev/null
@@ -1,2 +0,0 @@
-7
-7
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/gt_for_eval.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/gt_for_eval.txt
deleted file mode 100644
index 2580b2402..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/gt_for_eval.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-76,74,367,151
-75,76,369,150
-78,76,368,150
-81,77,366,149
-82,76,367,150
-81,74,370,151
-81,74,370,152
-84,77,370,151
-89,79,371,149
-88,78,372,149
-88,78,372,150
-90,79,374,149
-90,80,374,149
-89,81,374,150
-92,81,375,150
-94,80,378,150
-95,80,379,150
-96,79,376,151
-96,79,375,152
-100,81,377,150
-102,81,377,150
-99,79,376,152
-99,82,379,150
-104,82,375,150
-100,81,379,152
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/track_results.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/track_results.txt
deleted file mode 100644
index 2716acd33..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/track_results.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-15,123,544,267
-18,130,545,274
-23,106,553,252
-20,117,547,264
-17,122,545,267
-13,129,540,273
-24,104,551,249
-29,110,559,255
-34,113,566,258
-31,122,557,266
-32,127,552,271
-30,135,548,276
-37,110,554,254
-31,112,558,258
-31,119,560,264
-21,124,547,268
-48,132,578,277
-22,102,553,249
-11,105,544,253
-19,110,551,257
-22,113,557,257
-32,112,567,255
-30,115,566,258
-34,116,570,261
-28,120,556,265
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/video-2.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/video-2.txt
deleted file mode 100644
index c3d2ab4ee..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/video-2.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-1,1,100,100
-1,1,100,100
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/vot2018_gt_for_eval.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/vot2018_gt_for_eval.txt
deleted file mode 100644
index 8814fef9e..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/vot2018_gt_for_eval.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-76,74,443,74,443,225,76,225
-75,76,444,76,444,226,75,226
-78,76,446,76,446,226,78,226
-81,77,447,77,447,226,81,226
-82,76,449,76,449,226,82,226
-81,74,451,74,451,225,81,225
-81,74,451,74,451,226,81,226
-84,77,454,77,454,228,84,228
-89,79,460,79,460,228,89,228
-88,78,460,78,460,227,88,227
-88,78,460,78,460,228,88,228
-90,79,464,79,464,228,90,228
-90,80,464,80,464,229,90,229
-89,81,463,81,463,231,89,231
-92,81,467,81,467,231,92,231
-94,80,472,80,472,230,94,230
-95,80,474,80,474,230,95,230
-96,79,472,79,472,230,96,230
-96,79,471,79,471,231,96,231
-100,81,477,81,477,231,100,231
-102,81,479,81,479,231,102,231
-99,79,475,79,475,231,99,231
-99,82,478,82,478,232,99,232
-104,82,479,82,479,232,104,232
-100,81,479,81,479,233,100,233
diff --git a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/vot2018_track_results.txt b/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/vot2018_track_results.txt
deleted file mode 100644
index 11e302839..000000000
--- a/tests/data/demo_sot_data/trackingnet/TRAIN_0/video-2/vot2018_track_results.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-1
-18,130,545,274
-23,106,553,252
-20,117,547,264
-17,122,545,267
-13,129,540,273
-24,104,551,249
-2
-0
-0
-0
-0
-1
-31,112,558,258
-31,119,560,264
-21,124,547,268
-48,132,578,277
-22,102,553,249
-11,105,544,253
-19,110,551,257
-22,113,557,257
-32,112,567,255
-30,115,566,258
-34,116,570,261
-2
diff --git a/tests/data/demo_sot_data/trackingnet/annotations/trackingnet_train_infos.txt b/tests/data/demo_sot_data/trackingnet/annotations/trackingnet_train_infos.txt
deleted file mode 100644
index e54c2c885..000000000
--- a/tests/data/demo_sot_data/trackingnet/annotations/trackingnet_train_infos.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-The format of each line in this txt is (video_path,annotation_path,start_frame_id,end_frame_id)
-TRAIN_0/video-1,TRAIN_0/video-1/video-1.txt,0,1
-TRAIN_0/video-2,TRAIN_0/video-2/video-2.txt,0,1
diff --git a/tests/data/demo_tao_data/ann.json b/tests/data/demo_tao_data/ann.json
deleted file mode 100644
index a38e0d555..000000000
--- a/tests/data/demo_tao_data/ann.json
+++ /dev/null
@@ -1,253 +0,0 @@
-{
-    "videos": [
-        {
-            "id": 0,
-            "width": 640,
-            "height": 480,
-            "neg_category_ids": [
-                342,
-                57,
-                651,
-                357,
-                738
-            ],
-            "not_exhaustive_category_ids": [
-                805,
-                95
-            ],
-            "name": "train/YFCC100M/v_f69ebe5b731d3e87c1a3992ee39c3b7e",
-            "metadata": {
-                "dataset": "YFCC100M",
-                "user_id": "22634709@N00",
-                "username": "Amsterdamized"
-            },
-            "frame_range": 30
-        }
-    ],
-    "annotations": [
-        {
-            "segmentation": [
-                [
-                    156,
-                    97,
-                    306,
-                    97,
-                    306,
-                    398,
-                    156,
-                    398
-                ]
-            ],
-            "bbox": [
-                156.0,
-                97.0,
-                150.0,
-                301.0
-            ],
-            "area": 45150,
-            "iscrowd": 0,
-            "id": 1,
-            "image_id": 0,
-            "category_id": 805,
-            "track_id": 1,
-            "_scale_uuid": "82addabd-8991-49f8-b63e-e0e767d4e695",
-            "scale_category": "moving object",
-            "video_id": 0,
-            "instance_id": 1
-        },
-        {
-            "segmentation": [
-                [
-                    162,
-                    240,
-                    296,
-                    240,
-                    296,
-                    480,
-                    162,
-                    480
-                ]
-            ],
-            "bbox": [
-                162.0,
-                240.0,
-                134.0,
-                240.0
-            ],
-            "area": 32160,
-            "iscrowd": 0,
-            "id": 3,
-            "image_id": 0,
-            "category_id": 95,
-            "track_id": 3,
-            "_scale_uuid": "040cfc07-5f1a-4736-b39f-b9d3a9e2a5ef",
-            "scale_category": "moving object",
-            "video_id": 0,
-            "instance_id": 3
-        },
-        {
-            "segmentation": [
-                [
-                    58,
-                    63,
-                    465,
-                    63,
-                    465,
-                    480,
-                    58,
-                    480
-                ]
-            ],
-            "bbox": [
-                58.0,
-                63.0,
-                407.0,
-                417.0
-            ],
-            "area": 169719,
-            "iscrowd": 0,
-            "id": 6,
-            "image_id": 1,
-            "category_id": 805,
-            "track_id": 1,
-            "_scale_uuid": "82addabd-8991-49f8-b63e-e0e767d4e695",
-            "scale_category": "moving object",
-            "video_id": 0,
-            "instance_id": 1
-        },
-        {
-            "segmentation": [
-                [
-                    332,
-                    377,
-                    440,
-                    377,
-                    440,
-                    480,
-                    332,
-                    480
-                ]
-            ],
-            "bbox": [
-                332.0,
-                377.0,
-                108.0,
-                103.0
-            ],
-            "area": 11124,
-            "iscrowd": 0,
-            "id": 7,
-            "image_id": 1,
-            "category_id": 95,
-            "track_id": 3,
-            "_scale_uuid": "040cfc07-5f1a-4736-b39f-b9d3a9e2a5ef",
-            "scale_category": "moving object",
-            "video_id": 0,
-            "instance_id": 3
-        }
-    ],
-    "tracks": [
-        {
-            "id": 1,
-            "category_id": 805,
-            "video_id": 0
-        },
-        {
-            "id": 3,
-            "category_id": 95,
-            "video_id": 0
-        }
-    ],
-    "images": [
-        {
-            "id": 0,
-            "video": "train/YFCC100M/v_f69ebe5b731d3e87c1a3992ee39c3b7e",
-            "_scale_task_id": "5de800eddb2c18001a56aa11",
-            "width": 640,
-            "height": 480,
-            "file_name": "train/YFCC100M/v_f69ebe5b731d3e87c1a3992ee39c3b7e/frame0391.jpg",
-            "frame_index": 390,
-            "license": 0,
-            "video_id": 0,
-            "frame_id": 0,
-            "neg_category_ids": [
-                342,
-                57,
-                651,
-                357,
-                738
-            ],
-            "not_exhaustive_category_ids": [
-                805,
-                95
-            ]
-        },
-        {
-            "id": 1,
-            "video": "train/YFCC100M/v_f69ebe5b731d3e87c1a3992ee39c3b7e",
-            "_scale_task_id": "5de800eddb2c18001a56aa11",
-            "width": 640,
-            "height": 480,
-            "file_name": "train/YFCC100M/v_f69ebe5b731d3e87c1a3992ee39c3b7e/frame0421.jpg",
-            "frame_index": 420,
-            "license": 0,
-            "video_id": 0,
-            "frame_id": 1,
-            "neg_category_ids": [
-                342,
-                57,
-                651,
-                357,
-                738
-            ],
-            "not_exhaustive_category_ids": [
-                805,
-                95
-            ]
-        }
-    ],
-    "info": {
-        "year": 2020,
-        "version": "0.1.20200120",
-        "description": "Annotations imported from Scale",
-        "contributor": "",
-        "url": "",
-        "date_created": "2020-01-20 15:49:53.519740"
-    },
-    "categories": [
-        {
-            "frequency": "c",
-            "id": 950,
-            "synset": "serving_dish.n.01",
-            "image_count": 0,
-            "instance_count": 0,
-            "synonyms": [
-                "serving_dish"
-            ],
-            "def": "a dish used for serving food",
-            "name": "serving_dish"
-        },
-        {
-            "frequency": "f",
-            "id": 805,
-            "synset": "person.n.01",
-            "image_count": 93,
-            "instance_count": 487,
-            "synonyms": [
-                "baby",
-                "child",
-                "boy",
-                "girl",
-                "man",
-                "woman",
-                "person",
-                "human"
-            ],
-            "def": "a human being",
-            "name": "baby"
-        }
-    ],
-    "licenses": [
-        "Unknown"
-    ]
-}
diff --git a/tests/data/demo_vis_data/ann.json b/tests/data/demo_vis_data/ann.json
deleted file mode 100644
index f5e62ba59..000000000
--- a/tests/data/demo_vis_data/ann.json
+++ /dev/null
@@ -1,269 +0,0 @@
-{
-    "categories": [
-        {
-            "supercategory": "object",
-            "id": 1,
-            "name": "sedan"
-        }
-    ],
-    "videos": [
-        {
-            "id": 1,
-            "name": "0043f083b5",
-            "width": 1280,
-            "height": 720
-        }
-    ],
-    "images": [
-        {
-            "file_name": "0043f083b5/00000.jpg",
-            "height": 720,
-            "width": 1280,
-            "id": 1,
-            "frame_id": 0,
-            "video_id": 1
-        }
-    ],
-    "annotations": [
-        {
-            "id": 1,
-            "video_id": 1,
-            "image_id": 1,
-            "category_id": 1,
-            "instance_id": 1,
-            "bbox": [
-                1174,
-                335,
-                104,
-                79
-            ],
-            "segmentation": {
-                "counts":[
-                    845683,
-                    7,
-                    696,
-                    10,
-                    7,
-                    7,
-                    685,
-                    35,
-                    681,
-                    43,
-                    676,
-                    44,
-                    675,
-                    45,
-                    647,
-                    14,
-                    13,
-                    46,
-                    647,
-                    73,
-                    645,
-                    75,
-                    645,
-                    75,
-                    645,
-                    71,
-                    648,
-                    66,
-                    654,
-                    66,
-                    654,
-                    66,
-                    654,
-                    66,
-                    653,
-                    67,
-                    653,
-                    67,
-                    653,
-                    67,
-                    653,
-                    67,
-                    653,
-                    67,
-                    652,
-                    68,
-                    652,
-                    68,
-                    652,
-                    68,
-                    652,
-                    68,
-                    651,
-                    69,
-                    651,
-                    69,
-                    651,
-                    69,
-                    651,
-                    69,
-                    651,
-                    69,
-                    651,
-                    70,
-                    650,
-                    71,
-                    649,
-                    71,
-                    649,
-                    71,
-                    649,
-                    72,
-                    648,
-                    72,
-                    648,
-                    72,
-                    648,
-                    72,
-                    648,
-                    72,
-                    648,
-                    72,
-                    648,
-                    71,
-                    649,
-                    71,
-                    649,
-                    70,
-                    650,
-                    69,
-                    651,
-                    68,
-                    652,
-                    68,
-                    652,
-                    68,
-                    652,
-                    68,
-                    652,
-                    68,
-                    652,
-                    68,
-                    652,
-                    67,
-                    654,
-                    66,
-                    654,
-                    66,
-                    654,
-                    67,
-                    654,
-                    66,
-                    654,
-                    66,
-                    654,
-                    66,
-                    654,
-                    66,
-                    654,
-                    66,
-                    654,
-                    66,
-                    654,
-                    66,
-                    654,
-                    66,
-                    654,
-                    71,
-                    649,
-                    73,
-                    647,
-                    75,
-                    646,
-                    74,
-                    646,
-                    74,
-                    646,
-                    74,
-                    646,
-                    74,
-                    647,
-                    73,
-                    647,
-                    73,
-                    647,
-                    73,
-                    648,
-                    71,
-                    649,
-                    70,
-                    651,
-                    69,
-                    651,
-                    69,
-                    651,
-                    68,
-                    653,
-                    65,
-                    656,
-                    62,
-                    659,
-                    59,
-                    662,
-                    56,
-                    664,
-                    55,
-                    665,
-                    55,
-                    665,
-                    55,
-                    666,
-                    54,
-                    666,
-                    54,
-                    667,
-                    53,
-                    668,
-                    52,
-                    670,
-                    50,
-                    672,
-                    48,
-                    676,
-                    44,
-                    678,
-                    46,
-                    676,
-                    45,
-                    676,
-                    45,
-                    675,
-                    45,
-                    674,
-                    46,
-                    674,
-                    46,
-                    673,
-                    47,
-                    673,
-                    47,
-                    673,
-                    6,
-                    1,
-                    40,
-                    673,
-                    5,
-                    4,
-                    38,
-                    684,
-                    36,
-                    686,
-                    33,
-                    689,
-                    30,
-                    692,
-                    20,
-                    1764
-                ],
-                "size": [
-                    720,
-                    1280
-                ]
-            },
-            "area": 6377,
-            "iscrowd": 0
-        }
-    ]
-}
diff --git a/tests/data/demo_vis_data/results.json b/tests/data/demo_vis_data/results.json
deleted file mode 100644
index 5bd519f3d..000000000
--- a/tests/data/demo_vis_data/results.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-    "track_bboxes": [
-        [
-            [
-                [
-                    3,
-                    1169.290283203125,
-                    313.9691467285156,
-                    1210.27490234375,
-                    390.8688659667969,
-                    0.998499870300293
-                ]
-            ]
-        ]
-    ],
-    "track_masks": [
-        [
-            [
-                {
-                    "size": [
-                        720,
-                        1280
-                    ],
-                    "counts": "ckii07he03_ZOMVe0l0L8K1O1TOQOV\\OP1jc0k0N2000L3K600000O100000000O1000000O10000000001O1O00001O0000000000O100O1O1O10000000000O2O00010O0000000000000005K2N2OO0000001O00001N1O2O000O2M3M3M3M2O10001O001O1O2N2N4L22NO00O0O100O1000gNP[OR1Pe0mNS[OQ1Xe0N2M3M3F`Q1"
-                }
-            ]
-        ]
-    ]
-}
diff --git a/tests/data/image_1.jpg b/tests/data/image_1.jpg
deleted file mode 100644
index 81e94785b..000000000
Binary files a/tests/data/image_1.jpg and /dev/null differ
diff --git a/tests/data/image_2.jpg b/tests/data/image_2.jpg
deleted file mode 100644
index 81e94785b..000000000
Binary files a/tests/data/image_2.jpg and /dev/null differ
diff --git a/tests/data/image_3.jpg b/tests/data/image_3.jpg
deleted file mode 100644
index 81e94785b..000000000
Binary files a/tests/data/image_3.jpg and /dev/null differ
diff --git a/tests/test_core/test_bbox/test_bbox_transforms.py b/tests/test_core/test_bbox/test_bbox_transforms.py
deleted file mode 100644
index 921aff179..000000000
--- a/tests/test_core/test_bbox/test_bbox_transforms.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmtrack.core.bbox import (bbox_cxcyah_to_xyxy, bbox_cxcywh_to_x1y1wh,
-                               bbox_xyxy_to_cxcyah, bbox_xyxy_to_x1y1wh,
-                               quad2bbox)
-
-
-def test_quad2bbox():
-    quad = torch.zeros((5, 8), dtype=torch.float)
-    low_coord_index = torch.tensor([0, 1, 3, 6], dtype=torch.long)
-    high_coord_index = torch.tensor([2, 4, 5, 7], dtype=torch.long)
-    quad[:, low_coord_index] = torch.randint(1, 10, (5, 4), dtype=torch.float)
-    quad[:, high_coord_index] = torch.randint(
-        10, 20, (5, 4), dtype=torch.float)
-    bbox = quad2bbox(quad)
-    assert (bbox > 0).all()
-
-
-def test_bbox_cxcywh_to_x1y1wh():
-    cx = torch.randint(1, 10, (5, 1), dtype=torch.float)
-    cy = torch.randint(1, 10, (5, 1), dtype=torch.float)
-    w = torch.randint(1, 10, (5, 1), dtype=torch.float)
-    h = torch.randint(1, 10, (5, 1), dtype=torch.float)
-    bbox = torch.cat((cx, cy, w, h), dim=-1)
-    bbox_new = bbox_cxcywh_to_x1y1wh(bbox)
-    assert (bbox_new[:, :2] < bbox[:, :2]).all()
-
-
-def test_bbox_xyxy_to_x1y1wh():
-    x1 = torch.randint(1, 10, (5, 1), dtype=torch.float)
-    y1 = torch.randint(1, 10, (5, 1), dtype=torch.float)
-    x2 = torch.randint(10, 20, (5, 1), dtype=torch.float)
-    y2 = torch.randint(10, 20, (5, 1), dtype=torch.float)
-    bbox = torch.cat((x1, y1, x2, y2), dim=-1)
-    bbox_new = bbox_xyxy_to_x1y1wh(bbox)
-    assert (bbox_new[:, 2:] > 0).all()
-
-
-def test_bbox_xyxy_to_cxcyah():
-    x1 = torch.randint(1, 10, (5, 1), dtype=torch.float)
-    y1 = torch.randint(1, 10, (5, 1), dtype=torch.float)
-    x2 = torch.randint(10, 20, (5, 1), dtype=torch.float)
-    y2 = torch.randint(10, 20, (5, 1), dtype=torch.float)
-    bbox = torch.cat((x1, y1, x2, y2), dim=-1)
-    bbox_new = bbox_xyxy_to_cxcyah(bbox)
-    assert (bbox_new > 0).all()
-
-
-def test_bbox_cxcyah_to_xyxy():
-    cx = torch.randint(1, 10, (5, 1), dtype=torch.float)
-    cy = torch.randint(1, 10, (5, 1), dtype=torch.float)
-    ratio = torch.randint(10, 20, (5, 1), dtype=torch.float)
-    h = torch.randint(10, 20, (5, 1), dtype=torch.float)
-    bbox = torch.cat((cx, cy, ratio, h), dim=-1)
-    bbox_new = bbox_cxcyah_to_xyxy(bbox)
-    assert bbox_new.shape == bbox.shape
diff --git a/tests/test_core/test_track/test_interpolation.py b/tests/test_core/test_track/test_interpolation.py
deleted file mode 100644
index bb9960c45..000000000
--- a/tests/test_core/test_track/test_interpolation.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-
-
-def test_interpolate_tracks():
-    from mmtrack.core import interpolate_tracks
-    frame_id = np.arange(100) // 10
-    tracklet_id = np.random.randint(low=1, high=5, size=(100))
-    bboxes = np.random.random((100, 4)) * 100
-    scores = np.random.random((100)) * 100
-    in_results = np.concatenate(
-        (frame_id[:, None], tracklet_id[:, None], bboxes, scores[:, None]),
-        axis=1)
-    out_results = interpolate_tracks(in_results)
-    assert out_results.shape[1] == in_results.shape[1]
-    # the range of frame ids should not change
-    assert min(out_results[:, 0]) == min(in_results[:, 0])
-    assert max(out_results[:, 0]) == max(in_results[:, 0])
-    # the range of track ids should not change
-    assert min(out_results[:, 1]) == min(in_results[:, 1])
-    assert max(out_results[:, 1]) == max(in_results[:, 1])
diff --git a/tests/test_core/test_track/test_similarity.py b/tests/test_core/test_track/test_similarity.py
deleted file mode 100644
index 8feea0968..000000000
--- a/tests/test_core/test_track/test_similarity.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-
-def test_embed_similarity():
-    from mmtrack.core import embed_similarity
-    key_embeds = torch.randn(20, 256)
-    ref_embeds = torch.randn(10, 256)
-
-    sims = embed_similarity(
-        key_embeds, ref_embeds, method='dot_product', temperature=-1)
-    assert sims.size() == (20, 10)
-
-    sims = embed_similarity(
-        key_embeds, ref_embeds, method='dot_product', temperature=0.07)
-    assert sims.size() == (20, 10)
-
-    sims = embed_similarity(
-        key_embeds, ref_embeds, method='cosine', temperature=-1)
-    assert sims.size() == (20, 10)
-    assert sims.max() <= 1
-
-    key_embeds = torch.randn(20, 256)
-    ref_embeds = torch.randn(0, 256)
-    sims = embed_similarity(
-        key_embeds, ref_embeds, method='cosine', temperature=-1)
-    assert sims.size() == (20, 0)
-
-    key_embeds = torch.randn(0, 256)
-    ref_embeds = torch.randn(10, 256)
-    sims = embed_similarity(
-        key_embeds, ref_embeds, method='dot_product', temperature=0.07)
-    assert sims.size() == (0, 10)
diff --git a/tests/test_core/test_track/test_transforms.py b/tests/test_core/test_track/test_transforms.py
deleted file mode 100644
index c8bfd53e4..000000000
--- a/tests/test_core/test_track/test_transforms.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from mmdet.core.bbox.demodata import random_boxes
-
-
-def test_imrenormalize():
-    from mmtrack.core import imrenormalize
-    img_norm_cfg = dict(
-        mean=[123.675, 116.28, 103.53],
-        std=[58.395, 57.12, 57.375],
-        to_rgb=True)
-    new_img_norm_cfg = dict(
-        mean=[123.675, 116.28, 103.53],
-        std=[58.395, 57.12, 57.375],
-        to_rgb=True)
-
-    img = np.random.randn(128, 256, 3).astype(np.float32)
-    new_img = imrenormalize(img, img_norm_cfg, new_img_norm_cfg)
-    assert isinstance(new_img, np.ndarray)
-    assert new_img.shape == (128, 256, 3)
-    assert np.allclose(img, new_img, atol=1e-6)
-
-    img = torch.randn(1, 3, 128, 256, dtype=torch.float)
-    new_img = imrenormalize(img, img_norm_cfg, new_img_norm_cfg)
-    assert isinstance(new_img, torch.Tensor)
-    assert new_img.shape == (1, 3, 128, 256)
-    assert np.allclose(img, new_img, atol=1e-6)
-
-
-def test_outs2results():
-    from mmtrack.core import outs2results
-
-    # pseudo data
-    num_objects, num_classes, image_size = 8, 4, 100
-    bboxes = random_boxes(num_objects, image_size)
-    scores = torch.FloatTensor(num_objects, 1).uniform_(0, 1)
-    bboxes = torch.cat([bboxes, scores], dim=1)
-    # leave the results of the last class as empty
-    labels = torch.randint(0, num_classes - 1, (num_objects, ))
-    ids = torch.arange(num_objects)
-    masks = torch.randint(0, 2, (num_objects, image_size, image_size)).bool()
-
-    # test track2result without ids
-    results = outs2results(
-        bboxes=bboxes, labels=labels, masks=masks, num_classes=num_classes)
-
-    for key in ['bbox_results', 'mask_results']:
-        assert key in results
-    assert len(results['bbox_results']) == num_classes
-    assert isinstance(results['bbox_results'][0], np.ndarray)
-    assert results['bbox_results'][-1].shape == (0, 5)
-    assert len(results['mask_results']) == num_classes
-    assert isinstance(results['mask_results'][-1], list)
-    assert len(results['mask_results'][-1]) == 0
-    for i in range(num_classes):
-        assert results['bbox_results'][i].shape[0] == (labels == i).sum()
-        assert results['bbox_results'][i].shape[1] == 5
-        assert len(results['mask_results'][i]) == (labels == i).sum()
-        if len(results['mask_results'][i]) > 0:
-            assert results['mask_results'][i][0].shape == (image_size,
-                                                           image_size)
-
-    # test track2result with ids
-    results = outs2results(
-        bboxes=bboxes,
-        labels=labels,
-        masks=masks,
-        ids=ids,
-        num_classes=num_classes)
-
-    for key in ['bbox_results', 'mask_results']:
-        assert key in results
-    assert len(results['bbox_results']) == num_classes
-    assert isinstance(results['bbox_results'][0], np.ndarray)
-    assert results['bbox_results'][-1].shape == (0, 6)
-    assert len(results['mask_results']) == num_classes
-    assert isinstance(results['mask_results'][-1], list)
-    assert len(results['mask_results'][-1]) == 0
-    for i in range(num_classes):
-        assert results['bbox_results'][i].shape[0] == (labels == i).sum()
-        assert results['bbox_results'][i].shape[1] == 6
-        assert len(results['mask_results'][i]) == (labels == i).sum()
-        if len(results['mask_results'][i]) > 0:
-            assert results['mask_results'][i][0].shape == (image_size,
-                                                           image_size)
-
-
-def test_results2outs():
-    from mmtrack.core import results2outs
-    num_classes = 3
-    num_objects = [2, 0, 2]
-    gt_labels = []
-    for id, num in enumerate(num_objects):
-        gt_labels.extend([id for _ in range(num)])
-    image_size = 100
-
-    bbox_results = [
-        np.random.randint(low=0, high=image_size, size=(num_objects[i], 5))
-        for i in range(num_classes)
-    ]
-    bbox_results_with_ids = [
-        np.random.randint(low=0, high=image_size, size=(num_objects[i], 6))
-        for i in range(num_classes)
-    ]
-    mask_results = [[] for i in range(num_classes)]
-    for cls_id in range(num_classes):
-        for obj_id in range(num_objects[cls_id]):
-            mask_results[cls_id].append(
-                np.random.randint(0, 2, (image_size, image_size)))
-
-    # test results2outs without ids
-    outs = results2outs(
-        bbox_results=bbox_results,
-        mask_results=mask_results,
-        mask_shape=(image_size, image_size))
-
-    for key in ['bboxes', 'labels', 'masks']:
-        assert key in outs
-    assert outs['bboxes'].shape == (sum(num_objects), 5)
-    assert (outs['labels'] == np.array(gt_labels)).all()
-    assert outs['masks'].shape == (sum(num_objects), image_size, image_size)
-
-    # test results2outs with ids
-    outs = results2outs(
-        bbox_results=bbox_results_with_ids,
-        mask_results=mask_results,
-        mask_shape=(image_size, image_size))
-
-    for key in ['bboxes', 'labels', 'ids', 'masks']:
-        assert key in outs
-    assert outs['bboxes'].shape == (sum(num_objects), 5)
-    assert (outs['labels'] == np.array(gt_labels)).all()
-    assert outs['ids'].shape == (sum(num_objects), )
-    assert outs['masks'].shape == (sum(num_objects), image_size, image_size)
diff --git a/tests/test_data/test_datasets/__init__.py b/tests/test_data/test_datasets/__init__.py
deleted file mode 100644
index 85b6b53f2..000000000
--- a/tests/test_data/test_datasets/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .utils import _create_coco_gt_results
-
-__all__ = ['_create_coco_gt_results']
diff --git a/tests/test_data/test_datasets/test_coco_video_dataset.py b/tests/test_data/test_datasets/test_coco_video_dataset.py
deleted file mode 100644
index a534a1121..000000000
--- a/tests/test_data/test_datasets/test_coco_video_dataset.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-
-import pytest
-
-from mmtrack.datasets import DATASETS as DATASETS
-from .utils import _create_coco_gt_results
-
-PREFIX = osp.join(osp.dirname(__file__), '../../data')
-# This is a demo annotation file for CocoVideoDataset
-# 1 videos, 2 categories ('car', 'person')
-# 8 images, 2 instances -> [4, 3] objects
-# 1 ignore, 2 crowd
-DEMO_ANN_FILE = f'{PREFIX}/demo_cocovid_data/ann.json'
-
-
-@pytest.mark.parametrize('dataset', ['CocoVideoDataset'])
-def test_parse_ann_info(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset = dataset_class(
-        ann_file=DEMO_ANN_FILE, classes=('car', 'person'), pipeline=[])
-
-    # image 1 doesn't have gt and detected objects
-    img_id = 1
-    img_info = dataset.coco.load_imgs([img_id])[0]
-    ann_ids = dataset.coco.get_ann_ids([img_id])
-    ann_info = dataset.coco.loadAnns(ann_ids)
-    ann = dataset._parse_ann_info(img_info, ann_info)
-    assert ann['bboxes'].shape == (0, 4)
-    assert ann['bboxes_ignore'].shape == (3, 4)
-
-    # image 5 has 2 objects
-    img_id = 5
-    img_info = dataset.coco.load_imgs([img_id])[0]
-    ann_ids = dataset.coco.get_ann_ids([img_id])
-    ann_info = dataset.coco.loadAnns(ann_ids)
-    ann = dataset._parse_ann_info(img_info, ann_info)
-    assert ann['bboxes'].shape == (2, 4)
-    assert ann['bboxes_ignore'].shape == (0, 4)
-
-    # image 8 doesn't have objects
-    img_id = 8
-    img_info = dataset.coco.load_imgs([img_id])[0]
-    ann_ids = dataset.coco.get_ann_ids([img_id])
-    ann_info = dataset.coco.loadAnns(ann_ids)
-    ann = dataset._parse_ann_info(img_info, ann_info)
-    assert ann['bboxes'].shape == (0, 4)
-    assert ann['bboxes_ignore'].shape == (0, 4)
-
-
-@pytest.mark.parametrize('dataset', ['CocoVideoDataset'])
-def test_prepare_data(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    # train
-    dataset = dataset_class(
-        ann_file=DEMO_ANN_FILE,
-        classes=['car', 'person'],
-        ref_img_sampler=dict(
-            num_ref_imgs=1,
-            frame_range=1,
-            filter_key_img=True,
-            method='uniform'),
-        pipeline=[],
-        test_mode=False)
-    assert len(dataset) == 7
-
-    results = dataset.prepare_train_img(0)
-    assert isinstance(results, list)
-    assert len(results) == 2
-    assert 'ann_info' in results[0]
-    assert results[0].keys() == results[1].keys()
-
-    dataset.ref_img_sampler = None
-    results = dataset.prepare_train_img(0)
-    assert isinstance(results, dict)
-    assert 'ann_info' in results
-
-    # test
-    dataset = dataset_class(
-        ann_file=DEMO_ANN_FILE,
-        classes=['car', 'person'],
-        ref_img_sampler=dict(
-            num_ref_imgs=1,
-            frame_range=1,
-            filter_key_img=True,
-            method='uniform'),
-        pipeline=[],
-        test_mode=True)
-    assert len(dataset) == 8
-
-    results = dataset.prepare_test_img(0)
-    assert isinstance(results, list)
-    assert len(results) == 2
-    assert 'ann_info' not in results[0]
-    assert results[0].keys() == results[1].keys()
-
-    dataset.ref_img_sampler = None
-    results = dataset.prepare_test_img(0)
-    assert isinstance(results, dict)
-    assert 'ann_info' not in results
-
-
-@pytest.mark.parametrize('dataset', ['CocoVideoDataset'])
-def test_video_data_sampling(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    # key image sampling
-    for interval in [4, 2, 1]:
-        dataset = dataset_class(
-            ann_file=DEMO_ANN_FILE,
-            load_as_video=True,
-            classes=['car', 'person'],
-            key_img_sampler=dict(interval=interval),
-            ref_img_sampler=dict(
-                num_ref_imgs=1,
-                frame_range=3,
-                filter_key_frame=True,
-                method='uniform'),
-            pipeline=[],
-            test_mode=True)
-        assert len(dataset.data_infos) == 8 // interval
-
-    # ref image sampling
-    data = dataset.data_infos[3]
-    sampler = dict(num_ref_imgs=1, frame_range=3, method='uniform')
-    ref_data = dataset.ref_img_sampling(data, **sampler)[1]
-    assert abs(ref_data['frame_id'] -
-               data['frame_id']) <= sampler['frame_range']
-    sampler = dict(num_ref_imgs=2, frame_range=3, method='bilateral_uniform')
-    ref_data = dataset.ref_img_sampling(data, **sampler)
-    assert len(ref_data) == 3
-    ref_data = dataset.ref_img_sampling(data, **sampler, return_key_img=False)
-    assert len(ref_data) == 2
-    assert ref_data[0]['frame_id'] < data['frame_id']
-    assert ref_data[1]['frame_id'] > data['frame_id']
-    assert data['frame_id'] - ref_data[0]['frame_id'] <= sampler['frame_range']
-    assert ref_data[1]['frame_id'] - data['frame_id'] <= sampler['frame_range']
-
-
-def test_coco_video_evaluation():
-    classes = ('car', 'person')
-    dataset_class = DATASETS.get('CocoVideoDataset')
-    dataset = dataset_class(
-        ann_file=DEMO_ANN_FILE, classes=classes, pipeline=[])
-    results = _create_coco_gt_results(dataset)
-    eval_results = dataset.evaluate(results, metric=['bbox', 'track'])
-    assert eval_results['bbox_mAP'] == 1.0
-    assert eval_results['bbox_mAP_50'] == 1.0
-    assert eval_results['bbox_mAP_75'] == 1.0
-    assert 'bbox_mAP_copypaste' in eval_results
-    assert eval_results['MOTA'] == 1.0
-    assert eval_results['IDF1'] == 1.0
-    assert eval_results['MT'] == 2
-    assert 'track_OVERALL_copypaste' in eval_results
-    assert 'track_AVERAGE_copypaste' in eval_results
-
-    classes = ('car', )
-    dataset = dataset_class(
-        ann_file=DEMO_ANN_FILE, classes=classes, pipeline=[])
-    results = _create_coco_gt_results(dataset)
-    eval_results = dataset.evaluate(results, metric=['bbox', 'track'])
-    assert eval_results['bbox_mAP'] == 1.0
-    assert eval_results['bbox_mAP_50'] == 1.0
-    assert eval_results['bbox_mAP_75'] == 1.0
-    assert 'bbox_mAP_copypaste' in eval_results
-    assert eval_results['MOTA'] == 1.0
-    assert eval_results['IDF1'] == 1.0
-    assert eval_results['MT'] == 1
-    assert 'track_OVERALL_copypaste' in eval_results
-    assert 'track_AVERAGE_copypaste' in eval_results
diff --git a/tests/test_data/test_datasets/test_common.py b/tests/test_data/test_datasets/test_common.py
deleted file mode 100644
index 14ee2d562..000000000
--- a/tests/test_data/test_datasets/test_common.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import tempfile
-from unittest.mock import MagicMock, patch
-
-import pytest
-import torch
-import torch.nn as nn
-from mmcv.runner import EpochBasedRunner
-from torch.utils.data import DataLoader
-
-from mmtrack.core.evaluation import DistEvalHook, EvalHook
-
-
-@patch('mmtrack.apis.single_gpu_test', MagicMock)
-@patch('mmtrack.apis.multi_gpu_test', MagicMock)
-@pytest.mark.parametrize('EvalHookParam', (EvalHook, DistEvalHook))
-def test_evaluation_hook(EvalHookParam):
-    # create dummy data
-    dataloader = DataLoader(torch.ones((5, 2)))
-    dataloader.dataset.load_as_video = True
-
-    # 0.1. dataloader is not a DataLoader object
-    with pytest.raises(TypeError):
-        EvalHookParam(dataloader=MagicMock(), interval=-1)
-
-    # 0.2. negative interval
-    with pytest.raises(ValueError):
-        EvalHookParam(dataloader, interval=-1)
-
-    # 1. start=None, interval=1: perform evaluation after each epoch.
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, interval=1)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner.run([dataloader], [('train', 1)], 2)
-    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 2
-
-    # 2. start=1, interval=1: perform evaluation after each epoch.
-    runner = _build_demo_runner()
-
-    evalhook = EvalHookParam(dataloader, start=1, interval=1)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner.run([dataloader], [('train', 1)], 2)
-    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 2
-
-    # 3. start=None, interval=2: perform evaluation after epoch 2, 4, 6, etc
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, interval=2)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner.run([dataloader], [('train', 1)], 2)
-    assert evalhook.evaluate.call_count == 1  # after epoch 2
-
-    # 4. start=1, interval=2: perform evaluation after epoch 1, 3, 5, etc
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, start=1, interval=2)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner.run([dataloader], [('train', 1)], 3)
-    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 3
-
-    # 5. start=0/negative, interval=1: perform evaluation after each epoch and
-    #    before epoch 1.
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, start=0)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner.run([dataloader], [('train', 1)], 2)
-    assert evalhook.evaluate.call_count == 3  # before epoch1 and after e1 & e2
-
-    # the evaluation start epoch cannot be less than 0
-    runner = _build_demo_runner()
-    with pytest.raises(ValueError):
-        EvalHookParam(dataloader, start=-2)
-
-    # 6. resuming from epoch i, start = x (x<=i), interval =1: perform
-    #    evaluation after each epoch and before the first epoch.
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, start=1)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner._epoch = 2
-    runner.run([dataloader], [('train', 1)], 3)
-    assert evalhook.evaluate.call_count == 2  # before & after epoch 3
-
-    # 7. resuming from epoch i, start = i+1/None, interval =1: perform
-    #    evaluation after each epoch.
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, start=2)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner._epoch = 1
-    runner.run([dataloader], [('train', 1)], 3)
-    assert evalhook.evaluate.call_count == 2  # after epoch 2 & 3
-
-
-def _build_demo_runner():
-
-    class Model(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.linear = nn.Linear(2, 1)
-
-        def forward(self, x):
-            return self.linear(x)
-
-        def train_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x))
-
-        def val_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x))
-
-    model = Model()
-    tmp_dir = tempfile.mkdtemp()
-
-    runner = EpochBasedRunner(
-        model=model, work_dir=tmp_dir, logger=logging.getLogger())
-    return runner
diff --git a/tests/test_data/test_datasets/test_dataset_wrapers.py b/tests/test_data/test_datasets/test_dataset_wrapers.py
deleted file mode 100644
index c9dc8e4c5..000000000
--- a/tests/test_data/test_datasets/test_dataset_wrapers.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-
-from mmtrack.datasets import build_dataset
-
-PREFIX = osp.join(osp.dirname(__file__), '../../data/demo_sot_data/')
-
-
-def test_random_sample_concatdataset():
-    train_cfg = dict(
-        type='RandomSampleConcatDataset',
-        dataset_sampling_weights=[1, 1],
-        dataset_cfgs=[
-            dict(
-                type='GOT10kDataset',
-                ann_file=PREFIX +
-                'trackingnet/annotations/trackingnet_train_infos.txt',
-                img_prefix=PREFIX + 'trackingnet',
-                pipeline=[],
-                split='train',
-                test_mode=False),
-            dict(
-                type='TrackingNetDataset',
-                chunks_list=[0],
-                ann_file=PREFIX +
-                'trackingnet/annotations/trackingnet_train_infos.txt',
-                img_prefix=PREFIX + 'trackingnet',
-                pipeline=[],
-                split='train',
-                test_mode=False)
-        ])
-    dataset = build_dataset(train_cfg)
-    results = dataset[0]
-    assert len(dataset) == 4
-    assert dataset.dataset_sampling_probs == [0.5, 0.5]
-    assert len(results) == 2
diff --git a/tests/test_data/test_datasets/test_mot_challenge_dataset.py b/tests/test_data/test_datasets/test_mot_challenge_dataset.py
deleted file mode 100644
index 99ddc6f34..000000000
--- a/tests/test_data/test_datasets/test_mot_challenge_dataset.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import tempfile
-from collections import defaultdict
-from unittest.mock import MagicMock, patch
-
-import mmcv
-import numpy as np
-import pytest
-
-from mmtrack.datasets import DATASETS as DATASETS
-from .utils import _create_coco_gt_results
-
-PREFIX = osp.join(osp.dirname(__file__), '../../data')
-# This is a demo annotation file for CocoVideoDataset
-# 1 videos, 2 categories ('car', 'person')
-# 8 images, 2 instances -> [4, 3] objects
-# 1 ignore, 2 crowd
-DEMO_ANN_FILE = f'{PREFIX}/demo_cocovid_data/ann.json'
-MOT_ANN_PATH = f'{PREFIX}/demo_MOT15_data/train'
-
-
-@pytest.mark.parametrize('dataset', ['MOTChallengeDataset'])
-def test_load_detections(dataset):
-    dataset_class = DATASETS.get(dataset)
-    dataset = dataset_class(
-        ann_file=DEMO_ANN_FILE,
-        classes=('car', 'person'),
-        pipeline=[],
-        test_mode=True)
-
-    tmp_dir = tempfile.TemporaryDirectory()
-    det_file = osp.join(tmp_dir.name, 'det.pkl')
-    outputs = _create_coco_gt_results(dataset)
-
-    mmcv.dump(outputs['det_bboxes'], det_file)
-    detections = dataset.load_detections(det_file)
-    assert isinstance(detections, list)
-    assert len(detections) == 8
-
-    mmcv.dump(outputs, det_file)
-    detections = dataset.load_detections(det_file)
-    assert isinstance(detections, list)
-    assert len(detections) == 8
-    dataset.detections = detections
-    i = np.random.randint(0, len(dataset.data_infos))
-    results = dataset.prepare_results(dataset.data_infos[i])
-    assert 'detections' in results
-    for a, b in zip(results['detections'], outputs['det_bboxes'][i]):
-        assert (a == b).all()
-
-    out = dict()
-    for i in range(len(dataset.data_infos)):
-        out[dataset.data_infos[i]['file_name']] = outputs['det_bboxes'][i]
-    mmcv.dump(out, det_file)
-    detections = dataset.load_detections(det_file)
-    assert isinstance(detections, dict)
-    assert len(detections) == 8
-    dataset.detections = detections
-    i = np.random.randint(0, len(dataset.data_infos))
-    results = dataset.prepare_results(dataset.data_infos[i])
-    assert 'detections' in results
-    for a, b in zip(results['detections'], outputs['det_bboxes'][i]):
-        assert (a == b).all()
-
-    tmp_dir.cleanup()
-
-
-@pytest.mark.parametrize('dataset', ['MOTChallengeDataset'])
-def test_parse_ann_info(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset = dataset_class(
-        ann_file=DEMO_ANN_FILE, classes=('car', 'person'), pipeline=[])
-
-    # image 1 doesn't have gt and detected objects
-    img_id = 1
-    img_info = dataset.coco.load_imgs([img_id])[0]
-    ann_ids = dataset.coco.get_ann_ids([img_id])
-    ann_info = dataset.coco.loadAnns(ann_ids)
-    ann = dataset._parse_ann_info(img_info, ann_info)
-    assert ann['bboxes'].shape == (0, 4)
-    assert ann['bboxes_ignore'].shape == (3, 4)
-
-    # image 5 has 2 objects
-    img_id = 5
-    img_info = dataset.coco.load_imgs([img_id])[0]
-    ann_ids = dataset.coco.get_ann_ids([img_id])
-    ann_info = dataset.coco.loadAnns(ann_ids)
-    ann = dataset._parse_ann_info(img_info, ann_info)
-    assert ann['bboxes'].shape == (2, 4)
-    assert ann['bboxes_ignore'].shape == (0, 4)
-
-    # image 8 doesn't have objects
-    img_id = 8
-    img_info = dataset.coco.load_imgs([img_id])[0]
-    ann_ids = dataset.coco.get_ann_ids([img_id])
-    ann_info = dataset.coco.loadAnns(ann_ids)
-    ann = dataset._parse_ann_info(img_info, ann_info)
-    assert ann['bboxes'].shape == (0, 4)
-    assert ann['bboxes_ignore'].shape == (0, 4)
-
-
-def test_mot15_bbox_evaluation():
-    classes = ('car', 'person')
-    dataset_class = DATASETS.get('MOTChallengeDataset')
-    dataset = dataset_class(
-        ann_file=DEMO_ANN_FILE, classes=classes, pipeline=[])
-    results = _create_coco_gt_results(dataset)
-
-    eval_results = dataset.evaluate(results, metric='bbox')
-    assert eval_results['mAP'] == 1.0
-    eval_results = dataset.evaluate(results['det_bboxes'], metric='bbox')
-    assert eval_results['mAP'] == 1.0
-
-
-@patch('mmtrack.datasets.MOTChallengeDataset.load_annotations', MagicMock)
-@patch('mmtrack.datasets.MOTChallengeDataset._filter_imgs', MagicMock)
-@pytest.mark.parametrize('dataset', ['MOTChallengeDataset'])
-def test_mot15_track_evaluation(dataset):
-    tmp_dir = tempfile.TemporaryDirectory()
-    videos = ['TUD-Campus', 'TUD-Stadtmitte']
-
-    dataset_class = DATASETS.get(dataset)
-    dataset_class.cat_ids = MagicMock()
-    dataset_class.coco = MagicMock()
-
-    dataset = dataset_class(
-        ann_file=MagicMock(), visibility_thr=-1, pipeline=[])
-    dataset.img_prefix = MOT_ANN_PATH
-    dataset.vid_ids = [1, 2]
-    vid_infos = [dict(name=_) for _ in videos]
-    dataset.coco.load_vids = MagicMock(return_value=vid_infos)
-    dataset.data_infos = []
-
-    def _load_results(videos):
-        track_bboxes, data_infos = [], []
-        for video in videos:
-            dets = mmcv.list_from_file(
-                osp.join(MOT_ANN_PATH, 'results', f'{video}.txt'))
-            track_bbox = defaultdict(list)
-            for det in dets:
-                det = det.strip().split(',')
-                frame_id, ins_id = map(int, det[:2])
-                bbox = list(map(float, det[2:7]))
-                track = [
-                    ins_id, bbox[0], bbox[1], bbox[0] + bbox[2],
-                    bbox[1] + bbox[3], bbox[4]
-                ]
-                track_bbox[frame_id].append(track)
-            max_frame = max(track_bbox.keys())
-            for i in range(1, max_frame + 1):
-                track_bboxes.append(
-                    [np.array(track_bbox[i], dtype=np.float32)])
-                data_infos.append(dict(frame_id=i - 1))
-        return track_bboxes, data_infos
-
-    track_bboxes, data_infos = _load_results(videos)
-    dataset.data_infos = data_infos
-
-    eval_results = dataset.evaluate(
-        dict(track_bboxes=track_bboxes),
-        metric='track',
-        logger=None,
-        resfile_path=None,
-        track_iou_thr=0.5)
-    assert eval_results['IDF1'] == 0.624
-    assert eval_results['IDP'] == 0.799
-    assert eval_results['MOTA'] == 0.555
-    assert eval_results['IDs'] == 14
-    assert eval_results['HOTA'] == 0.400
-
-    tmp_dir.cleanup()
diff --git a/tests/test_data/test_datasets/test_reid_dataset.py b/tests/test_data/test_datasets/test_reid_dataset.py
deleted file mode 100644
index 5893d9dfa..000000000
--- a/tests/test_data/test_datasets/test_reid_dataset.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-
-import pytest
-import torch
-
-from mmtrack.datasets import DATASETS as DATASETS
-
-PREFIX = osp.join(osp.dirname(__file__), '../../data')
-# This is a demo annotation file for ReIDDataset
-REID_ANN_FILE = f'{PREFIX}/demo_reid_data/mot17_reid/ann.txt'
-
-
-def _create_reid_gt_results(dataset):
-    results = []
-    dataset_infos = dataset.load_annotations()
-    for dataset_info in dataset_infos:
-        result = torch.full((128, ),
-                            float(dataset_info['gt_label']),
-                            dtype=torch.float32)
-        results.append(result)
-    return results
-
-
-@pytest.mark.parametrize('dataset', ['ReIDDataset'])
-def test_reid_dataset_parse_ann_info(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset = dataset_class(
-        data_prefix='reid', ann_file=REID_ANN_FILE, pipeline=[])
-    data_infos = dataset.load_annotations()
-    img_id = 0
-    # image 0 has 21 objects
-    assert len([
-        data_info for data_info in data_infos
-        if data_info['gt_label'] == img_id
-    ]) == 21
-    img_id = 11
-    # image 11 doesn't have objects
-    assert len([
-        data_info for data_info in data_infos
-        if data_info['gt_label'] == img_id
-    ]) == 0
-
-
-@pytest.mark.parametrize('dataset', ['ReIDDataset'])
-def test_reid_dataset_prepare_data(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    num_ids = 8
-    ins_per_id = 4
-    dataset = dataset_class(
-        data_prefix='reid',
-        ann_file=REID_ANN_FILE,
-        triplet_sampler=dict(num_ids=num_ids, ins_per_id=ins_per_id),
-        pipeline=[],
-        test_mode=False)
-    assert len(dataset) == 704
-
-    results = dataset.prepare_data(0)
-    assert isinstance(results, list)
-    assert len(results) == 32
-    assert 'img_info' in results[0]
-    assert 'gt_label' in results[0]
-    assert results[0].keys() == results[1].keys()
-    # triplet sampling
-    for idx in range(len(results) - 1):
-        if (idx + 1) % ins_per_id != 0:
-            assert results[idx]['gt_label'] == results[idx + 1]['gt_label']
-
-
-@pytest.mark.parametrize('dataset', ['ReIDDataset'])
-def test_reid_evaluation(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset = dataset_class(
-        data_prefix='reid', ann_file=REID_ANN_FILE, pipeline=[])
-    results = _create_reid_gt_results(dataset)
-    eval_results = dataset.evaluate(results, metric=['mAP', 'CMC'])
-    assert eval_results['mAP'] == 1
-    assert eval_results['R1'] == 1
-    assert eval_results['R5'] == 1
-    assert eval_results['R10'] == 1
-    assert eval_results['R20'] == 1
diff --git a/tests/test_data/test_datasets/test_sot_dataset.py b/tests/test_data/test_datasets/test_sot_dataset.py
deleted file mode 100644
index 0141060f8..000000000
--- a/tests/test_data/test_datasets/test_sot_dataset.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import os.path as osp
-import tempfile
-
-import mmcv
-import numpy as np
-import pytest
-
-from mmtrack.datasets import DATASETS as DATASETS
-
-PREFIX = osp.join(osp.dirname(__file__), '../../data')
-SOT_DATA_PREFIX = f'{PREFIX}/demo_sot_data'
-DATASET_INFOS = dict(
-    GOT10kDataset=dict(
-        ann_file=osp.join(
-            SOT_DATA_PREFIX,
-            'trackingnet/annotations/trackingnet_train_infos.txt'),
-        img_prefix=osp.join(SOT_DATA_PREFIX, 'trackingnet')),
-    VOTDataset=dict(
-        dataset_type='vot2018',
-        ann_file=osp.join(
-            SOT_DATA_PREFIX,
-            'trackingnet/annotations/trackingnet_train_infos.txt'),
-        img_prefix=osp.join(SOT_DATA_PREFIX, 'trackingnet')),
-    OTB100Dataset=dict(
-        ann_file=osp.join(
-            SOT_DATA_PREFIX,
-            'trackingnet/annotations/trackingnet_train_infos.txt'),
-        img_prefix=osp.join(SOT_DATA_PREFIX, 'trackingnet')),
-    UAV123Dataset=dict(
-        ann_file=osp.join(
-            SOT_DATA_PREFIX,
-            'trackingnet/annotations/trackingnet_train_infos.txt'),
-        img_prefix=osp.join(SOT_DATA_PREFIX, 'trackingnet')),
-    LaSOTDataset=dict(
-        ann_file=osp.join(
-            SOT_DATA_PREFIX,
-            'trackingnet/annotations/trackingnet_train_infos.txt'),
-        img_prefix=osp.join(SOT_DATA_PREFIX, 'trackingnet')),
-    TrackingNetDataset=dict(
-        chunks_list=[0],
-        ann_file=osp.join(
-            SOT_DATA_PREFIX,
-            'trackingnet/annotations/trackingnet_train_infos.txt'),
-        img_prefix=osp.join(SOT_DATA_PREFIX, 'trackingnet')),
-    SOTCocoDataset=dict(
-        ann_file=osp.join(PREFIX, 'demo_cocovid_data', 'ann.json'),
-        img_prefix=osp.join(PREFIX, 'demo_cocovid_data')),
-    SOTImageNetVIDDataset=dict(
-        ann_file=osp.join(PREFIX, 'demo_cocovid_data', 'ann.json'),
-        img_prefix=osp.join(PREFIX, 'demo_cocovid_data')))
-
-
-@pytest.mark.parametrize('dataset', [
-    'GOT10kDataset', 'VOTDataset', 'OTB100Dataset', 'UAV123Dataset',
-    'LaSOTDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
-    'SOTCocoDataset'
-])
-def test_load_data_infos(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset_class(
-        **DATASET_INFOS[dataset], pipeline=[], split='train', test_mode=False)
-
-
-@pytest.mark.parametrize('dataset', [
-    'GOT10kDataset', 'VOTDataset', 'LaSOTDataset', 'TrackingNetDataset',
-    'SOTImageNetVIDDataset', 'SOTCocoDataset'
-])
-def test_get_bboxes_from_video(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset_object = dataset_class(
-        **DATASET_INFOS[dataset], pipeline=[], split='train', test_mode=False)
-
-    bboxes = dataset_object.get_bboxes_from_video(0)
-    assert bboxes.shape[0] == dataset_object.num_frames_per_video[0]
-    assert bboxes.shape[1] == 4
-
-
-@pytest.mark.parametrize('dataset', [
-    'GOT10kDataset', 'VOTDataset', 'LaSOTDataset', 'TrackingNetDataset',
-    'SOTImageNetVIDDataset', 'SOTCocoDataset'
-])
-def test_get_visibility_from_video(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset_object = dataset_class(
-        **DATASET_INFOS[dataset], pipeline=[], split='train', test_mode=False)
-    visibility = dataset_object.get_visibility_from_video(0)
-    assert len(visibility['visible']) == dataset_object.num_frames_per_video[0]
-
-
-@pytest.mark.parametrize('dataset', [
-    'GOT10kDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
-    'SOTCocoDataset', 'VOTDataset', 'LaSOTDataset'
-])
-def test_get_ann_infos_from_video(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset_object = dataset_class(
-        **DATASET_INFOS[dataset], pipeline=[], split='train', test_mode=False)
-    dataset_object.get_ann_infos_from_video(0)
-
-
-@pytest.mark.parametrize('dataset', [
-    'GOT10kDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
-    'SOTCocoDataset', 'VOTDataset', 'LaSOTDataset'
-])
-def test_get_img_infos_from_video(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset_object = dataset_class(
-        **DATASET_INFOS[dataset], pipeline=[], split='train', test_mode=False)
-    dataset_object.get_img_infos_from_video(0)
-
-
-@pytest.mark.parametrize(
-    'dataset',
-    ['GOT10kDataset', 'VOTDataset', 'LaSOTDataset', 'TrackingNetDataset'])
-def test_prepare_test_data(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset_object = dataset_class(
-        **DATASET_INFOS[dataset], pipeline=[], split='train', test_mode=True)
-    dataset_object.prepare_test_data(0, 1)
-
-
-@pytest.mark.parametrize('dataset', [
-    'GOT10kDataset', 'TrackingNetDataset', 'SOTImageNetVIDDataset',
-    'SOTCocoDataset', 'LaSOTDataset'
-])
-def test_prepare_train_data(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset_object = dataset_class(
-        **DATASET_INFOS[dataset], pipeline=[], split='train', test_mode=False)
-    dataset_object.prepare_train_data(0)
-
-
-@pytest.mark.parametrize('dataset', ['GOT10kDataset', 'TrackingNetDataset'])
-def test_format_results(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset_object = dataset_class(
-        **DATASET_INFOS[dataset], pipeline=[], split='train', test_mode=True)
-
-    results = []
-    for video_name in ['video-1', 'video-2']:
-        results.extend(
-            mmcv.list_from_file(
-                osp.join(SOT_DATA_PREFIX, 'trackingnet', 'TRAIN_0', video_name,
-                         'track_results.txt')))
-
-    track_bboxes = []
-    for result in results:
-        x1, y1, x2, y2 = result.split(',')
-        track_bboxes.append(
-            np.array([float(x1),
-                      float(y1),
-                      float(x2),
-                      float(y2), 0.]))
-
-    track_results = dict(track_bboxes=track_bboxes)
-
-    tmp_dir = tempfile.TemporaryDirectory()
-    dataset_object.format_results(track_results, resfile_path=tmp_dir.name)
-    if osp.isdir(tmp_dir.name):
-        tmp_dir.cleanup()
-    if osp.isfile(f'{tmp_dir.name}.zip'):
-        os.remove(f'{tmp_dir.name}.zip')
-
-
-def test_sot_ope_evaluation():
-    dataset_class = DATASETS.get('UAV123Dataset')
-    dataset_object = dataset_class(
-        **DATASET_INFOS['UAV123Dataset'],
-        pipeline=[],
-        split='test',
-        test_mode=True)
-
-    dataset_object.num_frames_per_video = [25, 25]
-    results = []
-    data_infos = []
-    data_root = osp.join(SOT_DATA_PREFIX, 'trackingnet', 'TRAIN_0')
-    for video_name in ['video-1', 'video-2']:
-        bboxes = np.loadtxt(
-            osp.join(data_root, video_name, 'track_results.txt'),
-            delimiter=',')
-        scores = np.zeros((len(bboxes), 1))
-        bboxes = np.concatenate((bboxes, scores), axis=-1)
-        results.extend(bboxes)
-        data_infos.append(
-            dict(
-                video_path=osp.join(data_root, video_name),
-                ann_path=osp.join(data_root, video_name, 'gt_for_eval.txt'),
-                start_frame_id=1,
-                end_frame_id=25,
-                framename_template='%06d.jpg'))
-
-    dataset_object.data_infos = data_infos
-    track_results = dict(track_bboxes=results)
-    eval_results = dataset_object.evaluate(track_results, metric=['track'])
-    assert eval_results['success'] == 67.524
-    assert eval_results['norm_precision'] == 70.0
-    assert eval_results['precision'] == 50.0
-
-
-def test_sot_vot_evaluation():
-    dataset_class = DATASETS.get('VOTDataset')
-    dataset_object = dataset_class(
-        **DATASET_INFOS['VOTDataset'],
-        pipeline=[],
-        split='test',
-        test_mode=True)
-
-    dataset_object.num_frames_per_video = [25, 25]
-    data_infos = []
-    results = []
-    vot_root = osp.join(SOT_DATA_PREFIX, 'trackingnet', 'TRAIN_0')
-    for video_name in ['video-1', 'video-2']:
-        results.extend(
-            mmcv.list_from_file(
-                osp.join(vot_root, video_name, 'vot2018_track_results.txt')))
-        data_infos.append(
-            dict(
-                video_path=osp.join(vot_root, video_name),
-                ann_path=osp.join(vot_root, video_name,
-                                  'vot2018_gt_for_eval.txt'),
-                start_frame_id=1,
-                end_frame_id=25,
-                framename_template='%08d.jpg'))
-    dataset_object.data_infos = data_infos
-
-    track_bboxes = []
-    for result in results:
-        result = result.split(',')
-        if len(result) == 1:
-            track_bboxes.append(np.array([float(result[0]), 0.]))
-        else:
-            track_bboxes.append(
-                np.array([
-                    float(result[0]),
-                    float(result[1]),
-                    float(result[2]),
-                    float(result[3]), 0.
-                ]))
-
-    track_bboxes = dict(track_bboxes=track_bboxes)
-    eval_results = dataset_object.evaluate(
-        track_bboxes, interval=[1, 3], metric=['track'])
-    assert abs(eval_results['eao'] - 0.6661) < 0.0001
-    assert round(eval_results['accuracy'], 4) == 0.5826
-    assert round(eval_results['robustness'], 4) == 6.0
diff --git a/tests/test_data/test_datasets/test_sot_train_dataset.py b/tests/test_data/test_datasets/test_sot_train_dataset.py
deleted file mode 100644
index a97e6f010..000000000
--- a/tests/test_data/test_datasets/test_sot_train_dataset.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-
-import pytest
-
-from mmtrack.datasets import DATASETS as DATASETS
-
-PREFIX = osp.join(osp.dirname(__file__), '../../data')
-# This is a demo annotation file for CocoVideoDataset
-# 1 videos, 2 categories ('car', 'person')
-# 8 images, 2 instances -> [4, 3] objects
-# 1 ignore, 2 crowd
-DEMO_ANN_FILE = f'{PREFIX}/demo_cocovid_data/ann.json'
-
-
-@pytest.mark.parametrize('dataset', ['SOTTrainDataset'])
-def test_sot_train_dataset_parse_ann_info(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    dataset = dataset_class(ann_file=DEMO_ANN_FILE, pipeline=[])
-
-    # image 5 has 2 objects, we only load the object with instance_id = 1
-    img_id = 5
-    instance_id = 1
-    ann_ids = dataset.coco.get_ann_ids([img_id])
-    ann_info = dataset.coco.loadAnns(ann_ids)
-    ann = dataset._parse_ann_info(instance_id, ann_info)
-    assert ann['bboxes'].shape == (1, 4)
-    assert ann['labels'].shape == (1, ) and ann['labels'][0] == 0
-
-
-@pytest.mark.parametrize('dataset', ['SOTTrainDataset'])
-def test_sot_train_dataset_prepare_data(dataset):
-    dataset_class = DATASETS.get(dataset)
-
-    # train
-    dataset = dataset_class(
-        ann_file=DEMO_ANN_FILE,
-        ref_img_sampler=dict(
-            frame_range=100,
-            pos_prob=0.8,
-            filter_key_img=False,
-            return_key_img=True),
-        pipeline=[],
-        test_mode=False)
-    assert len(dataset) == 1
-
-    results = dataset.prepare_train_img(0)
-    assert isinstance(results, list)
-    assert len(results) == 2
-    assert 'ann_info' in results[0]
-    assert results[0].keys() == results[1].keys()
diff --git a/tests/test_data/test_datasets/test_tao_dataset.py b/tests/test_data/test_datasets/test_tao_dataset.py
deleted file mode 100644
index 006fd1725..000000000
--- a/tests/test_data/test_datasets/test_tao_dataset.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-
-from mmtrack.datasets import DATASETS as DATASETS
-from .utils import _create_coco_gt_results
-
-PREFIX = osp.join(osp.dirname(__file__), '../../data')
-DEMO_ANN_FILE = f'{PREFIX}/demo_tao_data/ann.json'
-DEMO_TAO_DATA = f'{PREFIX}/demo_tao_data/'
-
-
-def test_load_annotation():
-    dataset_class = DATASETS.get('TaoDataset')
-    dataset_object = dataset_class(
-        ann_file=DEMO_ANN_FILE, classes=['serving_dish', 'baby'], pipeline=[])
-
-    dataset_object.load_as_video = True
-    data_infos = dataset_object.load_lvis_anns(DEMO_ANN_FILE)
-    assert isinstance(data_infos, list)
-    assert len(data_infos) == 2
-
-    dataset_object.load_as_video = False
-    data_infos = dataset_object.load_tao_anns(DEMO_ANN_FILE)
-    assert isinstance(data_infos, list)
-    assert len(data_infos) == 2
-    assert len(dataset_object.vid_ids) == 1
-
-
-def test_tao_evaluation():
-    dataset_class = DATASETS.get('TaoDataset')
-    dataset_object = dataset_class(
-        ann_file=DEMO_ANN_FILE, classes=['serving_dish', 'baby'], pipeline=[])
-    results = _create_coco_gt_results(dataset_object)
-    eval_results = dataset_object.evaluate(results, metric=['track', 'bbox'])
-    assert eval_results['bbox_AP'] == 1
-    assert eval_results['track_AP'] == 1
diff --git a/tests/test_data/test_datasets/test_vis_dataset.py b/tests/test_data/test_datasets/test_vis_dataset.py
deleted file mode 100644
index 3e6deaa35..000000000
--- a/tests/test_data/test_datasets/test_vis_dataset.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-from collections import defaultdict
-
-import mmcv
-import numpy as np
-
-from mmtrack.datasets import DATASETS as DATASETS
-
-PREFIX = osp.join(osp.dirname(__file__), '../../data')
-
-DEMO_ANN_FILE = f'{PREFIX}/demo_vis_data/ann.json'
-DEMO_RES_FILE = f'{PREFIX}/demo_vis_data/results.json'
-
-
-def test_vis_evaluation():
-    dataset_class = DATASETS.get('YouTubeVISDataset')
-    dataset_object = dataset_class(
-        '2019', ann_file=DEMO_ANN_FILE, pipeline=[], test_mode=True)
-    results_json = mmcv.load(DEMO_RES_FILE)
-
-    results = defaultdict(list)
-    track_bboxes_numpy = []
-    for frame_bboxes in results_json['track_bboxes']:
-        tmp = []
-        for bbox in frame_bboxes:
-            tmp.append(np.array(bbox).reshape(-1, 6))
-        track_bboxes_numpy.append(tmp)
-    results['track_bboxes'] = track_bboxes_numpy
-    results['track_masks'] = results_json['track_masks']
-
-    eval_results = dataset_object.evaluate(results, metric=['track_segm'])
-    assert eval_results['segm_mAP_50'] == 1.0
-    assert eval_results['segm_mAP'] == 1.0
diff --git a/tests/test_data/test_datasets/utils.py b/tests/test_data/test_datasets/utils.py
deleted file mode 100644
index 75bb9c4c8..000000000
--- a/tests/test_data/test_datasets/utils.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from collections import defaultdict
-
-import numpy as np
-
-
-def _create_coco_gt_results(dataset):
-    from mmtrack.core import outs2results
-
-    results = defaultdict(list)
-    for img_info in dataset.data_infos:
-        ann = dataset.get_ann_info(img_info)
-        scores = np.ones((ann['bboxes'].shape[0], 1), dtype=np.float)
-        bboxes = np.concatenate((ann['bboxes'], scores), axis=1)
-        det_results = outs2results(
-            bboxes=bboxes,
-            labels=ann['labels'],
-            num_classes=len(dataset.CLASSES))
-        track_results = outs2results(
-            bboxes=bboxes,
-            labels=ann['labels'],
-            ids=ann['instance_ids'].astype(np.int),
-            num_classes=len(dataset.CLASSES))
-        results['det_bboxes'].append(det_results['bbox_results'])
-        results['track_bboxes'].append(track_results['bbox_results'])
-    return results
diff --git a/tests/test_data/test_pipelines/test_formatting.py b/tests/test_data/test_pipelines/test_formatting.py
deleted file mode 100644
index 48bb955d4..000000000
--- a/tests/test_data/test_pipelines/test_formatting.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import os.path as osp
-
-import numpy as np
-import pytest
-from mmcv.utils import build_from_cfg
-
-from mmtrack.datasets import PIPELINES
-
-
-class TestFormatting(object):
-
-    @classmethod
-    def setup_class(cls):
-        cls.data_prefix = osp.join(osp.dirname(__file__), '../../data')
-
-    def test_formatting(self):
-        img_names = ['image_1.jpg', 'image_2.jpg', 'image_3.jpg']
-        collect_keys = ['img', 'gt_bboxes', 'gt_label']
-        num_objects = 4
-        num_ref_imgs = len(img_names) - 1
-
-        results = [
-            dict(img_prefix=self.data_prefix, img_info=dict(filename=name))
-            for name in img_names
-        ]
-
-        load = dict(type='LoadMultiImagesFromFile')
-        load = build_from_cfg(load, PIPELINES)
-        results = load(results)
-        assert len(results) == len(img_names)
-
-        for _result in results:
-            _result['padding_mask'] = np.ones_like(_result['img'], dtype=bool)
-        check_data_validity = dict(type='CheckPadMaskValidity', stride=16)
-        check_data_validity = build_from_cfg(check_data_validity, PIPELINES)
-        assert results is not None
-
-        for result in results:
-            result['gt_bboxes'] = np.random.randn(num_objects, 4)
-            result['gt_label'] = np.random.randint(0, 10)
-
-        collect = dict(type='VideoCollect', keys=collect_keys)
-        collect = build_from_cfg(collect, PIPELINES)
-        results = collect(results)
-        assert len(results) == len(img_names)
-        for key in collect_keys:
-            assert key in results[0]
-            assert key in results[1]
-            assert key in results[2]
-        assert 'img_metas' in results[0]
-        assert 'img_metas' in results[1]
-        assert 'img_metas' in results[2]
-        key_results = results[0]
-
-        # the type of results is a list
-        # the length of results is greater than 1
-        reid_results = copy.deepcopy(results)
-        bundle = dict(type='ReIDFormatBundle')
-        bundle = build_from_cfg(bundle, PIPELINES)
-        reid_results = bundle(reid_results)
-        assert isinstance(reid_results, dict)
-        assert 'img' in reid_results
-        assert not reid_results['img'].cpu_only
-        assert reid_results['img'].stack
-        assert reid_results['img'].data.ndim == 4
-        assert reid_results['img'].data.size(0) == 3
-        assert 'gt_label' in reid_results
-        assert not reid_results['gt_label'].cpu_only
-        assert reid_results['gt_label'].stack
-        assert reid_results['gt_label'].data.ndim == 1
-        assert reid_results['img'].data.size(0) == 3
-
-        # the type of results is a dict
-        reid_results = copy.deepcopy(results[0])
-        reid_results = bundle(reid_results)
-        assert isinstance(reid_results, dict)
-        assert 'img' in reid_results
-        assert not reid_results['img'].cpu_only
-        assert reid_results['img'].stack
-        assert reid_results['img'].data.ndim == 3
-        assert 'gt_label' in reid_results
-        assert not reid_results['gt_label'].cpu_only
-        assert reid_results['gt_label'].stack
-        assert reid_results['gt_label'].data.ndim == 1
-
-        # the type of results is a tuple
-        with pytest.raises(TypeError):
-            reid_results = (copy.deepcopy(results[0]), )
-            reid_results = bundle(reid_results)
-
-        # the type of results is a list but it only has one item
-        with pytest.raises(AssertionError):
-            reid_results = [copy.deepcopy(results[0])]
-            reid_results = bundle(reid_results)
-
-        concat2twoparts = dict(type='ConcatSameTypeFrames', num_key_frames=2)
-        concat2twoparts = build_from_cfg(concat2twoparts, PIPELINES)
-        concat_video_results = concat2twoparts(copy.deepcopy(results))
-        assert len(concat_video_results) == 2
-        assert concat_video_results[0]['img'].ndim == 4
-        assert concat_video_results[0]['img'].shape[3] == 2
-        assert len(concat_video_results[0]['img_metas']) == 2
-        assert concat_video_results[0]['gt_bboxes'].ndim == 2
-        assert concat_video_results[0]['gt_bboxes'].shape[1] == 5
-        assert concat_video_results[0]['gt_bboxes'].shape[0] == (
-            num_ref_imgs * num_objects)
-
-        concat_ref = dict(type='ConcatVideoReferences')
-        concat_ref = build_from_cfg(concat_ref, PIPELINES)
-        results = concat_ref(results)
-        assert len(results) == 2
-        assert results[0] == key_results
-        assert results[1]['img'].ndim == 4
-        assert results[1]['img'].shape[3] == 2
-        assert len(results[1]['img_metas']) == 2
-        assert results[1]['gt_bboxes'].ndim == 2
-        assert results[1]['gt_bboxes'].shape[1] == 5
-        assert results[1]['gt_bboxes'].shape[0] == (num_ref_imgs * num_objects)
-
-        ref_prefix = 'ref'
-        bundle = dict(type='SeqDefaultFormatBundle', ref_prefix=ref_prefix)
-        bundle = build_from_cfg(bundle, PIPELINES)
-        results = bundle(results)
-        for key in results:
-            if ref_prefix not in key:
-                assert f'{ref_prefix}_{key}' in results
diff --git a/tests/test_data/test_pipelines/test_loading.py b/tests/test_data/test_pipelines/test_loading.py
deleted file mode 100644
index 91653c4e6..000000000
--- a/tests/test_data/test_pipelines/test_loading.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import os.path as osp
-
-import numpy as np
-
-from mmtrack.datasets import PIPELINES
-
-
-class TestLoading(object):
-
-    @classmethod
-    def setup_class(cls):
-        cls.data_prefix = osp.join(osp.dirname(__file__), '../../data')
-
-    def test_load_seq_imgs(self):
-        img_names = ['image_1.jpg', 'image_2.jpg', 'image_3.jpg']
-        results = [
-            dict(img_prefix=self.data_prefix, img_info=dict(filename=name))
-            for name in img_names
-        ]
-        load = PIPELINES.get('LoadMultiImagesFromFile')()
-        all_results = load(copy.deepcopy(results))
-        assert isinstance(all_results, list)
-        for i, results in enumerate(all_results):
-            assert results['filename'] == osp.join(self.data_prefix,
-                                                   img_names[i])
-            assert results['ori_filename'] == img_names[i]
-            assert results['img'].shape == (256, 512, 3)
-            assert results['img'].dtype == np.uint8
-            assert results['img_shape'] == (256, 512, 3)
-            assert results['ori_shape'] == (256, 512, 3)
-
-    def test_load_detections(self):
-        results = dict()
-        results['bbox_fields'] = []
-        results['detections'] = [np.random.randn(4, 5), np.random.randn(3, 5)]
-        load = PIPELINES.get('LoadDetections')()
-        results = load(results)
-        assert 'public_bboxes' in results
-        assert 'public_scores' in results
-        assert 'public_labels' in results
-        assert results['public_bboxes'].shape == (7, 4)
-        assert results['public_scores'].shape == (7, )
-        assert results['public_labels'].shape == (7, )
-        assert 'public_bboxes' in results['bbox_fields']
diff --git a/tests/test_data/test_pipelines/test_processing.py b/tests/test_data/test_pipelines/test_processing.py
deleted file mode 100644
index 18d37c0f4..000000000
--- a/tests/test_data/test_pipelines/test_processing.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-from mmcv.utils import build_from_cfg
-
-from mmtrack.datasets import PIPELINES
-
-
-def test_trident_sampling():
-    process = dict(
-        type='TridentSampling',
-        num_search_frames=1,
-        num_template_frames=2,
-        max_frame_range=[200],
-        cls_pos_prob=0.5,
-        train_cls_head=True)
-    process = build_from_cfg(process, PIPELINES)
-
-    num_frames = 60
-    pair_video_infos = []
-
-    filename = ['{:08d}.jpg'.format(i) for i in range(num_frames)]
-    frame_ids = np.arange(num_frames)
-    bboxes = np.ones((num_frames, 4))
-    for video_id in range(2):
-        bboxes_isvalid = np.ones(num_frames, dtype=bool)
-        random_invalid_index = np.random.randint(0, num_frames, 4)
-        bboxes_isvalid[random_invalid_index] = False
-        visible = bboxes_isvalid.copy()
-        random_invalid_index = np.random.randint(0, num_frames, 4)
-        visible[random_invalid_index] = False
-        video_info = dict(
-            bboxes=bboxes,
-            bboxes_isvalid=bboxes_isvalid,
-            visible=visible,
-            filename=filename,
-            frame_ids=frame_ids,
-            video_id=video_id)
-        pair_video_infos.append(video_info)
-
-    outs = process(pair_video_infos)
-    if outs is not None:
-        for out in outs:
-            assert 0 <= out['img_info']['frame_id'] < num_frames
-            assert 'labels' in out['ann_info']
-            assert (out['ann_info']['bboxes'] == np.ones((1, 4))).all()
-
-
-def test_pair_sampling():
-    process = dict(
-        type='PairSampling',
-        frame_range=5,
-        pos_prob=0.8,
-        filter_template_img=False)
-    process = build_from_cfg(process, PIPELINES)
-
-    num_frames = 60
-    pair_video_infos = []
-
-    filename = ['{:08d}.jpg'.format(i) for i in range(num_frames)]
-    frame_ids = np.arange(num_frames)
-    bboxes = np.ones((num_frames, 4))
-    for video_id in range(2):
-        bboxes_isvalid = np.ones(num_frames, dtype=bool)
-        visible = bboxes_isvalid.copy()
-        video_info = dict(
-            bboxes=bboxes,
-            bboxes_isvalid=bboxes_isvalid,
-            visible=visible,
-            filename=filename,
-            frame_ids=frame_ids,
-            video_id=video_id)
-        pair_video_infos.append(video_info)
-
-    outs = process(pair_video_infos)
-    if outs is not None:
-        for out in outs:
-            assert 0 <= out['img_info']['frame_id'] < num_frames
-            assert 'is_positive_pairs' in out
-            assert (out['ann_info']['bboxes'] == np.ones((1, 4))).all()
-
-
-def test_match_instances():
-    process = dict(type='MatchInstances', skip_nomatch=True)
-    process = build_from_cfg(process, PIPELINES)
-
-    results = [
-        dict(gt_instance_ids=np.array([0, 1, 2, 3, 4])),
-        dict(gt_instance_ids=np.array([2, 3, 4, 6]))
-    ]
-    outs = process(results)
-    assert (outs[0]['gt_match_indices'] == np.array([-1, -1, 0, 1, 2])).all()
-    assert (outs[1]['gt_match_indices'] == np.array([2, 3, 4, -1])).all()
-
-    results = [
-        dict(gt_instance_ids=np.array([0, 1, 2])),
-        dict(gt_instance_ids=np.array([3, 4, 6, 7]))
-    ]
-    outs = process(results)
-    assert outs is None
-
-    process.skip_nomatch = False
-    outs = process(results)
-    assert (outs[0]['gt_match_indices'] == -1).all()
-    assert (outs[1]['gt_match_indices'] == -1).all()
diff --git a/tests/test_data/test_pipelines/test_transform.py b/tests/test_data/test_pipelines/test_transform.py
deleted file mode 100644
index aba027ab9..000000000
--- a/tests/test_data/test_pipelines/test_transform.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import os.path as osp
-
-import numpy as np
-import pytest
-from mmcv.utils import build_from_cfg
-from mmdet.core.bbox.demodata import random_boxes
-
-from mmtrack.datasets import PIPELINES
-
-
-class TestTransforms(object):
-
-    @classmethod
-    def setup_class(cls):
-        cls.data_prefix = osp.join(osp.dirname(__file__), '../../data')
-
-        img_names = ['image_1.jpg', 'image_2.jpg']
-        results = [
-            dict(img_prefix=cls.data_prefix, img_info=dict(filename=name))
-            for name in img_names
-        ]
-        load = build_from_cfg(
-            dict(type='LoadMultiImagesFromFile', to_float32=True), PIPELINES)
-        cls.results = load(results)
-
-    def test_seq_crop_like_siamfc(self):
-        results = copy.deepcopy(self.results)
-        for res in results:
-            res['gt_bboxes'] = random_boxes(1, 256)
-            res['bbox_fields'] = ['gt_bboxes']
-
-        transform = dict(
-            type='SeqCropLikeSiamFC',
-            context_amount=0.5,
-            exemplar_size=127,
-            crop_size=511)
-        seq_crop_like_siamfc = build_from_cfg(transform, PIPELINES)
-
-        results = seq_crop_like_siamfc(results)
-        assert results[0]['img'].shape == (511, 511, 3)
-        assert results[1]['img'].shape == (511, 511, 3)
-
-    def test_seq_bbox_jitter(self):
-        results = copy.deepcopy(self.results)
-        for res in results:
-            res['gt_bboxes'] = random_boxes(1, 256)
-            res['bbox_fields'] = ['gt_bboxes']
-
-        transform = dict(
-            type='SeqBboxJitter',
-            center_jitter_factor=[0, 4.5],
-            scale_jitter_factor=[0, 0.5],
-            crop_size_factor=[2, 5])
-        seq_bbox_jitter = build_from_cfg(transform, PIPELINES)
-        results = seq_bbox_jitter(results)
-        assert results[0]['jittered_bboxes'].shape == (1, 4)
-        assert results[1]['jittered_bboxes'].shape == (1, 4)
-
-    def test_seq_crop_like_stark(self):
-        results = copy.deepcopy(self.results)
-        for res in results:
-            res['gt_bboxes'] = random_boxes(1, 256)
-            res['jittered_bboxes'] = np.array([[
-                res['gt_bboxes'][0][0] - 1, res['gt_bboxes'][0][1] + 2,
-                res['gt_bboxes'][0][2] + 2, res['gt_bboxes'][0][3] + 3
-            ]])
-            res['bbox_fields'] = ['gt_bboxes']
-
-        transform = dict(
-            type='SeqCropLikeStark',
-            crop_size_factor=[2, 5],
-            output_size=[128, 320])
-        seq_crop_like_stark = build_from_cfg(transform, PIPELINES)
-        results = seq_crop_like_stark(results)
-        assert results[0]['img'].shape == (128, 128, 3)
-        assert results[1]['img'].shape == (320, 320, 3)
-
-    def test_seq_brightness_aug(self):
-        results = copy.deepcopy(self.results)
-        imgs_shape = [result['img'].shape for result in results]
-
-        transform = dict(type='SeqBrightnessAug', jitter_range=0.2)
-        seq_brightness_aug = build_from_cfg(transform, PIPELINES)
-
-        results = seq_brightness_aug(results)
-        assert results[0]['img'].shape == imgs_shape[0]
-        assert results[1]['img'].shape == imgs_shape[1]
-
-    def test_seq_gray_aug(self):
-        results = copy.deepcopy(self.results)
-        imgs_shape = [result['img'].shape for result in results]
-
-        transform = dict(type='SeqGrayAug', prob=0.2)
-        seq_gray_aug = build_from_cfg(transform, PIPELINES)
-
-        results = seq_gray_aug(results)
-        assert results[0]['img'].shape == imgs_shape[0]
-        assert results[1]['img'].shape == imgs_shape[1]
-
-    def test_seq_shift_scale_aug(self):
-        results = copy.deepcopy(self.results)
-        for res in results:
-            res['gt_bboxes'] = random_boxes(1, 256).numpy()
-            res['bbox_fields'] = ['gt_bboxes']
-
-        transform = dict(
-            type='SeqShiftScaleAug',
-            target_size=[127, 255],
-            shift=[4, 64],
-            scale=[0.05, 0.18])
-        seq_shift_scale_aug = build_from_cfg(transform, PIPELINES)
-
-        results = seq_shift_scale_aug(results)
-        assert results[0]['img'].shape == (127, 127, 3)
-        assert results[1]['img'].shape == (255, 255, 3)
-
-    def test_seq_color_aug(self):
-        results = copy.deepcopy(self.results)
-        imgs_shape = [result['img'].shape for result in results]
-
-        transform = dict(
-            type='SeqColorAug',
-            prob=[1.0, 1.0],
-            rgb_var=[[-0.55919361, 0.98062831, -0.41940627],
-                     [1.72091413, 0.19879334, -1.82968581],
-                     [4.64467907, 4.73710203, 4.88324118]])
-        seq_color_aug = build_from_cfg(transform, PIPELINES)
-
-        results = seq_color_aug(results)
-        assert results[0]['img'].shape == imgs_shape[0]
-        assert results[1]['img'].shape == imgs_shape[1]
-
-    def test_seq_blur_aug(self):
-        results = copy.deepcopy(self.results)
-        imgs_shape = [result['img'].shape for result in results]
-
-        transform = dict(type='SeqBlurAug', prob=[0.0, 0.2])
-        seq_blur_aug = build_from_cfg(transform, PIPELINES)
-
-        results = seq_blur_aug(results)
-        assert results[0]['img'].shape == imgs_shape[0]
-        assert results[1]['img'].shape == imgs_shape[1]
-
-    def test_seq_resize(self):
-        results = copy.deepcopy(self.results)
-        transform = dict(
-            type='SeqResize', img_scale=(512, 1024), keep_ratio=True)
-        seq_resize = build_from_cfg(transform, PIPELINES)
-
-        results = seq_resize(results)
-        assert results[0]['img'].shape == (512, 1024, 3)
-        assert results[1]['img'].shape == (512, 1024, 3)
-
-    def test_seq_flip(self):
-
-        transform = dict(
-            type='SeqRandomFlip', share_params=True, flip_ratio=0.5)
-        flip_module = build_from_cfg(transform, PIPELINES)
-
-        for i in range(8):
-            results = copy.deepcopy(self.results)
-            results = flip_module(results)
-            assert results[0]['flip'] == results[1]['flip']
-            assert results[0]['flip_direction'] == results[1]['flip_direction']
-
-        cases = [False, False]
-        transform = dict(
-            type='SeqRandomFlip', share_params=False, flip_ratio=0.5)
-        flip_module = build_from_cfg(transform, PIPELINES)
-        for i in range(20):
-            results = copy.deepcopy(self.results)
-            results = flip_module(results)
-            if results[0]['flip'] == results[1]['flip']:
-                cases[0] = True
-            else:
-                cases[1] = True
-        assert cases[0] is True
-        assert cases[1] is True
-
-    def test_seq_pad(self):
-        results = copy.deepcopy(self.results)
-
-        transform = dict(type='SeqPad', size_divisor=32)
-        transform = build_from_cfg(transform, PIPELINES)
-        results = transform(results)
-
-        for result in results:
-            img_shape = result['img'].shape
-            assert img_shape[0] % 32 == 0
-            assert img_shape[1] % 32 == 0
-
-        resize_transform = dict(
-            type='SeqResize', img_scale=(1333, 800), keep_ratio=True)
-        resize_module = build_from_cfg(resize_transform, PIPELINES)
-        results = resize_module(results)
-        results = transform(results)
-        for result in results:
-            img_shape = result['img'].shape
-            assert img_shape[0] % 32 == 0
-            assert img_shape[1] % 32 == 0
-
-    def test_seq_normalize(self):
-        results = copy.deepcopy(self.results)
-        img_norm_cfg = dict(
-            mean=[123.675, 116.28, 103.53],
-            std=[58.395, 57.12, 57.375],
-            to_rgb=True)
-        transform = dict(type='SeqNormalize', **img_norm_cfg)
-        transform = build_from_cfg(transform, PIPELINES)
-        results = transform(results)
-
-        mean = np.array(img_norm_cfg['mean'])
-        std = np.array(img_norm_cfg['std'])
-        for i, result in enumerate(results):
-            converted_img = (self.results[i]['img'][..., ::-1] - mean) / std
-            assert np.allclose(result['img'], converted_img)
-
-    def test_seq_random_crop(self):
-        # test assertion for invalid random crop
-        with pytest.raises(AssertionError):
-            transform = dict(
-                type='SeqRandomCrop', crop_size=(-1, 0), share_params=False)
-            build_from_cfg(transform, PIPELINES)
-
-        crop_size = (256, 384)
-        transform = dict(
-            type='SeqRandomCrop', crop_size=crop_size, share_params=False)
-        crop_module = build_from_cfg(transform, PIPELINES)
-
-        results = copy.deepcopy(self.results)
-        for res in results:
-            res['gt_bboxes'] = random_boxes(8, 256)
-            res['gt_labels'] = np.random.randint(8)
-            res['gt_instance_ids'] = np.random.randint(8)
-            res['gt_bboxes_ignore'] = random_boxes(2, 256)
-
-        outs = crop_module(results)
-        assert len(outs) == len(results)
-        for res in results:
-            assert res['img'].shape[:2] == crop_size
-            # All bboxes should be reserved after crop
-            assert res['img_shape'][:2] == crop_size
-            assert res['gt_bboxes'].shape[0] == 8
-            assert res['gt_bboxes_ignore'].shape[0] == 2
-        assert outs[0]['img_info']['crop_offsets'] != outs[1]['img_info'][
-            'crop_offsets']
-
-        crop_module.share_params = True
-        outs = crop_module(results)
-        assert outs[0]['img_info']['crop_offsets'] == outs[1]['img_info'][
-            'crop_offsets']
-
-    def test_seq_color_jitter(self):
-        results = self.results.copy()
-        transform = dict(type='SeqPhotoMetricDistortion', share_params=False)
-        transform = build_from_cfg(transform, PIPELINES)
-
-        outs = transform(results)
-        assert outs[0]['img_info']['color_jitter'] != outs[1]['img_info'][
-            'color_jitter']
-
-        transform.share_params = True
-        outs = transform(results)
-        assert outs[0]['img_info']['color_jitter'] == outs[1]['img_info'][
-            'color_jitter']
diff --git a/tests/test_datasets/test_base_video_dataset.py b/tests/test_datasets/test_base_video_dataset.py
new file mode 100644
index 000000000..3a1b18377
--- /dev/null
+++ b/tests/test_datasets/test_base_video_dataset.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+from mmtrack.datasets import BaseVideoDataset
+
+PREFIX = osp.join(osp.dirname(__file__), '../data')
+# This is a demo annotation file for CocoVideoDataset
+# 1 videos, 2 categories ('car', 'person')
+# 8 images, 2 instances -> [4, 3] objects
+# 1 ignore, 2 crowd
+DEMO_ANN_FILE_VID = f'{PREFIX}/demo_cocovid_data/ann_vid.json'
+DEMO_ANN_FILE_IMG = f'{PREFIX}/demo_cocovid_data/ann_img.json'
+
+
+class TestBasevideoDataset(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        cls.metainfo = dict(CLASSES=('car', ))
+        cls.ref_img_sampler = dict(
+            num_ref_imgs=2,
+            frame_range=4,
+            filter_key_img=True,
+            method='bilateral_uniform')
+        cls.dataset_video = BaseVideoDataset(
+            ann_file=DEMO_ANN_FILE_VID,
+            metainfo=cls.metainfo,
+            load_as_video=True,
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            ref_img_sampler=cls.ref_img_sampler)
+        cls.dataset_image = BaseVideoDataset(
+            ann_file=DEMO_ANN_FILE_IMG,
+            metainfo=cls.metainfo,
+            load_as_video=False,
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            ref_img_sampler=cls.ref_img_sampler)
+
+    def test_get_data_info(self):
+        # test load_as_video=True
+        for i in range(len(self.dataset_video)):
+            data_info = self.dataset_video.get_data_info(i)
+            assert len(data_info['instances']) > 0
+
+        # test load_as_video=False
+        for i in range(len(self.dataset_image)):
+            data_info = self.dataset_image.get_data_info(i)
+            assert len(data_info['instances']) > 0
+
+    def test_len(self):
+        assert len(self.dataset_video) == 5
+        assert len(self.dataset_image) == 5
+
+    def test_getitem(self):
+        # test load_as_video=True
+        for i in range(1, len(self.dataset_video) - 1):
+            results = self.dataset_video[i]
+            assert isinstance(results, dict)
+            assert len(results['frame_id']) == 3
+            assert abs(results['frame_id'][1] - results['frame_id'][0]
+                       ) <= self.ref_img_sampler['frame_range']
+            assert abs(results['frame_id'][2] - results['frame_id'][0]
+                       ) <= self.ref_img_sampler['frame_range']
+
+        # test load_as_video=False
+        for i in range(1, len(self.dataset_image) - 1):
+            results = self.dataset_image[i]
+            assert isinstance(results, dict)
+            assert len(results['img_id']) == 3
+            assert len(set(results['img_id'])) == 1, \
+                'all `img_id`s in the same item must be the same.'
diff --git a/tests/test_datasets/test_dataset_wrapper.py b/tests/test_datasets/test_dataset_wrapper.py
new file mode 100644
index 000000000..e204468ca
--- /dev/null
+++ b/tests/test_datasets/test_dataset_wrapper.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+from mmtrack.registry import DATASETS
+from mmtrack.utils import register_all_modules
+
+PREFIX = osp.join(osp.dirname(__file__), '../data/demo_sot_data/')
+
+
+class TestRandomSampleConcatDataset(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        train_cfg = dict(
+            type='RandomSampleConcatDataset',
+            dataset_sampling_weights=[1, 1],
+            datasets=[
+                dict(
+                    type='GOT10kDataset',
+                    data_root=PREFIX,
+                    ann_file=  # noqa: E251
+                    'trackingnet/annotations/trackingnet_train_infos.txt',  # noqa: E501
+                    data_prefix=dict(img_path='trackingnet'),
+                    pipeline=[],
+                    test_mode=False),
+                dict(
+                    type='TrackingNetDataset',
+                    chunks_list=[0],
+                    data_root=PREFIX,
+                    ann_file=  # noqa: E251
+                    'trackingnet/annotations/trackingnet_train_infos.txt',  # noqa: E501
+                    data_prefix=dict(img_path='trackingnet'),
+                    pipeline=[],
+                    test_mode=False)
+            ])
+
+        cls.dataset = DATASETS.build(train_cfg)
+
+    def test_get_item(self):
+        results = self.dataset[0]
+        assert len(self.dataset) == 4
+        assert self.dataset.dataset_sampling_probs == [0.5, 0.5]
+        assert len(results) == 2
diff --git a/tests/test_datasets/test_imagenet_vid_dataset.py b/tests/test_datasets/test_imagenet_vid_dataset.py
new file mode 100644
index 000000000..32edf4c26
--- /dev/null
+++ b/tests/test_datasets/test_imagenet_vid_dataset.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+from mmtrack.datasets import ImagenetVIDDataset
+
+PREFIX = osp.join(osp.dirname(__file__), '../data')
+# This is a demo annotation file for CocoDataset
+# 1 videos, 2 categories ('bus', 'car')
+# 3 images, 6 instances
+DEMO_ANN_FILE_IMG = f'{PREFIX}/demo_imagenetvid_data/ann_img.json'
+DEMO_ANN_FILE_VID = f'{PREFIX}/demo_imagenetvid_data/ann_vid.json'
+
+
+class TestImagenetVIDDataset(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        cls.metainfo = dict(CLASSES=('bus', 'car'))
+        cls.ref_img_sampler = dict(
+            num_ref_imgs=2,
+            frame_range=4,
+            filter_key_img=False,
+            method='bilateral_uniform')
+        cls.dataset_video = ImagenetVIDDataset(
+            ann_file=DEMO_ANN_FILE_VID,
+            metainfo=cls.metainfo,
+            load_as_video=True,
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            ref_img_sampler=cls.ref_img_sampler)
+        cls.dataset_image = ImagenetVIDDataset(
+            ann_file=DEMO_ANN_FILE_IMG,
+            metainfo=cls.metainfo,
+            load_as_video=False,
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            ref_img_sampler=cls.ref_img_sampler)
+
+    def test_load_data_list(self):
+        data_list, valid_data_indices = self.dataset_image.load_data_list()
+        assert len(data_list) == 3
+        assert valid_data_indices == [0, 1, 2]
+        assert len(self.dataset_image) == 2
+
+        data_list, valid_data_indices = self.dataset_video.load_data_list()
+        assert len(data_list) == 8
+        assert valid_data_indices == [0, 1, 2, 3, 4, 5, 6, 7]
+        assert len(self.dataset_video) == 7
diff --git a/tests/test_datasets/test_mot_challenge_dataset.py b/tests/test_datasets/test_mot_challenge_dataset.py
new file mode 100644
index 000000000..80e1403e6
--- /dev/null
+++ b/tests/test_datasets/test_mot_challenge_dataset.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+from mmtrack.datasets import MOTChallengeDataset
+from mmtrack.datasets.api_wrappers import CocoVID
+
+PREFIX = osp.join(osp.dirname(__file__), '../data')
+# This is a demo annotation file for MOTChallengeDataset
+# 1 video, 2 categories ('pedestrian')
+# 3 images, 3 instances
+# 0 ignore, 1 crowd
+DEMO_ANN_FILE = f'{PREFIX}/demo_mot_data/ann.json'
+
+
+class TestMOTChallengeDataset(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.metainfo = dict(CLASSES=('pedestrian'))
+        cls.ref_img_sampler = dict(
+            num_ref_imgs=1,
+            frame_range=2,
+            filter_key_img=True,
+            method='uniform')
+        cls.dataset = MOTChallengeDataset(
+            ann_file=DEMO_ANN_FILE,
+            metainfo=cls.metainfo,
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            ref_img_sampler=cls.ref_img_sampler)
+
+    def test_parse_data_info(self):
+        coco = CocoVID(self.dataset.ann_file)
+
+        img_ids = coco.get_img_ids_from_vid(vidId=1)
+        for img_id in img_ids:
+            # load img info
+            raw_img_info = coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            raw_img_info['video_length'] = len(img_ids)
+
+            # load ann info
+            ann_ids = coco.get_ann_ids(img_ids=[img_id], cat_ids=1)
+            raw_ann_info = coco.load_anns(ann_ids)
+
+            # get data_info
+            parsed_data_info = self.dataset.parse_data_info(
+                dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info))
+            if img_id == 1:
+                assert len(parsed_data_info['instances']) == 3
+                assert parsed_data_info['instances'][0]['instance_id'] == 0
+            else:
+                assert len(parsed_data_info['instances']) == 0
diff --git a/tests/test_datasets/test_reid_dataset.py b/tests/test_datasets/test_reid_dataset.py
new file mode 100644
index 000000000..a196faf9f
--- /dev/null
+++ b/tests/test_datasets/test_reid_dataset.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+from mmtrack.datasets import ReIDDataset
+
+PREFIX = osp.join(osp.dirname(__file__), '../data')
+# This is a demo annotation file for ReIDDataset.
+REID_ANN_FILE = f'{PREFIX}/demo_reid_data/mot17_reid/ann.txt'
+
+
+class TestReIDDataset(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.num_ids = 8
+        cls.ins_per_id = 4
+        cls.dataset = ReIDDataset(
+            pipeline=[], ann_file=REID_ANN_FILE, data_prefix=dict(img_path=''))
+        cls.dataset_triplet = ReIDDataset(
+            pipeline=[],
+            triplet_sampler=dict(
+                num_ids=cls.num_ids, ins_per_id=cls.ins_per_id),
+            ann_file=REID_ANN_FILE,
+            data_prefix=dict(img_path=''))
+
+    def test_get_data_info(self):
+        # id 0 has 21 objects
+        img_id = 0
+        data_list = [
+            self.dataset.get_data_info(i) for i in range(len(self.dataset))
+        ]
+        assert len([
+            data_info for data_info in data_list
+            if data_info['gt_label'] == img_id
+        ]) == 21
+        # id 11 doesn't have objects
+        img_id = 11
+        assert len([
+            data_info for data_info in data_list
+            if data_info['gt_label'] == img_id
+        ]) == 0
+
+    def test_len(self):
+        assert len(self.dataset) == 704
+        assert len(self.dataset_triplet) == 704
+
+    def test_getitem(self):
+        for i in range(len(self.dataset)):
+            results = self.dataset[i]
+            assert isinstance(results, dict)  # no triplet -> dict
+            assert 'img_path' in results
+            assert 'gt_label' in results
+        for i in range(len(self.dataset_triplet)):
+            num = self.num_ids * self.ins_per_id
+            results = self.dataset_triplet[i]
+            assert isinstance(results, dict)  # triplet -> dict
+            assert len(results['img_path']) == num
+            assert 'img_path' in results
+            assert 'gt_label' in results
+            for idx in range(num - 1):
+                if (idx + 1) % self.ins_per_id != 0:
+                    assert results['gt_label'][idx] == \
+                           results['gt_label'][idx + 1]
diff --git a/tests/test_datasets/test_samplers/test_batch_sampler.py b/tests/test_datasets/test_samplers/test_batch_sampler.py
new file mode 100644
index 000000000..119ceead2
--- /dev/null
+++ b/tests/test_datasets/test_samplers/test_batch_sampler.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+from mmtrack.datasets import (BaseVideoDataset, EntireVideoBatchSampler,
+                              VideoSampler)
+
+PREFIX = osp.join(osp.dirname(__file__), '../../data')
+DEMO_ANN_FILE = f'{PREFIX}/demo_cocovid_data/ann_vid.json'
+
+
+class TestEntireVideoBatchSampler(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.metainfo = dict(CLASSES=('car'))
+        cls.video_dataset = BaseVideoDataset(
+            ann_file=DEMO_ANN_FILE,
+            metainfo=cls.metainfo,
+            ref_img_sampler=None,
+            test_mode=True)
+        cls.video_sampler = VideoSampler(cls.video_dataset)
+
+    def test_video_batch(self):
+        batch_size = 1
+        batch_sampler = EntireVideoBatchSampler(
+            self.video_sampler, batch_size=batch_size)
+        # 1 video
+        self.assertEqual(len(batch_sampler), 1)
diff --git a/tests/test_datasets/test_samplers/test_quota_sampler.py b/tests/test_datasets/test_samplers/test_quota_sampler.py
new file mode 100644
index 000000000..48b874ee7
--- /dev/null
+++ b/tests/test_datasets/test_samplers/test_quota_sampler.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections.abc import Iterable
+from unittest import TestCase
+
+from mmtrack.datasets import LaSOTDataset, QuotaSampler
+
+PREFIX = osp.join(osp.dirname(__file__), '../../data')
+SOT_DATA_PREFIX = f'{PREFIX}/demo_sot_data'
+
+
+class TestBasevideoDataset(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        cls.video_dataset = LaSOTDataset(
+            data_root=SOT_DATA_PREFIX,
+            ann_file='trackingnet/annotations/trackingnet_train_infos.txt',
+            data_prefix=dict(img_path='trackingnet'),
+            test_mode=False)
+        cls.video_sampler = QuotaSampler(
+            cls.video_dataset, samples_per_epoch=10)
+
+    def test_iter(self):
+        iterator = iter(self.video_sampler)
+        assert isinstance(iterator, Iterable)
+        for i in iterator:
+            assert i >= 0 and i < len(self.video_dataset)
+
+    def test_len(self):
+        assert len(self.video_sampler) == 10
diff --git a/tests/test_datasets/test_samplers/test_video_sampler.py b/tests/test_datasets/test_samplers/test_video_sampler.py
new file mode 100644
index 000000000..1b4732cb0
--- /dev/null
+++ b/tests/test_datasets/test_samplers/test_video_sampler.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections.abc import Iterable
+from unittest import TestCase
+
+from mmtrack.datasets import BaseVideoDataset, LaSOTDataset, VideoSampler
+
+PREFIX = osp.join(osp.dirname(__file__), '../../data')
+DEMO_ANN_FILE = f'{PREFIX}/demo_cocovid_data/ann_vid.json'
+SOT_DATA_PREFIX = f'{PREFIX}/demo_sot_data'
+
+
+class TestBasevideoDataset(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        cls.metainfo = dict(CLASSES=('car'))
+        cls.video_dataset = BaseVideoDataset(
+            ann_file=DEMO_ANN_FILE,
+            metainfo=cls.metainfo,
+            ref_img_sampler=None,
+            test_mode=True)
+        cls.video_sampler = VideoSampler(cls.video_dataset)
+
+        cls.sot_video_dataset = LaSOTDataset(
+            data_root=SOT_DATA_PREFIX,
+            ann_file='trackingnet/annotations/trackingnet_train_infos.txt',
+            data_prefix=dict(img_path='trackingnet'),
+            test_mode=True)
+        cls.sot_video_sampler = VideoSampler(cls.sot_video_dataset)
+
+    def test_iter(self):
+        iterator = iter(self.video_sampler)
+        assert isinstance(iterator, Iterable)
+        for i in iterator:
+            assert i >= 0 and i < len(self.video_dataset)
+
+        iterator = iter(self.sot_video_sampler)
+        assert isinstance(iterator, Iterable)
+        for idx in iterator:
+            assert len(idx) == 2
+            video_idx, frame_idx = idx
+            assert (video_idx >= 0
+                    and video_idx < self.sot_video_dataset.num_videos)
+            assert (frame_idx >= 0 and frame_idx <
+                    self.sot_video_dataset.get_len_per_video(video_idx))
+
+    def test_len(self):
+        assert len(self.video_sampler) == len(self.video_dataset)
+        assert len(self.sot_video_sampler) == len(self.sot_video_dataset)
diff --git a/tests/test_datasets/test_sot_dataset.py b/tests/test_datasets/test_sot_dataset.py
new file mode 100644
index 000000000..60119be90
--- /dev/null
+++ b/tests/test_datasets/test_sot_dataset.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+from mmtrack.datasets import (GOT10kDataset, LaSOTDataset, OTB100Dataset,
+                              SOTCocoDataset, SOTImageNetVIDDataset,
+                              TrackingNetDataset, UAV123Dataset, VOTDataset)
+
+PREFIX = osp.join(osp.dirname(__file__), '../data')
+SOT_DATA_PREFIX = f'{PREFIX}/demo_sot_data'
+
+
+class TestLaSOTDataset(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.dataset = LaSOTDataset(
+            data_root=SOT_DATA_PREFIX,
+            ann_file='trackingnet/annotations/trackingnet_train_infos.txt',
+            data_prefix=dict(img_path='trackingnet'),
+            test_mode=False)
+
+    def test_get_bboxes_from_video(self):
+        for idx in range(self.dataset.num_videos):
+            bboxes = self.dataset.get_bboxes_from_video(idx)
+            assert bboxes.shape[0] == self.dataset.get_len_per_video(idx)
+            assert bboxes.shape[1] == 4
+
+    def test_get_visibility_from_video(self):
+        for idx in range(self.dataset.num_videos):
+            visibility = self.dataset.get_visibility_from_video(idx)
+            assert len(
+                visibility['visible']) == self.dataset.get_len_per_video(idx)
+
+    def test_get_ann_infos_from_video(self):
+        for idx in range(self.dataset.num_videos):
+            video_info = self.dataset.get_ann_infos_from_video(idx)
+            assert len(
+                video_info['bboxes']) == self.dataset.get_len_per_video(idx)
+
+    def test_get_img_infos_from_video(self):
+        for idx in range(self.dataset.num_videos):
+            video_info = self.dataset.get_img_infos_from_video(idx)
+            assert len(
+                video_info['frame_ids']) == self.dataset.get_len_per_video(idx)
+
+    def test_prepare_test_data(self):
+        for video_idx in range(self.dataset.num_videos):
+            for frame_idx in range(self.dataset.get_len_per_video(video_idx)):
+                test_data = self.dataset.prepare_test_data(
+                    video_idx, frame_idx)
+                assert len(test_data['instances']) > 0
+
+    def test_prepare_train_data(self):
+        for idx in range(self.dataset.num_videos):
+            train_data = self.dataset.prepare_train_data(idx)
+            assert len(train_data) == 2
+
+    def test_get_len_per_video(self):
+        for idx in range(self.dataset.num_videos):
+            assert self.dataset.get_len_per_video(idx) == 2
+
+    def test_num_videos(self):
+        assert self.dataset.num_videos == 2
+
+    def test_len(self):
+        self.dataset.test_mode = False
+        assert len(self.dataset) == 2
+        self.dataset.test_mode = True
+        assert len(self.dataset) == 4
+        self.dataset.test_mode = False
+
+
+class TestGOT10kDataset(TestLaSOTDataset):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.dataset = GOT10kDataset(
+            data_root=SOT_DATA_PREFIX,
+            ann_file='trackingnet/annotations/trackingnet_train_infos.txt',
+            data_prefix=dict(img_path='trackingnet'),
+            test_mode=False)
+
+
+class TestTrackingNetDataset(TestLaSOTDataset):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.dataset = TrackingNetDataset(
+            data_root=SOT_DATA_PREFIX,
+            ann_file='trackingnet/annotations/trackingnet_train_infos.txt',
+            data_prefix=dict(img_path='trackingnet'),
+            test_mode=False)
+
+
+class TestSOTCocoDataset(TestLaSOTDataset):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.dataset = SOTCocoDataset(
+            ann_file=osp.join(PREFIX, 'demo_cocovid_data', 'ann_vid.json'),
+            data_prefix=dict(img_path=PREFIX),
+            test_mode=False)
+
+    def test_get_len_per_video(self):
+        for idx in range(len(self.dataset)):
+            assert self.dataset.get_len_per_video(idx) == 1
+
+    def test_num_videos(self):
+        assert self.dataset.num_videos == 7
+
+    def test_len(self):
+        assert len(self.dataset) == 7
+
+
+class TestSOTImageNetVIDDataset(TestLaSOTDataset):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.dataset = SOTImageNetVIDDataset(
+            ann_file=osp.join(PREFIX, 'demo_cocovid_data', 'ann_vid.json'),
+            data_prefix=dict(img_path=PREFIX),
+            test_mode=False)
+
+    def test_get_len_per_video(self):
+        len_videos = [4, 3, 1, 1, 1]
+        for idx in range(len(self.dataset)):
+            assert self.dataset.get_len_per_video(idx) == len_videos[idx]
+
+    def test_num_videos(self):
+        assert self.dataset.num_videos == 5
+
+    def test_len(self):
+        assert len(self.dataset) == 5
+
+
+class TestUAV123Dataset(TestLaSOTDataset):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.dataset = UAV123Dataset(
+            data_root=SOT_DATA_PREFIX,
+            ann_file='trackingnet/annotations/trackingnet_train_infos.txt',
+            data_prefix=dict(img_path='trackingnet'),
+            test_mode=True)
+
+
+class TestOTB100Dataset(TestLaSOTDataset):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.dataset = OTB100Dataset(
+            data_root=SOT_DATA_PREFIX,
+            ann_file='trackingnet/annotations/trackingnet_train_infos.txt',
+            data_prefix=dict(img_path='trackingnet'),
+            test_mode=True)
+
+
+class TestVOTDataset(TestLaSOTDataset):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.dataset = VOTDataset(
+            data_root=SOT_DATA_PREFIX,
+            ann_file='trackingnet/annotations/trackingnet_train_infos.txt',
+            data_prefix=dict(img_path='trackingnet'),
+            test_mode=True)
diff --git a/tests/test_datasets/test_tao_dataset.py b/tests/test_datasets/test_tao_dataset.py
new file mode 100644
index 000000000..7860f8031
--- /dev/null
+++ b/tests/test_datasets/test_tao_dataset.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+from mmtrack.datasets import TaoDataset
+
+PREFIX = osp.join(osp.dirname(__file__), '../data')
+# This is a demo annotation file for CocoDataset
+# 1 videos, 2 categories ('bus', 'car')
+# 3 images, 6 instances
+DEMO_ANN_FILE_IMG = f'{PREFIX}/demo_imagenetvid_data/ann_img.json'
+DEMO_ANN_FILE_VID = f'{PREFIX}/demo_imagenetvid_data/ann_vid.json'
+
+
+class TestTaoDataset(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        cls.metainfo = dict(CLASSES=('bus', 'car'))
+        cls.ref_img_sampler = dict(
+            num_ref_imgs=1,
+            frame_range=4,
+            filter_key_img=True,
+            method='uniform')
+        cls.dataset_video = TaoDataset(
+            ann_file=DEMO_ANN_FILE_VID,
+            metainfo=cls.metainfo,
+            load_as_video=True,
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            ref_img_sampler=cls.ref_img_sampler)
+        cls.dataset_image = TaoDataset(
+            ann_file=DEMO_ANN_FILE_IMG,
+            metainfo=cls.metainfo,
+            load_as_video=False,
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            ref_img_sampler=cls.ref_img_sampler)
+
+    def test_load_data_list(self):
+        data_list, valid_data_indices = self.dataset_image.load_data_list()
+        assert len(data_list) == 3
+        assert valid_data_indices == [0, 1, 2]
+        assert len(self.dataset_image) == 2
+
+        data_list, valid_data_indices = self.dataset_video.load_data_list()
+        assert len(data_list) == 8
+        assert valid_data_indices == [0, 1, 2, 3, 4, 5, 6, 7]
+        assert len(self.dataset_video) == 7
diff --git a/tests/test_datasets/test_transforms/test_formatting.py b/tests/test_datasets/test_transforms/test_formatting.py
new file mode 100644
index 000000000..a2deb59aa
--- /dev/null
+++ b/tests/test_datasets/test_transforms/test_formatting.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from unittest import TestCase
+
+import numpy as np
+import torch
+from mmdet.structures.mask import BitmapMasks
+from mmengine.structures import LabelData
+
+from mmtrack.datasets.transforms import (CheckPadMaskValidity, PackReIDInputs,
+                                         PackTrackInputs)
+from mmtrack.structures import ReIDDataSample
+
+
+class TestPackTrackInputs:
+
+    def setup_class(cls):
+        cls.H, cls.W = 100, 120
+        cls.img = np.zeros((cls.H, cls.W, 3))
+        cls.gt_bboxes = np.zeros((2, 4))
+        cls.gt_bboxes_labels = np.zeros((2, ))
+        cls.gt_masks = BitmapMasks(
+            np.random.rand(2, cls.H, cls.W), height=cls.H, width=cls.W)
+        cls.gt_instances_id = np.ones((2, ), dtype=np.int32)
+        cls.padding_mask = np.zeros((cls.H, cls.W), dtype=np.int8)
+        cls.frame_id = 0
+        cls.scale_factor = 2.0
+        cls.flip = False
+        cls.ori_shape = (cls.H, cls.W)
+        cls.results_1 = dict(
+            img=[cls.img.copy(),
+                 cls.img.copy(),
+                 cls.img.copy()],
+            gt_bboxes=[
+                cls.gt_bboxes.copy(),
+                cls.gt_bboxes.copy(),
+                cls.gt_bboxes.copy()
+            ],
+            gt_bboxes_labels=[
+                cls.gt_bboxes_labels.copy(),
+                cls.gt_bboxes_labels.copy(),
+                cls.gt_bboxes_labels.copy()
+            ],
+            gt_masks=[
+                deepcopy(cls.gt_masks),
+                deepcopy(cls.gt_masks),
+                deepcopy(cls.gt_masks)
+            ],
+            gt_instances_id=[
+                cls.gt_instances_id.copy(),
+                cls.gt_instances_id.copy(),
+                cls.gt_instances_id.copy(),
+            ],
+            frame_id=[cls.frame_id] * 3,
+            ori_shape=[(cls.H, cls.W)] * 3,
+            height=[cls.H] * 3,
+            width=[cls.W] * 3,
+            scale_factor=[cls.scale_factor] * 3,
+            flip=[cls.flip] * 3,
+            padding_mask=[
+                cls.padding_mask.copy(),
+                cls.padding_mask.copy(),
+                cls.padding_mask.copy()
+            ])
+
+        cls.results_2 = deepcopy(cls.results_1)
+        cls.results_2.update(
+            dict(gt_ignore_flags=[np.array([0, 1], dtype=np.bool)] * 3))
+
+        cls.results_3 = dict(
+            img=cls.img.copy(),
+            gt_bboxes=cls.gt_bboxes.copy(),
+            gt_bboxes_labels=cls.gt_bboxes_labels.copy(),
+            gt_masks=deepcopy(cls.gt_masks),
+            gt_instances_id=cls.gt_instances_id.copy(),
+            frame_id=cls.frame_id,
+            ori_shape=(cls.H, cls.W),
+            height=cls.H,
+            width=cls.W,
+            scale_factor=cls.scale_factor,
+            flip=cls.flip,
+            padding_mask=cls.padding_mask.copy())
+
+        cls.ref_prefix = 'ref'
+        cls.meta_keys = ('frame_id', 'ori_shape', 'scale_factor', 'flip')
+        cls.pack_track_inputs = PackTrackInputs(
+            num_key_frames=1,
+            ref_prefix=cls.ref_prefix,
+            meta_keys=cls.meta_keys,
+            pack_single_img=False)
+
+    def test_transform_without_ignore(self):
+        self.pack_track_inputs.pack_single_img = False
+        track_results = self.pack_track_inputs(self.results_1)
+        assert isinstance(track_results, dict)
+
+        inputs = track_results['inputs']
+        assert isinstance(inputs['img'], torch.Tensor)
+        assert inputs['img'].shape == (1, 3, self.H, self.W)
+        assert isinstance(inputs['ref_img'], torch.Tensor)
+        assert inputs['ref_img'].shape == (2, 3, self.H, self.W)
+
+        track_data_sample = track_results['data_samples']
+
+        assert track_data_sample.gt_instances.bboxes.shape == (2, 4)
+        assert track_data_sample.ref_gt_instances.bboxes.shape == (4, 4)
+
+        assert track_data_sample.gt_instances.labels.shape == (2, )
+        assert track_data_sample.ref_gt_instances.labels.shape == (4, )
+
+        assert track_data_sample.gt_instances.instances_id.shape == (2, )
+        assert track_data_sample.ref_gt_instances.instances_id.shape == (4, )
+
+        assert (track_data_sample.gt_instances.map_instances_to_img_idx ==
+                torch.tensor([0, 0], dtype=torch.int32)).all()
+        assert (track_data_sample.ref_gt_instances.map_instances_to_img_idx ==
+                torch.tensor([0, 0, 1, 1], dtype=torch.int32)).all()
+
+        assert len(track_data_sample.gt_instances.masks) == 2
+        assert track_data_sample.gt_instances.masks.height == self.H
+        assert track_data_sample.gt_instances.masks.width == self.W
+        assert len(track_data_sample.ref_gt_instances.masks) == 4
+        assert track_data_sample.ref_gt_instances.masks.height == self.H
+        assert track_data_sample.ref_gt_instances.masks.width == self.W
+
+        track_data_sample.padding_mask.shape == (1, self.H, self.W)
+        track_data_sample.ref_padding_mask.shape == (2, self.H, self.W)
+
+        for key in self.meta_keys:
+            assert track_data_sample.metainfo[key] == getattr(self, key)
+            assert track_data_sample.metainfo[f'ref_{key}'] == [
+                getattr(self, key)
+            ] * 2
+
+    def test_transform_with_ignore(self):
+        self.pack_track_inputs.pack_single_img = False
+        track_results = self.pack_track_inputs(self.results_2)
+        assert isinstance(track_results, dict)
+
+        inputs = track_results['inputs']
+        assert isinstance(inputs['img'], torch.Tensor)
+        assert inputs['img'].shape == (1, 3, self.H, self.W)
+        assert isinstance(inputs['ref_img'], torch.Tensor)
+        assert inputs['ref_img'].shape == (2, 3, self.H, self.W)
+
+        track_data_sample = track_results['data_samples']
+
+        assert track_data_sample.gt_instances.bboxes.shape == (1, 4)
+        assert track_data_sample.ref_gt_instances.bboxes.shape == (2, 4)
+
+        assert track_data_sample.gt_instances.labels.shape == (1, )
+        assert track_data_sample.ref_gt_instances.labels.shape == (2, )
+
+        assert track_data_sample.gt_instances.instances_id.shape == (1, )
+        assert track_data_sample.ref_gt_instances.instances_id.shape == (2, )
+
+        assert (track_data_sample.gt_instances.map_instances_to_img_idx ==
+                torch.tensor([0], dtype=torch.int32)).all()
+        assert (track_data_sample.ref_gt_instances.map_instances_to_img_idx ==
+                torch.tensor([0, 1], dtype=torch.int32)).all()
+
+        assert len(track_data_sample.gt_instances.masks) == 1
+        assert track_data_sample.gt_instances.masks.height == self.H
+        assert track_data_sample.gt_instances.masks.width == self.W
+        assert len(track_data_sample.ref_gt_instances.masks) == 2
+        assert track_data_sample.ref_gt_instances.masks.height == self.H
+        assert track_data_sample.ref_gt_instances.masks.width == self.W
+
+        track_data_sample.padding_mask.shape == (1, self.H, self.W)
+        track_data_sample.ref_padding_mask.shape == (2, self.H, self.W)
+
+        for key in self.meta_keys:
+            assert track_data_sample.metainfo[key] == getattr(self, key)
+            assert track_data_sample.metainfo[f'ref_{key}'] == [
+                getattr(self, key)
+            ] * 2
+
+    def test_transform_test_mode(self):
+        self.pack_track_inputs.pack_single_img = True
+        track_results = self.pack_track_inputs(self.results_3)
+        assert isinstance(track_results, dict)
+
+        inputs = track_results['inputs']
+        assert isinstance(inputs['img'], torch.Tensor)
+        assert inputs['img'].shape == (1, 3, self.H, self.W)
+
+        track_data_sample = track_results['data_samples']
+
+        assert track_data_sample.gt_instances.bboxes.shape == (2, 4)
+
+        assert track_data_sample.gt_instances.labels.shape == (2, )
+
+        assert track_data_sample.gt_instances.instances_id.shape == (2, )
+
+        assert (track_data_sample.gt_instances.map_instances_to_img_idx ==
+                torch.tensor([0], dtype=torch.int32)).all()
+
+        assert len(track_data_sample.gt_instances.masks) == 2
+        assert track_data_sample.gt_instances.masks.height == self.H
+        assert track_data_sample.gt_instances.masks.width == self.W
+
+        track_data_sample.padding_mask.shape == (1, self.H, self.W)
+
+        for key in self.meta_keys:
+            assert track_data_sample.metainfo[key] == getattr(self, key)
+
+
+class TestPackReIDInputs(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.results = dict(
+            img=np.random.randn(256, 128, 3),
+            gt_label=0,
+            img_path='',
+            ori_shape=(128, 128),
+            img_shape=(256, 128),
+            scale=(128, 256),
+            scale_factor=(1., 2.),
+            flip=False,
+            flip_direction=None)
+        cls.pack_reid_inputs = PackReIDInputs(
+            meta_keys=('flip', 'flip_direction'))
+
+    def test_transform(self):
+        results = self.pack_reid_inputs(self.results)
+        self.assertIn('inputs', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertIn('data_samples', results)
+        data_sample = results['data_samples']
+        self.assertIsInstance(data_sample, ReIDDataSample)
+        self.assertIsInstance(data_sample.gt_label, LabelData)
+        self.assertEqual(data_sample.img_path, '')
+        self.assertEqual(data_sample.ori_shape, (128, 128))
+        self.assertEqual(data_sample.img_shape, (256, 128))
+        self.assertEqual(data_sample.scale, (128, 256))
+        self.assertEqual(data_sample.scale_factor, (1., 2.))
+        self.assertEqual(data_sample.flip, False)
+        self.assertIsNone(data_sample.flip_direction)
+
+    def test_repr(self):
+        self.assertEqual(
+            repr(self.pack_reid_inputs),
+            f'PackReIDInputs(meta_keys={self.pack_reid_inputs.meta_keys})')
+
+
+class TestCheckPadMaskValidity:
+
+    def setup_class(cls):
+        dummy = np.zeros((50, 50, 3))
+        cls.results = dict(
+            img=[dummy.copy(), dummy.copy(),
+                 dummy.copy()],
+            padding_mask=[dummy.copy(),
+                          dummy.copy(),
+                          dummy.copy()])
+
+        cls.check_pad_mask_validity = CheckPadMaskValidity(stride=16)
+
+    def test_transform(self):
+        results = self.check_pad_mask_validity(self.results)
+        assert results is not None
+        self.results['padding_mask'][1] = np.ones((50, 50, 3))
+        results = self.check_pad_mask_validity(self.results)
+        assert results is None
diff --git a/tests/test_datasets/test_transforms/test_loading.py b/tests/test_datasets/test_transforms/test_loading.py
new file mode 100644
index 000000000..9799fd05d
--- /dev/null
+++ b/tests/test_datasets/test_transforms/test_loading.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+
+import numpy as np
+
+from mmtrack.datasets.transforms import LoadTrackAnnotations
+
+
+class TestLoadTrackAnnotations:
+
+    def setup_class(cls):
+        data_prefix = osp.join(osp.dirname(__file__), '../data')
+        seg_map = osp.join(data_prefix, 'grayscale.jpg')
+        cls.results = {
+            'seg_map_path':
+            seg_map,
+            'instances': [{
+                'bbox': [0, 0, 10, 20],
+                'bbox_label': 1,
+                'instance_id': 100,
+                'keypoints': [1, 2, 3]
+            }, {
+                'bbox': [10, 10, 110, 120],
+                'bbox_label': 2,
+                'instance_id': 102,
+                'keypoints': [4, 5, 6]
+            }]
+        }
+
+    def test_load_instances_id(self):
+        transform = LoadTrackAnnotations(
+            with_bbox=False,
+            with_label=True,
+            with_instance_id=True,
+            with_seg=False,
+            with_keypoints=False,
+        )
+        results = transform(copy.deepcopy(self.results))
+        assert 'gt_instances_id' in results
+        assert (results['gt_instances_id'] == np.array([100, 102])).all()
+        assert results['gt_instances_id'].dtype == np.int32
+
+    def test_repr(self):
+        transform = LoadTrackAnnotations(
+            with_bbox=True,
+            with_label=False,
+            with_instance_id=True,
+            with_seg=False,
+            with_mask=False)
+        assert repr(transform) == ('LoadTrackAnnotations(with_bbox=True, '
+                                   'with_label=False, with_instance_id=True, '
+                                   'with_mask=False, with_seg=False, '
+                                   "poly2mask=True, imdecode_backend='cv2', "
+                                   "file_client_args={'backend': 'disk'})")
diff --git a/tests/test_datasets/test_transforms/test_processing.py b/tests/test_datasets/test_transforms/test_processing.py
new file mode 100644
index 000000000..53d69283d
--- /dev/null
+++ b/tests/test_datasets/test_transforms/test_processing.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from mmtrack.datasets.transforms import PairSampling, TridentSampling
+
+
+class TestPairSampling:
+
+    def setup_class(cls):
+        num_frames = 60
+        pair_video_infos = []
+        filenames = ['{:08d}.jpg'.format(i) for i in range(num_frames)]
+        frame_ids = np.arange(num_frames)
+        bboxes = np.ones((num_frames, 4))
+        bboxes_isvalid = np.ones(num_frames, dtype=bool)
+        visible = bboxes_isvalid.copy()
+
+        for video_id in range(2):
+            video_info = dict(
+                bboxes=bboxes,
+                bboxes_isvalid=bboxes_isvalid,
+                visible=visible,
+                img_paths=filenames,
+                frame_ids=frame_ids,
+                video_id=video_id,
+                video_length=60)
+            pair_video_infos.append(video_info)
+
+        cls.num_frames = num_frames
+        cls.pair_video_infos = pair_video_infos
+        cls.pair_sampling = PairSampling(
+            frame_range=5, pos_prob=0.8, filter_template_img=False)
+
+    def test_transform(self):
+        results = self.pair_sampling(self.pair_video_infos)
+
+        frame_ids = results['frame_id']
+        assert len(frame_ids) == 2
+        for frame_id in frame_ids:
+            assert 0 <= frame_id < self.num_frames
+
+        instances = results['instances']
+        assert len(instances) == 2
+        for instance in instances:
+            assert len(instance) == 1
+            assert (instance[0]['bbox'] == np.ones(4)).all()
+            assert (instance[0]['bbox_label'] <= np.ones(1)).all()
+
+
+class TestTridentSampling:
+
+    def setup_class(cls):
+        num_frames = 60
+        pair_video_infos = []
+        filenames = ['{:08d}.jpg'.format(i) for i in range(num_frames)]
+        frame_ids = np.arange(num_frames)
+        bboxes = np.ones((num_frames, 4))
+        bboxes_isvalid = np.ones(num_frames, dtype=bool)
+        visible = bboxes_isvalid.copy()
+
+        for video_id in range(2):
+            bboxes_isvalid = np.ones(num_frames, dtype=bool)
+            random_invalid_index = np.random.randint(0, num_frames, 4)
+            bboxes_isvalid[random_invalid_index] = False
+            visible = bboxes_isvalid.copy()
+            random_invalid_index = np.random.randint(0, num_frames, 4)
+            visible[random_invalid_index] = False
+            video_info = dict(
+                bboxes=bboxes,
+                bboxes_isvalid=bboxes_isvalid,
+                visible=visible,
+                img_paths=filenames,
+                frame_ids=frame_ids,
+                video_id=video_id,
+                video_length=60)
+            pair_video_infos.append(video_info)
+
+        cls.num_frames = num_frames
+        cls.pair_video_infos = pair_video_infos
+        cls.pair_sampling = TridentSampling(
+            num_search_frames=1,
+            num_template_frames=2,
+            max_frame_range=[200],
+            cls_pos_prob=0.5,
+            train_cls_head=True)
+
+    def test_transform(self):
+        results = self.pair_sampling(self.pair_video_infos)
+
+        frame_ids = results['frame_id']
+        assert len(frame_ids) == 3
+        for frame_id in frame_ids:
+            assert 0 <= frame_id < self.num_frames
+
+        instances = results['instances']
+        assert len(instances) == 3
+        for instance in instances:
+            assert len(instance) == 1
+            assert (instance[0]['bbox'] == np.ones(4)).all()
+            assert (instance[0]['bbox_label'] <= np.ones(1)).all()
diff --git a/tests/test_datasets/test_transforms/test_transforms.py b/tests/test_datasets/test_transforms/test_transforms.py
new file mode 100644
index 000000000..b9db9d63a
--- /dev/null
+++ b/tests/test_datasets/test_transforms/test_transforms.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from mmtrack.datasets.transforms import (BrightnessAug, CropLikeDiMP,
+                                         CropLikeSiamFC, GrayAug,
+                                         SeqBboxJitter, SeqBlurAug,
+                                         SeqColorAug, SeqCropLikeStark,
+                                         SeqShiftScaleAug)
+from mmtrack.testing import random_boxes
+
+
+class TestCropLikeSiamFC:
+
+    def setup_class(cls):
+        cls.crop_like_siamfc = CropLikeSiamFC(
+            context_amount=0.5, exemplar_size=127, crop_size=255)
+        cls.results = dict(
+            img=np.random.randn(500, 500, 3),
+            gt_bboxes=random_boxes(1, 256).numpy(),
+            img_shape=(500, 500, 3))
+
+    def test_transform(self):
+        results = self.crop_like_siamfc(self.results)
+        assert results['img'].shape == (255, 255, 3)
+        assert results['gt_bboxes'].shape == (1, 4)
+        assert results['img_shape'] == (255, 255, 3)
+
+
+class TestSeqShiftScaleAug:
+
+    def setup_class(cls):
+        cls.seq_shift_scale_aug = SeqShiftScaleAug(
+            target_size=[127, 255], shift=[4, 64], scale=[0.05, 0.18])
+        img = np.random.randn(500, 500, 3)
+        gt_bbox = random_boxes(1, 256).numpy()
+        cls.results = dict(
+            img=[img.copy(), img.copy()],
+            gt_bboxes=[gt_bbox.copy(), gt_bbox.copy()],
+            img_shape=[(500, 500, 3), (500, 500, 3)])
+
+    def test_transform(self):
+        results = self.seq_shift_scale_aug(self.results)
+        assert results['img'][0].shape == (127, 127, 3)
+        assert results['img'][1].shape == (255, 255, 3)
+        assert results['gt_bboxes'][0].shape == (1, 4)
+        assert results['gt_bboxes'][1].shape == (1, 4)
+        assert results['img_shape'][0] == (127, 127, 3)
+        assert results['img_shape'][1] == (255, 255, 3)
+
+
+class TestSeqColorAug:
+
+    def setup_class(cls):
+        cls.seq_color_aug = SeqColorAug(prob=[1.0, 0.5])
+        cls.results = dict(
+            img=[np.random.randn(127, 127, 3),
+                 np.random.randn(255, 255, 3)])
+
+    def test_transform(self):
+        results = self.seq_color_aug(self.results)
+        assert results['img'][0].shape == (127, 127, 3)
+        assert results['img'][1].shape == (255, 255, 3)
+
+
+class TestSeqBlurAug:
+
+    def setup_class(cls):
+        cls.seq_blur_aug = SeqBlurAug(prob=[0.2, 0.5])
+        cls.results = dict(
+            img=[np.random.randn(127, 127, 3),
+                 np.random.randn(255, 255, 3)])
+
+    def test_transform(self):
+        results = self.seq_blur_aug(self.results)
+        assert results['img'][0].shape == (127, 127, 3)
+        assert results['img'][1].shape == (255, 255, 3)
+
+
+class TestGrayAug:
+
+    def setup_class(cls):
+        cls.gray_aug = GrayAug(prob=1)
+        cls.results = dict(
+            img=np.random.randint(0, 255, (127, 127, 3)).astype(np.uint8))
+
+    def test_transform(self):
+        results = self.gray_aug(self.results)
+        assert results['img'].shape == (127, 127, 3)
+
+
+class TestBrightnessAug:
+
+    def setup_class(cls):
+        cls.brightness_aug = BrightnessAug(jitter_range=0.2)
+        cls.results = dict(img=np.random.randn(127, 127, 3))
+
+    def test_transform(self):
+        results = self.brightness_aug(self.results)
+        assert results['img'].shape == (127, 127, 3)
+
+
+class TestSeqBboxJitter:
+
+    def setup_class(cls):
+        cls.seq_shift_scale_aug = SeqBboxJitter(
+            center_jitter_factor=[0, 4.5],
+            scale_jitter_factor=[0, 0.5],
+            crop_size_factor=[2, 5])
+        gt_bbox = random_boxes(1, 256).numpy()
+        cls.results = dict(gt_bboxes=[gt_bbox.copy(), gt_bbox.copy()])
+
+    def test_transform(self):
+        results = self.seq_shift_scale_aug(self.results)
+        assert results['jittered_bboxes'][0].shape == (1, 4)
+        assert results['jittered_bboxes'][1].shape == (1, 4)
+
+
+class TestSeqCropLikeStark:
+
+    def setup_class(cls):
+        cls.seq_crop_like_stark = SeqCropLikeStark(
+            crop_size_factor=[2, 5], output_size=[128, 320])
+        cls.results = dict(
+            img=[np.random.randn(500, 500, 3),
+                 np.random.randn(500, 500, 3)],
+            gt_bboxes=[
+                random_boxes(1, 256).numpy(),
+                random_boxes(1, 256).numpy()
+            ],
+            img_shape=[(500, 500, 3), (500, 500, 3)],
+            jittered_bboxes=[
+                random_boxes(1, 256).numpy(),
+                random_boxes(1, 256).numpy()
+            ])
+
+    def test_transform(self):
+        results = self.seq_crop_like_stark(self.results)
+        assert results['img'][0].shape == (128, 128, 3)
+        assert results['img'][1].shape == (320, 320, 3)
+        assert results['gt_bboxes'][0].shape == (1, 4)
+        assert results['gt_bboxes'][1].shape == (1, 4)
+        assert results['img_shape'][0] == (128, 128, 3)
+        assert results['img_shape'][1] == (320, 320, 3)
+        assert results['padding_mask'][0].shape == (128, 128)
+        assert results['padding_mask'][1].shape == (320, 320)
+
+
+class TestCropLikeDiMP:
+
+    def setup_class(cls):
+        cls.crop_like_dimp = CropLikeDiMP(crop_size_factor=5, output_size=255)
+        cls.results = dict(
+            img=np.random.randn(500, 500, 3),
+            gt_bboxes=random_boxes(1, 100).numpy(),
+            img_shape=(500, 500, 3),
+            jittered_bboxes=random_boxes(1, 100).numpy())
+
+    def test_transform(self):
+        results = self.crop_like_dimp(self.results)
+        assert results['img'].shape == (255, 255, 3)
+        assert results['gt_bboxes'].shape == (1, 4)
+        assert results['img_shape'] == (255, 255, 3)
diff --git a/tests/test_datasets/test_youtube_vis_dataset.py b/tests/test_datasets/test_youtube_vis_dataset.py
new file mode 100644
index 000000000..aa10dcb51
--- /dev/null
+++ b/tests/test_datasets/test_youtube_vis_dataset.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+from mmtrack.datasets import YouTubeVISDataset
+
+PREFIX = osp.join(osp.dirname(__file__), '../data')
+# This is a demo annotation file for YouTubeVISDataset
+# 1 video, 1 categories ('sedan')
+# 1 images, 1 instances
+# 0 crowd
+DEMO_ANN_FILE = f'{PREFIX}/demo_vis_data/ann.json'
+
+
+class TestYouTubeVISDataset(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        cls.dataset = YouTubeVISDataset(
+            ann_file=DEMO_ANN_FILE, dataset_version='2019')
+
+    def test_set_dataset_classes(self):
+        assert isinstance(self.dataset.metainfo, dict)
+        assert len(self.dataset.metainfo['CLASSES']) == 40
diff --git a/tests/test_engine/test_hooks/test_siamrpn_backbone_unfreeze_hook.py b/tests/test_engine/test_hooks/test_siamrpn_backbone_unfreeze_hook.py
new file mode 100644
index 000000000..4e374b17c
--- /dev/null
+++ b/tests/test_engine/test_hooks/test_siamrpn_backbone_unfreeze_hook.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+from unittest.mock import Mock
+
+from torch import nn
+
+from mmtrack.engine.hooks import SiamRPNBackboneUnfreezeHook
+
+
+class TestSiamRPNBackboneUnfreezeHook(TestCase):
+
+    def test_before_train_epoch(self):
+        runner = Mock()
+        runner.model.backbone = Mock()
+        runner.model.backbone.layer1 = nn.Conv2d(1, 1, 1)
+        runner.model.backbone.layer2 = nn.Sequential(
+            nn.Conv2d(1, 1, 1), nn.BatchNorm2d(2))
+        for layer in ['layer1', 'layer2']:
+            for param in getattr(runner.model.backbone, layer).parameters():
+                param.requires_grad = False
+        runner.model.backbone.layer2[1].eval()
+        hook = SiamRPNBackboneUnfreezeHook(
+            backbone_start_train_epoch=10, backbone_train_layers=['layer2'])
+
+        runner.epoch = 9
+        hook.before_train_epoch(runner)
+        for layer in ['layer1', 'layer2']:
+            for param in getattr(runner.model.backbone, layer).parameters():
+                self.assertFalse(param.requires_grad)
+        self.assertFalse(runner.model.backbone.layer2[1].training)
+
+        runner.epoch = 10
+        hook.before_train_epoch(runner)
+        for param in getattr(runner.model.backbone, 'layer1').parameters():
+            self.assertFalse(param.requires_grad)
+        for param in getattr(runner.model.backbone, 'layer2').parameters():
+            self.assertTrue(param.requires_grad)
+        self.assertTrue(runner.model.backbone.layer2[1].training)
diff --git a/tests/test_engine/test_hooks/test_visualization_hook.py b/tests/test_engine/test_hooks/test_visualization_hook.py
new file mode 100644
index 000000000..c15a36bd6
--- /dev/null
+++ b/tests/test_engine/test_hooks/test_visualization_hook.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import time
+from unittest import TestCase
+from unittest.mock import Mock
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmtrack.engine import TrackVisualizationHook
+from mmtrack.structures import TrackDataSample
+from mmtrack.visualization import TrackLocalVisualizer
+
+
+class TestVisualizationHook(TestCase):
+
+    def setUp(self) -> None:
+        TrackLocalVisualizer.get_instance('visualizer')
+        # pseudo data_batch
+        self.data_batch = dict(data_samples=None, inputs=None)
+
+        pred_instances_data = dict(
+            bboxes=torch.tensor([[100, 100, 200, 200], [150, 150, 400, 200]]),
+            instances_id=torch.tensor([1, 2]),
+            labels=torch.tensor([0, 1]),
+            scores=torch.tensor([0.955, 0.876]))
+        pred_instances = InstanceData(**pred_instances_data)
+        track_data_sample = TrackDataSample()
+        track_data_sample.pred_track_instances = pred_instances
+        track_data_sample.gt_instances = pred_instances
+        track_data_sample.set_metainfo(
+            dict(
+                img_path=osp.join(
+                    osp.dirname(__file__), '../../data/image_1.jpg')))
+        self.outputs = [track_data_sample]
+
+    def test_after_val_iter(self):
+        runner = Mock()
+        runner.iter = 1
+        hook = TrackVisualizationHook(interval=10, draw=True)
+        hook.after_val_iter(runner, 9, self.data_batch, self.outputs)
+
+    def test_after_test_iter(self):
+        runner = Mock()
+        runner.iter = 1
+        hook = TrackVisualizationHook(interval=10, draw=True)
+        hook.after_val_iter(runner, 9, self.data_batch, self.outputs)
+
+        # test test_out_dir
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        test_out_dir = timestamp + '1'
+        runner.work_dir = timestamp
+        runner.timestamp = '1'
+        hook = TrackVisualizationHook(
+            interval=10, draw=True, test_out_dir=test_out_dir)
+        hook.after_test_iter(runner, 9, self.data_batch, self.outputs)
+        self.assertTrue(os.path.exists(f'{timestamp}/1/{test_out_dir}'))
+        shutil.rmtree(f'{timestamp}')
diff --git a/tests/test_engine/test_schedulers/test_siamrpn_scheduler.py b/tests/test_engine/test_schedulers/test_siamrpn_scheduler.py
new file mode 100644
index 000000000..dacc35b8a
--- /dev/null
+++ b/tests/test_engine/test_schedulers/test_siamrpn_scheduler.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from unittest import TestCase
+
+import torch
+import torch.nn.functional as F
+import torch.optim as optim
+from mmengine.optim.scheduler import _ParamScheduler
+from mmengine.testing import assert_allclose
+
+from mmtrack.engine import SiamRPNExpLR, SiamRPNExpParamScheduler
+
+
+class ToyModel(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(1, 1, 1)
+        self.conv2 = torch.nn.Conv2d(1, 1, 1)
+
+    def forward(self, x):
+        return self.conv2(F.relu(self.conv1(x)))
+
+
+class TestSiamRPNExpScheduler(TestCase):
+
+    def setUp(self):
+        """Setup the model and optimizer which are used in every test method.
+
+        TestCase calls functions in this order: setUp() -> testMethod() ->
+        tearDown() -> cleanUp()
+        """
+        self.model = ToyModel()
+        self.base_lr = 0.0005
+        self.optimizer = optim.SGD(
+            self.model.parameters(),
+            lr=self.base_lr,
+            momentum=0.01,
+            weight_decay=5e-4)
+
+    def _test_scheduler_value(self,
+                              schedulers,
+                              targets,
+                              epochs=5,
+                              param_name='lr'):
+        if isinstance(schedulers, _ParamScheduler):
+            schedulers = [schedulers]
+        for epoch in range(epochs):
+            for param_group, target in zip(self.optimizer.param_groups,
+                                           targets):
+                print(param_group[param_name])
+                assert_allclose(
+                    target[epoch],
+                    param_group[param_name],
+                    msg='{} is wrong in epoch {}: expected {}, got {}'.format(
+                        param_name, epoch, target[epoch],
+                        param_group[param_name]),
+                    atol=1e-5,
+                    rtol=0)
+            [scheduler.step() for scheduler in schedulers]
+
+    def test_siamrpn_exp_scheduler(self):
+        with self.assertRaises(ValueError):
+            SiamRPNExpParamScheduler(self.optimizer, param_name='lr')
+        epochs = 5
+        start_factor = 0.2
+        end_factor = 1.0
+        mult = math.pow(end_factor / start_factor, 1. / (epochs))
+        targets = [[
+            self.base_lr * start_factor * (mult**i) for i in range(epochs)
+        ]]
+
+        scheduler = SiamRPNExpParamScheduler(
+            self.optimizer,
+            param_name='lr',
+            start_factor=start_factor,
+            end_factor=end_factor,
+            end=epochs,
+            endpoint=False)
+        self._test_scheduler_value(scheduler, targets, epochs)
+
+    def test_siamrpn_exp_scheduler_convert_iterbased(self):
+        epochs = 5
+        epoch_length = 10
+
+        iters = epochs * epoch_length
+        start_factor = 0.2
+        end_factor = 1.0
+        mult = math.pow(end_factor / start_factor, 1. / (iters))
+        targets = [[
+            self.base_lr * start_factor * (mult**i) for i in range(iters)
+        ]]
+        scheduler = SiamRPNExpParamScheduler.build_iter_from_epoch(
+            self.optimizer,
+            param_name='lr',
+            start_factor=start_factor,
+            end_factor=end_factor,
+            end=epochs,
+            endpoint=False,
+            epoch_length=epoch_length)
+        self._test_scheduler_value(scheduler, targets, iters)
+
+    def test_siamrpn_exp_lr(self):
+        epochs = 5
+        start_factor = 0.2
+        end_factor = 1.0
+        mult = math.pow(end_factor / start_factor, 1. / (epochs))
+        targets = [[
+            self.base_lr * start_factor * (mult**i) for i in range(epochs)
+        ]]
+
+        scheduler = SiamRPNExpLR(
+            self.optimizer,
+            start_factor=start_factor,
+            end_factor=end_factor,
+            end=epochs,
+            endpoint=False)
+        self._test_scheduler_value(scheduler, targets, epochs)
diff --git a/tests/test_evaluation/test_metrics/test_coco_video_metric.py b/tests/test_evaluation/test_metrics/test_coco_video_metric.py
new file mode 100644
index 000000000..256207a09
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_coco_video_metric.py
@@ -0,0 +1,410 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+from unittest import TestCase
+
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from mmengine.fileio import dump
+
+from mmtrack.evaluation import CocoVideoMetric
+
+
+class TestCocoVideoMetric(TestCase):
+
+    def _create_dummy_coco_json(self, json_name):
+        dummy_mask = np.zeros((10, 10), order='F', dtype=np.uint8)
+        dummy_mask[:5, :5] = 1
+        rle_mask = mask_util.encode(dummy_mask)
+        rle_mask['counts'] = rle_mask['counts'].decode('utf-8')
+        image = {
+            'id': 0,
+            'width': 640,
+            'height': 640,
+            'file_name': 'fake_name.jpg',
+        }
+
+        annotation_1 = {
+            'id': 1,
+            'image_id': 0,
+            'category_id': 0,
+            'area': 400,
+            'bbox': [50, 60, 20, 20],
+            'iscrowd': 0,
+            'segmentation': rle_mask,
+        }
+
+        annotation_2 = {
+            'id': 2,
+            'image_id': 0,
+            'category_id': 0,
+            'area': 900,
+            'bbox': [100, 120, 30, 30],
+            'iscrowd': 0,
+            'segmentation': rle_mask,
+        }
+
+        annotation_3 = {
+            'id': 3,
+            'image_id': 0,
+            'category_id': 1,
+            'area': 1600,
+            'bbox': [150, 160, 40, 40],
+            'iscrowd': 0,
+            'segmentation': rle_mask,
+        }
+
+        annotation_4 = {
+            'id': 4,
+            'image_id': 0,
+            'category_id': 0,
+            'area': 10000,
+            'bbox': [250, 260, 100, 100],
+            'iscrowd': 0,
+            'segmentation': rle_mask,
+        }
+
+        categories = [
+            {
+                'id': 0,
+                'name': 'car',
+                'supercategory': 'car',
+            },
+            {
+                'id': 1,
+                'name': 'bicycle',
+                'supercategory': 'bicycle',
+            },
+        ]
+
+        fake_json = {
+            'images': [image],
+            'annotations':
+            [annotation_1, annotation_2, annotation_3, annotation_4],
+            'categories': categories
+        }
+
+        dump(fake_json, json_name)
+
+    def _create_dummy_results(self):
+        bboxes = np.array([[50, 60, 70, 80], [100, 120, 130, 150],
+                           [150, 160, 190, 200], [250, 260, 350, 360]])
+        scores = np.array([1.0, 0.98, 0.96, 0.95])
+        labels = np.array([0, 0, 1, 0])
+        dummy_mask = np.zeros((4, 10, 10), dtype=np.uint8)
+        dummy_mask[:, :5, :5] = 1
+        return dict(
+            bboxes=torch.from_numpy(bboxes),
+            scores=torch.from_numpy(scores),
+            labels=torch.from_numpy(labels),
+            masks=torch.from_numpy(dummy_mask))
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.tmp_dir.cleanup()
+
+    def test_init(self):
+        fake_json_file = osp.join(self.tmp_dir.name, 'fake_data.json')
+        self._create_dummy_coco_json(fake_json_file)
+        with self.assertRaisesRegex(KeyError, 'metric should be one of'):
+            CocoVideoMetric(ann_file=fake_json_file, metric='unknown')
+
+    def test_evaluate(self):
+        # create dummy data
+        fake_json_file = osp.join(self.tmp_dir.name, 'fake_data.json')
+        self._create_dummy_coco_json(fake_json_file)
+        dummy_pred = self._create_dummy_results()
+
+        # test single coco dataset evaluation
+        coco_metric = CocoVideoMetric(
+            ann_file=fake_json_file,
+            classwise=False,
+            outfile_prefix=f'{self.tmp_dir.name}/test')
+        coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+        coco_metric.process(
+            dict(inputs=None, data_samples=None), [
+                dict(
+                    pred_det_instances=dummy_pred,
+                    img_id=0,
+                    ori_shape=(640, 640))
+            ])
+        eval_results = coco_metric.evaluate(size=1)
+        target = {
+            'coco/bbox_mAP': 1.0,
+            'coco/bbox_mAP_50': 1.0,
+            'coco/bbox_mAP_75': 1.0,
+            'coco/bbox_mAP_s': 1.0,
+            'coco/bbox_mAP_m': 1.0,
+            'coco/bbox_mAP_l': 1.0,
+        }
+        self.assertDictEqual(eval_results, target)
+        self.assertTrue(
+            osp.isfile(osp.join(self.tmp_dir.name, 'test.bbox.json')))
+
+        # test box and segm coco dataset evaluation
+        coco_metric = CocoVideoMetric(
+            ann_file=fake_json_file,
+            metric=['bbox', 'segm'],
+            classwise=False,
+            outfile_prefix=f'{self.tmp_dir.name}/test')
+        coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+        coco_metric.process(
+            dict(inputs=None, data_samples=None), [
+                dict(
+                    pred_det_instances=dummy_pred,
+                    img_id=0,
+                    ori_shape=(640, 640))
+            ])
+        eval_results = coco_metric.evaluate(size=1)
+        target = {
+            'coco/bbox_mAP': 1.0,
+            'coco/bbox_mAP_50': 1.0,
+            'coco/bbox_mAP_75': 1.0,
+            'coco/bbox_mAP_s': 1.0,
+            'coco/bbox_mAP_m': 1.0,
+            'coco/bbox_mAP_l': 1.0,
+            'coco/segm_mAP': 1.0,
+            'coco/segm_mAP_50': 1.0,
+            'coco/segm_mAP_75': 1.0,
+            'coco/segm_mAP_s': 1.0,
+            'coco/segm_mAP_m': 1.0,
+            'coco/segm_mAP_l': 1.0,
+        }
+        self.assertDictEqual(eval_results, target)
+        self.assertTrue(
+            osp.isfile(osp.join(self.tmp_dir.name, 'test.bbox.json')))
+        self.assertTrue(
+            osp.isfile(osp.join(self.tmp_dir.name, 'test.segm.json')))
+
+        # test invalid custom metric_items
+        with self.assertRaisesRegex(KeyError,
+                                    'metric item "invalid" is not supported'):
+            coco_metric = CocoVideoMetric(
+                ann_file=fake_json_file, metric_items=['invalid'])
+            coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+            coco_metric.process(
+                dict(inputs=None, data_samples=None), [
+                    dict(
+                        pred_det_instances=dummy_pred,
+                        img_id=0,
+                        ori_shape=(640, 640))
+                ])
+            coco_metric.evaluate(size=1)
+
+        # test custom metric_items
+        coco_metric = CocoVideoMetric(
+            ann_file=fake_json_file, metric_items=['mAP_m'])
+        coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+        coco_metric.process(
+            dict(inputs=None, data_samples=None), [
+                dict(
+                    pred_det_instances=dummy_pred,
+                    img_id=0,
+                    ori_shape=(640, 640))
+            ])
+        eval_results = coco_metric.evaluate(size=1)
+        target = {
+            'coco/bbox_mAP_m': 1.0,
+        }
+        self.assertDictEqual(eval_results, target)
+
+    def test_classwise_evaluate(self):
+        # create dummy data
+        fake_json_file = osp.join(self.tmp_dir.name, 'fake_data.json')
+        self._create_dummy_coco_json(fake_json_file)
+        dummy_pred = self._create_dummy_results()
+
+        # test single coco dataset evaluation
+        coco_metric = CocoVideoMetric(
+            ann_file=fake_json_file, metric='bbox', classwise=True)
+        coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+        coco_metric.process(
+            dict(inputs=None, data_samples=None), [
+                dict(
+                    pred_det_instances=dummy_pred,
+                    img_id=0,
+                    ori_shape=(640, 640))
+            ])
+        eval_results = coco_metric.evaluate(size=1)
+        target = {
+            'coco/bbox_mAP': 1.0,
+            'coco/bbox_mAP_50': 1.0,
+            'coco/bbox_mAP_75': 1.0,
+            'coco/bbox_mAP_s': 1.0,
+            'coco/bbox_mAP_m': 1.0,
+            'coco/bbox_mAP_l': 1.0,
+            'coco/car_precision': 1.0,
+            'coco/bicycle_precision': 1.0,
+        }
+        self.assertDictEqual(eval_results, target)
+
+    def test_manually_set_iou_thrs(self):
+        # create dummy data
+        fake_json_file = osp.join(self.tmp_dir.name, 'fake_data.json')
+        self._create_dummy_coco_json(fake_json_file)
+
+        # test single coco dataset evaluation
+        coco_metric = CocoVideoMetric(
+            ann_file=fake_json_file, metric='bbox', iou_thrs=[0.3, 0.6])
+        coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+        self.assertEqual(coco_metric.iou_thrs, [0.3, 0.6])
+
+    def test_fast_eval_recall(self):
+        # create dummy data
+        fake_json_file = osp.join(self.tmp_dir.name, 'fake_data.json')
+        self._create_dummy_coco_json(fake_json_file)
+        dummy_pred = self._create_dummy_results()
+
+        # test default proposal nums
+        coco_metric = CocoVideoMetric(
+            ann_file=fake_json_file, metric='proposal_fast')
+        coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+        coco_metric.process(
+            dict(inputs=None, data_samples=None), [
+                dict(
+                    pred_det_instances=dummy_pred,
+                    img_id=0,
+                    ori_shape=(640, 640))
+            ])
+        eval_results = coco_metric.evaluate(size=1)
+        target = {'coco/AR@100': 1.0, 'coco/AR@300': 1.0, 'coco/AR@1000': 1.0}
+        self.assertDictEqual(eval_results, target)
+
+        # test manually set proposal nums
+        coco_metric = CocoVideoMetric(
+            ann_file=fake_json_file,
+            metric='proposal_fast',
+            proposal_nums=(2, 4))
+        coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+        coco_metric.process(
+            dict(inputs=None, data_samples=None), [
+                dict(
+                    pred_det_instances=dummy_pred,
+                    img_id=0,
+                    ori_shape=(640, 640))
+            ])
+        eval_results = coco_metric.evaluate(size=1)
+        target = {'coco/AR@2': 0.5, 'coco/AR@4': 1.0}
+        self.assertDictEqual(eval_results, target)
+
+    def test_evaluate_proposal(self):
+        # create dummy data
+        fake_json_file = osp.join(self.tmp_dir.name, 'fake_data.json')
+        self._create_dummy_coco_json(fake_json_file)
+        dummy_pred = self._create_dummy_results()
+
+        coco_metric = CocoVideoMetric(
+            ann_file=fake_json_file, metric='proposal')
+        coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+        coco_metric.process(
+            dict(inputs=None, data_samples=None), [
+                dict(
+                    pred_det_instances=dummy_pred,
+                    img_id=0,
+                    ori_shape=(640, 640))
+            ])
+        eval_results = coco_metric.evaluate(size=1)
+        print(eval_results)
+        target = {
+            'coco/AR@100': 1,
+            'coco/AR@300': 1.0,
+            'coco/AR@1000': 1.0,
+            'coco/AR_s@1000': 1.0,
+            'coco/AR_m@1000': 1.0,
+            'coco/AR_l@1000': 1.0
+        }
+        self.assertDictEqual(eval_results, target)
+
+    def test_empty_results(self):
+        # create dummy data
+        fake_json_file = osp.join(self.tmp_dir.name, 'fake_data.json')
+        self._create_dummy_coco_json(fake_json_file)
+        coco_metric = CocoVideoMetric(ann_file=fake_json_file, metric='bbox')
+        coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+        bboxes = np.zeros((0, 4))
+        labels = np.array([])
+        scores = np.array([])
+        dummy_mask = np.zeros((0, 10, 10), dtype=np.uint8)
+        empty_pred = dict(
+            bboxes=torch.from_numpy(bboxes),
+            scores=torch.from_numpy(scores),
+            labels=torch.from_numpy(labels),
+            masks=torch.from_numpy(dummy_mask))
+        coco_metric.process(
+            dict(inputs=None, data_samples=None), [
+                dict(
+                    pred_det_instances=empty_pred,
+                    img_id=0,
+                    ori_shape=(640, 640))
+            ])
+        # coco api Index error will be caught
+        coco_metric.evaluate(size=1)
+
+    def test_evaluate_without_json(self):
+        dummy_pred = self._create_dummy_results()
+
+        dummy_mask = np.zeros((10, 10), order='F', dtype=np.uint8)
+        dummy_mask[:5, :5] = 1
+        rle_mask = mask_util.encode(dummy_mask)
+        rle_mask['counts'] = rle_mask['counts'].decode('utf-8')
+        instances = [{
+            'bbox_label': 0,
+            'bbox': [50, 60, 70, 80],
+            'ignore_flag': 0,
+            'mask': rle_mask,
+        }, {
+            'bbox_label': 0,
+            'bbox': [100, 120, 130, 150],
+            'ignore_flag': 0,
+            'mask': rle_mask,
+        }, {
+            'bbox_label': 1,
+            'bbox': [150, 160, 190, 200],
+            'ignore_flag': 0,
+            'mask': rle_mask,
+        }, {
+            'bbox_label': 0,
+            'bbox': [250, 260, 350, 360],
+            'ignore_flag': 0,
+            'mask': rle_mask,
+        }]
+        coco_metric = CocoVideoMetric(
+            ann_file=None,
+            metric=['bbox', 'segm'],
+            classwise=False,
+            outfile_prefix=f'{self.tmp_dir.name}/test')
+        coco_metric.dataset_meta = dict(CLASSES=['car', 'bicycle'])
+        coco_metric.process(
+            dict(inputs=None, data_samples=None), [
+                dict(
+                    pred_det_instances=dummy_pred,
+                    img_id=0,
+                    ori_shape=(640, 640),
+                    instances=instances)
+            ])
+        eval_results = coco_metric.evaluate(size=1)
+        print(eval_results)
+        target = {
+            'coco/bbox_mAP': 1.0,
+            'coco/bbox_mAP_50': 1.0,
+            'coco/bbox_mAP_75': 1.0,
+            'coco/bbox_mAP_s': 1.0,
+            'coco/bbox_mAP_m': 1.0,
+            'coco/bbox_mAP_l': 1.0,
+            'coco/segm_mAP': 1.0,
+            'coco/segm_mAP_50': 1.0,
+            'coco/segm_mAP_75': 1.0,
+            'coco/segm_mAP_s': 1.0,
+            'coco/segm_mAP_m': 1.0,
+            'coco/segm_mAP_l': 1.0,
+        }
+        self.assertDictEqual(eval_results, target)
+        self.assertTrue(
+            osp.isfile(osp.join(self.tmp_dir.name, 'test.bbox.json')))
+        self.assertTrue(
+            osp.isfile(osp.join(self.tmp_dir.name, 'test.segm.json')))
+        self.assertTrue(
+            osp.isfile(osp.join(self.tmp_dir.name, 'test.gt.json')))
diff --git a/tests/test_evaluation/test_metrics/test_mot_challenge_metrics.py b/tests/test_evaluation/test_metrics/test_mot_challenge_metrics.py
new file mode 100644
index 000000000..fc5484917
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_mot_challenge_metrics.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from unittest import TestCase
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmtrack.evaluation import MOTChallengeMetrics
+
+
+class TestMOTChallengeMetrics(TestCase):
+
+    def test_init(self):
+        with self.assertRaisesRegex(KeyError, 'metric unknown is not'):
+            MOTChallengeMetrics(metric='unknown')
+        with self.assertRaises(AssertionError):
+            MOTChallengeMetrics(benchmark='MOT21')
+
+    @staticmethod
+    def _get_predictions_demo():
+        instances = [{
+            'bbox_label': 0,
+            'bbox': [0, 0, 100, 100],
+            'ignore_flag': 0,
+            'instance_id': 1,
+            'mot_conf': 1.0,
+            'category_id': 1,
+            'visibility': 1.0
+        }, {
+            'bbox_label': 0,
+            'bbox': [0, 0, 100, 100],
+            'ignore_flag': 0,
+            'instance_id': 2,
+            'mot_conf': 1.0,
+            'category_id': 1,
+            'visibility': 1.0
+        }]
+        sep = os.sep
+        pred_instances_data = dict(
+            bboxes=torch.tensor([
+                [0, 0, 100, 100],
+                [0, 0, 100, 40],
+            ]),
+            instances_id=torch.tensor([1, 2]),
+            scores=torch.tensor([1.0, 1.0]))
+        pred_instances = InstanceData(**pred_instances_data)
+        predictions = [
+            dict(
+                pred_track_instances=pred_instances,
+                frame_id=0,
+                video_length=1,
+                img_id=1,
+                img_path=f'xxx{sep}MOT17-09-DPM{sep}img1{sep}000001.jpg',
+                instances=instances)
+        ]
+        return predictions
+
+    def _test_evaluate(self, format_only):
+        """Test using the metric in the same way as Evaluator."""
+        metric = MOTChallengeMetrics(
+            metric=['HOTA', 'CLEAR', 'Identity'], format_only=format_only)
+        metric.dataset_meta = {'CLASSES': ('pedestrian', )}
+        data_batch = dict(input=None, data_samples=None)
+        predictions = self._get_predictions_demo()
+        metric.process(data_batch, predictions)
+        eval_results = metric.evaluate()
+        return eval_results
+
+    def test_evaluate(self):
+        eval_results = self._test_evaluate(False)
+        target = {
+            'motchallenge-metric/IDF1': 0.5,
+            'motchallenge-metric/MOTA': 0,
+            'motchallenge-metric/HOTA': 0.755,
+            'motchallenge-metric/IDSW': 0,
+        }
+        for key in target:
+            assert eval_results[key] - target[key] < 1e-3
+
+    def test_evaluate_format_only(self):
+        eval_results = self._test_evaluate(True)
+        assert eval_results == dict()
diff --git a/tests/test_evaluation/test_metrics/test_reid_metrics.py b/tests/test_evaluation/test_metrics/test_reid_metrics.py
new file mode 100644
index 000000000..82f24f4f4
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_reid_metrics.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmtrack.registry import METRICS
+from mmtrack.structures import ReIDDataSample
+from mmtrack.utils import register_all_modules
+
+
+class TestReIDMetrics(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evaluator."""
+        data_samples = [
+            ReIDDataSample().set_gt_label(i).to_dict()
+            for i in [0, 0, 1, 1, 1, 1]
+        ]
+        pred_batch = [
+            dict(pred_feature=torch.tensor(
+                [1., .0, .1])),  # [x,√,x,x,x],R1=0,R5=1,AP=0.50
+            dict(pred_feature=torch.tensor(
+                [.8, .0, .0])),  # [x,√,x,x,x],R1=0,R5=1,AP=0.50
+            dict(pred_feature=torch.tensor(
+                [.1, 1., .1])),  # [√,√,x,√,x],R1=1,R5=1,AP≈0.92
+            dict(pred_feature=torch.tensor(
+                [.0, .9, .1])),  # [√,√,√,x,x],R1=1,R5=1,AP=1.00
+            dict(pred_feature=torch.tensor(
+                [.9, .1, .0])),  # [x,x,√,√,√],R1=0,R5=1,AP≈0.48
+            dict(pred_feature=torch.tensor(
+                [.0, .1, 1.])),  # [√,√,x,√,x],R1=1,R5=1,AP≈0.92
+        ]
+        # get union
+        for idx in range(len(data_samples)):
+            data_samples[idx] = {**data_samples[idx], **pred_batch[idx]}
+
+        metric = METRICS.build(
+            dict(
+                type='ReIDMetrics',
+                metric=['mAP', 'CMC'],
+                metric_options=dict(rank_list=[1, 5], max_rank=5),
+            ))
+
+        prefix = 'reid-metric'
+        data_batch = dict(input=None, data_samples=None)
+        metric.process(data_batch, data_samples)
+        results = metric.evaluate(6)
+        self.assertIsInstance(results, dict)
+        self.assertEqual(results[f'{prefix}/mAP'], 0.719)
+        self.assertEqual(results[f'{prefix}/R1'], 0.5)
+        self.assertEqual(results[f'{prefix}/R5'], 1.0)
diff --git a/tests/test_evaluation/test_metrics/test_sot_metrics.py b/tests/test_evaluation/test_metrics/test_sot_metrics.py
new file mode 100644
index 000000000..3b459f477
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_sot_metrics.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+from unittest import TestCase
+
+import numpy as np
+import torch
+from mmengine import list_from_file
+
+from mmtrack.registry import METRICS
+from mmtrack.utils import register_all_modules
+
+SOT_DATA_PREFIX = osp.join(osp.dirname(__file__), '../../data/demo_sot_data')
+
+
+class TestSOTMetric(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        cls.tmp_dir = tempfile.TemporaryDirectory()
+        cls.outfile_prefix = f'{cls.tmp_dir.name}/test'
+        cls.sot_metric = METRICS.build(
+            dict(
+                type='SOTMetric',
+                outfile_prefix=cls.outfile_prefix,
+                options_after_eval=dict(eval_show_video_indices=[0, 1])))
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.tmp_dir.cleanup()
+
+    def _create_eval_inputs(self, pred_file, gt_file):
+        data_root = osp.join(SOT_DATA_PREFIX, 'trackingnet', 'TRAIN_0')
+        data_samples = []
+        for video_id, video_name in enumerate(['video-1', 'video-2']):
+            pred_bboxes = list_from_file(
+                osp.join(data_root, video_name, pred_file))
+            gt_bboxes = np.loadtxt(
+                osp.join(data_root, video_name, gt_file), delimiter=',')
+
+            for i, (pred_bbox,
+                    gt_bbox) in enumerate(zip(pred_bboxes, gt_bboxes)):
+                pred_bbox = list(map(lambda x: float(x), pred_bbox.split(',')))
+                pred_track_instances = dict(
+                    bboxes=torch.Tensor(pred_bbox)[None])
+                if len(gt_bbox) == 4:
+                    gt_bbox[2:] += gt_bbox[:2]
+                data_samples.append(
+                    dict(
+                        pred_track_instances=pred_track_instances,
+                        instances=[dict(bbox=gt_bbox, visible=True)],
+                        video_id=video_id,
+                        frame_id=i,
+                        img_path=osp.join(data_root, video_name, 'demo.jpg'),
+                        ori_shape=(256, 512),
+                        video_length=25))
+        return data_samples
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evaluator."""
+        metric_prefix = self.sot_metric.prefix
+
+        # 1. OPE evaluation
+        self.sot_metric.metrics = ['OPE']
+        data_samples = self._create_eval_inputs('track_results.txt',
+                                                'gt_for_eval.txt')
+        for data_sample in data_samples:
+            data_batch = dict(inputs=None, data_samples=None)
+            self.sot_metric.process(data_batch, [data_sample])
+        eval_results = self.sot_metric.evaluate(size=50)
+        assert round(eval_results[f'{metric_prefix}/success'], 4) == 67.5238
+        assert eval_results[f'{metric_prefix}/norm_precision'] == 70.0
+        assert eval_results[f'{metric_prefix}/precision'] == 50.0
+
+        # 2. Format-only
+        self.sot_metric.format_only = True
+        for data_sample in data_samples:
+            data_batch = dict(inputs=None, data_samples=None)
+            self.sot_metric.process(data_batch, [data_sample])
+        eval_results = self.sot_metric.evaluate(size=50)
+        assert len(eval_results) == 0
+        assert os.path.exists(f'{self.outfile_prefix}.zip')
+
+        # 3. VOT evaluation
+        self.sot_metric.format_only = False
+        self.sot_metric.metrics = ['VOT']
+        self.sot_metric.metric_options['interval'] = [1, 3]
+        data_samples = self._create_eval_inputs('vot2018_track_results.txt',
+                                                'vot2018_gt_for_eval.txt')
+        for data_sample in data_samples:
+            data_batch = dict(inputs=None, data_samples=None)
+            self.sot_metric.process(data_batch, [data_sample])
+        eval_results = self.sot_metric.evaluate(size=50)
+        assert abs(eval_results[f'{metric_prefix}/eao'] - 0.6661) < 0.0001
+        assert round(eval_results[f'{metric_prefix}/accuracy'], 4) == 0.5826
+        assert round(eval_results[f'{metric_prefix}/robustness'], 4) == 6.0
diff --git a/tests/test_evaluation/test_metrics/test_tao_metric.py b/tests/test_evaluation/test_metrics/test_tao_metric.py
new file mode 100644
index 000000000..d185b97f3
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_tao_metric.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import os.path as osp
+import tempfile
+from unittest import TestCase
+
+import numpy as np
+import torch
+
+from mmtrack.registry import METRICS
+from mmtrack.utils import register_all_modules
+
+
+class TestTAOMetric(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.tmp_dir.cleanup()
+
+    def _create_dummy_results(self, track_id):
+        bboxes = np.array([[100, 100, 150, 150]])
+        scores = np.array([1.0])
+        labels = np.array([0])
+        instance_id = np.array([track_id])
+        return dict(
+            bboxes=torch.from_numpy(bboxes),
+            scores=torch.from_numpy(scores),
+            labels=torch.from_numpy(labels),
+            instances_id=torch.from_numpy(instance_id))
+
+    def test_format_only(self):
+        outfile_prefix = f'{self.tmp_dir.name}/result'
+        tao_metric = METRICS.build(
+            dict(
+                type='TAOMetric',
+                format_only=True,
+                outfile_prefix=outfile_prefix,
+            ))
+        dummy_pred = self._create_dummy_results(track_id=0)
+        instances = [{
+            'bbox_label': 0,
+            'bbox': [100, 100, 150, 150],
+            'ignore_flag': 0,
+            'instance_id': 1,
+        }]
+        tao_metric.dataset_meta = dict(
+            CLASSES=['car', 'train'],
+            categories={
+                0: dict(id=0, name='car'),
+                1: dict(id=1, name='train')
+            })
+        data_batch = dict(inputs=None, data_samples=None)
+        data_samples = [
+            dict(
+                pred_track_instances=dummy_pred,
+                pred_det_instances=dummy_pred,
+                img_id=0,
+                ori_shape=(720, 1280),
+                frame_id=0,
+                frame_index=0,
+                video_id=1,
+                video_length=1,
+                instances=instances,
+                neg_category_ids=[3, 4],
+                not_exhaustive_category_ids=[1, 2])
+        ]
+        tao_metric.process(data_batch, data_samples)
+        tao_metric.evaluate(size=1)
+        assert osp.exists(f'{outfile_prefix}_track.json')
+        assert osp.exists(f'{outfile_prefix}_det.json')
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evaluator."""
+        dummy_pred_1 = self._create_dummy_results(track_id=1)
+        dummy_pred_2 = self._create_dummy_results(track_id=1)
+
+        instances = [{
+            'bbox_label': 0,
+            'bbox': [100, 100, 150, 150],
+            'ignore_flag': 0,
+            'instance_id': 1
+        }]
+        tao_metric = METRICS.build(
+            dict(
+                type='TAOMetric',
+                outfile_prefix=f'{self.tmp_dir.name}/test',
+            ))
+
+        tao_metric.dataset_meta = dict(
+            CLASSES=['car', 'train'],
+            categories={
+                0: dict(id=0, name='car'),
+                1: dict(id=1, name='train')
+            })
+        data_batch = dict(inputs=None, data_samples=None)
+        data_samples = [
+            dict(
+                pred_track_instances=dummy_pred_1,
+                pred_det_instances=dummy_pred_1,
+                img_id=0,
+                ori_shape=(720, 1280),
+                frame_id=0,
+                frame_index=0,
+                video_id=1,
+                video_length=1,
+                instances=instances,
+                neg_category_ids=[3, 4],
+                not_exhaustive_category_ids=[1, 2])
+        ]
+        tao_metric.process(data_batch, data_samples)
+
+        data_samples = [
+            dict(
+                pred_track_instances=dummy_pred_2,
+                pred_det_instances=dummy_pred_2,
+                img_id=0,
+                ori_shape=(720, 1280),
+                frame_id=0,
+                frame_index=0,
+                video_id=1,
+                video_length=1,
+                instances=instances,
+                neg_category_ids=[3, 4],
+                not_exhaustive_category_ids=[1, 2])
+        ]
+        tao_metric.process(data_batch, data_samples)
+
+        eval_results = tao_metric.evaluate(size=2)
+        target = {
+            'tao/track_AP': 1.0,
+            'tao/track_AP50': 1.0,
+            'tao/track_AP75': 1.0,
+        }
+        self.assertDictEqual(eval_results, target)
diff --git a/tests/test_evaluation/test_metrics/test_youtube_vis_metric.py b/tests/test_evaluation/test_metrics/test_youtube_vis_metric.py
new file mode 100644
index 000000000..934e7473f
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_youtube_vis_metric.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+from unittest import TestCase
+
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+
+from mmtrack.registry import METRICS
+from mmtrack.utils import register_all_modules
+
+
+class TestYouTubeVISMetric(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.tmp_dir.cleanup()
+
+    def _create_dummy_results(self, track_id):
+        bboxes = np.array([[100, 100, 150, 150]])
+        scores = np.array([1.0])
+        labels = np.array([0])
+        instance_id = np.array([track_id])
+        dummy_mask = np.zeros((1, 720, 1280), dtype=np.uint8)
+        dummy_mask[:, 100:150, 100:150] = 1
+        return dict(
+            bboxes=torch.from_numpy(bboxes),
+            scores=torch.from_numpy(scores),
+            labels=torch.from_numpy(labels),
+            instances_id=torch.from_numpy(instance_id),
+            masks=torch.from_numpy(dummy_mask))
+
+    def test_format_only(self):
+        outfile_prefix = f'{self.tmp_dir.name}/result'
+        vis_metric = METRICS.build(
+            dict(
+                type='YouTubeVISMetric',
+                format_only=True,
+                outfile_prefix=outfile_prefix,
+            ))
+        dummy_pred = self._create_dummy_results(track_id=0)
+        dummy_mask = np.zeros((720, 1280), order='F', dtype=np.uint8)
+        dummy_mask[100:150, 100:150] = 1
+        rle_mask = mask_util.encode(dummy_mask)
+        rle_mask['counts'] = rle_mask['counts'].decode('utf-8')
+        instances = [{
+            'bbox_label': 0,
+            'bbox': [100, 100, 150, 150],
+            'ignore_flag': 0,
+            'instance_id': 1,
+            'mask': rle_mask,
+        }]
+        vis_metric.dataset_meta = dict(CLASSES=['car', 'train'])
+        data_batch = dict(inputs=None, data_samples=None)
+        data_samples = [
+            dict(
+                pred_track_instances=dummy_pred,
+                img_id=0,
+                ori_shape=(720, 1280),
+                frame_id=0,
+                video_id=1,
+                video_length=1,
+                instances=instances)
+        ]
+        vis_metric.process(data_batch, data_samples)
+        vis_metric.evaluate(size=1)
+        assert os.path.exists(f'{outfile_prefix}.json')
+        assert os.path.exists(f'{outfile_prefix}.submission_file.zip')
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evaluator."""
+        dummy_pred_1 = self._create_dummy_results(track_id=1)
+        dummy_pred_2 = self._create_dummy_results(track_id=1)
+        dummy_pred_3 = self._create_dummy_results(track_id=2)
+
+        dummy_mask = np.zeros((720, 1280), order='F', dtype=np.uint8)
+        dummy_mask[100:150, 100:150] = 1
+        rle_mask = mask_util.encode(dummy_mask)
+        rle_mask['counts'] = rle_mask['counts'].decode('utf-8')
+        instances_1 = [{
+            'bbox_label': 0,
+            'bbox': [100, 100, 150, 150],
+            'ignore_flag': 0,
+            'instance_id': 1,
+            'mask': rle_mask,
+        }]
+        instances_2 = [{
+            'bbox_label': 0,
+            'bbox': [100, 100, 150, 150],
+            'ignore_flag': 0,
+            'instance_id': 2,
+            'mask': rle_mask,
+        }]
+        vis_metric = METRICS.build(
+            dict(
+                type='YouTubeVISMetric',
+                outfile_prefix=f'{self.tmp_dir.name}/test',
+            ))
+
+        vis_metric.dataset_meta = dict(CLASSES=['car', 'train'])
+        data_batch = dict(inputs=None, data_samples=None)
+        data_samples = [
+            dict(
+                pred_track_instances=dummy_pred_1,
+                img_id=1,
+                ori_shape=(720, 1280),
+                frame_id=0,
+                video_id=1,
+                video_length=2,
+                instances=instances_1)
+        ]
+        vis_metric.process(data_batch, data_samples)
+        data_samples = [
+            dict(
+                pred_track_instances=dummy_pred_2,
+                img_id=2,
+                ori_shape=(720, 1280),
+                frame_id=1,
+                video_id=1,
+                video_length=2,
+                instances=instances_1)
+        ]
+        vis_metric.process(data_batch, data_samples)
+        data_samples = [
+            dict(
+                pred_track_instances=dummy_pred_3,
+                img_id=3,
+                ori_shape=(720, 1280),
+                frame_id=0,
+                video_id=2,
+                video_length=1,
+                instances=instances_2)
+        ]
+        vis_metric.process(data_batch, data_samples)
+
+        eval_results = vis_metric.evaluate(size=3)
+        target = {
+            'youtube_vis/segm_mAP': 1.0,
+            'youtube_vis/segm_mAP_50': 1.0,
+            'youtube_vis/segm_mAP_75': 1.0,
+            'youtube_vis/segm_mAP_s': 1.0,
+            'youtube_vis/segm_mAP_m': -1.0,
+            'youtube_vis/segm_mAP_l': -1.0,
+        }
+        self.assertDictEqual(eval_results, target)
diff --git a/tests/test_models/test_aggregators/test_embed_aggregator.py b/tests/test_models/test_aggregators/test_embed_aggregator.py
index 967df2518..cb7d9e8ff 100644
--- a/tests/test_models/test_aggregators/test_embed_aggregator.py
+++ b/tests/test_models/test_aggregators/test_embed_aggregator.py
@@ -1,30 +1,27 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
+from unittest import TestCase
 
-from mmtrack.models.aggregators import EmbedAggregator
+import torch
 
+from mmtrack.models import EmbedAggregator
 
-def test_embed_aggregator():
-    """Test embed_aggregator."""
-    with pytest.raises(AssertionError):
-        # The number of convs must be bigger than 1.
-        model = EmbedAggregator(num_convs=0, channels=32, kernel_size=3)
 
-    with pytest.raises(AssertionError):
-        # Only support 'batch_size == 1' for target_x
-        model = EmbedAggregator(num_convs=3, channels=32, kernel_size=3)
-        model.train()
+class TestEmbedAggregator(TestCase):
 
-        target_x = torch.randn(2, 32, 224, 224)
-        ref_x = torch.randn(4, 32, 224, 224)
-        agg_x = model(target_x, ref_x)
+    @classmethod
+    def setUpClass(cls):
+        cls.model = EmbedAggregator(num_convs=3, channels=32, kernel_size=3)
+        cls.model.train()
 
-    # Test embed_aggregator forward
-    model = EmbedAggregator(num_convs=3, channels=32, kernel_size=3)
-    model.train()
+    def test_forward(self):
+        with self.assertRaises(AssertionError):
+            # Only support 'batch_size == 1' for target_x
+            target_x = torch.randn(2, 32, 224, 224)
+            ref_x = torch.randn(4, 32, 224, 224)
+            agg_x = self.model(target_x, ref_x)
 
-    target_x = torch.randn(1, 32, 224, 224)
-    ref_x = torch.randn(4, 32, 224, 224)
-    agg_x = model(target_x, ref_x)
-    assert agg_x.shape == target_x.shape
+        # Test embed_aggregator forward
+        target_x = torch.randn(1, 32, 224, 224)
+        ref_x = torch.randn(4, 32, 224, 224)
+        agg_x = self.model(target_x, ref_x)
+        assert agg_x.shape == target_x.shape
diff --git a/tests/test_models/test_aggregators/test_selsa_aggregator.py b/tests/test_models/test_aggregators/test_selsa_aggregator.py
index aed4b2cc7..4612f40d6 100644
--- a/tests/test_models/test_aggregators/test_selsa_aggregator.py
+++ b/tests/test_models/test_aggregators/test_selsa_aggregator.py
@@ -1,16 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
 import torch
 
 from mmtrack.models.aggregators import SelsaAggregator
 
 
-def test_selsa_aggregator():
-    """Test selsa_aggregator."""
-    # Test embed_aggregator forward
-    model = SelsaAggregator(in_channels=16, num_attention_blocks=4)
-    model.train()
+class TestSelsaAggregator(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = SelsaAggregator(in_channels=16, num_attention_blocks=4)
+        cls.model.train()
 
-    target_x = torch.randn(2, 16)
-    ref_x = torch.randn(4, 16)
-    agg_x = model(target_x, ref_x)
-    assert agg_x.shape == target_x.shape
+    def test_forward(self):
+        # Test embed_aggregator forward
+        target_x = torch.randn(2, 16)
+        ref_x = torch.randn(4, 16)
+        agg_x = self.model(target_x, ref_x)
+        assert agg_x.shape == target_x.shape
diff --git a/tests/test_models/test_backbones/test_sot_resnet.py b/tests/test_models/test_backbones/test_sot_resnet.py
new file mode 100644
index 000000000..9bb2afd7a
--- /dev/null
+++ b/tests/test_models/test_backbones/test_sot_resnet.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmtrack.models import SOTResNet
+from mmtrack.models.backbones.sot_resnet import SOTBottleneck, SOTResLayer
+
+
+class TestSOTRestNet(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        with pytest.raises(AssertionError):
+            # ResNet depth should be 50
+            SOTResNet(20)
+        cls.model = SOTResNet(
+            depth=50,
+            out_indices=(1, 2, 3),
+            frozen_stages=3,
+            strides=(1, 2, 1, 1),
+            dilations=(1, 1, 2, 4),
+            norm_eval=True,
+            base_channels=1)
+        cls.model.train()
+        for num in range(1, 4):
+            layer = getattr(cls.model, f'layer{num}')
+            for param in layer.parameters():
+                assert not param.requires_grad
+            for m in layer.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    assert not m.training
+
+        for param in cls.model.layer4.parameters():
+            assert param.requires_grad
+        for m in cls.model.layer4.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                assert not m.training
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 32, 32)
+        feat = self.model(imgs)
+        assert len(feat) == 3
+        assert feat[0].shape == torch.Size([1, 8, 3, 3])
+        assert feat[1].shape == torch.Size([1, 16, 3, 3])
+        assert feat[2].shape == torch.Size([1, 32, 3, 3])
+
+
+class TestSOTBottleneck(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        downsample = nn.Sequential(
+            nn.Conv2d(4, 4, 3, stride=2, padding=1), nn.BatchNorm2d(4))
+        cls.model = SOTBottleneck(
+            4, 1, stride=2, dilation=2, downsample=downsample)
+
+    def test_forward(self):
+        x = torch.randn(1, 4, 8, 8)
+        feats = self.model(x)
+        assert feats.shape == torch.Size([1, 4, 4, 4])
+
+
+class TestSOTResLayer(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = SOTResLayer(SOTBottleneck, 4, 8, 2)
+        cls.model.train()
+
+    def test_forward(self):
+        x = torch.randn(1, 4, 8, 8)
+        feats = self.model(x)
+        assert feats.shape == torch.Size([1, 32, 8, 8])
diff --git a/tests/test_models/test_backones/test_sot_resnet.py b/tests/test_models/test_backones/test_sot_resnet.py
deleted file mode 100644
index 3264e4722..000000000
--- a/tests/test_models/test_backones/test_sot_resnet.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
-
-from mmtrack.models.backbones import SOTResNet
-
-
-def test_sot_resnet_backbone():
-    """Test sot resnet backbone."""
-    with pytest.raises(AssertionError):
-        # ResNet depth should be 50
-        SOTResNet(20)
-
-    # Test SOTResNet50 with layers 2, 3, 4 out forward
-    cfg = dict(
-        depth=50,
-        out_indices=(1, 2, 3),
-        frozen_stages=4,
-        strides=(1, 2, 1, 1),
-        dilations=(1, 1, 2, 4),
-        norm_eval=True)
-    model = SOTResNet(**cfg)
-    model.init_weights()
-    model.train()
-
-    imgs = torch.randn(1, 3, 127, 127)
-    feat = model(imgs)
-    assert len(feat) == 3
-    assert feat[0].shape == torch.Size([1, 512, 15, 15])
-    assert feat[1].shape == torch.Size([1, 1024, 15, 15])
-    assert feat[2].shape == torch.Size([1, 2048, 15, 15])
-
-    imgs = torch.randn(1, 3, 255, 255)
-    feat = model(imgs)
-    assert len(feat) == 3
-    assert feat[0].shape == torch.Size([1, 512, 31, 31])
-    assert feat[1].shape == torch.Size([1, 1024, 31, 31])
-    assert feat[2].shape == torch.Size([1, 2048, 31, 31])
diff --git a/tests/test_models/test_data_preprocessors/test_data_preprocessor.py b/tests/test_models/test_data_preprocessors/test_data_preprocessor.py
new file mode 100644
index 000000000..4794d2f6f
--- /dev/null
+++ b/tests/test_models/test_data_preprocessors/test_data_preprocessor.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+from mmtrack.models.data_preprocessors import TrackDataPreprocessor
+from mmtrack.testing import demo_mm_inputs
+
+
+class TestTrackDataPreprocessor(TestCase):
+
+    def test_init(self):
+        # test mean is None
+        processor = TrackDataPreprocessor()
+        self.assertTrue(not hasattr(processor, 'mean'))
+        self.assertTrue(processor._enable_normalize is False)
+
+        # test mean is not None
+        processor = TrackDataPreprocessor(mean=[0, 0, 0], std=[1, 1, 1])
+        self.assertTrue(hasattr(processor, 'mean'))
+        self.assertTrue(hasattr(processor, 'std'))
+        self.assertTrue(processor._enable_normalize)
+
+        # please specify both mean and std
+        with self.assertRaises(AssertionError):
+            TrackDataPreprocessor(mean=[0, 0, 0])
+
+        # bgr2rgb and rgb2bgr cannot be set to True at the same time
+        with self.assertRaises(AssertionError):
+            TrackDataPreprocessor(bgr_to_rgb=True, rgb_to_bgr=True)
+
+    def test_forward(self):
+        processor = TrackDataPreprocessor(mean=[0, 0, 0], std=[1, 1, 1])
+
+        data = demo_mm_inputs(
+            batch_size=1,
+            frame_id=0,
+            num_key_imgs=1,
+            ref_prefix='search',
+            image_shapes=[(3, 11, 10)],
+            num_items=[1])
+        out_data = processor(data)
+        inputs, data_samples = out_data['inputs'], out_data['data_samples']
+        for _, inputs_single_mode in inputs.items():
+            self.assertEqual(inputs_single_mode.shape, (1, 1, 3, 11, 10))
+        self.assertEqual(len(data_samples), 1)
+
+        # test channel_conversion
+        processor = TrackDataPreprocessor(
+            mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True)
+        out_data = processor(data)
+        inputs, data_samples = out_data['inputs'], out_data['data_samples']
+        for _, inputs_single_mode in inputs.items():
+            self.assertEqual(inputs_single_mode.shape, (1, 1, 3, 11, 10))
+        self.assertEqual(len(data_samples), 1)
+
+        # test padding
+        data = demo_mm_inputs(
+            batch_size=2,
+            frame_id=0,
+            num_key_imgs=1,
+            ref_prefix='search',
+            image_shapes=[(3, 10, 11), (3, 9, 14)],
+            num_items=[1, 1])
+        out_data = processor(data)
+        inputs, data_samples = out_data['inputs'], out_data['data_samples']
+        for _, inputs_single_mode in inputs.items():
+            self.assertEqual(inputs_single_mode.shape, (2, 1, 3, 10, 14))
+
+        # test pad_size_divisor
+        data = demo_mm_inputs(
+            batch_size=2,
+            frame_id=0,
+            num_key_imgs=1,
+            ref_prefix='search',
+            image_shapes=[(3, 10, 11), (3, 9, 24)],
+            num_items=[1, 1])
+        processor = TrackDataPreprocessor(
+            mean=[0., 0., 0.], std=[1., 1., 1.], pad_size_divisor=5)
+        out_data = processor(data)
+        inputs, data_samples = out_data['inputs'], out_data['data_samples']
+        for _, inputs_single_mode in inputs.items():
+            self.assertEqual(inputs_single_mode.shape, (2, 1, 3, 10, 25))
+        self.assertEqual(len(data_samples), 2)
+        for data_sample, expected_shape in zip(data_samples, [(10, 15),
+                                                              (10, 25)]):
+            self.assertEqual(data_sample.pad_shape, expected_shape)
+            self.assertEqual(data_sample.search_pad_shape, expected_shape)
+
+        # test pad_mask=True
+        data = demo_mm_inputs(
+            batch_size=2,
+            frame_id=0,
+            num_key_imgs=1,
+            ref_prefix='search',
+            image_shapes=[(3, 10, 11), (3, 9, 24)],
+            num_items=[1, 1],
+            with_mask=True)
+        processor = TrackDataPreprocessor(pad_mask=True, mask_pad_value=0)
+        mask_pad_sums = [
+            x.gt_instances.masks.masks.sum() for x in data['data_samples']
+        ]
+        out_data = processor(data)
+        inputs, data_samples = out_data['inputs'], out_data['data_samples']
+        for data_sample, expected_shape, mask_pad_sum in zip(
+                data_samples, [(10, 24), (10, 24)], mask_pad_sums):
+            self.assertEqual(data_sample.gt_instances.masks.masks.shape[-2:],
+                             expected_shape)
+            self.assertEqual(data_sample.gt_instances.masks.masks.sum(),
+                             mask_pad_sum)
+            self.assertEqual(
+                data_sample.search_gt_instances.masks.masks.shape[-2:],
+                expected_shape)
+            self.assertEqual(data_sample.search_gt_instances.masks.masks.sum(),
+                             mask_pad_sum)
diff --git a/tests/test_models/test_filter/test_filter_head.py b/tests/test_models/test_filter/test_filter_head.py
new file mode 100644
index 000000000..cc78e4cb9
--- /dev/null
+++ b/tests/test_models/test_filter/test_filter_head.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmtrack.models import FilterInitializer
+
+
+def test_filter_classifier_initializer():
+    if not torch.cuda.is_available():
+        return
+    classifier_initializer = FilterInitializer(
+        filter_size=4, feature_dim=8, feature_stride=16).to('cuda:0')
+    feats = torch.randn(4, 8, 22, 22, device='cuda:0')
+    bboxes = torch.randn(4, 4, device='cuda:0') * 100
+    bboxes[:, 2:] += bboxes[:, :2]
+    filter = classifier_initializer(feats, bboxes)
+    assert filter.shape == torch.Size([1, 8, 4, 4])
diff --git a/tests/test_models/test_filter/test_filter_optimizer.py b/tests/test_models/test_filter/test_filter_optimizer.py
new file mode 100644
index 000000000..178ee1fa2
--- /dev/null
+++ b/tests/test_models/test_filter/test_filter_optimizer.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmtrack.models import PrDiMPFilterOptimizer
+
+
+def test_prdimp_steepest_descent_newton():
+    optimizer = PrDiMPFilterOptimizer(
+        num_iters=5,
+        feat_stride=16,
+        init_step_length=1.0,
+        init_filter_regular=0.05,
+        gauss_sigma=0.9,
+        alpha_eps=0.05,
+        min_filter_regular=0.05,
+        label_thres=0)
+    filter = torch.randn(4, 8, 4, 4)
+    feat = torch.randn(3, 4, 8, 22, 22)
+    bboxes = torch.randn(3, 4, 4) * 100
+    new_filter, filter_iters, losses = optimizer(filter, feat, bboxes)
+    assert new_filter.shape == filter.shape
+    assert len(filter_iters) == len(losses) == 6
diff --git a/tests/test_models/test_forward/__init__.py b/tests/test_models/test_forward/__init__.py
deleted file mode 100644
index 04900791e..000000000
--- a/tests/test_models/test_forward/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .utils import _demo_mm_inputs, _get_config_module
-
-__all__ = ['_demo_mm_inputs', '_get_config_module']
diff --git a/tests/test_models/test_forward/test_mot_forward.py b/tests/test_models/test_forward/test_mot_forward.py
deleted file mode 100644
index cb300c0e9..000000000
--- a/tests/test_models/test_forward/test_mot_forward.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-from collections import defaultdict
-
-import pytest
-import torch
-
-from mmtrack.datasets.pipelines.processing import MatchInstances
-from .utils import _demo_mm_inputs, _get_config_module
-
-
-@pytest.mark.parametrize(
-    'cfg_file',
-    [
-        'mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py',
-        'mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17-private-half.py'  # noqa
-    ])
-def test_mot_forward_train(cfg_file):
-    config = _get_config_module(cfg_file)
-    model = copy.deepcopy(config.model)
-
-    from mmtrack.models import build_model
-    qdtrack = build_model(model)
-
-    # Test forward train with a non-empty truth batch
-    input_shape = (1, 3, 256, 256)
-    mm_inputs = _demo_mm_inputs(
-        input_shape, num_items=[10], num_classes=2, with_track=True)
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    gt_bboxes = mm_inputs['gt_bboxes']
-    gt_labels = mm_inputs['gt_labels']
-    gt_instance_ids = mm_inputs['gt_instance_ids']
-    gt_masks = mm_inputs['gt_masks']
-
-    ref_input_shape = (1, 3, 256, 256)
-    ref_mm_inputs = _demo_mm_inputs(
-        ref_input_shape, num_items=[10], num_classes=2, with_track=True)
-    ref_img = ref_mm_inputs.pop('imgs')
-    ref_img_metas = ref_mm_inputs.pop('img_metas')
-    ref_gt_bboxes = ref_mm_inputs['gt_bboxes']
-    ref_gt_labels = ref_mm_inputs['gt_labels']
-    ref_gt_masks = ref_mm_inputs['gt_masks']
-    ref_gt_instance_ids = ref_mm_inputs['gt_instance_ids']
-
-    match_tool = MatchInstances()
-    gt_match_indices, _ = match_tool._match_gts(gt_instance_ids[0],
-                                                ref_gt_instance_ids[0])
-    gt_match_indices = [torch.tensor(gt_match_indices)]
-
-    losses = qdtrack.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        gt_labels=gt_labels,
-        gt_masks=gt_masks,
-        gt_match_indices=gt_match_indices,
-        ref_img=ref_img,
-        ref_img_metas=ref_img_metas,
-        ref_gt_bboxes=ref_gt_bboxes,
-        ref_gt_labels=ref_gt_labels,
-        ref_gt_masks=ref_gt_masks,
-        return_loss=True)
-    assert isinstance(losses, dict)
-    loss, _ = qdtrack._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-    # Test forward train with an empty truth batch
-    mm_inputs = _demo_mm_inputs(
-        input_shape, num_items=[0], num_classes=2, with_track=True)
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    gt_bboxes = mm_inputs['gt_bboxes']
-    gt_labels = mm_inputs['gt_labels']
-    gt_instance_ids = mm_inputs['gt_instance_ids']
-    gt_masks = mm_inputs['gt_masks']
-
-    ref_mm_inputs = _demo_mm_inputs(
-        ref_input_shape, num_items=[0], num_classes=2, with_track=True)
-    ref_img = ref_mm_inputs.pop('imgs')
-    ref_img_metas = ref_mm_inputs.pop('img_metas')
-    ref_gt_bboxes = ref_mm_inputs['gt_bboxes']
-    ref_gt_labels = ref_mm_inputs['gt_labels']
-    ref_gt_masks = ref_mm_inputs['gt_masks']
-    ref_gt_instance_ids = ref_mm_inputs['gt_instance_ids']
-
-    gt_match_indices, _ = match_tool._match_gts(gt_instance_ids[0],
-                                                ref_gt_instance_ids[0])
-    gt_match_indices = [torch.tensor(gt_match_indices)]
-
-    losses = qdtrack.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        gt_labels=gt_labels,
-        gt_masks=gt_masks,
-        gt_match_indices=gt_match_indices,
-        ref_img=ref_img,
-        ref_img_metas=ref_img_metas,
-        ref_gt_bboxes=ref_gt_bboxes,
-        ref_gt_labels=ref_gt_labels,
-        ref_gt_masks=ref_gt_masks,
-        return_loss=True)
-    assert isinstance(losses, dict)
-    loss, _ = qdtrack._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-
-@pytest.mark.parametrize(
-    'cfg_file',
-    [
-        'mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_mot17-private-half.py',
-        'mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_crowdhuman_mot17-private-half.py',  # noqa
-        'mot/tracktor/tracktor_faster-rcnn_r50_fpn_4e_mot17-private.py',
-        'mot/deepsort/deepsort_faster-rcnn_fpn_4e_mot17-private-half.py',
-        'mot/bytetrack/bytetrack_yolox_x_crowdhuman_mot17-private-half.py'
-    ])
-def test_mot_simple_test(cfg_file):
-    config = _get_config_module(cfg_file)
-    model = copy.deepcopy(config.model)
-
-    from mmtrack.models import build_model
-    mot = build_model(model)
-    mot.eval()
-
-    input_shape = (1, 3, 256, 256)
-    mm_inputs = _demo_mm_inputs(input_shape, num_items=[10], with_track=True)
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    with torch.no_grad():
-        imgs = torch.cat([imgs, imgs.clone()], dim=0)
-        img_list = [g[None, :] for g in imgs]
-        img2_metas = copy.deepcopy(img_metas)
-        img2_metas[0]['frame_id'] = 1
-        img_metas.extend(img2_metas)
-        results = defaultdict(list)
-        for one_img, one_meta in zip(img_list, img_metas):
-            result = mot.forward([one_img], [[one_meta]], return_loss=False)
-            for k, v in result.items():
-                results[k].append(v)
diff --git a/tests/test_models/test_forward/test_sot_forward.py b/tests/test_models/test_forward/test_sot_forward.py
deleted file mode 100644
index 139c532f8..000000000
--- a/tests/test_models/test_forward/test_sot_forward.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-from collections import defaultdict
-
-import pytest
-import torch
-
-from mmtrack.models import build_model
-from .utils import _demo_mm_inputs, _get_config_module
-
-
-@pytest.mark.parametrize('cfg_file', [
-    'sot/siamese_rpn/siamese_rpn_r50_20e_lasot.py',
-    'sot/siamese_rpn/siamese_rpn_r50_20e_vot2018.py'
-])
-def test_siamrpn_forward(cfg_file):
-    config = _get_config_module(cfg_file)
-    model = copy.deepcopy(config.model)
-
-    sot = build_model(model)
-
-    # Test forward train with a non-empty truth batch
-    input_shape = (1, 3, 127, 127)
-    mm_inputs = _demo_mm_inputs(input_shape, num_items=[1])
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    gt_bboxes = mm_inputs['gt_bboxes']
-
-    search_input_shape = (1, 3, 255, 255)
-    search_mm_inputs = _demo_mm_inputs(search_input_shape, num_items=[1])
-    search_img = search_mm_inputs.pop('imgs')[None]
-    search_img_metas = search_mm_inputs.pop('img_metas')
-    search_gt_bboxes = search_mm_inputs['gt_bboxes']
-    img_inds = search_gt_bboxes[0].new_full((search_gt_bboxes[0].size(0), 1),
-                                            0)
-    search_gt_bboxes[0] = torch.cat((img_inds, search_gt_bboxes[0]), dim=1)
-
-    losses = sot.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        search_img=search_img,
-        search_img_metas=search_img_metas,
-        search_gt_bboxes=search_gt_bboxes,
-        is_positive_pairs=[True],
-        return_loss=True)
-    assert isinstance(losses, dict)
-    loss, _ = sot._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-    losses = sot.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        search_img=search_img,
-        search_img_metas=search_img_metas,
-        search_gt_bboxes=search_gt_bboxes,
-        is_positive_pairs=[False],
-        return_loss=True)
-    assert isinstance(losses, dict)
-    loss, _ = sot._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-
-def test_stark_forward():
-    # test stage-1 forward
-    config = _get_config_module('sot/stark/stark_st1_r50_500e_got10k.py')
-    model = copy.deepcopy(config.model)
-
-    from mmtrack.models import build_model
-    sot = build_model(model)
-
-    # Test forward train with a non-empty truth batch
-    input_shape = (2, 3, 128, 128)
-    mm_inputs = _demo_mm_inputs(input_shape, num_items=[1, 1])
-    imgs = mm_inputs.pop('imgs')[None]
-    img_metas = mm_inputs.pop('img_metas')
-    gt_bboxes = mm_inputs['gt_bboxes']
-    padding_mask = torch.zeros((2, 128, 128), dtype=bool)
-    padding_mask[0, 100:128, 100:128] = 1
-    padding_mask = padding_mask[None]
-
-    search_input_shape = (1, 3, 320, 320)
-    search_mm_inputs = _demo_mm_inputs(search_input_shape, num_items=[1])
-    search_img = search_mm_inputs.pop('imgs')[None]
-    search_img_metas = search_mm_inputs.pop('img_metas')
-    search_gt_bboxes = search_mm_inputs['gt_bboxes']
-    search_padding_mask = torch.zeros((1, 320, 320), dtype=bool)
-    search_padding_mask[0, 0:20, 0:20] = 1
-    search_padding_mask = search_padding_mask[None]
-    img_inds = search_gt_bboxes[0].new_full((search_gt_bboxes[0].size(0), 1),
-                                            0)
-    search_gt_bboxes[0] = torch.cat((img_inds, search_gt_bboxes[0]), dim=1)
-
-    losses = sot.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        padding_mask=padding_mask,
-        search_img=search_img,
-        search_img_metas=search_img_metas,
-        search_gt_bboxes=search_gt_bboxes,
-        search_padding_mask=search_padding_mask,
-        return_loss=True)
-    assert isinstance(losses, dict)
-    assert losses['loss_bbox'] > 0
-    loss, _ = sot._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-    # test stage-2 forward
-    config = _get_config_module('sot/stark/stark_st2_r50_50e_got10k.py')
-    model = copy.deepcopy(config.model)
-    sot = build_model(model)
-    search_gt_labels = [torch.ones((1, 2))]
-
-    losses = sot.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        padding_mask=padding_mask,
-        search_img=search_img,
-        search_img_metas=search_img_metas,
-        search_gt_bboxes=search_gt_bboxes,
-        search_padding_mask=search_padding_mask,
-        search_gt_labels=search_gt_labels,
-        return_loss=True)
-    assert isinstance(losses, dict)
-    assert losses['loss_cls'] > 0
-    loss, _ = sot._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-
-@pytest.mark.parametrize('cfg_file', [
-    'sot/siamese_rpn/siamese_rpn_r50_20e_lasot.py',
-    'sot/siamese_rpn/siamese_rpn_r50_20e_vot2018.py',
-    'sot/stark/stark_st2_r50_50e_got10k.py'
-])
-def test_sot_test_forward(cfg_file):
-    config = _get_config_module(cfg_file)
-    model = copy.deepcopy(config.model)
-    sot = build_model(model)
-    sot.eval()
-
-    input_shape = (1, 3, 127, 127)
-    mm_inputs = _demo_mm_inputs(input_shape, num_items=[1])
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    gt_bboxes = mm_inputs['gt_bboxes']
-
-    with torch.no_grad():
-        imgs = torch.cat([imgs, imgs.clone()], dim=0)
-        img_list = [g[None, :] for g in imgs]
-        img_metas.extend(copy.deepcopy(img_metas))
-        for i in range(len(img_metas)):
-            img_metas[i]['frame_id'] = i
-        gt_bboxes.extend(copy.deepcopy(gt_bboxes))
-        results = defaultdict(list)
-        for one_img, one_meta, one_gt_bboxes in zip(img_list, img_metas,
-                                                    gt_bboxes):
-            result = sot.forward([one_img], [[one_meta]],
-                                 gt_bboxes=[one_gt_bboxes],
-                                 return_loss=False)
-            for k, v in result.items():
-                results[k].append(v)
diff --git a/tests/test_models/test_forward/test_vid_forward.py b/tests/test_models/test_forward/test_vid_forward.py
deleted file mode 100644
index 4e85e31a7..000000000
--- a/tests/test_models/test_forward/test_vid_forward.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-from collections import defaultdict
-
-import pytest
-import torch
-
-from .utils import _demo_mm_inputs, _get_config_module
-
-
-@pytest.mark.parametrize(
-    'cfg_file', ['vid/dff/dff_faster_rcnn_r101_dc5_1x_imagenetvid.py'])
-def test_vid_dff_style_forward(cfg_file):
-    config = _get_config_module(cfg_file)
-    model = copy.deepcopy(config.model)
-
-    from mmtrack.models import build_model
-    vid = build_model(model)
-
-    # Test forward train with a non-empty truth batch
-    input_shape = (1, 3, 256, 256)
-    mm_inputs = _demo_mm_inputs(input_shape, num_items=[10])
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    img_metas[0]['is_video_data'] = True
-    gt_bboxes = mm_inputs['gt_bboxes']
-    gt_labels = mm_inputs['gt_labels']
-    gt_masks = mm_inputs['gt_masks']
-
-    ref_input_shape = (1, 3, 256, 256)
-    ref_mm_inputs = _demo_mm_inputs(ref_input_shape, num_items=[11])
-    ref_img = ref_mm_inputs.pop('imgs')[None]
-    ref_img_metas = ref_mm_inputs.pop('img_metas')
-    ref_img_metas[0]['is_video_data'] = True
-    ref_gt_bboxes = ref_mm_inputs['gt_bboxes']
-    ref_gt_labels = ref_mm_inputs['gt_labels']
-    ref_gt_masks = ref_mm_inputs['gt_masks']
-
-    losses = vid.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        gt_labels=gt_labels,
-        ref_img=ref_img,
-        ref_img_metas=ref_img_metas,
-        ref_gt_bboxes=ref_gt_bboxes,
-        ref_gt_labels=ref_gt_labels,
-        gt_masks=gt_masks,
-        ref_gt_masks=ref_gt_masks,
-        return_loss=True)
-    assert isinstance(losses, dict)
-    loss, _ = vid._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-    # Test forward train with an empty truth batch
-    mm_inputs = _demo_mm_inputs(input_shape, num_items=[0])
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    img_metas[0]['is_video_data'] = True
-    gt_bboxes = mm_inputs['gt_bboxes']
-    gt_labels = mm_inputs['gt_labels']
-    gt_masks = mm_inputs['gt_masks']
-
-    ref_input_shape = (1, 3, 256, 256)
-    ref_mm_inputs = _demo_mm_inputs(ref_input_shape, num_items=[0])
-    ref_img = ref_mm_inputs.pop('imgs')[None]
-    ref_img_metas = ref_mm_inputs.pop('img_metas')
-    ref_img_metas[0]['is_video_data'] = True
-    ref_gt_bboxes = ref_mm_inputs['gt_bboxes']
-    ref_gt_labels = ref_mm_inputs['gt_labels']
-    ref_gt_masks = ref_mm_inputs['gt_masks']
-
-    losses = vid.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        gt_labels=gt_labels,
-        ref_img=ref_img,
-        ref_img_metas=ref_img_metas,
-        ref_gt_bboxes=ref_gt_bboxes,
-        ref_gt_labels=ref_gt_labels,
-        gt_masks=gt_masks,
-        ref_gt_masks=ref_gt_masks,
-        return_loss=True)
-    assert isinstance(losses, dict)
-    loss, _ = vid._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-    # Test forward test
-    with torch.no_grad():
-        imgs = torch.cat([imgs, imgs.clone()], dim=0)
-        img_list = [g[None, :] for g in imgs]
-        img_metas.extend(copy.deepcopy(img_metas))
-        for i in range(len(img_metas)):
-            img_metas[i]['frame_id'] = i
-        results = defaultdict(list)
-        for one_img, one_meta in zip(img_list, img_metas):
-            result = vid.forward([one_img], [[one_meta]], return_loss=False)
-            for k, v in result.items():
-                results[k].append(v)
-
-
-@pytest.mark.parametrize('cfg_file', [
-    'vid/fgfa/fgfa_faster_rcnn_r101_dc5_1x_imagenetvid.py',
-    'vid/selsa/selsa_faster_rcnn_r101_dc5_1x_imagenetvid.py',
-    'vid/temporal_roi_align/'
-    'selsa_troialign_faster_rcnn_r101_dc5_7e_imagenetvid.py',
-])
-def test_vid_fgfa_style_forward(cfg_file):
-    config = _get_config_module(cfg_file)
-    model = copy.deepcopy(config.model)
-
-    from mmtrack.models import build_model
-    vid = build_model(model)
-
-    # Test forward train with a non-empty truth batch
-    input_shape = (1, 3, 256, 256)
-    mm_inputs = _demo_mm_inputs(input_shape, num_items=[10])
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    img_metas[0]['is_video_data'] = True
-    gt_bboxes = mm_inputs['gt_bboxes']
-    gt_labels = mm_inputs['gt_labels']
-    gt_masks = mm_inputs['gt_masks']
-
-    ref_input_shape = (2, 3, 256, 256)
-    ref_mm_inputs = _demo_mm_inputs(ref_input_shape, num_items=[9, 11])
-    ref_img = ref_mm_inputs.pop('imgs')[None]
-    ref_img_metas = ref_mm_inputs.pop('img_metas')
-    ref_img_metas[0]['is_video_data'] = True
-    ref_img_metas[1]['is_video_data'] = True
-    ref_gt_bboxes = ref_mm_inputs['gt_bboxes']
-    ref_gt_labels = ref_mm_inputs['gt_labels']
-    ref_gt_masks = ref_mm_inputs['gt_masks']
-
-    losses = vid.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        gt_labels=gt_labels,
-        ref_img=ref_img,
-        ref_img_metas=[ref_img_metas],
-        ref_gt_bboxes=ref_gt_bboxes,
-        ref_gt_labels=ref_gt_labels,
-        gt_masks=gt_masks,
-        ref_gt_masks=ref_gt_masks,
-        return_loss=True)
-    assert isinstance(losses, dict)
-    loss, _ = vid._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-    # Test forward train with an empty truth batch
-    mm_inputs = _demo_mm_inputs(input_shape, num_items=[0])
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    img_metas[0]['is_video_data'] = True
-    gt_bboxes = mm_inputs['gt_bboxes']
-    gt_labels = mm_inputs['gt_labels']
-    gt_masks = mm_inputs['gt_masks']
-
-    ref_mm_inputs = _demo_mm_inputs(ref_input_shape, num_items=[0, 0])
-    ref_imgs = ref_mm_inputs.pop('imgs')[None]
-    ref_img_metas = ref_mm_inputs.pop('img_metas')
-    ref_img_metas[0]['is_video_data'] = True
-    ref_img_metas[1]['is_video_data'] = True
-    ref_gt_bboxes = ref_mm_inputs['gt_bboxes']
-    ref_gt_labels = ref_mm_inputs['gt_labels']
-    ref_gt_masks = ref_mm_inputs['gt_masks']
-
-    losses = vid.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        gt_labels=gt_labels,
-        ref_img=ref_imgs,
-        ref_img_metas=[ref_img_metas],
-        ref_gt_bboxes=ref_gt_bboxes,
-        ref_gt_labels=ref_gt_labels,
-        gt_masks=gt_masks,
-        ref_gt_masks=ref_gt_masks,
-        return_loss=True)
-    assert isinstance(losses, dict)
-    loss, _ = vid._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-    # Test forward test with frame_stride=1 and frame_range=[-1,0]
-    with torch.no_grad():
-        imgs = torch.cat([imgs, imgs.clone()], dim=0)
-        img_list = [g[None, :] for g in imgs]
-        img_metas.extend(copy.deepcopy(img_metas))
-        for i in range(len(img_metas)):
-            img_metas[i]['frame_id'] = i
-            img_metas[i]['num_left_ref_imgs'] = 1
-            img_metas[i]['frame_stride'] = 1
-        ref_imgs = [ref_imgs.clone(), imgs[[0]][None].clone()]
-        ref_img_metas = [
-            copy.deepcopy(ref_img_metas),
-            copy.deepcopy([img_metas[0]])
-        ]
-        results = defaultdict(list)
-        for one_img, one_meta, ref_img, ref_img_meta in zip(
-                img_list, img_metas, ref_imgs, ref_img_metas):
-            result = vid.forward([one_img], [[one_meta]],
-                                 ref_img=[ref_img],
-                                 ref_img_metas=[[ref_img_meta]],
-                                 return_loss=False)
-            for k, v in result.items():
-                results[k].append(v)
diff --git a/tests/test_models/test_forward/test_vis_forward.py b/tests/test_models/test_forward/test_vis_forward.py
deleted file mode 100644
index d36726951..000000000
--- a/tests/test_models/test_forward/test_vis_forward.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-from collections import defaultdict
-
-import pytest
-import torch
-
-from .utils import _demo_mm_inputs, _get_config_module
-
-
-@pytest.mark.parametrize(
-    'cfg_file',
-    ['vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019.py'])
-def test_vis_forward(cfg_file):
-    config = _get_config_module(cfg_file)
-    model = copy.deepcopy(config.model)
-
-    from mmtrack.models import build_model
-    vis = build_model(model)
-
-    # Test forward train with a non-empty truth batch
-    input_shape = (1, 3, 256, 256)
-    mm_inputs = _demo_mm_inputs(input_shape, num_items=[10], with_track=True)
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    gt_bboxes = mm_inputs['gt_bboxes']
-    gt_labels = mm_inputs['gt_labels']
-    gt_instance_ids = mm_inputs['gt_instance_ids']
-    gt_masks = mm_inputs['gt_masks']
-
-    ref_input_shape = (1, 3, 256, 256)
-    ref_mm_inputs = _demo_mm_inputs(
-        ref_input_shape, num_items=[11], with_track=True)
-    ref_img = ref_mm_inputs.pop('imgs')
-    ref_img_metas = ref_mm_inputs.pop('img_metas')
-    ref_gt_bboxes = ref_mm_inputs['gt_bboxes']
-    ref_gt_labels = ref_mm_inputs['gt_labels']
-    ref_gt_masks = ref_mm_inputs['gt_masks']
-    ref_gt_instance_ids = ref_mm_inputs['gt_instance_ids']
-
-    losses = vis.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        gt_labels=gt_labels,
-        ref_img=ref_img,
-        ref_img_metas=ref_img_metas,
-        ref_gt_bboxes=ref_gt_bboxes,
-        ref_gt_labels=ref_gt_labels,
-        gt_instance_ids=gt_instance_ids,
-        gt_masks=gt_masks,
-        ref_gt_instance_ids=ref_gt_instance_ids,
-        ref_gt_masks=ref_gt_masks,
-        return_loss=True)
-    assert isinstance(losses, dict)
-    loss, _ = vis._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-    # Test forward train with an empty truth batch
-    mm_inputs = _demo_mm_inputs(input_shape, num_items=[0], with_track=True)
-    imgs = mm_inputs.pop('imgs')
-    img_metas = mm_inputs.pop('img_metas')
-    gt_bboxes = mm_inputs['gt_bboxes']
-    gt_labels = mm_inputs['gt_labels']
-    gt_instance_ids = mm_inputs['gt_instance_ids']
-    gt_masks = mm_inputs['gt_masks']
-
-    ref_input_shape = (1, 3, 256, 256)
-    ref_mm_inputs = _demo_mm_inputs(
-        ref_input_shape, num_items=[0], with_track=True)
-    ref_img = ref_mm_inputs.pop('imgs')
-    ref_img_metas = ref_mm_inputs.pop('img_metas')
-    ref_gt_bboxes = ref_mm_inputs['gt_bboxes']
-    ref_gt_labels = ref_mm_inputs['gt_labels']
-    ref_gt_masks = ref_mm_inputs['gt_masks']
-    ref_gt_instance_ids = ref_mm_inputs['gt_instance_ids']
-
-    losses = vis.forward(
-        img=imgs,
-        img_metas=img_metas,
-        gt_bboxes=gt_bboxes,
-        gt_labels=gt_labels,
-        ref_img=ref_img,
-        ref_img_metas=ref_img_metas,
-        ref_gt_bboxes=ref_gt_bboxes,
-        ref_gt_labels=ref_gt_labels,
-        gt_instance_ids=gt_instance_ids,
-        gt_masks=gt_masks,
-        ref_gt_instance_ids=ref_gt_instance_ids,
-        ref_gt_masks=ref_gt_masks,
-        return_loss=True)
-    assert isinstance(losses, dict)
-    loss, _ = vis._parse_losses(losses)
-    loss.requires_grad_(True)
-    assert float(loss.item()) > 0
-    loss.backward()
-
-    # Test forward test
-    with torch.no_grad():
-        imgs = torch.cat([imgs, imgs.clone()], dim=0)
-        img_list = [g[None, :] for g in imgs]
-        img2_metas = copy.deepcopy(img_metas)
-        img2_metas[0]['frame_id'] = 1
-        img_metas.extend(img2_metas)
-        results = defaultdict(list)
-        for one_img, one_meta in zip(img_list, img_metas):
-            result = vis.forward([one_img], [[one_meta]],
-                                 rescale=True,
-                                 return_loss=False)
-            for k, v in result.items():
-                results[k].append(v)
diff --git a/tests/test_models/test_forward/utils.py b/tests/test_models/test_forward/utils.py
deleted file mode 100644
index d30e07a89..000000000
--- a/tests/test_models/test_forward/utils.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-from os.path import dirname, exists, join
-
-import numpy as np
-import torch
-
-
-def _get_config_directory():
-    """Find the predefined video detector or tracker config directory."""
-    try:
-        # Assume we are running in the source mmtracking repo
-        repo_dpath = dirname(dirname(dirname(dirname(__file__))))
-    except NameError:
-        # For IPython development when this __file__ is not defined
-        import mmtrack
-        repo_dpath = dirname(dirname(dirname(mmtrack.__file__)))
-    config_dpath = join(repo_dpath, 'configs')
-    if not exists(config_dpath):
-        raise Exception('Cannot find config path')
-    return config_dpath
-
-
-def _get_config_module(fname):
-    """Load a configuration as a python module."""
-    from mmcv import Config
-    config_dpath = _get_config_directory()
-    config_fpath = join(config_dpath, fname)
-    config_mod = Config.fromfile(config_fpath)
-    return config_mod
-
-
-def _get_model_cfg(fname):
-    """Grab configs necessary to create a video detector or tracker.
-
-    These are deep copied to allow for safe modification of parameters without
-    influencing other tests.
-    """
-    import mmcv
-    config = _get_config_module(fname)
-    model = copy.deepcopy(config.model)
-    train_cfg = mmcv.Config(copy.deepcopy(config.train_cfg))
-    test_cfg = mmcv.Config(copy.deepcopy(config.test_cfg))
-    return model, train_cfg, test_cfg
-
-
-def _demo_mm_inputs(
-        input_shape=(1, 3, 300, 300),
-        num_items=None,
-        num_classes=10,
-        with_track=False):
-    """Create a superset of inputs needed to run test or train batches.
-
-    Args:
-        input_shape (tuple):
-            input batch dimensions
-
-        num_items (None | List[int]):
-            specifies the number of boxes in each batch item
-
-        num_classes (int):
-            number of different labels a box might have
-    """
-    from mmdet.core import BitmapMasks
-
-    (N, C, H, W) = input_shape
-
-    rng = np.random.RandomState(0)
-
-    imgs = rng.rand(*input_shape)
-
-    img_metas = [{
-        'img_shape': (H, W, C),
-        'ori_shape': (H, W, C),
-        'pad_shape': (H, W, C),
-        'filename': '<demo>.png',
-        'scale_factor': 1.0,
-        'flip': False,
-        'frame_id': 0,
-        'img_norm_cfg': {
-            'mean': (128.0, 128.0, 128.0),
-            'std': (10.0, 10.0, 10.0)
-        }
-    } for i in range(N)]
-
-    gt_bboxes = []
-    gt_labels = []
-    gt_masks = []
-    gt_instance_ids = []
-
-    for batch_idx in range(N):
-        if num_items is None:
-            num_boxes = rng.randint(1, 10)
-        else:
-            num_boxes = num_items[batch_idx]
-
-        cx, cy, bw, bh = rng.rand(num_boxes, 4).T
-
-        tl_x = ((cx * W) - (W * bw / 2)).clip(0, W)
-        tl_y = ((cy * H) - (H * bh / 2)).clip(0, H)
-        br_x = ((cx * W) + (W * bw / 2)).clip(0, W)
-        br_y = ((cy * H) + (H * bh / 2)).clip(0, H)
-
-        boxes = np.vstack([tl_x, tl_y, br_x, br_y]).T
-        class_idxs = rng.randint(1, num_classes, size=num_boxes)
-
-        gt_bboxes.append(torch.FloatTensor(boxes))
-        gt_labels.append(torch.LongTensor(class_idxs))
-        if with_track:
-            gt_instance_ids.append(torch.arange(boxes.shape[0]))
-
-    mask = np.random.randint(0, 2, (len(boxes), H, W), dtype=np.uint8)
-    gt_masks.append(BitmapMasks(mask, H, W))
-
-    mm_inputs = {
-        'imgs': torch.FloatTensor(imgs).requires_grad_(True),
-        'img_metas': img_metas,
-        'gt_bboxes': gt_bboxes,
-        'gt_labels': gt_labels,
-        'gt_bboxes_ignore': None,
-        'gt_masks': gt_masks,
-    }
-    if with_track:
-        mm_inputs['gt_instance_ids'] = gt_instance_ids
-    return mm_inputs
diff --git a/tests/test_models/test_layers/test_position_encoding.py b/tests/test_models/test_layers/test_position_encoding.py
new file mode 100644
index 000000000..551f3f3bf
--- /dev/null
+++ b/tests/test_models/test_layers/test_position_encoding.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import pytest
+import torch
+
+from mmtrack.models.layers import SinePositionalEncoding3D
+
+
+class TestSinePositionalEncoding3D(TestCase):
+
+    def test_sine_positional_encoding_3(self, num_feats=16, batch_size=2):
+        # test invalid type of scale
+        with pytest.raises(AssertionError):
+            module = SinePositionalEncoding3D(
+                num_feats, scale=(3., ), normalize=True)
+
+        module = SinePositionalEncoding3D(num_feats)
+        t, h, w = 2, 10, 6
+        mask = (torch.rand(batch_size, t, h, w) > 0.5).to(torch.int)
+        assert not module.normalize
+        out = module(mask)
+        assert out.shape == (batch_size, t, num_feats * 2, h, w)
+
+        # set normalize
+        module = SinePositionalEncoding3D(num_feats, normalize=True)
+        assert module.normalize
+        out = module(mask)
+        assert out.shape == (batch_size, t, num_feats * 2, h, w)
diff --git a/tests/test_models/test_losses/test_kl_loss.py b/tests/test_models/test_losses/test_kl_loss.py
new file mode 100644
index 000000000..31132f134
--- /dev/null
+++ b/tests/test_models/test_losses/test_kl_loss.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmtrack.models import KLGridLoss, KLMCLoss
+
+
+class TestKLMCLoss(TestCase):
+
+    def test_kl_mc_loss(self):
+        loss = KLMCLoss(eps=1e-9)
+        scores = torch.Tensor([[0.1, 0.3], [0.1, 0.3]])
+        sample_density = torch.Tensor([[0.001, 0.001], [0.001, 0.001]])
+        gt_density = torch.Tensor([[0.001, 0.001], [0.001, 0.001]])
+        assert torch.allclose(
+            loss(scores, sample_density, gt_density), torch.tensor(6.9127))
+
+
+class TestKLGridLoss(TestCase):
+
+    def test_kl_grid_loss(self):
+        loss = KLGridLoss()
+        scores = torch.Tensor([[0.1, 0.3], [0.1, 0.3]])
+        gt_density = torch.Tensor([[0.001, 0.001], [0.001, 0.001]])
+        assert torch.allclose(
+            loss(scores, gt_density), torch.tensor(0.89773887))
diff --git a/tests/test_models/test_losses/test_l2_loss.py b/tests/test_models/test_losses/test_l2_loss.py
new file mode 100644
index 000000000..d84b09561
--- /dev/null
+++ b/tests/test_models/test_losses/test_l2_loss.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmtrack.models import L2Loss
+
+
+class TestL2Loss(TestCase):
+
+    def test_l2_loss(self):
+        pred = torch.Tensor([[1, 1, 0, 0, 0, 0, 1]])
+        target = torch.Tensor([[1, 1, 0, 0, 0, 0, 0]])
+
+        loss = L2Loss(
+            neg_pos_ub=2,
+            pos_margin=0,
+            neg_margin=0.1,
+            hard_mining=True,
+            loss_weight=1.0)
+        assert torch.allclose(loss(pred, target), torch.tensor(0.1350))
diff --git a/tests/test_models/test_losses/test_multi_pos_cross_entropy_loss.py b/tests/test_models/test_losses/test_multi_pos_cross_entropy_loss.py
index 854558271..996fb4039 100644
--- a/tests/test_models/test_losses/test_multi_pos_cross_entropy_loss.py
+++ b/tests/test_models/test_losses/test_multi_pos_cross_entropy_loss.py
@@ -1,16 +1,20 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
 import torch
 
 from mmtrack.models import MultiPosCrossEntropyLoss
 
 
-def test_mpce_loss():
-    costs = torch.tensor([[1, 0], [0, 1]])
-    labels = torch.tensor([[1, 1], [0, 0]])
+class TestMultiPosCrossEntropyLoss(TestCase):
+
+    def test_mpce_loss(self):
+        costs = torch.tensor([[1, 0], [0, 1]])
+        labels = torch.tensor([[1, 1], [0, 0]])
 
-    loss = MultiPosCrossEntropyLoss(reduction='mean', loss_weight=1.0)
-    assert torch.allclose(loss(costs, labels), torch.tensor(0.))
+        loss = MultiPosCrossEntropyLoss(reduction='mean', loss_weight=1.0)
+        assert torch.allclose(loss(costs, labels), torch.tensor(0.))
 
-    labels = torch.Tensor([[1, 0], [0, 1]])
-    loss(costs, labels)
-    assert torch.allclose(loss(costs, labels), torch.tensor(0.31326))
+        labels = torch.Tensor([[1, 0], [0, 1]])
+        loss(costs, labels)
+        assert torch.allclose(loss(costs, labels), torch.tensor(0.31326))
diff --git a/tests/test_models/test_losses/test_triplet_loss.py b/tests/test_models/test_losses/test_triplet_loss.py
index 73ec56499..0fd6c5f3b 100644
--- a/tests/test_models/test_losses/test_triplet_loss.py
+++ b/tests/test_models/test_losses/test_triplet_loss.py
@@ -1,15 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
 import torch
 
 from mmtrack.models import TripletLoss
 
 
-def test_triplet_loss():
-    feature = torch.Tensor([[1, 1], [1, 1], [0, 0], [0, 0]])
-    label = torch.Tensor([1, 1, 0, 0])
+class TestTripletLoss(TestCase):
+
+    def test_triplet_loss(self):
+        feature = torch.Tensor([[1, 1], [1, 1], [0, 0], [0, 0]])
+        label = torch.Tensor([1, 1, 0, 0])
 
-    loss = TripletLoss(margin=0.3, loss_weight=1.0)
-    assert torch.allclose(loss(feature, label), torch.tensor(0.))
+        loss = TripletLoss(margin=0.3, loss_weight=1.0)
+        assert torch.allclose(loss(feature, label), torch.tensor(0.))
 
-    label = torch.Tensor([1, 0, 1, 0])
-    assert torch.allclose(loss(feature, label), torch.tensor(1.7142))
+        label = torch.Tensor([1, 0, 1, 0])
+        assert torch.allclose(loss(feature, label), torch.tensor(1.7142))
diff --git a/tests/test_models/test_mot/test_byte_track.py b/tests/test_models/test_mot/test_byte_track.py
new file mode 100644
index 000000000..df5e2e199
--- /dev/null
+++ b/tests/test_models/test_mot/test_byte_track.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+import unittest
+from unittest import TestCase
+
+import torch
+from mmengine.logging import MessageHub
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestByteTrack(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    @parameterized.expand([
+        'mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_'
+        'test-mot17halfval.py',
+    ])
+    def test_bytetrack_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+        model.detector.neck.out_channels = 1
+        model.detector.neck.num_csp_blocks = 1
+        model.detector.bbox_head.in_channels = 1
+        model.detector.bbox_head.feat_channels = 1
+        model = MODELS.build(model)
+        assert model.detector
+        assert model.motion
+
+    @parameterized.expand([
+        ('mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_'
+         'test-mot17halfval.py', ('cpu', 'cuda')),
+    ])
+    def test_bytetrack_forward_loss_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_bytetrack_forward_loss_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            _model.detector.neck.out_channels = 1
+            _model.detector.neck.num_csp_blocks = 1
+            _model.detector.bbox_head.in_channels = 1
+            _model.detector.bbox_head.feat_channels = 1
+            # _scope_ will be popped after build
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1, frame_id=0, num_ref_imgs=0, num_classes=1)
+            out_data = model.data_preprocessor(packed_inputs, True)
+            # Test forward
+            losses = model.forward(**out_data, mode='loss')
+            assert isinstance(losses, dict)
+
+    @parameterized.expand([
+        ('mot/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_'
+         'test-mot17halfval.py', ('cpu', 'cuda')),
+    ])
+    def test_bytetrack_forward_predict_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_bytetrack_forward_predict_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            _model.detector.neck.out_channels = 1
+            _model.detector.neck.num_csp_blocks = 1
+            _model.detector.bbox_head.in_channels = 1
+            _model.detector.bbox_head.feat_channels = 1
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1, frame_id=0, num_ref_imgs=0, num_classes=1)
+            out_data = model.data_preprocessor(packed_inputs, False)
+            # Test forward test
+            model.eval()
+            with torch.no_grad():
+                batch_results = model.forward(**out_data, mode='predict')
+                assert len(batch_results) == 1
diff --git a/tests/test_models/test_mot/test_deep_sort.py b/tests/test_models/test_mot/test_deep_sort.py
new file mode 100644
index 000000000..3f800568f
--- /dev/null
+++ b/tests/test_models/test_mot/test_deep_sort.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+import unittest
+from unittest import TestCase
+
+import torch
+from mmengine.logging import MessageHub
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestDeepSORT(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    @parameterized.expand([
+        'mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e'
+        '_mot17halftrain_test-mot17halfval.py'
+    ])
+    def test_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+        model = MODELS.build(model)
+        assert model.detector
+        assert model.reid
+        assert model.motion
+        assert model.tracker
+
+    @parameterized.expand([
+        ('mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e'
+         '_mot17halftrain_test-mot17halfval.py', ('cpu', 'cuda')),
+    ])
+    def test_deepsort_forward_predict_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_deepsort_forward_predict_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1,
+                frame_id=0,
+                num_ref_imgs=0,
+                image_shapes=[(3, 256, 256)],
+                num_classes=1)
+            out_data = model.data_preprocessor(packed_inputs, False)
+
+            # Test forward test
+            model.eval()
+            with torch.no_grad():
+                batch_results = model.forward(**out_data, mode='predict')
+                assert len(batch_results) == 1
diff --git a/tests/test_models/test_mot/test_qdtrack.py b/tests/test_models/test_mot/test_qdtrack.py
new file mode 100644
index 000000000..9bed7c853
--- /dev/null
+++ b/tests/test_models/test_mot/test_qdtrack.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+import unittest
+from unittest import TestCase
+
+import numpy as np
+import torch
+from mmengine.logging import MessageHub
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestQDTrack(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    @parameterized.expand([
+        'mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_'
+        'test-mot17halfval.py',
+    ])
+    def test_qdtrack_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+
+        model = MODELS.build(model)
+        assert model.detector
+        assert model.track_head
+
+    @parameterized.expand([
+        ('mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain'
+         '_test-mot17halfval.py', ('cpu', 'cuda')),
+    ])
+    def test_qdtrack_forward_loss_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_qdtrack_forward_loss_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            # _scope_ will be popped after build
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1,
+                frame_id=0,
+                num_ref_imgs=1,
+                num_classes=1,
+                num_items=[2])
+            out_data = model.data_preprocessor(packed_inputs, True)
+            inputs, data_samples = out_data['inputs'], out_data['data_samples']
+            # Test forward
+            # add gt_match_indices
+            data_samples[0].gt_match_indices = np.array([0, 1])
+            losses = model.forward(inputs, data_samples, mode='loss')
+            assert isinstance(losses, dict)
+
+    @parameterized.expand([
+        ('mot/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain'
+         '_test-mot17halfval.py', ('cpu', 'cuda')),
+    ])
+    def test_qdtrack_forward_predict_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_bytetrack_forward_predict_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1, frame_id=0, num_ref_imgs=0, num_classes=1)
+            out_data = model.data_preprocessor(packed_inputs, False)
+
+            # Test forward test
+            model.eval()
+            with torch.no_grad():
+                batch_results = model.forward(**out_data, mode='predict')
+                assert len(batch_results) == 1
diff --git a/tests/test_models/test_mot/test_strong_sort.py b/tests/test_models/test_mot/test_strong_sort.py
new file mode 100644
index 000000000..33b16d28c
--- /dev/null
+++ b/tests/test_models/test_mot/test_strong_sort.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+import unittest
+from unittest import TestCase
+
+import torch
+from mmengine.logging import MessageHub
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestStrongSORT(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    @parameterized.expand([
+        'mot/strongsort/strongsort_yolox_x_8xb4-80e'
+        '_crowdhuman-mot17halftrain_test-mot17halfval.py'
+    ])
+    def test_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+        model.detector.neck.out_channels = 1
+        model.detector.neck.num_csp_blocks = 1
+        model.detector.bbox_head.in_channels = 1
+        model.detector.bbox_head.feat_channels = 1
+        model.reid.backbone.depth = 18
+        model.reid.head.fc_channels = 1
+        model.reid.head.out_channels = 1
+        model.reid.head.num_classes = 2
+        model = MODELS.build(model)
+        assert model.detector
+        assert model.reid
+        assert model.kalman
+        assert model.cmc
+        assert model.tracker
+
+    @parameterized.expand([
+        ('mot/strongsort/strongsort_yolox_x_8xb4-80e'
+         '_crowdhuman-mot17halftrain_test-mot17halfval.py', ('cpu', 'cuda')),
+    ])
+    def test_strongsort_forward_predict_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_strongsort_forward_predict_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            _model.detector.neck.out_channels = 1
+            _model.detector.neck.num_csp_blocks = 1
+            _model.detector.bbox_head.in_channels = 1
+            _model.detector.bbox_head.feat_channels = 1
+            _model.reid.backbone.depth = 18
+            _model.reid.head.in_channels = 512
+            _model.reid.head.fc_channels = 1
+            _model.reid.head.out_channels = 1
+            _model.reid.head.num_classes = 2
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1,
+                frame_id=0,
+                num_ref_imgs=0,
+                image_shapes=[(3, 256, 256)],
+                num_classes=1)
+            out_data = model.data_preprocessor(packed_inputs, False)
+
+            # Test forward test
+            model.eval()
+            with torch.no_grad():
+                batch_results = model.forward(**out_data, mode='predict')
+                assert len(batch_results) == 1
diff --git a/tests/test_models/test_mot/test_tracktor.py b/tests/test_models/test_mot/test_tracktor.py
new file mode 100644
index 000000000..05b43cf0c
--- /dev/null
+++ b/tests/test_models/test_mot/test_tracktor.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+import unittest
+from unittest import TestCase
+
+import torch
+from mmengine.logging import MessageHub
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestTracktor(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    @parameterized.expand([
+        'mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e'
+        '_mot17halftrain_test-mot17halfval.py'
+    ])
+    def test_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+        model = MODELS.build(model)
+        assert model.detector
+        assert model.reid
+        assert model.motion
+        assert model.tracker
+
+    @parameterized.expand([
+        ('mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e'
+         '_mot17halftrain_test-mot17halfval.py', ('cpu', 'cuda')),
+    ])
+    def test_deepsort_forward_predict_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_deepsort_forward_predict_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1,
+                frame_id=0,
+                num_ref_imgs=0,
+                image_shapes=[(3, 256, 256)],
+                num_classes=1)
+            out_data = model.data_preprocessor(packed_inputs, False)
+
+            # Test forward test
+            model.eval()
+            with torch.no_grad():
+                batch_results = model.forward(**out_data, mode='predict')
+                assert len(batch_results) == 1
diff --git a/tests/test_models/test_motion/test_camera_motion_compensation.py b/tests/test_models/test_motion/test_camera_motion_compensation.py
deleted file mode 100644
index d4a98e3b6..000000000
--- a/tests/test_models/test_motion/test_camera_motion_compensation.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from mmdet.core.bbox.demodata import random_boxes
-
-from mmtrack.models.motion import CameraMotionCompensation
-
-
-def test_cmc():
-    cmc = CameraMotionCompensation()
-    img = np.random.randn(256, 256, 3).astype(np.float32)
-    ref_img = img
-
-    warp_matrix = cmc.get_warp_matrix(img, ref_img)
-    assert isinstance(warp_matrix, torch.Tensor)
-
-    bboxes = random_boxes(5, 256)
-    trans_bboxes = cmc.warp_bboxes(bboxes, warp_matrix)
-    assert (bboxes == trans_bboxes).all()
diff --git a/tests/test_models/test_motion/test_flownet_simple.py b/tests/test_models/test_motion/test_flownet_simple.py
index 6ee2e5193..102f27e73 100644
--- a/tests/test_models/test_motion/test_flownet_simple.py
+++ b/tests/test_models/test_motion/test_flownet_simple.py
@@ -1,21 +1,25 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
 import torch
 
 from mmtrack.models.motion import FlowNetSimple
 
 
-def test_flownet_simple():
-    # Test flownet_simple forward
-    model = FlowNetSimple(img_scale_factor=0.5)
-    model.init_weights()
-    model.train()
-
-    imgs = torch.randn(2, 6, 224, 224)
-    img_metas = [
-        dict(
-            img_norm_cfg=dict(
-                mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375)),
-            img_shape=(224, 224, 3))
-    ]
-    flow = model(imgs, img_metas)
-    assert flow.shape == torch.Size([2, 2, 224, 224])
+class TestFlowNetSimple(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        cls.flownet = FlowNetSimple(img_scale_factor=0.5)
+        cls.flownet.init_weights()
+        cls.flownet.train()
+
+    def test_forward(self):
+        imgs = torch.randn(2, 6, 112, 224)
+        metainfo = dict(img_shape=(112, 224, 3))
+        preprocess_cfg = dict(
+            mean=(123.675, 116.28, 103.53), std=(58.395, 57.12, 57.375))
+
+        flow = self.flownet(imgs, metainfo, preprocess_cfg)
+        assert flow.shape == torch.Size([2, 2, 112, 224])
diff --git a/tests/test_models/test_motion/test_linear_motion.py b/tests/test_models/test_motion/test_linear_motion.py
deleted file mode 100644
index f2bb602c1..000000000
--- a/tests/test_models/test_motion/test_linear_motion.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmtrack.models.motion import LinearMotion
-
-
-def test_linear_motion():
-    linear_motion = LinearMotion(num_samples=2, center_motion=False)
-    bboxes = [[1, 1, 1, 1], [3, 3, 3, 3], [6, 6, 6, 6]]
-    bboxes = [torch.tensor(_, dtype=torch.float32) for _ in bboxes]
-    bbox = linear_motion.step(bboxes)
-    assert (bbox == torch.tensor([9., 9., 9., 9.])).all()
-
-    linear_motion = LinearMotion(num_samples=3, center_motion=False)
-    bboxes = [[1, 1, 1, 1], [3, 3, 3, 3], [6, 6, 6, 6]]
-    bboxes = [torch.tensor(_, dtype=torch.float32) for _ in bboxes]
-    bbox = linear_motion.step(bboxes)
-    assert (bbox == torch.tensor([8.5, 8.5, 8.5, 8.5])).all()
-
-    linear_motion = LinearMotion(num_samples=4, center_motion=False)
-    bboxes = [[1, 1, 1, 1], [3, 3, 3, 3], [6, 6, 6, 6]]
-    bboxes = [torch.tensor(_, dtype=torch.float32) for _ in bboxes]
-    bbox = linear_motion.step(bboxes)
-    assert (bbox == torch.tensor([8.5, 8.5, 8.5, 8.5])).all()
-
-    linear_motion = LinearMotion(num_samples=4, center_motion=True)
-    bboxes = [[1, 1, 1, 1], [3, 3, 3, 3], [6, 6, 6, 6]]
-    bboxes = [torch.tensor(_, dtype=torch.float32) for _ in bboxes]
-    bbox = linear_motion.step(bboxes)
-    assert (bbox == torch.tensor([8.5, 8.5, 8.5, 8.5])).all()
diff --git a/tests/test_models/test_reid/test_base_reid.py b/tests/test_models/test_reid/test_base_reid.py
index fffe94500..a0ab8c0fe 100644
--- a/tests/test_models/test_reid/test_base_reid.py
+++ b/tests/test_models/test_reid/test_base_reid.py
@@ -1,58 +1,46 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import pytest
+from unittest import TestCase
+
 import torch
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import ReIDDataSample
+from mmtrack.testing import get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestBaseReID(TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        register_all_modules()
+
+    @parameterized.expand([
+        'reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py',
+    ])
+    def test_forward(self, cfg_file):
+        model_cfg = get_model_cfg(cfg_file)
+        model = MODELS.build(model_cfg)
+        inputs = torch.rand(1, 4, 3, 256, 128)
+        data_samples = [
+            ReIDDataSample().set_gt_label(label) for label in (0, 0, 1, 1)
+        ]
+
+        # test mode='tensor'
+        feats = model(inputs, mode='tensor')
+        assert feats.shape == (4, 128)
+
+        # test mode='loss'
+        losses = model(inputs, data_samples, mode='loss')
+        assert losses.keys() == {'triplet_loss', 'ce_loss', 'accuracy_top-1'}
+        assert losses['ce_loss'].item() > 0
+        assert losses['triplet_loss'].item() > 0
 
-from mmtrack.models import REID
-
-
-@pytest.mark.parametrize('model_type', ['BaseReID'])
-def test_base_reid(model_type):
-    model_class = REID.get(model_type)
-    backbone = dict(
-        type='ResNet',
-        depth=50,
-        num_stages=4,
-        out_indices=(3, ),
-        style='pytorch')
-    neck = dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1)
-    head = dict(
-        type='LinearReIDHead',
-        num_fcs=1,
-        in_channels=2048,
-        fc_channels=1024,
-        out_channels=128,
-        num_classes=378,
-        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
-        loss_pairwise=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
-        norm_cfg=dict(type='BN1d'),
-        act_cfg=dict(type='ReLU'))
-    model = model_class(backbone=backbone, neck=neck, head=head)
-    model.train()
-    x = torch.randn(32, 3, 256, 128)
-    label = torch.randperm(32)
-    outputs = model.forward_train(x, label)
-    assert isinstance(outputs, dict)
-    assert len(outputs) == 3
-    assert 'triplet_loss' in outputs
-    assert 'ce_loss' in outputs
-    assert 'accuracy' in outputs
-    model.eval()
-    x = torch.randn(1, 3, 256, 128)
-    outputs = model.simple_test(x)
-    assert outputs.shape == (1, 128)
-
-    head['num_classes'] = None
-    # when loss_pairwise is set, num_classes must be a current number
-    with pytest.raises(TypeError):
-        model = model_class(backbone=backbone, neck=neck, head=head)
-
-    head['num_classes'] = 378
-    head['loss'] = None
-    # when loss_pairwise is set, num_classes will be ignored.
-    with pytest.warns(UserWarning):
-        model = model_class(backbone=backbone, neck=neck, head=head)
-
-    head['loss_pairwise'] = None
-    # two losses cannot be none at the same time
-    with pytest.raises(ValueError):
-        model = model_class(backbone=backbone, neck=neck, head=head)
+        # test mode='predict'
+        predictions = model(inputs, data_samples, mode='predict')
+        for pred in predictions:
+            assert isinstance(pred, ReIDDataSample)
+            assert isinstance(pred.pred_feature, torch.Tensor)
+            assert isinstance(pred.gt_label.label, torch.Tensor)
+            assert pred.pred_feature.shape == (128, )
diff --git a/tests/test_models/test_reid/test_fc_module.py b/tests/test_models/test_reid/test_fc_module.py
new file mode 100644
index 000000000..8c566ee42
--- /dev/null
+++ b/tests/test_models/test_reid/test_fc_module.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmtrack.models import FcModule
+
+
+class TestFcModule(TestCase):
+
+    def test_forward(self):
+        inputs = torch.rand(32, 128)
+
+        # test
+        fc = FcModule(
+            in_channels=128,
+            out_channels=32,
+        )
+        fc.init_weights()
+        outputs = fc(inputs)
+        assert outputs.shape == (32, 32)
+
+        # test with norm
+        fc = FcModule(
+            in_channels=128,
+            out_channels=32,
+            norm_cfg=dict(type='BN1d'),
+        )
+        outputs = fc(inputs)
+        assert outputs.shape == (32, 32)
+
+        # test with norm and act
+        fc = FcModule(
+            in_channels=128,
+            out_channels=32,
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='ReLU'),
+        )
+        outputs = fc(inputs)
+        assert outputs.shape == (32, 32)
diff --git a/tests/test_models/test_reid/test_gap.py b/tests/test_models/test_reid/test_gap.py
new file mode 100644
index 000000000..266befe3f
--- /dev/null
+++ b/tests/test_models/test_reid/test_gap.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmtrack.models import GlobalAveragePooling
+
+
+class TestGlobalAveragePooling(TestCase):
+
+    def test_forward(self):
+        inputs = torch.rand(32, 128, 14, 14)
+
+        # test AdaptiveAvgPool2d
+        neck = GlobalAveragePooling()
+        outputs = neck(inputs)
+        assert outputs.shape == (32, 128)
+
+        # test kernel_size
+        neck = GlobalAveragePooling(kernel_size=7)
+        outputs = neck(inputs)
+        assert outputs.shape == (32, 128 * 2 * 2)
+
+        # test kenel_size and stride
+        neck = GlobalAveragePooling(kernel_size=7, stride=2)
+        outputs = neck(inputs)
+        assert outputs.shape == (32, 128 * 4 * 4)
diff --git a/tests/test_models/test_reid/test_linear_reid_head.py b/tests/test_models/test_reid/test_linear_reid_head.py
new file mode 100644
index 000000000..687f06f25
--- /dev/null
+++ b/tests/test_models/test_reid/test_linear_reid_head.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import ReIDDataSample
+from mmtrack.utils import register_all_modules
+
+
+class TestLinearReIDHead(TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        register_all_modules()
+        head_cfg = dict(
+            type='LinearReIDHead',
+            num_fcs=1,
+            in_channels=128,
+            fc_channels=64,
+            out_channels=32,
+            num_classes=2,
+            loss_cls=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
+            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='ReLU'))
+        cls.head = MODELS.build(head_cfg)
+        cls.inputs = (torch.rand(4, 128), torch.rand(4, 128))
+        cls.data_samples = [
+            ReIDDataSample().set_gt_label(label) for label in (0, 0, 1, 1)
+        ]
+
+    def test_forward(self):
+        outputs = self.head(self.inputs)
+        assert outputs.shape == (4, 32)
+
+    def test_loss(self):
+        losses = self.head.loss(self.inputs, self.data_samples)
+        assert losses.keys() == {'triplet_loss', 'ce_loss', 'accuracy_top-1'}
+        assert losses['ce_loss'].item() >= 0
+        assert losses['triplet_loss'].item() >= 0
+
+    def test_predict(self):
+        predictions = self.head.predict(self.inputs, self.data_samples)
+        for pred in predictions:
+            assert isinstance(pred, ReIDDataSample)
+            assert isinstance(pred.pred_feature, torch.Tensor)
+            assert isinstance(pred.gt_label.label, torch.Tensor)
+            assert pred.pred_feature.shape == (32, )
diff --git a/tests/test_models/test_roi_heads/test_bbox_heads/test_selsa_bbox_head.py b/tests/test_models/test_roi_heads/test_bbox_heads/test_selsa_bbox_head.py
index 601b1d37b..0daddf543 100644
--- a/tests/test_models/test_roi_heads/test_bbox_heads/test_selsa_bbox_head.py
+++ b/tests/test_models/test_roi_heads/test_bbox_heads/test_selsa_bbox_head.py
@@ -1,107 +1,41 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
+from unittest import TestCase
+
 import torch
-from mmdet.core import bbox2roi, build_assigner, build_sampler
 
 from mmtrack.models.roi_heads.bbox_heads import SelsaBBoxHead
 
 
-def test_selsa_bbox_head_loss():
-    """Tests selsa_bbox_head loss when truth is empty and non-empty."""
-    selsa_bbox_head_config = dict(
-        num_shared_fcs=2,
-        in_channels=8,
-        fc_out_channels=16,
-        roi_feat_size=3,
-        aggregator=dict(
-            type='SelsaAggregator', in_channels=16, num_attention_blocks=4))
-    self = SelsaBBoxHead(**selsa_bbox_head_config)
-
-    # Dummy proposals
-    proposal_list = [
-        torch.Tensor([[23.6667, 23.8757, 228.6326, 153.8874]]),
-    ]
-
-    target_cfg = mmcv.Config(dict(pos_weight=1))
-
-    # Test bbox loss when truth is empty
-    gt_bboxes = [torch.empty((0, 4))]
-    gt_labels = [torch.LongTensor([])]
-
-    sampling_results = _dummy_bbox_sampling(proposal_list, gt_bboxes,
-                                            gt_labels)
-
-    bbox_targets = self.get_targets(sampling_results, gt_bboxes, gt_labels,
-                                    target_cfg)
-    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
-
-    # Create dummy features "extracted" for each sampled bbox
-    num_sampled = sum(len(res.bboxes) for res in sampling_results)
-    rois = bbox2roi([res.bboxes for res in sampling_results])
-    dummy_feats = torch.rand(num_sampled, 8, 3, 3)
-    ref_dummy_feats = torch.rand(2 * num_sampled, 8, 3, 3)
-    cls_scores, bbox_preds = self.forward(dummy_feats, ref_dummy_feats)
-
-    losses = self.loss(cls_scores, bbox_preds, rois, labels, label_weights,
-                       bbox_targets, bbox_weights)
-    assert losses.get('loss_cls', 0) > 0, 'cls-loss should be non-zero'
-    assert losses.get('loss_bbox', 0) == 0, 'empty gt loss should be zero'
-
-    # Test bbox loss when truth is non-empty
-    gt_bboxes = [
-        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
-    ]
-    gt_labels = [torch.LongTensor([2])]
-
-    sampling_results = _dummy_bbox_sampling(proposal_list, gt_bboxes,
-                                            gt_labels)
-    rois = bbox2roi([res.bboxes for res in sampling_results])
-
-    bbox_targets = self.get_targets(sampling_results, gt_bboxes, gt_labels,
-                                    target_cfg)
-    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
-
-    # Create dummy features "extracted" for each sampled bbox
-    num_sampled = sum(len(res.bboxes) for res in sampling_results)
-    dummy_feats = torch.rand(num_sampled, 8, 3, 3)
-    ref_dummy_feats = torch.rand(2 * num_sampled, 8, 3, 3)
-    cls_scores, bbox_preds = self.forward(dummy_feats, ref_dummy_feats)
-
-    losses = self.loss(cls_scores, bbox_preds, rois, labels, label_weights,
-                       bbox_targets, bbox_weights)
-    assert losses.get('loss_cls', 0) > 0, 'cls-loss should be non-zero'
-    assert losses.get('loss_bbox', 0) > 0, 'box-loss should be non-zero'
-
-
-def _dummy_bbox_sampling(proposal_list, gt_bboxes, gt_labels):
-    """Create sample results that can be passed to BBoxHead.get_targets."""
-    num_imgs = 1
-    feat = torch.rand(1, 1, 3, 3)
-    assign_config = dict(
-        type='MaxIoUAssigner',
-        pos_iou_thr=0.5,
-        neg_iou_thr=0.5,
-        min_pos_iou=0.5,
-        ignore_iof_thr=-1)
-    sampler_config = dict(
-        type='RandomSampler',
-        num=512,
-        pos_fraction=0.25,
-        neg_pos_ub=-1,
-        add_gt_as_proposals=True)
-    bbox_assigner = build_assigner(assign_config)
-    bbox_sampler = build_sampler(sampler_config)
-    gt_bboxes_ignore = [None for _ in range(num_imgs)]
-    sampling_results = []
-    for i in range(num_imgs):
-        assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i],
-                                             gt_bboxes_ignore[i], gt_labels[i])
-        sampling_result = bbox_sampler.sample(
-            assign_result,
-            proposal_list[i],
-            gt_bboxes[i],
-            gt_labels[i],
-            feats=feat)
-        sampling_results.append(sampling_result)
-
-    return sampling_results
+class TestSelsaBBoxHead(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        selsa_bbox_head_config = dict(
+            num_shared_fcs=2,
+            in_channels=2,
+            fc_out_channels=4,
+            roi_feat_size=3,
+            num_classes=10,
+            aggregator=dict(
+                type='SelsaAggregator', in_channels=4, num_attention_blocks=4),
+            bbox_coder=dict(
+                type='mmdet.DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.2, 0.2, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            reg_predictor_cfg=dict(type='mmdet.Linear'),
+            cls_predictor_cfg=dict(type='mmdet.Linear'),
+            loss_cls=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        cls.model = SelsaBBoxHead(**selsa_bbox_head_config)
+
+    def test_forward(self):
+        x = torch.randn(2, 2, 3, 3)
+        ref_x = torch.randn(3, 2, 3, 3)
+        cls_scores, bbox_preds = self.model.forward(x, ref_x)
+        assert cls_scores.shape == (2, 11)
+        assert bbox_preds.shape == (2, 40)
diff --git a/tests/test_models/test_roi_heads/test_roi_extractors/test_single_level_roi_extractor.py b/tests/test_models/test_roi_heads/test_roi_extractors/test_single_level_roi_extractor.py
deleted file mode 100644
index 10ef8db52..000000000
--- a/tests/test_models/test_roi_heads/test_roi_extractors/test_single_level_roi_extractor.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmtrack.models.roi_heads.roi_extractors import SingleRoIExtractor
-
-
-def test_single_roi_extractor():
-    """Tests single roi extractor."""
-    single_roi_extractor_config = dict(
-        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
-        out_channels=256,
-        featmap_strides=[4, 8, 16, 32])
-    self = SingleRoIExtractor(**single_roi_extractor_config)
-
-    feats = (
-        torch.rand((1, 256, 200, 336)),
-        torch.rand((1, 256, 100, 168)),
-        torch.rand((1, 256, 50, 84)),
-        torch.rand((1, 256, 25, 42)),
-    )
-
-    rois = torch.tensor([[0.0000, 587.8285, 52.1405, 886.2484, 341.5644]])
-    # test allowing to accept external arguments by **kwargs
-    roi_feats = self(feats, rois, variable=1)
-    assert roi_feats.shape == torch.Size([1, 256, 7, 7])
diff --git a/tests/test_models/test_roi_heads/test_roi_extractors/test_temporal_roi_align.py b/tests/test_models/test_roi_heads/test_roi_extractors/test_temporal_roi_align.py
index 52369582c..504476cae 100644
--- a/tests/test_models/test_roi_heads/test_roi_extractors/test_temporal_roi_align.py
+++ b/tests/test_models/test_roi_heads/test_roi_extractors/test_temporal_roi_align.py
@@ -1,27 +1,36 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
 import torch
 
-from mmtrack.models.roi_heads.roi_extractors import TemporalRoIAlign
+from mmtrack.registry import MODELS
+from mmtrack.utils import register_all_modules
+
 
+class TestTemporalRoIAlign(TestCase):
 
-def test_temporal_roi_align():
-    """Test Temporal RoI Align."""
-    temporal_roi_align_config = dict(
-        num_most_similar_points=2,
-        num_temporal_attention_blocks=4,
-        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
-        out_channels=256,
-        featmap_strides=[16])
-    self = TemporalRoIAlign(**temporal_roi_align_config)
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        temporal_roi_align_config = dict(
+            type='TemporalRoIAlign',
+            num_most_similar_points=2,
+            num_temporal_attention_blocks=4,
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[16])
+        cls.roi_extractor = MODELS.build(temporal_roi_align_config)
 
-    feats = (torch.rand((1, 256, 50, 84)), )
-    ref_feats = (feats[0].repeat((2, 1, 1, 1)), )
-    rois = torch.tensor([[0.0000, 587.8285, 52.1405, 886.2484, 341.5644]])
+    def test_single_roi_extractor_forward(self):
+        """Tests single roi extractor."""
+        feats = (torch.rand((1, 256, 50, 84)), )
+        ref_feats = (feats[0].repeat((2, 1, 1, 1)), )
+        rois = torch.tensor([[0.0000, 587.8285, 52.1405, 886.2484, 341.5644]])
 
-    # test when ref_feats is not None
-    roi_feats = self(feats, rois, ref_feats=ref_feats)
-    assert roi_feats.shape == torch.Size([1, 256, 7, 7])
+        # test when ref_feats is not None
+        roi_feats = self.roi_extractor(feats, rois, ref_feats=ref_feats)
+        assert roi_feats.shape == torch.Size([1, 256, 7, 7])
 
-    # test when ref_feats is None
-    roi_feats = self(feats, rois, ref_feats=None)
-    assert roi_feats.shape == torch.Size([1, 256, 7, 7])
+        # test when ref_feats is None
+        roi_feats = self.roi_extractor(feats, rois, ref_feats=None)
+        assert roi_feats.shape == torch.Size([1, 256, 7, 7])
diff --git a/tests/test_models/test_roi_heads/test_selsa_roi_head.py b/tests/test_models/test_roi_heads/test_selsa_roi_head.py
new file mode 100644
index 000000000..fea63cc30
--- /dev/null
+++ b/tests/test_models/test_roi_heads/test_selsa_roi_head.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+from unittest import TestCase
+
+import mmengine
+import torch
+from mmengine.structures import InstanceData
+
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs
+from mmtrack.utils import register_all_modules
+
+
+class TestSelsaRoIHead(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        selsa_bbox_roi_head_cfg = dict(
+            type='mmtrack.SelsaRoIHead',
+            _scope_='mmdet',
+            bbox_roi_extractor=dict(
+                type='mmtrack.SingleRoIExtractor',
+                roi_layer=dict(
+                    type='RoIAlign', output_size=7, sampling_ratio=2),
+                out_channels=4,
+                featmap_strides=[16]),
+            bbox_head=dict(
+                type='mmtrack.SelsaBBoxHead',
+                num_shared_fcs=2,
+                in_channels=4,
+                fc_out_channels=4,
+                roi_feat_size=7,
+                num_classes=30,
+                aggregator=dict(
+                    type='mmtrack.SelsaAggregator',
+                    in_channels=4,
+                    num_attention_blocks=2),
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.2, 0.2, 0.2, 0.2]),
+                reg_class_agnostic=False,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            train_cfg=dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            test_cfg=dict(
+                score_thr=0.05,
+                nms=dict(type='nms', iou_threshold=0.5),
+                max_per_img=100))
+        cfg = mmengine.Config(selsa_bbox_roi_head_cfg)
+        cls.roi_head = MODELS.build(cfg)
+        assert cls.roi_head.with_bbox
+
+    def _fake_inputs(self, img_size, proposal_len):
+        """Create a fake proposal list and feature maps."""
+        img_metas = [{
+            'img_shape': (img_size, img_size),
+            'scale_factor': 1,
+        }]
+        proposals_list = []
+        for i in range(len(img_metas)):
+            result = InstanceData(metainfo=img_metas[i])
+            proposal = torch.randn(proposal_len, 4).to(device='cuda')
+            result.bboxes = proposal
+            proposals_list.append(result)
+
+        feats = []
+        for i in range(len(self.roi_head.bbox_roi_extractor.featmap_strides)):
+            feats.append(
+                torch.rand(1, 4, img_size // (2**(i + 2)),
+                           img_size // (2**(i + 2))).to(device='cuda'))
+        feats = tuple(feats)
+
+        return proposals_list, feats
+
+    def test_loss(self):
+        if not torch.cuda.is_available():
+            # RoI pooling only support in GPU
+            return unittest.skip('test requires GPU and torch+cuda')
+
+        self.roi_head = self.roi_head.cuda()
+
+        proposal_list, feats = self._fake_inputs(256, 100)
+        ref_proposal_list, ref_feats = self._fake_inputs(256, 100)
+
+        # When truth is non-empty then both cls, box
+        # should be nonzero for random inputs
+        packed_inputs = demo_mm_inputs(
+            batch_size=1,
+            image_shapes=[(3, 256, 256)],
+            frame_id=0,
+            num_items=[1],
+            num_ref_imgs=2)
+        batch_data_samples = []
+        for data_sample in packed_inputs['data_samples']:
+            batch_data_samples.append(data_sample.to(device='cuda'))
+        out = self.roi_head.loss(feats, ref_feats, proposal_list,
+                                 ref_proposal_list, batch_data_samples)
+        loss_cls = out['loss_cls']
+        loss_bbox = out['loss_bbox']
+        self.assertGreater(loss_cls.sum(), 0, 'cls loss should be non-zero')
+        self.assertGreater(loss_bbox.sum(), 0, 'box loss should be non-zero')
+
+        # When there is no truth, the cls loss should be nonzero but
+        # there should be no box and mask loss.
+        proposal_list, feats = self._fake_inputs(256, 100)
+        ref_proposal_list, ref_feats = self._fake_inputs(256, 100)
+        packed_inputs = demo_mm_inputs(
+            batch_size=1,
+            image_shapes=[(3, 256, 256)],
+            num_items=[0],
+            frame_id=0,
+            num_ref_imgs=2)
+        batch_data_samples = []
+        for data_sample in packed_inputs['data_samples']:
+            batch_data_samples.append(data_sample.to(device='cuda'))
+        out = self.roi_head.loss(feats, ref_feats, proposal_list,
+                                 ref_proposal_list, batch_data_samples)
+        empty_cls_loss = out['loss_cls']
+        empty_bbox_loss = out['loss_bbox']
+        self.assertGreater(empty_cls_loss.sum(), 0,
+                           'cls loss should be non-zero')
+        self.assertEqual(
+            empty_bbox_loss.sum(), 0,
+            'there should be no box loss when there are no true boxes')
+
+    def test_predict(self):
+        if not torch.cuda.is_available():
+            # RoI pooling only support in GPU
+            return unittest.skip('test requires GPU and torch+cuda')
+
+        self.roi_head = self.roi_head.cuda()
+
+        proposal_list, feats = self._fake_inputs(256, 100)
+        ref_proposal_list, ref_feats = self._fake_inputs(256, 100)
+        packed_inputs = demo_mm_inputs(
+            batch_size=1,
+            image_shapes=[(3, 256, 256)],
+            frame_id=0,
+            num_items=[1],
+            num_ref_imgs=2)
+        batch_data_samples = []
+        for data_sample in packed_inputs['data_samples']:
+            batch_data_samples.append(data_sample.to(device='cuda'))
+        out = self.roi_head.predict(feats, ref_feats, proposal_list,
+                                    ref_proposal_list, batch_data_samples)
+        assert out[0]['bboxes'].shape[0] == out[0]['scores'].shape[0]
diff --git a/tests/test_models/test_sot/test_prdimp.py b/tests/test_models/test_sot/test_prdimp.py
new file mode 100644
index 000000000..30449e6a1
--- /dev/null
+++ b/tests/test_models/test_sot/test_prdimp.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestPrDiMP(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    @parameterized.expand(['sot/prdimp/prdimp_r50_8xb10-50e_got10k.py'])
+    def test_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+
+        model = MODELS.build(model)
+        assert model.backbone
+        assert model.classifier
+        assert model.bbox_regressor
+
+    @parameterized.expand(['sot/prdimp/prdimp_r50_8xb10-50e_got10k.py'])
+    def test_stark_forward_predict_mode(self, cfg_file):
+        if not torch.cuda.is_available():
+            return
+
+        _model = get_model_cfg(cfg_file)
+        model = MODELS.build(_model)
+        model = model.cuda()
+
+        # forward in ``predict`` mode
+        model.eval()
+        with torch.no_grad():
+            for i in range(3):
+                packed_inputs = demo_mm_inputs(
+                    batch_size=1,
+                    frame_id=i,
+                    num_key_imgs=1,
+                    num_ref_imgs=0,
+                    image_shapes=[(3, 320, 320)],
+                    num_items=[1])
+                out_data = model.data_preprocessor(packed_inputs, False)
+                batch_results = model.forward(**out_data, mode='predict')
+                assert len(batch_results) == 1
+                assert isinstance(batch_results[0], TrackDataSample)
+
+    @parameterized.expand(['sot/prdimp/prdimp_r50_8xb10-50e_got10k.py'])
+    def test_prdimp_forward_loss_mode(self, cfg_file):
+        if not torch.cuda.is_available():
+            return
+        _model = get_model_cfg(cfg_file)
+        model = MODELS.build(_model)
+        model = model.cuda()
+
+        # forward in ``loss`` mode
+        model.train()
+        packed_inputs = demo_mm_inputs(
+            batch_size=2,
+            frame_id=0,
+            num_key_imgs=3,
+            num_ref_imgs=3,
+            image_shapes=[(3, 280, 280), (3, 280, 280)],
+            ref_prefix='search',
+            num_items=[3, 3])
+        out_data = model.data_preprocessor(packed_inputs, True)
+        losses = model.forward(**out_data, mode='loss')
+        assert isinstance(losses, dict)
diff --git a/tests/test_models/test_sot/test_siamrpn.py b/tests/test_models/test_sot/test_siamrpn.py
new file mode 100644
index 000000000..60243b71a
--- /dev/null
+++ b/tests/test_models/test_sot/test_siamrpn.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+from unittest import TestCase
+
+import torch
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestSiameseRPN(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    @parameterized.expand([
+        'sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py',  # noqa: E501
+    ])
+    def test_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+
+        model = MODELS.build(model)
+        assert model.backbone
+        assert model.neck
+        assert model.head
+
+    @parameterized.expand([
+        (
+            'sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py',  # noqa: E501
+            ('cpu', 'cuda')),
+    ])
+    def test_siamese_rpn_forward_loss_mode(self, cfg_file, devices):
+        _model = get_model_cfg(cfg_file)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            # forward in ``loss`` mode
+            packed_inputs = demo_mm_inputs(
+                batch_size=1,
+                frame_id=0,
+                num_template_imgs=1,
+                num_search_imgs=1,
+                ref_prefix='search',
+                num_items=[1])
+            out_data = model.data_preprocessor(packed_inputs, True)
+            losses = model.forward(**out_data, mode='loss')
+            assert isinstance(losses, dict)
+
+    @parameterized.expand([
+        (
+            'sot/siamese_rpn/siamese-rpn_r50_8xb28-20e_imagenetvid-imagenetdet-coco_test-lasot.py',  # noqa: E501
+            ('cpu', 'cuda')),
+    ])
+    def test_siamese_rpn_forward_predict_mode(self, cfg_file, devices):
+        _model = get_model_cfg(cfg_file)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            # forward in ``predict`` mode
+            model.eval()
+            with torch.no_grad():
+                for i in range(3):
+                    packed_inputs = demo_mm_inputs(
+                        batch_size=1,
+                        frame_id=i,
+                        num_key_imgs=1,
+                        num_ref_imgs=0,
+                        num_items=[1])
+                    out_data = model.data_preprocessor(packed_inputs, False)
+                    batch_results = model.forward(**out_data, mode='predict')
+                    assert len(batch_results) == 1
+                    assert isinstance(batch_results[0], TrackDataSample)
diff --git a/tests/test_models/test_sot/test_stark.py b/tests/test_models/test_sot/test_stark.py
new file mode 100644
index 000000000..1f0d7f0ca
--- /dev/null
+++ b/tests/test_models/test_sot/test_stark.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+from unittest import TestCase
+
+import torch
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestStark(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    @parameterized.expand([
+        'sot/stark/stark-st1_r50_8xb16-500e_got10k.py',
+        'sot/stark/stark-st2_r50_8xb16-50e_got10k.py'
+    ])
+    def test_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+
+        model = MODELS.build(model)
+        assert model.backbone
+        assert model.neck
+        assert model.head
+
+    # TODO: reduce the channels of models in all configs for speed up
+    # unit test.
+    @parameterized.expand([
+        ('sot/stark/stark-st1_r50_8xb16-500e_got10k.py', ('cpu', 'cuda')),
+        ('sot/stark/stark-st2_r50_8xb16-50e_got10k.py', ('cpu', 'cuda'))
+    ])
+    def test_stark_forward_loss_mode(self, cfg_file, devices):
+        _model = get_model_cfg(cfg_file)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            # forward in ``loss`` mode
+            packed_inputs = demo_mm_inputs(
+                batch_size=1,
+                frame_id=0,
+                num_template_imgs=2,
+                num_search_imgs=1,
+                ref_prefix='search',
+                image_shapes=[[(3, 128, 128), (3, 320, 320)]],
+                num_items=[1])
+            for data_sample in packed_inputs['data_samples']:
+                data_sample.padding_mask = torch.zeros((2, 128, 128),
+                                                       dtype=bool)
+                data_sample.search_padding_mask = torch.zeros((1, 128, 128),
+                                                              dtype=bool)
+            out_data = model.data_preprocessor(packed_inputs, True)
+            losses = model.forward(**out_data, mode='loss')
+            assert isinstance(losses, dict)
+
+    @parameterized.expand([
+        ('sot/stark/stark-st1_r50_8xb16-500e_got10k.py', ('cpu', 'cuda')),
+        ('sot/stark/stark-st2_r50_8xb16-50e_got10k.py', ('cpu', 'cuda'))
+    ])
+    def test_stark_forward_predict_mode(self, cfg_file, devices):
+        _model = get_model_cfg(cfg_file)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            # forward in ``predict`` mode
+            model.eval()
+            with torch.no_grad():
+                for i in range(3):
+                    packed_inputs = demo_mm_inputs(
+                        batch_size=1,
+                        frame_id=i,
+                        num_key_imgs=1,
+                        num_ref_imgs=0,
+                        image_shapes=[(3, 320, 320)],
+                        num_items=[1])
+                    for data_sample in packed_inputs['data_samples']:
+                        data_sample.padding_mask = torch.zeros((1, 320, 320),
+                                                               dtype=bool)
+                    out_data = model.data_preprocessor(packed_inputs, False)
+                    batch_results = model.forward(**out_data, mode='predict')
+                    assert len(batch_results) == 1
+                    assert isinstance(batch_results[0], TrackDataSample)
diff --git a/tests/test_models/test_task_modules/test_anchor/test_siamese_rpn_anchor_generator.py b/tests/test_models/test_task_modules/test_anchor/test_siamese_rpn_anchor_generator.py
new file mode 100644
index 000000000..678e9b3c4
--- /dev/null
+++ b/tests/test_models/test_task_modules/test_anchor/test_siamese_rpn_anchor_generator.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmtrack.models.task_modules import SiameseRPNAnchorGenerator
+
+
+class TestSiameseRPNAnchorGenerator(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        cls.anchor_generator = SiameseRPNAnchorGenerator(
+            strides=[(8, 8)], ratios=[0.33, 0.5, 1, 2, 3], scales=[8])
+
+    def test_gen_2d_hanning_windows(self):
+        multi_level_windows = self.anchor_generator.gen_2d_hanning_windows(
+            [(4, 4)], device='cpu')
+        assert len(multi_level_windows) == 1
+        assert len(multi_level_windows[0]) == 4 * 4 * 5
+
+    def test_gen_single_level_base_anchors(self):
+        base_anchors = self.anchor_generator.gen_single_level_base_anchors(
+            base_size=6,
+            scales=torch.Tensor([8]),
+            ratios=torch.Tensor([0.33, 0.5, 1, 2, 3]),
+            center=[2, 2])
+        assert base_anchors.shape == torch.Size([5, 4])
diff --git a/tests/test_models/test_task_modules/test_filter/test_filter.py b/tests/test_models/test_task_modules/test_filter/test_filter.py
new file mode 100644
index 000000000..4fbb0174a
--- /dev/null
+++ b/tests/test_models/test_task_modules/test_filter/test_filter.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmtrack.models.task_modules import apply_feat_transpose, apply_filter
+
+
+def test_apply_filter():
+    feat = torch.randn(3, 4, 8, 18, 18)
+    filter = torch.randn(4, 8, 4, 4)
+    scores = apply_filter(feat, filter)
+    assert scores.shape == torch.Size([3, 4, 19, 19])
+    feat = torch.randn(3, 8, 22, 22)
+    filter = torch.randn(1, 8, 4, 4)
+    scores = apply_filter(feat, filter)
+    assert scores.shape == torch.Size([3, 1, 23, 23])
+
+
+def test_apply_feat_transpose():
+    feat = torch.randn(3, 4, 8, 18, 18)
+    activation = torch.randn(3, 4, 19, 19)
+    filter_grad = apply_feat_transpose(feat, activation, (4, 4))
+    assert filter_grad.shape == torch.Size([4, 8, 4, 4])
+    feat = torch.randn(3, 8, 22, 22)
+    activation = torch.randn(3, 1, 23, 23)
+    filter_grad = apply_feat_transpose(
+        feat, activation, (4, 4), training=False)
+    assert filter_grad.shape == torch.Size([1, 8, 4, 4])
diff --git a/tests/test_core/test_motion/test_flow.py b/tests/test_models/test_task_modules/test_motion/test_flow.py
similarity index 90%
rename from tests/test_core/test_motion/test_flow.py
rename to tests/test_models/test_task_modules/test_motion/test_flow.py
index cd3329d0b..9c7b3a135 100644
--- a/tests/test_core/test_motion/test_flow.py
+++ b/tests/test_models/test_task_modules/test_motion/test_flow.py
@@ -2,11 +2,11 @@
 import pytest
 import torch
 
-from mmtrack.core import flow_warp_feats
+from mmtrack.models.task_modules import flow_warp_feats
 
 
 def test_flow_warp_feats():
-    flow = torch.randn(2, 2, 10, 10)
+    flow = torch.randn(2, 2, 10, 8)
     ref_x = torch.randn(2, 8, 32, 32)
     x = flow_warp_feats(ref_x, flow)
     assert x.shape == ref_x.shape
diff --git a/tests/test_models/test_task_modules/test_track/test_aflink.py b/tests/test_models/test_task_modules/test_track/test_aflink.py
new file mode 100644
index 000000000..c421e7ecb
--- /dev/null
+++ b/tests/test_models/test_task_modules/test_track/test_aflink.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+from torch import nn
+
+from mmtrack.registry import TASK_UTILS
+from mmtrack.utils import register_all_modules
+
+
+class TestAppearanceFreeLink(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        cls.cfg = dict(
+            type='AppearanceFreeLink',
+            checkpoint='',
+            temporal_threshold=(0, 30),
+            spatial_threshold=75,
+            confidence_threshold=0.95,
+        )
+
+    def test_init(self):
+        aflink = TASK_UTILS.build(self.cfg)
+        assert aflink.temporal_threshold == (0, 30)
+        assert aflink.spatial_threshold == 75
+        assert aflink.confidence_threshold == 0.95
+        assert isinstance(aflink.model, nn.Module)
+
+    def test_forward(self):
+        pred_track = np.random.randn(10, 7)
+        aflink = TASK_UTILS.build(self.cfg)
+        linked_track = aflink.forward(pred_track)
+        assert isinstance(linked_track, np.ndarray)
+        assert linked_track.shape == (10, 7)
diff --git a/tests/test_models/test_task_modules/test_track/test_interpolation.py b/tests/test_models/test_task_modules/test_track/test_interpolation.py
new file mode 100644
index 000000000..e6d2a784c
--- /dev/null
+++ b/tests/test_models/test_task_modules/test_track/test_interpolation.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+
+from mmtrack.registry import TASK_UTILS
+from mmtrack.utils import register_all_modules
+
+
+class TestInterpolateTracklets(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        cls.cfg = dict(
+            type='InterpolateTracklets',
+            min_num_frames=5,
+            max_num_frames=20,
+            use_gsi=True,
+            smooth_tau=10)
+
+    def test_init(self):
+        interpolation = TASK_UTILS.build(self.cfg)
+        assert interpolation.min_num_frames == 5
+        assert interpolation.max_num_frames == 20
+        assert interpolation.use_gsi
+        assert interpolation.smooth_tau == 10
+
+    def test_forward(self):
+        pred_track = np.random.randn(5, 7)
+
+        # set frame_id and target_id
+        pred_track[:, 0] = np.array([1, 2, 5, 6, 7])
+        pred_track[:, 1] = 1
+
+        interpolation = TASK_UTILS.build(self.cfg)
+        linked_track = interpolation.forward(pred_track)
+        assert isinstance(linked_track, np.ndarray)
+        assert linked_track.shape == (5, 7)
diff --git a/tests/test_models/test_track_heads/__init__.py b/tests/test_models/test_track_heads/__init__.py
new file mode 100644
index 000000000..ad36d81ea
--- /dev/null
+++ b/tests/test_models/test_track_heads/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
diff --git a/tests/test_models/test_track_heads/test_iounet_head.py b/tests/test_models/test_track_heads/test_iounet_head.py
new file mode 100644
index 000000000..f9d3243a0
--- /dev/null
+++ b/tests/test_models/test_track_heads/test_iounet_head.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from unittest import TestCase
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmtrack.models.track_heads.iounet_head import IouNetHead, LinearBlock
+from mmtrack.structures import TrackDataSample
+
+
+class TestLinearBlock(TestCase):
+
+    def setUp(self):
+        self.model = LinearBlock(
+            8, 8, 3, bias=True, batch_norm=True, relu=True)
+
+    def test_forward(self):
+        x = torch.randn(4, 8, 3, 3)
+        output = self.model(x)
+        assert output.shape == torch.Size([4, 8])
+
+
+class TestIouNetHead(TestCase):
+
+    def setUp(self):
+        cfg = dict(
+            in_dim=(16, 32),
+            pred_in_dim=(16, 16),
+            pred_inter_dim=(8, 8),
+            bbox_cfg=dict(
+                num_init_random_boxes=9,
+                box_jitter_pos=0.1,
+                box_jitter_sz=0.5,
+                iounet_topk=3,
+                box_refine_step_length=2.5e-3,
+                box_refine_iter=10,
+                max_aspect_ratio=6,
+                box_refine_step_decay=1),
+            test_cfg=dict(img_sample_size=352),
+            loss_bbox=dict(type='KLMCLoss'),
+            train_cfg=dict(
+                proposals_sigma=[(0.05, 0.05), (0.5, 0.5)],
+                gt_bboxes_sigma=(0.05, 0.05),
+                num_samples=128,
+                add_first_bbox=False,
+                loss_weights=dict(bbox=0.0025)))
+
+        self.model = IouNetHead(**cfg)
+
+    def test_prdimp_cls_head_predict_mode(self):
+        if not torch.cuda.is_available():
+            return
+        backbone_feats = (torch.randn(1, 16, 22, 22, device='cuda:0'),
+                          torch.randn(1, 32, 22, 22, device='cuda:0'))
+        target_bboxes = torch.rand(1, 4, device='cuda:0') * 150
+
+        model = self.model.to('cuda:0')
+        model.eval()
+        with torch.no_grad():
+            model.init_iou_net(backbone_feats, target_bboxes)
+            sample_center = torch.randn(1, 2, device='cuda:0') * 150
+            model.predict(backbone_feats, None, target_bboxes, sample_center,
+                          4)
+
+    def test_iou_net_head_loss(self):
+        if not torch.cuda.is_available():
+            return
+        model = self.model.to('cuda:0')
+        model.train()
+        model = self.model.to('cuda:0')
+        template_feats = (torch.randn(2, 16, 32, 32).to('cuda:0'),
+                          torch.randn(2, 32, 18, 18).to('cuda:0'))
+        search_feats = (torch.randn(2, 16, 32, 32).to('cuda:0'),
+                        torch.randn(2, 32, 18, 18).to('cuda:0'))
+        target_bboxes = (torch.rand(1, 4) * 150).to('cuda:0')
+
+        gt_instances = InstanceData()
+        gt_instances['bboxes'] = target_bboxes
+        search_gt_instances = copy.deepcopy(gt_instances)
+
+        data_sample = TrackDataSample()
+        data_sample.gt_instances = gt_instances
+        data_sample.search_gt_instances = search_gt_instances
+
+        model.loss(template_feats, search_feats, [data_sample, data_sample])
diff --git a/tests/test_models/test_track_heads/test_mask2former_head.py b/tests/test_models/test_track_heads/test_mask2former_head.py
new file mode 100644
index 000000000..8cdf302cd
--- /dev/null
+++ b/tests/test_models/test_track_heads/test_mask2former_head.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+from mmengine import Config
+
+from mmtrack.models.track_heads import Mask2FormerHead
+from mmtrack.structures import TrackDataSample
+from mmtrack.testing import demo_mm_inputs
+
+
+class TestMask2FormerHead(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.config = Config(
+            dict(
+                type='Mask2FormerHead',
+                in_channels=[256, 512, 1024,
+                             2048],  # pass to pixel_decoder inside
+                strides=[4, 8, 16, 32],
+                feat_channels=256,
+                out_channels=256,
+                num_classes=40,
+                num_queries=100,
+                num_frames=2,
+                num_transformer_feat_level=3,
+                pixel_decoder=dict(
+                    type='mmdet.MSDeformAttnPixelDecoder',
+                    num_outs=3,
+                    norm_cfg=dict(type='GN', num_groups=32),
+                    act_cfg=dict(type='ReLU'),
+                    encoder=dict(
+                        type='mmdet.DetrTransformerEncoder',
+                        num_layers=6,
+                        transformerlayers=dict(
+                            type='BaseTransformerLayer',
+                            attn_cfgs=dict(
+                                type='MultiScaleDeformableAttention',
+                                embed_dims=256,
+                                num_heads=8,
+                                num_levels=3,
+                                num_points=4,
+                                im2col_step=64,
+                                dropout=0.0,
+                                batch_first=False,
+                                norm_cfg=None,
+                                init_cfg=None),
+                            ffn_cfgs=dict(
+                                type='FFN',
+                                embed_dims=256,
+                                feedforward_channels=1024,
+                                num_fcs=2,
+                                ffn_drop=0.0,
+                                act_cfg=dict(type='ReLU', inplace=True)),
+                            operation_order=('self_attn', 'norm', 'ffn',
+                                             'norm')),
+                        init_cfg=None),
+                    positional_encoding=dict(
+                        type='mmdet.SinePositionalEncoding',
+                        num_feats=128,
+                        normalize=True),
+                    init_cfg=None),
+                enforce_decoder_input_project=False,
+                positional_encoding=dict(
+                    type='SinePositionalEncoding3D',
+                    num_feats=128,
+                    normalize=True),
+                transformer_decoder=dict(
+                    type='mmdet.DetrTransformerDecoder',
+                    return_intermediate=True,
+                    num_layers=9,
+                    transformerlayers=dict(
+                        type='mmdet.DetrTransformerDecoderLayer',
+                        attn_cfgs=dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            attn_drop=0.0,
+                            proj_drop=0.0,
+                            dropout_layer=None,
+                            batch_first=False),
+                        ffn_cfgs=dict(
+                            embed_dims=256,
+                            feedforward_channels=2048,
+                            num_fcs=2,
+                            act_cfg=dict(type='ReLU', inplace=True),
+                            ffn_drop=0.0,
+                            dropout_layer=None,
+                            add_identity=True),
+                        feedforward_channels=2048,
+                        operation_order=('cross_attn', 'norm', 'self_attn',
+                                         'norm', 'ffn', 'norm')),
+                    init_cfg=None),
+                loss_cls=dict(
+                    type='mmdet.CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=2.0,
+                    reduction='mean',
+                    class_weight=[1.0] * 40 + [0.1]),
+                loss_mask=dict(
+                    type='mmdet.CrossEntropyLoss',
+                    use_sigmoid=True,
+                    reduction='mean',
+                    loss_weight=5.0),
+                loss_dice=dict(
+                    type='mmdet.DiceLoss',
+                    use_sigmoid=True,
+                    activate=True,
+                    reduction='mean',
+                    naive_dice=True,
+                    eps=1.0,
+                    loss_weight=5.0),
+                train_cfg=dict(
+                    num_points=12544,
+                    oversample_ratio=3.0,
+                    importance_sample_ratio=0.75,
+                    assigner=dict(
+                        type='mmdet.HungarianAssigner',
+                        match_costs=[
+                            dict(type='mmdet.ClassificationCost', weight=2.0),
+                            dict(
+                                type='mmdet.CrossEntropyLossCost',
+                                weight=5.0,
+                                use_sigmoid=True),
+                            dict(
+                                type='mmdet.DiceCost',
+                                weight=5.0,
+                                pred_act=True,
+                                eps=1.0)
+                        ]),
+                    sampler=dict(type='mmdet.MaskPseudoSampler'))))
+
+    def test_mask2former_head_loss(self):
+        mask2former_head = Mask2FormerHead(**self.config)
+        mask2former_head.init_weights()
+        s = 256
+        feats = [
+            torch.rand(2, 256 * (2**i), s // stride, s // stride)
+            for i, stride in enumerate([8, 16, 32, 64])
+        ]
+        packed_inputs = demo_mm_inputs(
+            batch_size=1,
+            frame_id=0,
+            num_items=[3],
+            num_key_imgs=2,
+            image_shapes=[(3, s, s)],
+            num_classes=2,
+            with_mask=True)
+        data_sample = packed_inputs['data_samples'][0]
+        data_sample.gt_instances['map_instances_to_img_idx'] = torch.tensor(
+            [0, 0, 1])
+        loss = mask2former_head.loss(feats, [data_sample])
+        # loss_cls, loss_mask and loss_dice
+        assert len(loss) == 30
+
+    def test_mask2former_head_predict(self):
+        mask2former_head = Mask2FormerHead(**self.config)
+        mask2former_head.init_weights()
+        s = 256
+        # assume the video has 30 frames
+        feats = [
+            torch.rand(30, 256 * (2**i), s // stride, s // stride)
+            for i, stride in enumerate([8, 16, 32, 64])
+        ]
+
+        img_metas = dict(
+            img_shape=(s, s),
+            ori_shape=(s, s),
+            scale_factor=(1, 1),
+            pad_shape=(s, s),
+            batch_input_shape=(s, s))
+        data_sample = TrackDataSample(metainfo=img_metas)
+        results = mask2former_head.predict(feats, [data_sample])
+
+        assert len(results) == 1
+        assert len(results[0]) == 10
diff --git a/tests/test_models/test_track_heads/test_prdimp_cls_head.py b/tests/test_models/test_track_heads/test_prdimp_cls_head.py
new file mode 100644
index 000000000..9484c4d37
--- /dev/null
+++ b/tests/test_models/test_track_heads/test_prdimp_cls_head.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from unittest import TestCase
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmtrack.models.track_heads.prdimp_cls_head import PrDiMPClsHead
+from mmtrack.structures import TrackDataSample
+
+
+class TestLinearBlock(TestCase):
+
+    def setUp(self):
+        cfg = dict(
+            in_dim=32,
+            out_dim=16,
+            filter_initializer=dict(
+                type='FilterInitializer',
+                filter_size=4,
+                feature_dim=16,
+                feature_stride=16),
+            filter_optimizer=dict(
+                type='PrDiMPFilterOptimizer',
+                num_iters=5,
+                feat_stride=16,
+                init_step_length=1.0,
+                init_filter_regular=0.05,
+                gauss_sigma=0.9,
+                alpha_eps=0.05,
+                min_filter_regular=0.05,
+                label_thres=0),
+            locate_cfg=dict(
+                no_target_min_score=0.04,
+                distractor_thres=0.8,
+                hard_neg_thres=0.5,
+                target_neighborhood_scale=2.2,
+                dispalcement_scale=0.8,
+                update_scale_when_uncertain=True),
+            update_cfg=dict(
+                sample_memory_size=50,
+                normal_lr=0.01,
+                hard_neg_lr=0.02,
+                init_samples_min_weight=0.25,
+                train_skipping=20),
+            optimizer_cfg=dict(
+                init_update_iters=10, update_iters=2, hard_neg_iters=1),
+            test_cfg=dict(img_sample_size=352),
+            loss_cls=dict(type='KLGridLoss'),
+            train_cfg=dict(
+                feat_size=(18, 18),
+                img_size=(288, 288),
+                sigma_factor=0.05,
+                end_pad_if_even=True,
+                gauss_label_bias=0.,
+                use_gauss_density=True,
+                label_density_norm=True,
+                label_density_threshold=0.,
+                label_density_shrink=0,
+                loss_weights=dict(cls_init=0.25, cls_iter=1., cls_final=0.25)))
+
+        self.model = PrDiMPClsHead(**cfg)
+
+    def test_prdimp_cls_head_predict(self):
+        self.model.eval()
+        backbone_feats = torch.randn(2, 32, 22, 22)
+        target_bboxes = torch.rand(4, 4) * 150
+
+        if torch.cuda.is_available():
+            self.model = self.model.to('cuda:0')
+            backbone_feats = backbone_feats.to('cuda:0')
+            target_bboxes = target_bboxes.to('cuda:0')
+            self.model.init_classifier(
+                backbone_feats, target_bboxes, dropout_probs=[0.2, 0.2])
+        else:
+            self.model.target_filter = torch.randn(1, 16, 4, 4)
+            cls_feats = self.model.get_cls_feats(backbone_feats)
+            self.model.init_memory(cls_feats, target_bboxes)
+
+        scores, test_feat = self.model(backbone_feats)
+        sample_size = torch.Tensor([352., 352.])
+        prev_bbox = torch.rand(4) * 150
+        if torch.cuda.is_available():
+            sample_size = sample_size.to('cuda:0')
+            prev_bbox = prev_bbox.to('cuda:0')
+        self.model.predict_by_feat(scores[:1], prev_bbox,
+                                   target_bboxes[:1, :2], 4)
+        if torch.cuda.is_available():
+            self.model.update_classifier(target_bboxes[1], 1, False)
+
+    def test_prdimp_cls_head_loss(self):
+        if not torch.cuda.is_available():
+            return
+        self.model.train()
+        model = self.model.to('cuda:0')
+        template_feats = (torch.randn(2, 32, 18, 18).to('cuda:0'), )
+        search_feats = (torch.randn(2, 32, 18, 18).to('cuda:0'), )
+        target_bboxes = (torch.rand(2, 4) * 150).to('cuda:0')
+
+        gt_instances = InstanceData()
+        gt_instances['bboxes'] = target_bboxes
+        search_gt_instances = copy.deepcopy(gt_instances)
+
+        data_sample = TrackDataSample()
+        data_sample.gt_instances = gt_instances
+        data_sample.search_gt_instances = search_gt_instances
+
+        model.loss(template_feats, search_feats, [data_sample])
diff --git a/tests/test_models/test_track_heads/test_quasi_dense_embed_head.py b/tests/test_models/test_track_heads/test_quasi_dense_embed_head.py
index 5dbabd019..2beb0dd3b 100644
--- a/tests/test_models/test_track_heads/test_quasi_dense_embed_head.py
+++ b/tests/test_models/test_track_heads/test_quasi_dense_embed_head.py
@@ -1,87 +1,118 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
+from unittest import TestCase
+
+import mmengine
 import torch
-from mmdet.core import build_assigner, build_sampler
+from mmengine.structures import InstanceData
 
 from mmtrack.models.track_heads import QuasiDenseEmbedHead
+from mmtrack.registry import TASK_UTILS
 
 
-def test_quasi_dense_embed_head():
-    cfg = mmcv.Config(
-        dict(
-            num_convs=4,
-            num_fcs=1,
-            embed_channels=256,
-            norm_cfg=dict(type='GN', num_groups=32),
-            loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
-            loss_track_aux=dict(
-                type='L2Loss',
-                neg_pos_ub=3,
-                pos_margin=0,
-                neg_margin=0.1,
-                hard_mining=True,
-                loss_weight=1.0)))
-
-    self = QuasiDenseEmbedHead(**cfg)
-
-    gt_match_indices = [torch.tensor([0, 1])]
-    proposal_list = [
-        torch.Tensor([[23.6667, 23.8757, 228.6326, 153.8874],
-                      [23.6667, 23.8757, 228.6326, 153.8874]])
-    ]
-    gt_bboxes = [
-        torch.Tensor([[23.6667, 23.8757, 228.6326, 153.8874],
-                      [23.6667, 23.8757, 228.6326, 153.8874]])
-    ]
-    gt_labels = [torch.LongTensor([1, 1])]
-
-    feats = torch.rand(2, 256, 7, 7)
-    key_sampling_results = _dummy_bbox_sampling(feats, proposal_list,
-                                                gt_bboxes, gt_labels)
-    ref_sampling_results = key_sampling_results
-
-    key_embeds = self.forward(feats)
-    ref_embeds = key_embeds
-
-    match_feats = self.match(key_embeds, ref_embeds, key_sampling_results,
-                             ref_sampling_results)
-    asso_targets = self.get_targets(gt_match_indices, key_sampling_results,
-                                    ref_sampling_results)
-    loss_track = self.loss(*match_feats, *asso_targets)
-    assert loss_track['loss_track'] >= 0, 'track loss should be zero'
-    assert loss_track['loss_track_aux'] > 0, 'aux loss should be non-zero'
-
-
-def _dummy_bbox_sampling(feats, proposal_list, gt_bboxes, gt_labels):
+def _dummy_bbox_sampling(rpn_results_list, batch_gt_instances):
     """Create sample results that can be passed to Head.get_targets."""
-    num_imgs = len(proposal_list)
+    num_imgs = len(rpn_results_list)
+    feat = torch.rand(1, 1, 3, 3)
     assign_config = dict(
-        type='MaxIoUAssigner',
+        type='mmdet.MaxIoUAssigner',
+        _scope_='mmdet',
         pos_iou_thr=0.5,
         neg_iou_thr=0.5,
         min_pos_iou=0.5,
         ignore_iof_thr=-1)
     sampler_config = dict(
         type='CombinedSampler',
+        _scope_='mmdet',
         num=4,
         pos_fraction=0.5,
         neg_pos_ub=3,
         add_gt_as_proposals=True,
         pos_sampler=dict(type='InstanceBalancedPosSampler'),
         neg_sampler=dict(type='RandomSampler'))
-    bbox_assigner = build_assigner(assign_config)
-    bbox_sampler = build_sampler(sampler_config)
-    gt_bboxes_ignore = [None for _ in range(num_imgs)]
+    bbox_assigner = TASK_UTILS.build(assign_config)
+    bbox_sampler = TASK_UTILS.build(sampler_config)
+
     sampling_results = []
     for i in range(num_imgs):
-        assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i],
-                                             gt_bboxes_ignore[i], gt_labels[i])
+        assign_result = bbox_assigner.assign(rpn_results_list[i],
+                                             batch_gt_instances[i])
         sampling_result = bbox_sampler.sample(
             assign_result,
-            proposal_list[i],
-            gt_bboxes[i],
-            gt_labels[i],
-            feats=feats)
+            rpn_results_list[i],
+            batch_gt_instances[i],
+            feats=feat)
         sampling_results.append(sampling_result)
 
     return sampling_results
+
+
+class TestQuasiDenseEmbedHead(TestCase):
+
+    def test_quasi_dense_embed_head_loss(self):
+        cfg = mmengine.Config(
+            dict(
+                num_convs=4,
+                num_fcs=1,
+                embed_channels=256,
+                norm_cfg=dict(type='GN', num_groups=32),
+                loss_track=dict(
+                    type='MultiPosCrossEntropyLoss', loss_weight=0.25),
+                loss_track_aux=dict(
+                    type='L2Loss',
+                    neg_pos_ub=3,
+                    pos_margin=0,
+                    neg_margin=0.1,
+                    hard_mining=True,
+                    loss_weight=1.0)))
+
+        embed_head = QuasiDenseEmbedHead(**cfg)
+
+        key_feats = torch.rand(2, 256, 7, 7)
+        ref_feats = key_feats
+        rpn_results = InstanceData()
+        rpn_results.labels = torch.LongTensor([1, 2])
+        rpn_results.priors = torch.Tensor(
+            [[23.6667, 23.8757, 238.6326, 151.8874],
+             [23.6667, 23.8757, 238.6326, 151.8874]])
+        rpn_results_list = [rpn_results]
+
+        gt_instance = InstanceData()
+        gt_instance.labels = torch.LongTensor([1, 2])
+        gt_instance.bboxes = torch.Tensor(
+            [[23.6667, 23.8757, 238.6326, 151.8874],
+             [23.6667, 23.8757, 238.6326, 151.8874]])
+        gt_instance.instances_id = torch.LongTensor([1, 2])
+        batch_gt_instances = [gt_instance]
+
+        sampling_results = _dummy_bbox_sampling(rpn_results_list,
+                                                batch_gt_instances)
+        gt_match_indices_list = [torch.Tensor([0, 1])]
+        loss_track = embed_head.loss(key_feats, ref_feats, sampling_results,
+                                     sampling_results, gt_match_indices_list)
+        assert loss_track['loss_track'] >= 0, 'track loss should be zero'
+        assert loss_track['loss_track_aux'] > 0, 'aux loss should be non-zero'
+
+    def test_quasi_dense_embed_head_predict(self):
+        cfg = mmengine.Config(
+            dict(
+                num_convs=4,
+                num_fcs=1,
+                embed_channels=256,
+                norm_cfg=dict(type='GN', num_groups=32),
+                loss_track=dict(
+                    type='MultiPosCrossEntropyLoss', loss_weight=0.25),
+                loss_track_aux=dict(
+                    type='L2Loss',
+                    neg_pos_ub=3,
+                    pos_margin=0,
+                    neg_margin=0.1,
+                    hard_mining=True,
+                    loss_weight=1.0)))
+
+        embed_head = QuasiDenseEmbedHead(**cfg)
+
+        key_feats = torch.rand(2, 256, 7, 7)
+        track_feats = embed_head.predict(key_feats)
+
+        assert isinstance(track_feats, torch.Tensor)
+        assert track_feats.size() == (2, 256)
diff --git a/tests/test_models/test_track_heads/test_quasi_dense_track_head.py b/tests/test_models/test_track_heads/test_quasi_dense_track_head.py
new file mode 100644
index 000000000..17f1065ab
--- /dev/null
+++ b/tests/test_models/test_track_heads/test_quasi_dense_track_head.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+from mmengine.config import Config
+from mmengine.structures import InstanceData
+
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, random_boxes
+from mmtrack.utils import register_all_modules
+
+
+def _fake_proposals(img_metas, proposal_len):
+    """Create a fake proposal list."""
+    results = []
+    for i in range(len(img_metas)):
+        result = InstanceData(metainfo=img_metas[i])
+        proposal = random_boxes(proposal_len, 10).to(device='cpu')
+        result.bboxes = proposal
+        results.append(result)
+    return results
+
+
+class TestQuasiDenseTrackHead(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        cfg = Config(
+            dict(
+                type='QuasiDenseTrackHead',
+                roi_extractor=dict(
+                    _scope_='mmdet',
+                    type='SingleRoIExtractor',
+                    roi_layer=dict(
+                        type='RoIAlign', output_size=7, sampling_ratio=0),
+                    out_channels=256,
+                    featmap_strides=[4, 8, 16, 32]),
+                embed_head=dict(
+                    type='QuasiDenseEmbedHead',
+                    num_convs=4,
+                    num_fcs=1,
+                    embed_channels=256,
+                    norm_cfg=dict(type='GN', num_groups=32),
+                    loss_track=dict(
+                        type='MultiPosCrossEntropyLoss', loss_weight=0.25),
+                    loss_track_aux=dict(
+                        type='L2Loss',
+                        neg_pos_ub=3,
+                        pos_margin=0,
+                        neg_margin=0.1,
+                        hard_mining=True,
+                        loss_weight=1.0)),
+                loss_bbox=dict(type='L1Loss', loss_weight=1.0),
+                train_cfg=dict(
+                    assigner=dict(
+                        _scope_='mmdet',
+                        type='MaxIoUAssigner',
+                        pos_iou_thr=0.7,
+                        neg_iou_thr=0.5,
+                        min_pos_iou=0.5,
+                        match_low_quality=False,
+                        ignore_iof_thr=-1),
+                    sampler=dict(
+                        _scope_='mmdet',
+                        type='CombinedSampler',
+                        num=256,
+                        pos_fraction=0.5,
+                        neg_pos_ub=3,
+                        add_gt_as_proposals=True,
+                        pos_sampler=dict(type='InstanceBalancedPosSampler'),
+                        neg_sampler=dict(type='RandomSampler')))))
+        cls.track_head = MODELS.build(cfg)
+
+    def test_quasi_dense_track_head_loss(self):
+        packed_inputs = demo_mm_inputs(
+            batch_size=1,
+            frame_id=0,
+            num_ref_imgs=1,
+            image_shapes=[(3, 256, 256)])
+        img_metas = [{
+            'img_shape': (256, 256, 3),
+            'scale_factor': 1,
+        }]
+        proposal_list = _fake_proposals(img_metas, 10)
+        feats = []
+        for i in range(len(self.track_head.roi_extractor.featmap_strides)):
+            feats.append(
+                torch.rand(1, 256, 256 // (2**(i + 2)),
+                           256 // (2**(i + 2))).to(device='cpu'))
+        key_feats = tuple(feats)
+        ref_feats = key_feats
+        loss_track = self.track_head.loss(key_feats, ref_feats, proposal_list,
+                                          proposal_list,
+                                          [packed_inputs['data_samples'][0]])
+        assert loss_track['loss_track'] >= 0, 'track loss should be zero'
+        assert loss_track['loss_track_aux'] > 0, 'aux loss should be non-zero'
+
+    def test_quasi_dense_track_head_predict(self):
+        feats = []
+        for i in range(len(self.track_head.roi_extractor.featmap_strides)):
+            feats.append(
+                torch.rand(1, 256, 256 // (2**(i + 2)),
+                           256 // (2**(i + 2))).to(device='cpu'))
+        feats = tuple(feats)
+        track_feat = self.track_head.predict(
+            feats, [torch.Tensor([[10, 10, 20, 20]])])
+        assert track_feat.size() == (1, 256)
diff --git a/tests/test_models/test_track_heads/test_roi_embed_head.py b/tests/test_models/test_track_heads/test_roi_embed_head.py
index ee881cf02..f7c2c1e37 100644
--- a/tests/test_models/test_track_heads/test_roi_embed_head.py
+++ b/tests/test_models/test_track_heads/test_roi_embed_head.py
@@ -1,81 +1,110 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
+from unittest import TestCase
+
+import mmengine
 import torch
-from mmdet.core import build_assigner, build_sampler
+from mmengine.structures import InstanceData
 
 from mmtrack.models.track_heads import RoIEmbedHead
+from mmtrack.registry import TASK_UTILS
 
 
-def test_roi_embed_head_loss():
-    """Test roi embed head loss when truth is non-empty."""
-    cfg = mmcv.Config(
-        dict(
-            num_convs=2,
-            num_fcs=2,
-            roi_feat_size=7,
-            in_channels=16,
-            fc_out_channels=32))
-
-    self = RoIEmbedHead(**cfg)
-
-    x = torch.rand(2, 16, 7, 7)
-    ref_x = torch.rand(2, 16, 7, 7)
-    num_x_per_img = [1, 1]
-    num_x_per_ref_img = [1, 1]
-    similarity_scores = self.forward(x, ref_x, num_x_per_img,
-                                     num_x_per_ref_img)
-
-    proposal_list = [
-        torch.Tensor([[23.6667, 23.8757, 228.6326, 153.8874]]),
-        torch.Tensor([[23.6667, 23.8757, 228.6326, 153.8874]]),
-    ]
-    gt_bboxes = [
-        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
-        torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]),
-    ]
-    gt_labels = [torch.LongTensor([2]), torch.LongTensor([2])]
-    gt_instance_ids = [torch.LongTensor([2]), torch.LongTensor([2])]
-    ref_gt_instance_ids = [torch.LongTensor([2]), torch.LongTensor([2])]
-    sampling_results = _dummy_bbox_sampling(proposal_list, gt_bboxes,
-                                            gt_labels)
-
-    track_targets = self.get_targets(sampling_results, gt_instance_ids,
-                                     ref_gt_instance_ids)
-    gt_losses = self.loss(similarity_scores, *track_targets)
-    assert gt_losses['loss_match'] > 0, 'match loss should be non-zero'
-    assert gt_losses[
-        'match_accuracy'] >= 0, 'match accuracy should be non-zero or zero'
-
-
-def _dummy_bbox_sampling(proposal_list, gt_bboxes, gt_labels):
+def _dummy_bbox_sampling(rpn_results_list, batch_gt_instances):
     """Create sample results that can be passed to Head.get_targets."""
-    num_imgs = len(proposal_list)
+    num_imgs = len(rpn_results_list)
     feat = torch.rand(1, 1, 3, 3)
     assign_config = dict(
-        type='MaxIoUAssigner',
+        type='mmdet.MaxIoUAssigner',
+        _scope_='mmdet',
         pos_iou_thr=0.5,
         neg_iou_thr=0.5,
         min_pos_iou=0.5,
         ignore_iof_thr=-1)
     sampler_config = dict(
-        type='RandomSampler',
+        type='mmdet.RandomSampler',
+        _scope_='mmdet',
         num=512,
         pos_fraction=0.25,
         neg_pos_ub=-1,
         add_gt_as_proposals=False)
-    bbox_assigner = build_assigner(assign_config)
-    bbox_sampler = build_sampler(sampler_config)
-    gt_bboxes_ignore = [None for _ in range(num_imgs)]
+    bbox_assigner = TASK_UTILS.build(assign_config)
+    bbox_sampler = TASK_UTILS.build(sampler_config)
+
     sampling_results = []
     for i in range(num_imgs):
-        assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i],
-                                             gt_bboxes_ignore[i], gt_labels[i])
+        assign_result = bbox_assigner.assign(rpn_results_list[i],
+                                             batch_gt_instances[i])
         sampling_result = bbox_sampler.sample(
             assign_result,
-            proposal_list[i],
-            gt_bboxes[i],
-            gt_labels[i],
+            rpn_results_list[i],
+            batch_gt_instances[i],
             feats=feat)
         sampling_results.append(sampling_result)
 
     return sampling_results
+
+
+class TestRoIEmbedHead(TestCase):
+
+    def test_roi_embed_head_loss(self):
+        """Test roi embed head loss when truth is non-empty."""
+        cfg = mmengine.Config(
+            dict(
+                num_convs=2,
+                num_fcs=2,
+                roi_feat_size=7,
+                in_channels=16,
+                fc_out_channels=32))
+
+        embed_head = RoIEmbedHead(**cfg)
+
+        x = torch.rand(1, 16, 7, 7)
+        ref_x = torch.rand(1, 16, 7, 7)
+        num_x_per_img = [1]
+        num_x_per_ref_img = [1]
+        x_split, ref_x_split = embed_head.forward(x, ref_x, num_x_per_img,
+                                                  num_x_per_ref_img)
+
+        gt_instance_ids = [torch.LongTensor([2])]
+        ref_gt_instance_ids = [torch.LongTensor([2])]
+
+        rpn_results = InstanceData()
+        rpn_results.labels = torch.LongTensor([2])
+        rpn_results.priors = torch.Tensor(
+            [[23.6667, 23.8757, 238.6326, 151.8874]])
+        rpn_results_list = [rpn_results]
+
+        gt_instance = InstanceData()
+        gt_instance.labels = torch.LongTensor([2])
+        gt_instance.bboxes = torch.Tensor(
+            [[23.6667, 23.8757, 238.6326, 151.8874]])
+        gt_instance.instances_id = torch.LongTensor([2])
+        batch_gt_instances = [gt_instance]
+
+        sampling_results = _dummy_bbox_sampling(rpn_results_list,
+                                                batch_gt_instances)
+
+        gt_losses = embed_head.loss_by_feat(x_split, ref_x_split,
+                                            sampling_results, gt_instance_ids,
+                                            ref_gt_instance_ids)
+        assert gt_losses['loss_match'] > 0, 'match loss should be non-zero'
+        assert gt_losses[
+            'match_accuracy'] >= 0, 'match accuracy should be non-zero or zero'
+
+    def test_roi_embed_head_predict(self):
+        cfg = mmengine.Config(
+            dict(
+                num_convs=2,
+                num_fcs=2,
+                roi_feat_size=7,
+                in_channels=16,
+                fc_out_channels=32))
+
+        embed_head = RoIEmbedHead(**cfg)
+
+        x = torch.rand(1, 16, 7, 7)
+        ref_x = torch.rand(1, 16, 7, 7)
+        similarity_logits = embed_head.predict(x, ref_x)
+
+        assert isinstance(similarity_logits, list)
+        assert len(similarity_logits) == 1
diff --git a/tests/test_models/test_track_heads/test_siamese_rpn_head.py b/tests/test_models/test_track_heads/test_siamese_rpn_head.py
index ef0f9babc..958103907 100644
--- a/tests/test_models/test_track_heads/test_siamese_rpn_head.py
+++ b/tests/test_models/test_track_heads/test_siamese_rpn_head.py
@@ -1,74 +1,190 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
+from copy import deepcopy
+from unittest import TestCase
+
+import mmengine
+import numpy as np
 import torch
+from mmengine.structures import InstanceData
 
 from mmtrack.models.track_heads import CorrelationHead, SiameseRPNHead
+from mmtrack.structures import TrackDataSample
+from mmtrack.testing import random_boxes
+from mmtrack.utils import register_all_modules
+
+
+class TestCorrelationHead(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.correlation_head = CorrelationHead(16, 16, 2)
+
+    def test_forward(self):
+        kernel = torch.rand(1, 16, 7, 7)
+        search = torch.rand(1, 16, 31, 31)
+        out = self.correlation_head(kernel, search)
+        assert out.size() == (1, 2, 25, 25)
+
+
+class TestSiameseRPNHead(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        cfg = mmengine.Config(
+            dict(
+                anchor_generator=dict(
+                    type='SiameseRPNAnchorGenerator',
+                    strides=[8],
+                    ratios=[0.33, 0.5, 1, 2, 3],
+                    scales=[8]),
+                in_channels=[1, 1, 1],
+                weighted_sum=True,
+                bbox_coder=dict(
+                    type='mmdet.DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[1., 1., 1., 1.]),
+                loss_cls=dict(
+                    type='mmdet.CrossEntropyLoss',
+                    reduction='sum',
+                    loss_weight=1.0),
+                loss_bbox=dict(
+                    type='mmdet.L1Loss', reduction='sum', loss_weight=1.2),
+                train_cfg=dict(
+                    assigner=dict(
+                        type='mmdet.MaxIoUAssigner',
+                        pos_iou_thr=0.6,
+                        neg_iou_thr=0.3,
+                        min_pos_iou=0.6,
+                        match_low_quality=False,
+                        iou_calculator=dict(type='mmdet.BboxOverlaps2D')),
+                    sampler=dict(
+                        type='mmdet.RandomSampler',
+                        num=64,
+                        pos_fraction=0.25,
+                        add_gt_as_proposals=False),
+                    num_neg=16,
+                    exemplar_size=127,
+                    search_size=255),
+                test_cfg=dict(penalty_k=0.05, window_influence=0.42, lr=0.38)))
+        cls.siamese_rpn_head = SiameseRPNHead(**cfg)
+        cls.rng = np.random.RandomState(0)
+
+    def test_get_init_targets(self):
+        bboxes = torch.randn(10, 4)
+        (labels, labels_weights, bbox_targets,
+         bbox_weights) = self.siamese_rpn_head._get_init_targets(
+             bboxes, (25, 25))
+        assert labels.shape == (25 * 25 * 5, )
+        assert labels_weights.shape == (25 * 25 * 5, )
+        assert bbox_targets.shape == (25 * 25 * 5, 4)
+        assert bbox_weights.shape == (25 * 25 * 5, 4)
+
+    def test_get_positive_pair_targets(self):
+        gt_bboxes = random_boxes(1, 50)
+        gt_instances = InstanceData()
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = torch.Tensor([True]).long()
+        (labels, labels_weights, bbox_targets,
+         bbox_weights) = self.siamese_rpn_head.get_targets(
+             gt_instances, [25, 25])
+        assert labels.shape == (1, 25 * 25 * 5)
+        assert labels_weights.shape == (1, 25 * 25 * 5)
+        assert bbox_targets.shape == (1, 25 * 25 * 5, 4)
+        assert bbox_weights.shape == (1, 25 * 25 * 5, 4)
+
+    def test_get_negative_pair_targets(self):
+        gt_bboxes = random_boxes(1, 50)
+        gt_instances = InstanceData()
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = torch.Tensor([False]).long()
+        (labels, labels_weights, bbox_targets,
+         bbox_weights) = self.siamese_rpn_head.get_targets(
+             gt_instances, [25, 25])
+        assert labels.shape == (1, 25 * 25 * 5)
+        assert labels_weights.shape == (1, 25 * 25 * 5)
+        assert bbox_targets.shape == (1, 25 * 25 * 5, 4)
+        assert bbox_weights.shape == (1, 25 * 25 * 5, 4)
+
+    def test_forward(self):
+        z_feats = tuple([
+            torch.rand(1, 1, 7, 7)
+            for i in range(len(self.siamese_rpn_head.cls_heads))
+        ])
+        x_feats = tuple([
+            torch.rand(1, 1, 31, 31)
+            for i in range(len(self.siamese_rpn_head.cls_heads))
+        ])
+        cls_score, bbox_pred = self.siamese_rpn_head.forward(z_feats, x_feats)
+        assert cls_score.shape == (1, 10, 25, 25)
+        assert bbox_pred.shape == (1, 20, 25, 25)
+
+    def test_get_targets(self):
+        batch_gt_instances = []
+        gt_bboxes = random_boxes(2, 50)
+        gt_labels = torch.randint(2, (2, 1)).long()
+        for i in range(2):
+            gt_instances = InstanceData()
+            gt_instances.bboxes = gt_bboxes[i:i + 1]
+            gt_instances.labels = gt_labels[i]
+            batch_gt_instances.append(gt_instances)
+        (labels, labels_weights, bbox_targets,
+         bbox_weights) = self.siamese_rpn_head.get_targets(
+             batch_gt_instances, [25, 25])
+        assert labels.shape == (2, 25 * 25 * 5)
+        assert labels_weights.shape == (2, 25 * 25 * 5)
+        assert bbox_targets.shape == (2, 25 * 25 * 5, 4)
+        assert bbox_weights.shape == (2, 25 * 25 * 5, 4)
+
+    def test_predict(self):
+        z_feats = tuple([
+            torch.rand(1, 1, 7, 7)
+            for i in range(len(self.siamese_rpn_head.cls_heads))
+        ])
+        x_feats = tuple([
+            torch.rand(1, 1, 31, 31)
+            for i in range(len(self.siamese_rpn_head.cls_heads))
+        ])
+        prev_bbox = random_boxes(1, 50).squeeze()
+        scale_factor = torch.Tensor([3.])
+
+        data_sample = TrackDataSample()
+        data_sample.set_metainfo(dict(ori_shape=(200, 200)))
+        batch_data_samples = [data_sample]
+        results = self.siamese_rpn_head.predict(z_feats, x_feats,
+                                                batch_data_samples, prev_bbox,
+                                                scale_factor)
+        assert results[0].scores >= 0
+        assert results[0].bboxes.shape == (1, 4)
+
+    def test_loss(self):
+        z_feats = tuple([
+            torch.rand(1, 1, 7, 7)
+            for i in range(len(self.siamese_rpn_head.cls_heads))
+        ])
+        x_feats = tuple([
+            torch.rand(1, 1, 31, 31)
+            for i in range(len(self.siamese_rpn_head.cls_heads))
+        ])
+
+        data_sample = TrackDataSample()
+        gt_instances = InstanceData()
+        gt_instances.bboxes = torch.Tensor(
+            [[23.6667, 23.8757, 238.6326, 151.8874]])
+        gt_instances.labels = torch.Tensor([True]).long()
+        data_sample.search_gt_instances = gt_instances
+        data_sample.gt_instances = deepcopy(gt_instances)
+        data_sample.set_metainfo(dict(search_ori_shape=(200, 200)))
+        batch_data_samples = [data_sample]
 
+        gt_losses = self.siamese_rpn_head.loss(z_feats, x_feats,
+                                               batch_data_samples)
+        assert gt_losses['loss_rpn_cls'] > 0, 'cls loss should be non-zero'
+        assert gt_losses[
+            'loss_rpn_bbox'] >= 0, 'box loss should be non-zero or zero'
 
-def test_correlation_head():
-    self = CorrelationHead(16, 16, 2)
-    kernel = torch.rand(1, 16, 7, 7)
-    search = torch.rand(1, 16, 31, 31)
-    out = self(kernel, search)
-    assert out.size() == (1, 2, 25, 25)
-
-
-def test_siamese_rpn_head_loss():
-    """Tests siamese rpn head loss when truth is non-empty."""
-    cfg = mmcv.Config(
-        dict(
-            anchor_generator=dict(
-                type='SiameseRPNAnchorGenerator',
-                strides=[8],
-                ratios=[0.33, 0.5, 1, 2, 3],
-                scales=[8]),
-            in_channels=[16, 16, 16],
-            weighted_sum=True,
-            bbox_coder=dict(
-                type='DeltaXYWHBBoxCoder',
-                target_means=[0., 0., 0., 0.],
-                target_stds=[1., 1., 1., 1.]),
-            loss_cls=dict(
-                type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
-            loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=1.2),
-            train_cfg=dict(
-                assigner=dict(
-                    type='MaxIoUAssigner',
-                    pos_iou_thr=0.6,
-                    neg_iou_thr=0.3,
-                    min_pos_iou=0.6,
-                    match_low_quality=False),
-                sampler=dict(
-                    type='RandomSampler',
-                    num=64,
-                    pos_fraction=0.25,
-                    add_gt_as_proposals=False),
-                num_neg=16,
-                exemplar_size=127,
-                search_size=255),
-            test_cfg=dict(penalty_k=0.05, window_influence=0.42, lr=0.38)))
-
-    self = SiameseRPNHead(**cfg)
-
-    z_feats = tuple(
-        [torch.rand(1, 16, 7, 7) for i in range(len(self.cls_heads))])
-    x_feats = tuple(
-        [torch.rand(1, 16, 31, 31) for i in range(len(self.cls_heads))])
-    cls_score, bbox_pred = self.forward(z_feats, x_feats)
-
-    gt_bboxes = [
-        torch.Tensor([[0., 23.6667, 23.8757, 238.6326, 151.8874]]),
-    ]
-    bbox_targets = self.get_targets(gt_bboxes, cls_score.shape[2:], [True])
-    gt_losses = self.loss(cls_score, bbox_pred, *bbox_targets)
-    assert gt_losses['loss_rpn_cls'] > 0, 'cls loss should be non-zero'
-    assert gt_losses[
-        'loss_rpn_bbox'] >= 0, 'box loss should be non-zero or zero'
-
-    gt_bboxes = [
-        torch.Tensor([[0., 23.6667, 23.8757, 238.6326, 151.8874]]),
-    ]
-    bbox_targets = self.get_targets(gt_bboxes, cls_score.shape[2:], [False])
-    gt_losses = self.loss(cls_score, bbox_pred, *bbox_targets)
-    assert gt_losses['loss_rpn_cls'] > 0, 'cls loss should be non-zero'
-    assert gt_losses['loss_rpn_bbox'] == 0, 'box loss should be zero'
+        gt_instances.labels = torch.Tensor([False]).long()
+        gt_losses = self.siamese_rpn_head.loss(z_feats, x_feats,
+                                               batch_data_samples)
+        assert gt_losses['loss_rpn_cls'] > 0, 'cls loss should be non-zero'
+        assert gt_losses['loss_rpn_bbox'] == 0, 'box loss should be zero'
diff --git a/tests/test_models/test_track_heads/test_stark_head.py b/tests/test_models/test_track_heads/test_stark_head.py
index 8a46f03be..fd879fbd4 100644
--- a/tests/test_models/test_track_heads/test_stark_head.py
+++ b/tests/test_models/test_track_heads/test_stark_head.py
@@ -1,169 +1,225 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
+from copy import deepcopy
+from unittest import TestCase
+
+import mmengine
 import torch
+from mmengine.structures import InstanceData
 
 from mmtrack.models.track_heads.stark_head import (CornerPredictorHead,
                                                    ScoreHead, StarkHead,
                                                    StarkTransformer)
+from mmtrack.structures import TrackDataSample
+from mmtrack.testing import random_boxes
+
+
+class TestCornerPredictorHead(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.bbox_head = CornerPredictorHead(8, 8, feat_size=20, stride=16)
+
+    def test_corner_predictor_Head(self):
+        inputs = torch.randn(1, 8, 20, 20)
+        outputs = self.bbox_head(inputs)
+        assert outputs.shape == (1, 4)
+
 
+class TestScoreHead(TestCase):
 
-def test_corner_predictor_head():
-    bbox_head = CornerPredictorHead(8, 8, feat_size=20, stride=16)
-    inputs = torch.randn(1, 8, 20, 20)
-    outputs = bbox_head(inputs)
-    assert outputs.shape == (1, 4)
+    @classmethod
+    def setUpClass(cls):
+        cls.score_head = ScoreHead(8, 8, 1, 3)
 
+    def test_corner_predictor_Head(self):
+        inputs = torch.randn(1, 1, 1, 8)
+        outputs = self.score_head(inputs)
+        assert outputs.shape == (1, 1)
 
-def test_score_head():
-    score_head = ScoreHead(8, 8, 1, 3)
-    inputs = torch.randn(1, 1, 1, 8)
-    outputs = score_head(inputs)
-    assert outputs.shape == (1, 1, 1)
 
+class TestStarkTransformer(TestCase):
 
-def test_transormer_head():
-    cfg = mmcv.Config(
-        dict(
-            encoder=dict(
-                type='DetrTransformerEncoder',
-                num_layers=6,
-                transformerlayers=dict(
-                    type='BaseTransformerLayer',
-                    attn_cfgs=[
-                        dict(
+    @classmethod
+    def setUpClass(cls):
+        cfg = mmengine.Config(
+            dict(
+                encoder=dict(
+                    type='mmdet.DetrTransformerEncoder',
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=[
+                            dict(
+                                type='MultiheadAttention',
+                                embed_dims=16,
+                                num_heads=8,
+                                attn_drop=0.1,
+                                dropout_layer=dict(
+                                    type='Dropout', drop_prob=0.1))
+                        ],
+                        ffn_cfgs=dict(
+                            feedforward_channels=16,
+                            embed_dims=16,
+                            ffn_drop=0.1),
+                        operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+                decoder=dict(
+                    type='mmdet.DetrTransformerDecoder',
+                    return_intermediate=False,
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=dict(
                             type='MultiheadAttention',
                             embed_dims=16,
                             num_heads=8,
                             attn_drop=0.1,
-                            dropout_layer=dict(type='Dropout', drop_prob=0.1))
-                    ],
-                    ffn_cfgs=dict(
-                        feedforward_channels=16, embed_dims=16, ffn_drop=0.1),
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
-            decoder=dict(
-                type='DetrTransformerDecoder',
-                return_intermediate=False,
-                num_layers=6,
-                transformerlayers=dict(
-                    type='BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='MultiheadAttention',
-                        embed_dims=16,
-                        num_heads=8,
-                        attn_drop=0.1,
-                        dropout_layer=dict(type='Dropout', drop_prob=0.1)),
-                    ffn_cfgs=dict(
-                        feedforward_channels=16, embed_dims=16, ffn_drop=0.1),
-                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                     'ffn', 'norm')))))
-    self = StarkTransformer(**cfg)
-    feat = torch.randn(20, 1, 16)
-    mask = torch.zeros(1, 20, dtype=bool)
-    query_embed = torch.randn(1, 16)
-    pos_embed = torch.randn(20, 1, 16)
-    out_dec, enc_mem = self.forward(feat, mask, query_embed, pos_embed)
-    assert out_dec.shape == (1, 1, 1, 16)
-    assert enc_mem.shape == (20, 1, 16)
-
-
-def test_stark_head_loss():
-    """Tests stark head loss when truth is non-empty."""
-    head_cfg = dict(
-        num_query=1,
-        transformer=dict(
-            type='StarkTransformer',
-            encoder=dict(
-                type='DetrTransformerEncoder',
-                num_layers=6,
-                transformerlayers=dict(
-                    type='BaseTransformerLayer',
-                    attn_cfgs=[
-                        dict(
+                            dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+                        ffn_cfgs=dict(
+                            feedforward_channels=16,
+                            embed_dims=16,
+                            ffn_drop=0.1),
+                        operation_order=('self_attn', 'norm', 'cross_attn',
+                                         'norm', 'ffn', 'norm')))))
+        cls.stark_transformer = StarkTransformer(**cfg)
+
+    def test_stark_transformer(self):
+        feat = torch.randn(20, 1, 16)
+        mask = torch.zeros(1, 20, dtype=bool)
+        query_embed = torch.randn(1, 16)
+        pos_embed = torch.randn(20, 1, 16)
+        out_dec, enc_mem = self.stark_transformer(feat, mask, query_embed,
+                                                  pos_embed)
+        assert out_dec.shape == (1, 1, 1, 16)
+        assert enc_mem.shape == (20, 1, 16)
+
+
+class TestStarkHead(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.cfg = dict(
+            num_query=1,
+            transformer=dict(
+                type='StarkTransformer',
+                encoder=dict(
+                    type='mmdet.DetrTransformerEncoder',
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=[
+                            dict(
+                                type='MultiheadAttention',
+                                embed_dims=16,
+                                num_heads=8,
+                                attn_drop=0.1,
+                                dropout_layer=dict(
+                                    type='Dropout', drop_prob=0.1))
+                        ],
+                        ffn_cfgs=dict(
+                            feedforward_channels=16,
+                            embed_dims=16,
+                            ffn_drop=0.1),
+                        operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+                decoder=dict(
+                    type='mmdet.DetrTransformerDecoder',
+                    return_intermediate=False,
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=dict(
                             type='MultiheadAttention',
                             embed_dims=16,
                             num_heads=8,
                             attn_drop=0.1,
-                            dropout_layer=dict(type='Dropout', drop_prob=0.1))
-                    ],
-                    ffn_cfgs=dict(
-                        feedforward_channels=16, embed_dims=16, ffn_drop=0.1),
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
-            decoder=dict(
-                type='DetrTransformerDecoder',
-                return_intermediate=False,
-                num_layers=6,
-                transformerlayers=dict(
-                    type='BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='MultiheadAttention',
-                        embed_dims=16,
-                        num_heads=8,
-                        attn_drop=0.1,
-                        dropout_layer=dict(type='Dropout', drop_prob=0.1)),
-                    ffn_cfgs=dict(
-                        feedforward_channels=16, embed_dims=16, ffn_drop=0.1),
-                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                     'ffn', 'norm'))),
-        ),
-        positional_encoding=dict(
-            type='SinePositionalEncoding', num_feats=8, normalize=True),
-        bbox_head=dict(
-            type='CornerPredictorHead',
-            inplanes=16,
-            channel=16,
-            feat_size=20,
-            stride=16),
-        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
-        loss_iou=dict(type='GIoULoss', loss_weight=2.0),
-        test_cfg=dict(
-            search_factor=5.0,
-            search_size=320,
-            template_factor=2.0,
-            template_size=128,
-            update_intervals=[200]))
-    cfg = mmcv.Config(head_cfg)
-
-    self = StarkHead(**cfg)
-
-    head_inputs = [
-        dict(
-            feat=(torch.rand(1, 16, 8, 8), ),
-            mask=torch.zeros(1, 128, 128, dtype=bool)),
-        dict(
-            feat=(torch.rand(1, 16, 8, 8), ),
-            mask=torch.zeros(1, 128, 128, dtype=bool)),
-        dict(
-            feat=(torch.rand(1, 16, 20, 20), ),
-            mask=torch.zeros(1, 320, 320, dtype=bool))
-    ]
-    track_results = self.forward(head_inputs)
-
-    gt_bboxes = [
-        torch.Tensor([[0., 23.6667, 23.8757, 238.6326, 151.8874]]),
-    ]
-    gt_labels = [
-        torch.Tensor([[0., 1]]),
-    ]
-    bboxes_losses = self.loss(track_results, gt_bboxes, gt_labels, (320, 320))
-    assert bboxes_losses['loss_iou'] >= 0, 'iou loss should be'
-    'non-zero or zero'
-    assert bboxes_losses[
-        'loss_bbox'] >= 0, 'bbox loss should be non-zero or zero'
-
-    head_cfg.update(
-        dict(
-            cls_head=dict(
-                type='ScoreHead',
-                input_dim=16,
-                hidden_dim=16,
-                output_dim=1,
-                num_layers=3,
-                use_bn=False),
-            frozen_module=['transformer', 'bbox_head'],
-            loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True)))
-    cfg = mmcv.Config(head_cfg)
-    self = StarkHead(**cfg)
-    track_results = self.forward(head_inputs)
-    bboxes_losses = self.loss(track_results, gt_bboxes, gt_labels, (320, 320))
-    assert bboxes_losses['loss_cls'] >= 0, 'iou loss should be'
-    'non-zero or zero'
+                            dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+                        ffn_cfgs=dict(
+                            feedforward_channels=16,
+                            embed_dims=16,
+                            ffn_drop=0.1),
+                        operation_order=('self_attn', 'norm', 'cross_attn',
+                                         'norm', 'ffn', 'norm'))),
+            ),
+            positional_encoding=dict(
+                type='mmdet.SinePositionalEncoding',
+                num_feats=8,
+                normalize=True),
+            bbox_head=dict(
+                type='CornerPredictorHead',
+                inplanes=16,
+                channel=16,
+                feat_size=20,
+                stride=16),
+            loss_bbox=dict(type='mmdet.L1Loss', loss_weight=5.0),
+            loss_iou=dict(type='mmdet.GIoULoss', loss_weight=2.0),
+            test_cfg=dict(
+                search_factor=5.0,
+                search_size=320,
+                template_factor=2.0,
+                template_size=128,
+                update_intervals=[200]))
+        cls.stark_head_st1 = StarkHead(**cls.cfg)
+
+        cls.cfg.update(
+            dict(
+                cls_head=dict(
+                    type='ScoreHead',
+                    input_dim=16,
+                    hidden_dim=16,
+                    output_dim=1,
+                    num_layers=3,
+                    use_bn=False),
+                frozen_module=['transformer', 'bbox_head'],
+                loss_cls=dict(type='mmdet.CrossEntropyLoss',
+                              use_sigmoid=True)))
+        cls.stark_head_st2 = StarkHead(**cls.cfg)
+
+        cls.head_inputs = [
+            dict(
+                feat=(torch.rand(1, 16, 8, 8), ),
+                mask=torch.zeros(1, 128, 128, dtype=bool)),
+            dict(
+                feat=(torch.rand(1, 16, 8, 8), ),
+                mask=torch.zeros(1, 128, 128, dtype=bool)),
+            dict(
+                feat=(torch.rand(1, 16, 20, 20), ),
+                mask=torch.zeros(1, 320, 320, dtype=bool))
+        ]
+
+        data_sample = TrackDataSample()
+        gt_instances = InstanceData()
+        gt_instances.bboxes = torch.Tensor(
+            [[23.6667, 23.8757, 238.6326, 151.8874]])
+        gt_instances.labels = torch.Tensor([True]).long()
+        data_sample.search_gt_instances = gt_instances
+        data_sample.gt_instances = deepcopy(gt_instances)
+        data_sample.set_metainfo(
+            dict(
+                search_ori_shape=(500, 500),
+                ori_shape=(500, 500),
+                search_img_shape=[320, 320]))
+        cls.batch_data_samples = [data_sample]
+
+    def test_loss(self):
+        """Test the forward of stark head in loss mode."""
+        self.stark_head_st1.train()
+        losses = self.stark_head_st1.loss(self.head_inputs,
+                                          self.batch_data_samples)
+        assert losses['loss_iou'] >= 0, 'iou loss should be non-zero or zero'
+        assert losses['loss_bbox'] >= 0, 'box loss should be non-zero or zero'
+
+        self.stark_head_st2.train()
+        losses = self.stark_head_st2.loss(self.head_inputs,
+                                          self.batch_data_samples)
+        assert losses['loss_cls'] > 0, 'cls loss should be non-zero'
+
+    def test_predict(self):
+        """Test the forward of stark head in predict mode."""
+        prev_bbox = random_boxes(1, 50).squeeze()
+        scale_factor = torch.Tensor([3.])
+
+        self.stark_head_st2.eval()
+        results = self.stark_head_st2.predict(self.head_inputs,
+                                              self.batch_data_samples,
+                                              prev_bbox, scale_factor)
+        assert results[0].scores >= 0
+        assert results[0].bboxes.shape == (1, 4)
diff --git a/tests/test_models/test_trackers/__init__.py b/tests/test_models/test_trackers/__init__.py
new file mode 100644
index 000000000..ad36d81ea
--- /dev/null
+++ b/tests/test_models/test_trackers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
diff --git a/tests/test_models/test_trackers/test_base_tracker.py b/tests/test_models/test_trackers/test_base_tracker.py
deleted file mode 100644
index 277643073..000000000
--- a/tests/test_models/test_trackers/test_base_tracker.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmdet.core.bbox.demodata import random_boxes
-
-from mmtrack.models import TRACKERS
-
-
-class TestBaseTracker(object):
-
-    @classmethod
-    def setup_class(cls):
-        cfg = dict(
-            obj_score_thr=0.3,
-            regression=dict(
-                obj_score_thr=0.5,
-                nms=dict(type='nms', iou_threshold=0.6),
-                match_iou_thr=0.3),
-            reid=dict(
-                num_samples=10,
-                img_scale=(256, 128),
-                img_norm_cfg=None,
-                match_score_thr=2.0,
-                match_iou_thr=0.2),
-            momentums=dict(embeds=0.5),
-            num_frames_retain=5)
-        tracker = TRACKERS.get('TracktorTracker')
-        cls.tracker = tracker(**cfg)
-        cls.momentums = cfg['momentums']
-        cls.num_frames_retain = cfg['num_frames_retain']
-        cls.num_objs = 5
-
-    def test_init(self):
-        bboxes = random_boxes(self.num_objs, 512)
-        labels = torch.zeros(self.num_objs)
-        embeds = torch.randn(self.num_objs, 256)
-        ids = torch.arange(self.num_objs)
-        self.tracker.update(
-            ids=ids, bboxes=bboxes, labels=labels, embeds=embeds, frame_ids=0)
-
-        assert self.tracker.ids == list(ids)
-        assert self.tracker.memo_items == [
-            'ids', 'bboxes', 'labels', 'embeds', 'frame_ids'
-        ]
-        for k, v in self.tracker.tracks[0].items():
-            if k in self.momentums:
-                assert isinstance(v, torch.Tensor)
-            else:
-                assert isinstance(v, list)
-
-    def test_update(self):
-        for i in range(1, self.num_frames_retain * 2):
-            bboxes = random_boxes(self.num_objs, 512)
-            labels = torch.zeros(self.num_objs, dtype=torch.int)
-            embeds = torch.randn(self.num_objs, 256)
-            ids = torch.arange(self.num_objs) + i
-            self.tracker.update(
-                ids=ids,
-                bboxes=bboxes,
-                labels=labels,
-                embeds=embeds,
-                frame_ids=i)
-            if i < self.num_frames_retain:
-                assert 0 in self.tracker.tracks
-            else:
-                assert 0 not in self.tracker.tracks
-
-    def test_memo(self):
-        memo = self.tracker.memo
-        num_tracks = self.num_frames_retain * 2 - 1
-        assert (memo.ids == torch.arange(
-            self.num_frames_retain, self.num_frames_retain * 3 - 1)).all()
-        assert memo.bboxes.shape[0] == num_tracks
-
-    def test_get(self):
-        ids = [self.num_frames_retain + 1, self.num_frames_retain + 2]
-
-        bboxes = self.tracker.get('bboxes', ids)
-        assert bboxes.shape == (2, 4)
-
-        bboxes = self.tracker.get('bboxes', ids, num_samples=2)
-        assert bboxes.shape == (2, 2, 4)
-
-        bboxes = self.tracker.get(
-            'bboxes', ids, num_samples=2, behavior='mean')
-        assert bboxes.shape == (2, 4)
diff --git a/tests/test_models/test_trackers/test_byte_tracker.py b/tests/test_models/test_trackers/test_byte_tracker.py
new file mode 100644
index 000000000..5ae708b73
--- /dev/null
+++ b/tests/test_models/test_trackers/test_byte_tracker.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+from unittest.mock import MagicMock
+
+import torch
+
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.testing import demo_mm_inputs, random_boxes
+from mmtrack.utils import register_all_modules
+
+
+class TestByteTracker(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        cfg = dict(
+            type='ByteTracker',
+            obj_score_thrs=dict(high=0.6, low=0.1),
+            init_track_thr=0.7,
+            weight_iou_with_det_scores=True,
+            match_iou_thrs=dict(high=0.1, low=0.5, tentative=0.3),
+            num_tentatives=3,
+            num_frames_retain=30)
+        cls.tracker = MODELS.build(cfg)
+        cls.tracker.kf = TASK_UTILS.build(dict(type='KalmanFilter'))
+        cls.num_frames_retain = cfg['num_frames_retain']
+        cls.num_objs = 30
+
+    def test_init(self):
+        bboxes = random_boxes(self.num_objs, 512)
+        labels = torch.zeros(self.num_objs)
+        scores = torch.ones(self.num_objs)
+        ids = torch.arange(self.num_objs)
+        self.tracker.update(
+            ids=ids, bboxes=bboxes, scores=scores, labels=labels, frame_ids=0)
+
+        assert self.tracker.ids == list(ids)
+        assert self.tracker.memo_items == [
+            'ids', 'bboxes', 'scores', 'labels', 'frame_ids'
+        ]
+
+    def test_track(self):
+        img_size = 64
+        img = torch.rand((1, 3, img_size, img_size))
+
+        model = MagicMock()
+
+        for frame_id in range(3):
+            packed_inputs = demo_mm_inputs(
+                batch_size=1, frame_id=frame_id, num_ref_imgs=0)
+            data_sample = packed_inputs['data_samples'][0]
+            data_sample.pred_det_instances = data_sample.gt_instances.clone()
+            # add fake scores
+            scores = torch.ones(5)
+            data_sample.pred_det_instances.scores = torch.FloatTensor(scores)
+
+            pred_track_instances = self.tracker.track(
+                model=model,
+                img=img,
+                feats=None,
+                data_sample=packed_inputs['data_samples'][0])
+
+            bboxes = pred_track_instances.bboxes
+            labels = pred_track_instances.labels
+            ids = pred_track_instances.instances_id
+
+            assert bboxes.shape[1] == 4
+            assert bboxes.shape[0] == labels.shape[0]
+            assert bboxes.shape[0] == ids.shape[0]
diff --git a/tests/test_models/test_trackers/test_bytetracker.py b/tests/test_models/test_trackers/test_bytetracker.py
deleted file mode 100644
index 181c3aa07..000000000
--- a/tests/test_models/test_trackers/test_bytetracker.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from unittest.mock import MagicMock
-
-import torch
-from mmdet.core.bbox.demodata import random_boxes
-
-from mmtrack.models import TRACKERS, KalmanFilter
-
-
-class TestByteTracker(object):
-
-    @classmethod
-    def setup_class(cls):
-        cfg = dict(
-            obj_score_thrs=dict(high=0.6, low=0.1),
-            init_track_thr=0.7,
-            weight_iou_with_det_scores=True,
-            match_iou_thrs=dict(high=0.1, low=0.5, tentative=0.3),
-            num_tentatives=3,
-            num_frames_retain=30)
-        tracker = TRACKERS.get('ByteTracker')
-        cls.tracker = tracker(**cfg)
-        cls.tracker.kf = KalmanFilter()
-        cls.num_objs = 5
-
-    def test_track(self):
-        img_size = 64
-        img = torch.rand((1, 3, img_size, img_size))
-
-        img_metas = [dict(scale_factor=1.0)]
-
-        model = MagicMock()
-
-        bboxes = random_boxes(self.num_objs, img_size)
-        scores = torch.rand((self.num_objs, 1))
-        bboxes = torch.cat((bboxes, scores), dim=1)
-
-        labels = torch.arange(self.num_objs)
-
-        for frame_id in range(3):
-            bboxes, labels, ids = self.tracker.track(
-                img, img_metas, model, bboxes, labels, frame_id, rescale=True)
-            assert bboxes.shape[0] == labels.shape[0]
-            assert bboxes.shape[0] == ids.shape[0]
diff --git a/tests/test_models/test_trackers/test_masktrack_rcnn_tracker.py b/tests/test_models/test_trackers/test_masktrack_rcnn_tracker.py
index dc924d53a..ef7bfd1d2 100644
--- a/tests/test_models/test_trackers/test_masktrack_rcnn_tracker.py
+++ b/tests/test_models/test_trackers/test_masktrack_rcnn_tracker.py
@@ -1,57 +1,71 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from unittest.mock import MagicMock
+from unittest import TestCase
 
 import torch
-from mmdet.core.bbox.demodata import random_boxes
+from parameterized import parameterized
 
-from mmtrack.models import TRACKERS
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg, random_boxes
+from mmtrack.utils import register_all_modules
 
 
-class TestMaskTrackRCNNTracker(object):
+class TestMaskTrackRCNNTracker(TestCase):
 
     @classmethod
-    def setup_class(cls):
-        cfg = dict(
-            match_weights=dict(det_score=1.0, iou=1.0, det_label=1000.0), )
-        tracker = TRACKERS.get('MaskTrackRCNNTracker')
-        cls.tracker = tracker(**cfg)
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        tracker_cfg = dict(
+            type='MaskTrackRCNNTracker',
+            match_weights=dict(det_score=1.0, iou=2.0, det_label=10.0),
+            num_frames_retain=20)
+        cls.tracker = MODELS.build(tracker_cfg)
         cls.num_objs = 5
 
-    def test_track(self):
-        img_size, feats_channel = 64, 8
-        img = torch.rand((1, 3, img_size, img_size))
+    def test_get_match_score(self):
+        bboxes = random_boxes(self.num_objs, 64)
+        labels = torch.arange(self.num_objs)
+        scores = torch.arange(self.num_objs, dtype=torch.float32)
+        similarity_logits = torch.randn(self.num_objs, self.num_objs + 1)
 
-        img_metas = [dict(scale_factor=1.0)]
+        match_score = self.tracker.get_match_score(bboxes, labels, scores,
+                                                   bboxes, labels,
+                                                   similarity_logits)
+        assert match_score.size() == similarity_logits.size()
 
-        model = MagicMock()
-        model.track_head.extract_roi_feats = MagicMock(
-            return_value=(torch.rand(self.num_objs, feats_channel, 7, 7),
-                          [self.num_objs]))
-        model.track_head.simple_test = MagicMock(
-            return_value=torch.rand((self.num_objs, self.num_objs + 1)))
+    @parameterized.expand([
+        'vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py'  # noqa: E501
+    ])
+    def test_track(self, cfg_file):
+        _model = get_model_cfg(cfg_file)
+        # _scope_ will be popped after build
+        model = MODELS.build(_model)
 
-        feats = torch.rand((1, feats_channel, img_size, img_size))
+        for frame_id in range(3):
+            packed_inputs = demo_mm_inputs(
+                batch_size=1, frame_id=0, num_ref_imgs=0, with_mask=True)
+            data_sample = packed_inputs['data_samples'][0]
+            img = packed_inputs['inputs']['img'][0]
+            data_sample.pred_det_instances = data_sample.gt_instances.clone()
+            # add fake scores
+            scores = torch.ones(5)
+            data_sample.pred_det_instances.scores = torch.FloatTensor(scores)
+            feats = []
+            for i in range(
+                    len(model.track_head.roi_extractor.featmap_strides)):
+                feats.append(
+                    torch.rand(1, 256, 256 // (2**(i + 2)),
+                               256 // (2**(i + 2))).to(device='cpu'))
 
-        bboxes = random_boxes(self.num_objs, img_size)
-        scores = torch.rand((self.num_objs, 1))
-        bboxes = torch.cat((bboxes, scores), dim=1)
+            pred_track_instances = self.tracker.track(
+                model=model,
+                img=img,
+                feats=tuple(feats),
+                data_sample=packed_inputs['data_samples'][0])
 
-        labels = torch.arange(self.num_objs)
+            bboxes = pred_track_instances.bboxes
+            labels = pred_track_instances.labels
+            ids = pred_track_instances.instances_id
 
-        masks = torch.zeros((self.num_objs, img_size, img_size))
-
-        for frame_id in range(3):
-            bboxes, labels, masks, ids = self.tracker.track(
-                img,
-                img_metas,
-                model,
-                feats,
-                bboxes,
-                labels,
-                masks,
-                frame_id,
-                rescale=True)
-            assert bboxes.shape[0] == self.num_objs
-            assert labels.shape[0] == self.num_objs
-            assert masks.shape == (self.num_objs, img_size, img_size)
-            assert ids.shape[0] == self.num_objs
+            assert bboxes.shape[1] == 4
+            assert bboxes.shape[0] == labels.shape[0]
+            assert bboxes.shape[0] == ids.shape[0]
diff --git a/tests/test_models/test_trackers/test_quasi_dense_tao_tracker.py b/tests/test_models/test_trackers/test_quasi_dense_tao_tracker.py
new file mode 100644
index 000000000..314abafb1
--- /dev/null
+++ b/tests/test_models/test_trackers/test_quasi_dense_tao_tracker.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+from unittest.mock import MagicMock
+
+import torch
+
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, random_boxes
+from mmtrack.utils import register_all_modules
+
+
+class TestQuasiDenseTAOTracker(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        cfg = dict(
+            type='QuasiDenseTAOTracker',
+            init_score_thr=0.0001,
+            obj_score_thr=0.0001,
+            match_score_thr=0.5,
+            memo_frames=10,
+            memo_momentum=0.8,
+            momentum_obj_score=0.5,
+            obj_score_diff_thr=1.0,
+            distractor_nms_thr=0.3,
+            distractor_score_thr=0.5,
+            match_metric='bisoftmax',
+            match_with_cosine=True)
+        cls.tracker = MODELS.build(cfg)
+        cls.num_objs = 5
+
+    def test_update(self):
+        ids = torch.arange(self.num_objs)
+        bboxes = random_boxes(self.num_objs, 64)
+        labels = torch.arange(self.num_objs)
+        scores = torch.arange(self.num_objs)
+        embeds = torch.randn(self.num_objs, 256)
+
+        self.tracker.update(
+            ids=ids,
+            bboxes=bboxes,
+            labels=labels,
+            embeds=embeds,
+            scores=scores,
+            frame_id=0)
+
+        for tid in range(self.num_objs):
+            assert self.tracker.tracks[tid]['bboxes'][0].equal(
+                bboxes[tid]) is True
+            assert self.tracker.tracks[tid]['labels'][0].equal(
+                labels[tid]) is True
+            assert self.tracker.tracks[tid]['scores'][0].equal(
+                scores[tid]) is True
+            assert self.tracker.tracks[tid]['embeds'].equal(
+                embeds[tid]) is True
+
+        ids = torch.tensor([self.num_objs - 1])
+        bboxes = random_boxes(1, 64)
+        labels = torch.tensor([self.num_objs])
+        scores = torch.tensor([self.num_objs])
+        embeds = torch.randn(1, 256)
+        new_embeds = (1 - self.tracker.memo_momentum) * self.tracker.tracks[
+            ids.item()]['embeds'] + self.tracker.memo_momentum * embeds
+
+        self.tracker.update(
+            ids=ids,
+            bboxes=bboxes,
+            labels=labels,
+            embeds=embeds,
+            scores=scores,
+            frame_id=1)
+
+        assert self.tracker.tracks[ids.item()]['embeds'].equal(
+            new_embeds[0]) == True  # noqa
+
+    def test_track(self):
+        img_size = 64
+        img = torch.rand((1, 3, img_size, img_size))
+        feats = torch.rand((1, 256, img_size, img_size))
+
+        model = MagicMock()
+        for frame_id in range(3):
+            packed_inputs = demo_mm_inputs(
+                batch_size=1, frame_id=0, num_ref_imgs=0)
+            data_sample = packed_inputs['data_samples'][0]
+            data_sample.pred_det_instances = data_sample.gt_instances.clone()
+            # add fake scores
+            scores = torch.ones(5)
+            data_sample.pred_det_instances.scores = torch.FloatTensor(scores)
+            pred_track_instances = self.tracker.track(
+                model=model,
+                img=img,
+                feats=feats,
+                data_sample=packed_inputs['data_samples'][0])
+
+            bboxes = pred_track_instances.bboxes
+            labels = pred_track_instances.labels
+            ids = pred_track_instances.instances_id
+
+            assert bboxes.shape[1] == 4
+            assert bboxes.shape[0] == labels.shape[0]
+            assert bboxes.shape[0] == ids.shape[0]
diff --git a/tests/test_models/test_trackers/test_quasi_dense_tao_trakcer.py b/tests/test_models/test_trackers/test_quasi_dense_tao_trakcer.py
deleted file mode 100644
index 5b1dfeaff..000000000
--- a/tests/test_models/test_trackers/test_quasi_dense_tao_trakcer.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from unittest.mock import MagicMock
-
-import torch
-from mmdet.core.bbox.demodata import random_boxes
-
-from mmtrack.models import TRACKERS
-
-
-class TestQuasiDenseTAOTracker(object):
-
-    @classmethod
-    def setup_class(cls):
-        cfg = dict(
-            init_score_thr=0.0001,
-            obj_score_thr=0.0001,
-            match_score_thr=0.5,
-            memo_frames=10,
-            memo_momentum=0.8,
-            momentum_obj_score=0.5,
-            obj_score_diff_thr=1.0,
-            distractor_nms_thr=0.3,
-            distractor_score_thr=0.5,
-            match_metric='bisoftmax',
-            match_with_cosine=True)
-        tracker = TRACKERS.get('QuasiDenseTAOTracker')
-        cls.tracker = tracker(**cfg)
-        cls.num_objs = 5
-
-    def test_update(self):
-        ids = torch.arange(self.num_objs)
-        bboxes = random_boxes(self.num_objs, 64)
-        labels = torch.arange(self.num_objs)
-        embeds = torch.randn(self.num_objs, 256)
-
-        self.tracker.update(
-            ids=ids, bboxes=bboxes, embeds=embeds, labels=labels, frame_id=0)
-
-        for tid in range(self.num_objs):
-            assert self.tracker.tracks[tid]['bboxes'][-1].equal(bboxes[tid])
-            assert self.tracker.tracks[tid]['embeds'].equal(embeds[tid])
-            assert self.tracker.tracks[tid]['labels'][-1].equal(labels[tid])
-
-        ids = torch.tensor([self.num_objs - 1])
-        bboxes = random_boxes(1, 64)
-        labels = torch.tensor([self.num_objs])
-        embeds = torch.randn(1, 256)
-        new_embeds = (1 - self.tracker.memo_momentum) * self.tracker.tracks[
-            ids.item()]['embeds'] + self.tracker.memo_momentum * embeds
-
-        self.tracker.update(
-            ids=ids, bboxes=bboxes, labels=labels, embeds=embeds, frame_id=1)
-
-        assert self.tracker.tracks[ids.item()]['embeds'].equal(new_embeds[0])
-
-    def test_memo(self):
-        memo_bboxes, memo_labels, memo_embeds, memo_ids = self.tracker.memo  # noqa
-        assert memo_bboxes.shape[0] == memo_labels.shape[0]
-        assert memo_labels.shape[0] == memo_embeds.shape[0]
-        assert memo_embeds.shape[0] == memo_ids.shape[0]
-        assert memo_ids.shape[0] == memo_bboxes.shape[0]
-
-    def test_track(self):
-        self.tracker.reset()
-        img_size, feats_channel = 64, 256
-        img_metas = [dict(scale_factor=1.0)]
-
-        model = MagicMock()
-        model.track_head.extract_roi_feats = MagicMock(
-            return_value=(torch.rand(self.num_objs, feats_channel, 7, 7),
-                          [self.num_objs]))
-        model.track_head.simple_test = MagicMock(
-            return_value=torch.rand((self.num_objs, self.num_objs + 1)))
-
-        feats = torch.rand((1, feats_channel, img_size, img_size))
-
-        bboxes = random_boxes(self.num_objs, 64)
-        scores = torch.rand((self.num_objs, 1))
-        bboxes = torch.cat((bboxes, scores), dim=1)
-
-        labels = torch.arange(self.num_objs)
-
-        for frame_id in range(3):
-            bboxes, labels, ids = self.tracker.track(img_metas, feats, model,
-                                                     bboxes, labels, frame_id)
-
-            assert bboxes.shape[0] == labels.shape[0]
-            assert labels.shape[0] == labels.shape[0]
-            assert ids.shape[0] == labels.shape[0]
diff --git a/tests/test_models/test_trackers/test_quasi_dense_embed_tracker.py b/tests/test_models/test_trackers/test_quasi_dense_tracker.py
similarity index 50%
rename from tests/test_models/test_trackers/test_quasi_dense_embed_tracker.py
rename to tests/test_models/test_trackers/test_quasi_dense_tracker.py
index 1a2151a99..6358637c3 100644
--- a/tests/test_models/test_trackers/test_quasi_dense_embed_tracker.py
+++ b/tests/test_models/test_trackers/test_quasi_dense_tracker.py
@@ -1,17 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
 from unittest.mock import MagicMock
 
 import torch
-from mmdet.core.bbox.demodata import random_boxes
 
-from mmtrack.models import TRACKERS
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, random_boxes
+from mmtrack.utils import register_all_modules
 
 
-class TestQuasiDenseTracker(object):
+class TestQuasiDenseTracker(TestCase):
 
     @classmethod
-    def setup_class(cls):
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
         cfg = dict(
+            type='QuasiDenseTracker',
             init_score_thr=0.9,
             obj_score_thr=0.5,
             match_score_thr=0.5,
@@ -23,23 +27,29 @@ def setup_class(cls):
             nms_class_iou_thr=0.7,
             with_cats=True,
             match_metric='bisoftmax')
-        tracker = TRACKERS.get('QuasiDenseTracker')
-        cls.tracker = tracker(**cfg)
+        cls.tracker = MODELS.build(cfg)
         cls.num_objs = 5
 
     def test_update(self):
         ids = torch.arange(self.num_objs)
         bboxes = random_boxes(self.num_objs, 64)
         labels = torch.arange(self.num_objs)
+        scores = torch.arange(self.num_objs)
         embeds = torch.randn(self.num_objs, 256)
 
         self.tracker.update(
-            ids=ids, bboxes=bboxes, embeds=embeds, labels=labels, frame_id=0)
+            ids=ids,
+            bboxes=bboxes,
+            labels=labels,
+            embeds=embeds,
+            scores=scores,
+            frame_id=0)
 
         for tid in range(self.num_objs):
             assert self.tracker.tracks[tid]['bbox'].equal(bboxes[tid])
             assert self.tracker.tracks[tid]['embed'].equal(embeds[tid])
             assert self.tracker.tracks[tid]['label'].equal(labels[tid])
+            assert self.tracker.tracks[tid]['score'].equal(scores[tid])
             assert self.tracker.tracks[tid]['acc_frame'] == 0
             assert self.tracker.tracks[tid]['last_frame'] == 0
             assert len(self.tracker.tracks[tid]['velocity']) == len(
@@ -48,47 +58,46 @@ def test_update(self):
         ids = torch.tensor([self.num_objs - 1])
         bboxes = random_boxes(1, 64)
         labels = torch.tensor([self.num_objs])
+        scores = torch.tensor([self.num_objs])
         embeds = torch.randn(1, 256)
         new_embeds = (1 - self.tracker.memo_momentum) * self.tracker.tracks[
             ids.item()]['embed'] + self.tracker.memo_momentum * embeds
 
         self.tracker.update(
-            ids=ids, bboxes=bboxes, labels=labels, embeds=embeds, frame_id=1)
+            ids=ids,
+            bboxes=bboxes,
+            labels=labels,
+            embeds=embeds,
+            scores=scores,
+            frame_id=1)
 
         assert self.tracker.tracks[ids.item()]['embed'].equal(
             new_embeds[0]) == True  # noqa
 
-    def test_memo(self):
-        memo_bboxes, memo_labels, memo_embeds, memo_ids, memo_vs = self.tracker.memo  # noqa
-        assert memo_bboxes.shape[0] == memo_labels.shape[0]
-        assert memo_embeds.shape[0] == memo_labels.shape[0]
-        assert memo_ids.shape[0] == memo_vs.shape[0]
-        assert memo_vs.shape[0] == memo_embeds.shape[0]
-
     def test_track(self):
-        self.tracker.reset()
-        img_size, feats_channel = 64, 256
-        img_metas = [dict(scale_factor=1.0)]
+        img_size = 64
+        img = torch.rand((1, 3, img_size, img_size))
+        feats = torch.rand((1, 256, img_size, img_size))
 
         model = MagicMock()
-        model.track_head.extract_roi_feats = MagicMock(
-            return_value=(torch.rand(self.num_objs, feats_channel, 7, 7),
-                          [self.num_objs]))
-        model.track_head.simple_test = MagicMock(
-            return_value=torch.rand((self.num_objs, self.num_objs + 1)))
-
-        feats = torch.rand((1, feats_channel, img_size, img_size))
-
-        bboxes = random_boxes(self.num_objs, 64)
-        scores = torch.rand((self.num_objs, 1))
-        bboxes = torch.cat((bboxes, scores), dim=1)
-
-        labels = torch.arange(self.num_objs)
-
         for frame_id in range(3):
-            bboxes, labels, ids = self.tracker.track(img_metas, feats, model,
-                                                     bboxes, labels, frame_id)
-
+            packed_inputs = demo_mm_inputs(
+                batch_size=1, frame_id=0, num_ref_imgs=0)
+            data_sample = packed_inputs['data_samples'][0]
+            data_sample.pred_det_instances = data_sample.gt_instances.clone()
+            # add fake scores
+            scores = torch.ones(5)
+            data_sample.pred_det_instances.scores = torch.FloatTensor(scores)
+            pred_track_instances = self.tracker.track(
+                model=model,
+                img=img,
+                feats=feats,
+                data_sample=packed_inputs['data_samples'][0])
+
+            bboxes = pred_track_instances.bboxes
+            labels = pred_track_instances.labels
+            ids = pred_track_instances.instances_id
+
+            assert bboxes.shape[1] == 4
             assert bboxes.shape[0] == labels.shape[0]
-            assert labels.shape[0] == labels.shape[0]
-            assert ids.shape[0] == labels.shape[0]
+            assert bboxes.shape[0] == ids.shape[0]
diff --git a/tests/test_models/test_trackers/test_sort_tracker.py b/tests/test_models/test_trackers/test_sort_tracker.py
new file mode 100644
index 000000000..bdb42309f
--- /dev/null
+++ b/tests/test_models/test_trackers/test_sort_tracker.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+from unittest.mock import MagicMock
+
+import torch
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg, random_boxes
+from mmtrack.utils import register_all_modules
+
+
+class TestSORTTracker(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        cls.num_objs = 30
+
+    @parameterized.expand([
+        'mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e'
+        '_mot17halftrain_test-mot17halfval.py'
+    ])
+    def test_init(self, cfg_file):
+        cfg = get_model_cfg(cfg_file)
+        tracker = MODELS.build(cfg['tracker'])
+        tracker.kf = TASK_UTILS.build(cfg['motion'])
+
+        bboxes = random_boxes(self.num_objs, 512)
+        labels = torch.zeros(self.num_objs)
+        scores = torch.ones(self.num_objs)
+        ids = torch.arange(self.num_objs)
+        tracker.update(
+            ids=ids, bboxes=bboxes, scores=scores, labels=labels, frame_ids=0)
+
+        assert tracker.ids == list(ids)
+        assert tracker.memo_items == [
+            'ids', 'bboxes', 'scores', 'labels', 'frame_ids'
+        ]
+
+    @parameterized.expand([
+        'mot/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e'
+        '_mot17halftrain_test-mot17halfval.py'
+    ])
+    def test_track(self, cfg_file):
+        img = torch.rand((1, 3, 128, 128))
+
+        cfg = get_model_cfg(cfg_file)
+        tracker = MODELS.build(cfg['tracker'])
+        tracker.kf = TASK_UTILS.build(cfg['motion'])
+
+        model = MagicMock()
+        model.reid = MODELS.build(cfg['reid'])
+        model.motion = TASK_UTILS.build(cfg['motion'])
+
+        with torch.no_grad():
+            for frame_id in range(3):
+                packed_inputs = demo_mm_inputs(
+                    batch_size=1, frame_id=frame_id, num_ref_imgs=0)
+                data_sample = packed_inputs['data_samples'][0]
+                data_sample.pred_det_instances = \
+                    data_sample.gt_instances.clone()
+                # add fake scores
+                scores = torch.ones(5)
+                data_sample.pred_det_instances.scores = torch.FloatTensor(
+                    scores)
+
+                pred_track_instances = tracker.track(
+                    model=model,
+                    img=img,
+                    feats=None,
+                    data_sample=packed_inputs['data_samples'][0],
+                    data_preprocessor=cfg['data_preprocessor'])
+
+                bboxes = pred_track_instances.bboxes
+                labels = pred_track_instances.labels
+                ids = pred_track_instances.instances_id
+
+                assert bboxes.shape[1] == 4
+                assert bboxes.shape[0] == labels.shape[0]
+                assert bboxes.shape[0] == ids.shape[0]
diff --git a/tests/test_models/test_trackers/test_strong_sort_tracker.py b/tests/test_models/test_trackers/test_strong_sort_tracker.py
new file mode 100644
index 000000000..60bf391a9
--- /dev/null
+++ b/tests/test_models/test_trackers/test_strong_sort_tracker.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+from unittest.mock import MagicMock
+
+import torch
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg, random_boxes
+from mmtrack.utils import register_all_modules
+
+
+class TestStrongSORTTracker(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        cls.num_objs = 30
+
+    @parameterized.expand([
+        'mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain'
+        '_test-mot17halfval.py'
+    ])
+    def test_init(self, cfg_file):
+        cfg = get_model_cfg(cfg_file)
+        tracker = MODELS.build(cfg['tracker'])
+        tracker.kf = TASK_UTILS.build(cfg['kalman'])
+        tracker.cmc = TASK_UTILS.build(cfg['cmc'])
+
+        bboxes = random_boxes(self.num_objs, 512)
+        labels = torch.zeros(self.num_objs)
+        scores = torch.ones(self.num_objs)
+        ids = torch.arange(self.num_objs)
+        tracker.update(
+            ids=ids, bboxes=bboxes, scores=scores, labels=labels, frame_ids=0)
+
+        assert tracker.ids == list(ids)
+        assert tracker.memo_items == [
+            'ids', 'bboxes', 'scores', 'labels', 'frame_ids'
+        ]
+
+    @parameterized.expand([
+        'mot/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain'
+        '_test-mot17halfval.py'
+    ])
+    def test_track(self, cfg_file):
+        img = torch.rand((1, 3, 128, 128))
+
+        cfg = get_model_cfg(cfg_file)
+        tracker = MODELS.build(cfg['tracker'])
+        tracker.kf = TASK_UTILS.build(cfg['kalman'])
+        tracker.cmc = TASK_UTILS.build(cfg['cmc'])
+
+        model = MagicMock()
+        model.reid = MODELS.build(cfg['reid'])
+        model.kalman = TASK_UTILS.build(cfg['kalman'])
+        model.cmc = TASK_UTILS.build(cfg['cmc'])
+
+        with torch.no_grad():
+            for frame_id in range(3):
+                packed_inputs = demo_mm_inputs(
+                    batch_size=1, frame_id=frame_id, num_ref_imgs=0)
+                data_sample = packed_inputs['data_samples'][0]
+                data_sample.pred_det_instances = \
+                    data_sample.gt_instances.clone()
+                # add fake scores
+                scores = torch.ones(5)
+                data_sample.pred_det_instances.scores = \
+                    torch.FloatTensor(scores)
+
+                pred_track_instances = tracker.track(
+                    model=model,
+                    img=img,
+                    feats=None,
+                    data_sample=packed_inputs['data_samples'][0],
+                    data_preprocessor=cfg['data_preprocessor'])
+
+                bboxes = pred_track_instances.bboxes
+                labels = pred_track_instances.labels
+                ids = pred_track_instances.instances_id
+
+                assert bboxes.shape[1] == 4
+                assert bboxes.shape[0] == labels.shape[0]
+                assert bboxes.shape[0] == ids.shape[0]
diff --git a/tests/test_models/test_trackers/test_tracktor_tracker.py b/tests/test_models/test_trackers/test_tracktor_tracker.py
new file mode 100644
index 000000000..71c7dcabb
--- /dev/null
+++ b/tests/test_models/test_trackers/test_tracktor_tracker.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+from unittest.mock import MagicMock
+
+import torch
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS, TASK_UTILS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg, random_boxes
+from mmtrack.utils import register_all_modules
+
+
+class TestTracktorTracker(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+        cls.num_objs = 30
+
+    @parameterized.expand([
+        'mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e'
+        '_mot17halftrain_test-mot17halfval.py'
+    ])
+    def test_init(self, cfg_file):
+        cfg = get_model_cfg(cfg_file)
+        tracker = MODELS.build(cfg['tracker'])
+
+        bboxes = random_boxes(self.num_objs, 512)
+        labels = torch.zeros(self.num_objs)
+        scores = torch.ones(self.num_objs)
+        ids = torch.arange(self.num_objs)
+        tracker.update(
+            ids=ids, bboxes=bboxes, scores=scores, labels=labels, frame_ids=0)
+
+        assert tracker.ids == list(ids)
+        assert tracker.memo_items == [
+            'ids', 'bboxes', 'scores', 'labels', 'frame_ids'
+        ]
+
+    @parameterized.expand([
+        'mot/tracktor/tracktor_faster-rcnn_r50_fpn_8xb2-4e'
+        '_mot17halftrain_test-mot17halfval.py'
+    ])
+    def test_track(self, cfg_file):
+        img = torch.rand((1, 3, 256, 256))
+        x = [torch.rand(1, 256, 4, 4)]
+
+        cfg = get_model_cfg(cfg_file)
+        tracker = MODELS.build(cfg['tracker'])
+
+        model = MagicMock()
+        model.detector = MODELS.build(cfg['detector'])
+        model.reid = MODELS.build(cfg['reid'])
+        model.cmc = TASK_UTILS.build(cfg['motion'])
+        model.with_linear_motion = False
+
+        packed_inputs = demo_mm_inputs(
+            batch_size=1, frame_id=0, num_ref_imgs=0)
+        data_sample = packed_inputs['data_samples'][0]
+        data_sample.pred_det_instances = data_sample.gt_instances.clone()
+        # add fake scores
+        scores = torch.ones(5)
+        data_sample.pred_det_instances.scores = torch.FloatTensor(scores)
+        for frame_id in range(3):
+            pred_track_instances = tracker.track(
+                model=model,
+                img=img,
+                feats=x,
+                data_sample=packed_inputs['data_samples'][0],
+                data_preprocessor=cfg['data_preprocessor'])
+
+            bboxes = pred_track_instances.bboxes
+            labels = pred_track_instances.labels
+            ids = pred_track_instances.instances_id
+
+            assert bboxes.shape[1] == 4
+            assert bboxes.shape[0] == labels.shape[0]
+            assert bboxes.shape[0] == ids.shape[0]
diff --git a/tests/test_models/test_vid/__init__.py b/tests/test_models/test_vid/__init__.py
new file mode 100644
index 000000000..ad36d81ea
--- /dev/null
+++ b/tests/test_models/test_vid/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
diff --git a/tests/test_models/test_vid/test_dff.py b/tests/test_models/test_vid/test_dff.py
new file mode 100644
index 000000000..bc593f8b6
--- /dev/null
+++ b/tests/test_models/test_vid/test_dff.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+from unittest import TestCase
+
+import torch
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestDFF(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules()
+
+    @parameterized.expand([
+        'vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py',
+    ])
+    def test_dff_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+        model = MODELS.build(model)
+        assert model.detector
+        assert model.motion
+
+    @parameterized.expand([
+        ('vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py', ('cpu',
+                                                                    'cuda'))
+    ])
+    def test_dff_forward_loss_mode(self, cfg_file, devices):
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            # _scope_ will be popped after build
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1, frame_id=0, num_ref_imgs=1)
+            out_data = model.data_preprocessor(packed_inputs, True)
+
+            # forward in ``loss`` mode
+            losses = model.forward(**out_data, mode='loss')
+            assert isinstance(losses, dict)
+
+    @parameterized.expand([
+        ('vid/dff/dff_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py', ('cpu',
+                                                                    'cuda'))
+    ])
+    def test_dff_forward_predict_mode(self, cfg_file, devices):
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            _model.test_cfg.key_frame_interval = 2
+            # _scope_ will be popped after build
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            # forward in ``predict`` mode
+            model.eval()
+            with torch.no_grad():
+                for i in range(3):
+                    packed_inputs = demo_mm_inputs(
+                        batch_size=1, frame_id=i, num_ref_imgs=0)
+                    out_data = model.data_preprocessor(packed_inputs, False)
+                    batch_results = model.forward(**out_data, mode='predict')
+                    assert len(batch_results) == 1
+                    assert isinstance(batch_results[0], TrackDataSample)
diff --git a/tests/test_models/test_vid/test_fgfa.py b/tests/test_models/test_vid/test_fgfa.py
new file mode 100644
index 000000000..e333de6bb
--- /dev/null
+++ b/tests/test_models/test_vid/test_fgfa.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+from unittest import TestCase
+
+import torch
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestVideoDetector(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules()
+
+    @parameterized.expand([
+        'vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py',
+    ])
+    def test_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+        model = MODELS.build(model)
+        assert model.detector
+        assert model.motion
+
+    @parameterized.expand([
+        ('vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py', ('cpu',
+                                                                      'cuda'))
+    ])
+    def test_fgfa_forward_loss_mode(self, cfg_file, devices):
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            # _scope_ will be popped after build
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1, frame_id=0, num_ref_imgs=2)
+            out_data = model.data_preprocessor(packed_inputs, True)
+            # forward in ``loss`` mode
+            losses = model.forward(**out_data, mode='loss')
+            assert isinstance(losses, dict)
+
+    @parameterized.expand([
+        ('vid/fgfa/fgfa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py', ('cpu',
+                                                                      'cuda'))
+    ])
+    def test_fgfa_forward_predict_mode(self, cfg_file, devices):
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            # _scope_ will be popped after build
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            # forward in ``predict`` mode
+            model.eval()
+            with torch.no_grad():
+                for i in range(3):
+                    packed_inputs = demo_mm_inputs(
+                        batch_size=1, frame_id=i, num_ref_imgs=2)
+                    out_data = model.data_preprocessor(packed_inputs, False)
+                    batch_results = model.forward(**out_data, mode='predict')
+                    assert len(batch_results) == 1
+                    assert isinstance(batch_results[0], TrackDataSample)
diff --git a/tests/test_models/test_vid/test_selsa.py b/tests/test_models/test_vid/test_selsa.py
new file mode 100644
index 000000000..6d06a08fb
--- /dev/null
+++ b/tests/test_models/test_vid/test_selsa.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+from unittest import TestCase
+
+import torch
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.structures import TrackDataSample
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestVideoDetector(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules()
+
+    @parameterized.expand([
+        'vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py',
+        'vid/temporal_roi_align/'
+        'selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py'
+    ])
+    def test_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+        model = MODELS.build(model)
+        assert model.detector
+
+    @parameterized.expand([
+        ('vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py',
+         ('cpu', 'cuda')),
+        ('vid/temporal_roi_align/'
+         'selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py',
+         ('cpu', 'cuda')),
+    ])
+    def test_selsa_forward_loss_mode(self, cfg_file, devices):
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            # _scope_ will be popped after build
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1, frame_id=0, num_ref_imgs=2)
+            out_data = model.data_preprocessor(packed_inputs, True)
+            # forward in ``loss`` mode
+            losses = model.forward(**out_data, mode='loss')
+            assert isinstance(losses, dict)
+
+    @parameterized.expand([
+        ('vid/selsa/selsa_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py',
+         ('cpu', 'cuda')),
+        ('vid/temporal_roi_align/'
+         'selsa-troialign_faster-rcnn_r50-dc5_8xb1-7e_imagenetvid.py',
+         ('cpu', 'cuda')),
+    ])
+    def test_selsa_forward_predict_mode(self, cfg_file, devices):
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            # _scope_ will be popped after build
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            # forward in ``predict`` mode
+            model.eval()
+            with torch.no_grad():
+                for i in range(3):
+                    packed_inputs = demo_mm_inputs(
+                        batch_size=1, frame_id=i, num_ref_imgs=2)
+                    out_data = model.data_preprocessor(packed_inputs, False)
+                    batch_results = model.forward(**out_data, mode='predict')
+                    assert len(batch_results) == 1
+                    assert isinstance(batch_results[0], TrackDataSample)
diff --git a/tests/test_models/test_vis/__init__.py b/tests/test_models/test_vis/__init__.py
new file mode 100644
index 000000000..ad36d81ea
--- /dev/null
+++ b/tests/test_models/test_vis/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
diff --git a/tests/test_models/test_vis/test_mask2former.py b/tests/test_models/test_vis/test_mask2former.py
new file mode 100644
index 000000000..7201dfb52
--- /dev/null
+++ b/tests/test_models/test_vis/test_mask2former.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+import unittest
+from unittest import TestCase
+
+import torch
+from mmengine.logging import MessageHub
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestMask2Former(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    @parameterized.expand([
+        'vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2019.py',
+    ])
+    def test_mask2former_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+
+        model = MODELS.build(model)
+        assert model.backbone
+        assert model.track_head
+
+    @parameterized.expand([
+        ('vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2019.py',
+         ('cpu', 'cuda')),
+    ])
+    def test_mask2former_forward_loss_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_mask2former_forward_loss_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            # _scope_ will be popped after build
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1,
+                frame_id=0,
+                num_key_imgs=2,
+                num_classes=2,
+                with_mask=True)
+            out_data = model.data_preprocessor(packed_inputs, True)
+            inputs, data_samples = out_data['inputs'], out_data['data_samples']
+            # Test forward
+            data_samples[0].gt_instances[
+                'map_instances_to_img_idx'] = torch.tensor([0], device=device)
+            losses = model.forward(inputs, data_samples, mode='loss')
+            assert isinstance(losses, dict)
+
+    @parameterized.expand([
+        ('vis/mask2former/mask2former_r50_8xb2-8e_youtubevis2019.py',
+         ('cpu', 'cuda')),
+    ])
+    def test_mask2former_forward_predict_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_mask2former_forward_predict_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=2,
+                frame_id=0,
+                image_shapes=[(3, 128, 128), (3, 128, 128)],
+                num_classes=2,
+                with_mask=True)
+            out_data = model.data_preprocessor(packed_inputs, False)
+            # Test forward test
+            model.eval()
+            with torch.no_grad():
+                batch_results = model.forward(**out_data, mode='predict')
+                assert len(batch_results) == 2
diff --git a/tests/test_models/test_vis/test_masktrack_rcnn.py b/tests/test_models/test_vis/test_masktrack_rcnn.py
new file mode 100644
index 000000000..515a3e990
--- /dev/null
+++ b/tests/test_models/test_vis/test_masktrack_rcnn.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+import unittest
+from unittest import TestCase
+
+import torch
+from mmengine.logging import MessageHub
+from parameterized import parameterized
+
+from mmtrack.registry import MODELS
+from mmtrack.testing import demo_mm_inputs, get_model_cfg
+from mmtrack.utils import register_all_modules
+
+
+class TestMaskTrackRCNN(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        register_all_modules(init_default_scope=True)
+
+    @parameterized.expand([
+        'vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py',  # noqa: E501
+    ])
+    def test_mask_track_rcnn_init(self, cfg_file):
+        model = get_model_cfg(cfg_file)
+
+        model = MODELS.build(model)
+        assert model.detector
+        assert model.track_head
+        assert model.tracker
+
+    @parameterized.expand([
+        (
+            'vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py',  # noqa: E501
+            ('cpu', 'cuda')),
+    ])
+    def test_mask_track_rcnn_forward_loss_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_mask_track_rcnn_forward_loss_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            # _scope_ will be popped after build
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1,
+                frame_id=0,
+                num_ref_imgs=1,
+                num_classes=1,
+                with_mask=True)
+            out_data = model.data_preprocessor(packed_inputs, True)
+            # Test forward
+            losses = model.forward(**out_data, mode='loss')
+            assert isinstance(losses, dict)
+
+    @parameterized.expand([
+        (
+            'vis/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py',  # noqa: E501
+            ('cpu', 'cuda')),
+    ])
+    def test_mask_track_rcnn_forward_predict_mode(self, cfg_file, devices):
+        message_hub = MessageHub.get_instance(
+            f'test_mask_track_rcnn_forward_predict_mode-{time.time()}')
+        message_hub.update_info('iter', 0)
+        message_hub.update_info('epoch', 0)
+
+        assert all([device in ['cpu', 'cuda'] for device in devices])
+
+        for device in devices:
+            _model = get_model_cfg(cfg_file)
+            model = MODELS.build(_model)
+
+            if device == 'cuda':
+                if not torch.cuda.is_available():
+                    return unittest.skip('test requires GPU and torch+cuda')
+                model = model.cuda()
+
+            packed_inputs = demo_mm_inputs(
+                batch_size=1,
+                frame_id=0,
+                num_ref_imgs=1,
+                num_classes=1,
+                with_mask=True)
+            out_data = model.data_preprocessor(packed_inputs, False)
+            # Test forward test
+            model.eval()
+            with torch.no_grad():
+                batch_results = model.forward(**out_data, mode='predict')
+                assert len(batch_results) == 1
diff --git a/tests/test_strutures/test_reid_data_sample.py b/tests/test_strutures/test_reid_data_sample.py
new file mode 100644
index 000000000..7cfa5382e
--- /dev/null
+++ b/tests/test_strutures/test_reid_data_sample.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+import torch
+from mmengine.structures import LabelData
+
+from mmtrack.structures import ReIDDataSample
+
+
+def _equal(a, b):
+    if isinstance(a, (torch.Tensor, np.ndarray)):
+        return (a == b).all()
+    else:
+        return a == b
+
+
+class TestReIDDataSample(TestCase):
+
+    def test_init(self):
+        img_shape = (256, 128)
+        ori_shape = (64, 64)
+        num_classes = 5
+        meta_info = dict(
+            img_shape=img_shape, ori_shape=ori_shape, num_classes=num_classes)
+        data_sample = ReIDDataSample(metainfo=meta_info)
+        self.assertIn('img_shape', data_sample)
+        self.assertIn('ori_shape', data_sample)
+        self.assertIn('num_classes', data_sample)
+        self.assertTrue(_equal(data_sample.get('img_shape'), img_shape))
+        self.assertTrue(_equal(data_sample.get('ori_shape'), ori_shape))
+        self.assertTrue(_equal(data_sample.get('num_classes'), num_classes))
+
+    def test_set_gt_label(self):
+        data_sample = ReIDDataSample(metainfo=dict(num_classes=5))
+        method = getattr(data_sample, 'set_' + 'gt_label')
+
+        # Test number
+        method(1)
+        label = data_sample.get('gt_label')
+        self.assertIsInstance(label, LabelData)
+        self.assertIsInstance(label.label, torch.LongTensor)
+
+        # Test tensor with single number
+        method(torch.tensor(2))
+        label = data_sample.get('gt_label')
+        self.assertIsInstance(label, LabelData)
+        self.assertIsInstance(label.label, torch.LongTensor)
+
+        # Test array with single number
+        method(np.array(3))
+        label = data_sample.get('gt_label')
+        self.assertIsInstance(label, LabelData)
+        self.assertIsInstance(label.label, torch.LongTensor)
+
+        # Test tensor
+        _label = torch.tensor([1, 2, 3])
+        method(_label)
+        label = data_sample.get('gt_label')
+        self.assertIsInstance(label, LabelData)
+        self.assertIsInstance(label.label, torch.Tensor)
+        self.assertTrue(_equal(label.label, _label))
+
+        # Test array
+        _label = np.array([1, 2, 3])
+        method(_label)
+        label = data_sample.get('gt_label')
+        self.assertIsInstance(label, LabelData)
+        self.assertIsInstance(label.label, torch.Tensor)
+        self.assertTrue(_equal(label.label, torch.from_numpy(_label)))
+
+        # Test Sequence
+        _label = [1, 2, 3.]
+        method(_label)
+        label = data_sample.get('gt_label')
+        self.assertIsInstance(label, LabelData)
+        self.assertIsInstance(label.label, torch.Tensor)
+        self.assertTrue(_equal(label.label, torch.tensor(_label)))
+
+        # Test set num_classes
+        self.assertEqual(label.num_classes, 5)
+
+        # Test unavailable type
+        with self.assertRaisesRegex(TypeError, "<class 'str'> is not"):
+            method('hi')
+
+    def test_set_gt_score(self):
+        data_sample = ReIDDataSample(metainfo={'num_classes': 5})
+        method = getattr(data_sample, 'set_' + 'gt_score')
+
+        # Test set
+        score = [0.1, 0.1, 0.6, 0.1, 0.1]
+        method(torch.tensor(score))
+        sample_gt_label = getattr(data_sample, 'gt_label')
+        self.assertIn('score', sample_gt_label)
+        torch.testing.assert_allclose(sample_gt_label.score, score)
+        self.assertEqual(sample_gt_label.num_classes, 5)
+
+        # Test set again
+        score = [0.2, 0.1, 0.5, 0.1, 0.1]
+        method(torch.tensor(score))
+        torch.testing.assert_allclose(sample_gt_label.score, score)
+
+        # Test invalid type
+        with self.assertRaisesRegex(AssertionError, 'be a torch.Tensor'):
+            method(score)
+
+        # Test invalid dims
+        with self.assertRaisesRegex(AssertionError, 'but got 2'):
+            method(torch.tensor([score]))
+
+        # Test invalid num_classes
+        with self.assertRaisesRegex(AssertionError, r'length of value \(6\)'):
+            method(torch.tensor(score + [0.1]))
+
+        # Test auto inter num_classes
+        data_sample = ReIDDataSample()
+        method = getattr(data_sample, 'set_gt_score')
+        method(torch.tensor(score))
+        sample_gt_label = getattr(data_sample, 'gt_label')
+        self.assertEqual(sample_gt_label.num_classes, len(score))
+
+    def test_del_gt_label(self):
+        data_sample = ReIDDataSample()
+        self.assertNotIn('gt_label', data_sample)
+        data_sample.set_gt_label(1)
+        self.assertIn('gt_label', data_sample)
+        del data_sample.gt_label
+        self.assertNotIn('gt_label', data_sample)
diff --git a/tests/test_strutures/test_track_data_sample.py b/tests/test_strutures/test_track_data_sample.py
new file mode 100644
index 000000000..0870f5a7f
--- /dev/null
+++ b/tests/test_strutures/test_track_data_sample.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+import pytest
+import torch
+from mmengine.structures import InstanceData
+
+from mmtrack.structures import TrackDataSample
+
+
+def _equal(a, b):
+    if isinstance(a, (torch.Tensor, np.ndarray)):
+        return (a == b).all()
+    else:
+        return a == b
+
+
+class TestTrackDataSample(TestCase):
+
+    def test_init(self):
+        meta_info = dict(
+            img_size=[256, 256],
+            scale_factor=np.array([1.5, 1.5]),
+            img_shape=torch.rand(4),
+            ref_img_size=[512, 512, 2],
+            ref_scale_factor=np.array([3, 3]),
+            ref_img_shape=torch.rand(8))
+
+        track_data_sample = TrackDataSample(metainfo=meta_info)
+        assert 'img_size' in track_data_sample
+        assert track_data_sample.img_size == [256, 256]
+        assert track_data_sample.get('img_size') == [256, 256]
+
+    def test_setter(self):
+        track_data_sample = TrackDataSample()
+
+        # test gt_instances
+        gt_instances_data = dict(
+            bboxes=torch.rand(4, 4),
+            labels=torch.rand(4),
+            masks=np.random.rand(4, 2, 2))
+        gt_instances = InstanceData(**gt_instances_data)
+        track_data_sample.gt_instances = gt_instances
+        assert 'gt_instances' in track_data_sample
+        assert _equal(track_data_sample.gt_instances.bboxes,
+                      gt_instances_data['bboxes'])
+        assert _equal(track_data_sample.gt_instances.labels,
+                      gt_instances_data['labels'])
+        assert _equal(track_data_sample.gt_instances.masks,
+                      gt_instances_data['masks'])
+
+        # test ignored_instances
+        ignored_instances_data = dict(
+            bboxes=torch.rand(4, 4), labels=torch.rand(4))
+        ignored_instances = InstanceData(**ignored_instances_data)
+        track_data_sample.ignored_instances = ignored_instances
+        assert 'ignored_instances' in track_data_sample
+        assert _equal(track_data_sample.ignored_instances.bboxes,
+                      ignored_instances_data['bboxes'])
+        assert _equal(track_data_sample.ignored_instances.labels,
+                      ignored_instances_data['labels'])
+
+        # test proposals
+        proposals_data = dict(bboxes=torch.rand(4, 4), labels=torch.rand(4))
+        proposals = InstanceData(**proposals_data)
+        track_data_sample.proposals = proposals
+        assert 'proposals' in track_data_sample
+        assert _equal(track_data_sample.proposals.bboxes,
+                      proposals_data['bboxes'])
+        assert _equal(track_data_sample.proposals.labels,
+                      proposals_data['labels'])
+
+        # test pred_det_instances
+        pred_det_instances_data = dict(
+            bboxes=torch.rand(2, 4),
+            labels=torch.rand(2),
+            masks=np.random.rand(2, 2, 2))
+        pred_det_instances = InstanceData(**pred_det_instances_data)
+        track_data_sample.pred_det_instances = pred_det_instances
+        assert 'pred_det_instances' in track_data_sample
+        assert _equal(track_data_sample.pred_det_instances.bboxes,
+                      pred_det_instances_data['bboxes'])
+        assert _equal(track_data_sample.pred_det_instances.labels,
+                      pred_det_instances_data['labels'])
+        assert _equal(track_data_sample.pred_det_instances.masks,
+                      pred_det_instances_data['masks'])
+
+        # test pred_track_instances
+        pred_track_instances_data = dict(
+            bboxes=torch.rand(2, 4),
+            labels=torch.rand(2),
+            masks=np.random.rand(2, 2, 2))
+        pred_track_instances = InstanceData(**pred_track_instances_data)
+        track_data_sample.pred_track_instances = pred_track_instances
+        assert 'pred_track_instances' in track_data_sample
+        assert _equal(track_data_sample.pred_track_instances.bboxes,
+                      pred_track_instances_data['bboxes'])
+        assert _equal(track_data_sample.pred_track_instances.labels,
+                      pred_track_instances_data['labels'])
+        assert _equal(track_data_sample.pred_track_instances.masks,
+                      pred_track_instances_data['masks'])
+
+        # test type error
+        with pytest.raises(AssertionError):
+            track_data_sample.pred_det_instances = torch.rand(2, 4)
+        with pytest.raises(AssertionError):
+            track_data_sample.pred_track_instances = torch.rand(2, 4)
+
+    def test_deleter(self):
+        gt_instances_data = dict(
+            bboxes=torch.rand(4, 4),
+            labels=torch.rand(4),
+            masks=np.random.rand(4, 2, 2))
+
+        track_data_sample = TrackDataSample()
+        gt_instances = InstanceData(data=gt_instances_data)
+        track_data_sample.gt_instances = gt_instances
+        assert 'gt_instances' in track_data_sample
+        del track_data_sample.gt_instances
+        assert 'gt_instances' not in track_data_sample
diff --git a/tests/test_utils/test_image.py b/tests/test_utils/test_image.py
new file mode 100644
index 000000000..eea6fe41e
--- /dev/null
+++ b/tests/test_utils/test_image.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmtrack.utils import gauss_blur
+
+
+def test_gauss_blur():
+    img = torch.randn(1, 2, 10, 10)
+    blurred_img = gauss_blur(img, kernel_size=(5, 5), sigma=(2, 2))
+    assert blurred_img.shape == img.shape
diff --git a/tests/test_core/test_utils/test_visualization.py b/tests/test_utils/test_mot_error_visualization.py
similarity index 99%
rename from tests/test_core/test_utils/test_visualization.py
rename to tests/test_utils/test_mot_error_visualization.py
index 7d0f5cd82..b6194dd38 100644
--- a/tests/test_core/test_utils/test_visualization.py
+++ b/tests/test_utils/test_mot_error_visualization.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pytest
 
-from mmtrack.core.utils import visualization as vis
+from mmtrack.utils import mot_error_visualization as vis
 
 
 def test_imshow_mot_errors():
diff --git a/tests/test_utils/test_plot_sot_curve.py b/tests/test_utils/test_plot_sot_curve.py
new file mode 100644
index 000000000..bc9067c7f
--- /dev/null
+++ b/tests/test_utils/test_plot_sot_curve.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from mmtrack.utils import (plot_norm_precision_curve, plot_precision_curve,
+                           plot_success_curve)
+
+
+def test_plot_success_curve():
+    success_1 = np.arange(100, -1, -5) + (np.random.rand(21) - 0.5) * 4
+    success_2 = np.arange(100, -1, -5) + (np.random.rand(21) - 0.5) * 4
+    success = np.stack([success_1, success_2])
+    plot_success_curve(success, ['tracker-1', 'tracker-2'])
+
+
+def test_plot_norm_precision_curve():
+    precision_1 = np.arange(0, 101, 2) + (np.random.rand(51) - 0.5) * 4
+    precision_2 = np.arange(0, 101, 2) + (np.random.rand(51) - 0.5) * 4
+    precision = np.stack([precision_1, precision_2])
+    plot_norm_precision_curve(precision, ['tracker-1', 'tracker-2'])
+
+
+def test_plot_precision_curve():
+    precision_1 = np.arange(0, 101, 2) + (np.random.rand(51) - 0.5) * 4
+    precision_2 = np.arange(0, 101, 2) + (np.random.rand(51) - 0.5) * 4
+    precision = np.stack([precision_1, precision_2])
+    plot_precision_curve(precision, ['tracker-1', 'tracker-2'])
diff --git a/tests/test_version.py b/tests/test_version.py
deleted file mode 100644
index 8453df477..000000000
--- a/tests/test_version.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmtrack import digit_version
-
-
-def test_digit_version():
-    assert digit_version('0.2.16') == (0, 2, 16, 0, 0, 0)
-    assert digit_version('1.2.3') == (1, 2, 3, 0, 0, 0)
-    assert digit_version('1.2.3rc0') == (1, 2, 3, 0, -1, 0)
-    assert digit_version('1.2.3rc1') == (1, 2, 3, 0, -1, 1)
-    assert digit_version('1.0rc0') == (1, 0, 0, 0, -1, 0)
-    assert digit_version('1.0') == digit_version('1.0.0')
-    assert digit_version('1.5.0+cuda90_cudnn7.6.3_lms') == digit_version('1.5')
-    assert digit_version('1.0.0dev') < digit_version('1.0.0a')
-    assert digit_version('1.0.0a') < digit_version('1.0.0a1')
-    assert digit_version('1.0.0a') < digit_version('1.0.0b')
-    assert digit_version('1.0.0b') < digit_version('1.0.0rc')
-    assert digit_version('1.0.0rc1') < digit_version('1.0.0')
-    assert digit_version('1.0.0') < digit_version('1.0.0post')
-    assert digit_version('1.0.0post') < digit_version('1.0.0post1')
-    assert digit_version('v1') == (1, 0, 0, 0, 0, 0)
-    assert digit_version('v1.1.5') == (1, 1, 5, 0, 0, 0)
diff --git a/tests/test_visualization/test_local_visualizer.py b/tests/test_visualization/test_local_visualizer.py
new file mode 100644
index 000000000..26413cb82
--- /dev/null
+++ b/tests/test_visualization/test_local_visualizer.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from unittest import TestCase
+
+import cv2
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+from mmtrack.structures import TrackDataSample
+from mmtrack.visualization import DetLocalVisualizer, TrackLocalVisualizer
+
+
+class TestTrackLocalVisualizer(TestCase):
+
+    @staticmethod
+    def _get_gt_instances():
+        bboxes = np.array([[912, 484, 1009, 593], [1338, 418, 1505, 797]])
+        masks = np.zeros((2, 1080, 1920), dtype=np.bool_)
+        for i, bbox in enumerate(bboxes):
+            masks[i, bbox[1]:bbox[3], bbox[0]:bbox[2]] = True
+        instances_data = dict(
+            bboxes=torch.tensor(bboxes),
+            masks=masks,
+            instances_id=torch.tensor([1, 2]),
+            labels=torch.tensor([0, 1]))
+        instances = InstanceData(**instances_data)
+        return instances
+
+    @staticmethod
+    def _get_pred_instances():
+        instances_data = dict(
+            bboxes=torch.tensor([[900, 500, 1000, 600], [1300, 400, 1500,
+                                                         800]]),
+            instances_id=torch.tensor([1, 2]),
+            labels=torch.tensor([0, 1]),
+            scores=torch.tensor([0.955, 0.876]))
+        instances = InstanceData(**instances_data)
+        return instances
+
+    @staticmethod
+    def _assert_image_and_shape(out_file, out_shape):
+        assert os.path.exists(out_file)
+        drawn_img = cv2.imread(out_file)
+        assert drawn_img.shape == out_shape
+        os.remove(out_file)
+
+    def test_add_datasample(self):
+        out_file = 'out_file.jpg'
+        h, w = 1080, 1920
+        image = np.random.randint(0, 256, size=(h, w, 3)).astype('uint8')
+        gt_instances = self._get_gt_instances()
+        pred_instances = self._get_pred_instances()
+        track_data_sample = TrackDataSample()
+        track_data_sample.gt_instances = gt_instances
+        track_data_sample.pred_track_instances = pred_instances
+
+        track_local_visualizer = TrackLocalVisualizer(alpha=0.2)
+        track_local_visualizer.dataset_meta = dict(
+            CLASSES=['pedestrian', 'vehicle'])
+
+        # test gt_instances
+        track_local_visualizer.add_datasample('image', image,
+                                              track_data_sample, None)
+
+        # test out_file
+        track_local_visualizer.add_datasample(
+            'image', image, track_data_sample, None, out_file=out_file)
+        self._assert_image_and_shape(out_file, (h, w, 3))
+
+        # test gt_instances and pred_instances
+        track_local_visualizer.add_datasample(
+            'image', image, track_data_sample, out_file=out_file)
+        self._assert_image_and_shape(out_file, (h, 2 * w, 3))
+
+        track_local_visualizer.add_datasample(
+            'image',
+            image,
+            track_data_sample,
+            draw_gt=False,
+            out_file=out_file)
+        self._assert_image_and_shape(out_file, (h, w, 3))
+
+        track_local_visualizer.add_datasample(
+            'image',
+            image,
+            track_data_sample,
+            draw_pred=False,
+            out_file=out_file)
+        self._assert_image_and_shape(out_file, (h, w, 3))
+
+
+class TestDetLocalVisualizer(TestCase):
+
+    @staticmethod
+    def _get_gt_instances():
+        instances_data = dict(
+            bboxes=np.array([[912, 484, 1009, 593], [1338, 418, 1505, 797]]),
+            labels=torch.tensor([0, 1]),
+            scores=torch.tensor([1., 1.]))
+        instances = InstanceData(**instances_data)
+        return instances
+
+    @staticmethod
+    def _get_pred_instances():
+        instances_data = dict(
+            bboxes=np.array([[900, 500, 1000, 600], [1300, 400, 1500, 800]]),
+            labels=torch.tensor([0, 1]),
+            scores=torch.tensor([0.955, 0.876]))
+        instances = InstanceData(**instances_data)
+        return instances
+
+    @staticmethod
+    def _assert_image_and_shape(out_file, out_shape):
+        assert os.path.exists(out_file)
+        drawn_img = cv2.imread(out_file)
+        assert drawn_img.shape == out_shape
+        os.remove(out_file)
+
+    def test_add_datasample(self):
+        out_file = 'out_file.jpg'
+        h, w = 1080, 1920
+        image = np.random.randint(0, 256, size=(h, w, 3)).astype('uint8')
+        gt_instances = self._get_gt_instances()
+        pred_instances = self._get_pred_instances()
+        track_data_sample = TrackDataSample()
+        track_data_sample.gt_instances = gt_instances
+        track_data_sample.pred_det_instances = pred_instances
+
+        det_local_visualizer = DetLocalVisualizer()
+        det_local_visualizer.dataset_meta = dict(
+            CLASSES=['pedestrian', 'vehicle'])
+        det_local_visualizer.add_datasample(
+            'image', image, track_data_sample, out_file=out_file)
+        self._assert_image_and_shape(out_file, (h, 2 * w, 3))
diff --git a/tools/analysis/benchmark.py b/tools/analysis/benchmark.py
deleted file mode 100644
index c425532aa..000000000
--- a/tools/analysis/benchmark.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import time
-
-import torch
-from mmcv import Config
-from mmcv.cnn import fuse_conv_bn
-from mmcv.parallel import MMDataParallel
-from mmcv.runner import load_checkpoint, wrap_fp16_model
-from mmdet.datasets import replace_ImageToTensor
-
-from mmtrack.datasets import build_dataloader, build_dataset
-from mmtrack.models import build_model
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='MMTrack benchmark a model')
-    parser.add_argument('config', help='test config file path')
-    parser.add_argument('--checkpoint', help='checkpoint file')
-    parser.add_argument(
-        '--log-interval', default=50, help='interval of logging')
-    parser.add_argument(
-        '--fuse-conv-bn',
-        action='store_true',
-        help='Whether to fuse conv and bn, this will slightly increase'
-        'the inference speed')
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-
-    cfg = Config.fromfile(args.config)
-    # import modules from string list.
-    if cfg.get('custom_imports', None):
-        from mmcv.utils import import_modules_from_strings
-        import_modules_from_strings(**cfg['custom_imports'])
-    # set cudnn_benchmark
-    if cfg.get('cudnn_benchmark', False):
-        torch.backends.cudnn.benchmark = True
-    if hasattr(cfg.model, 'detector'):
-        cfg.model.detector.pretrained = None
-    cfg.data.test.test_mode = True
-
-    # build the dataloader
-    samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
-    if samples_per_gpu > 1:
-        # Replace 'ImageToTensor' to 'DefaultFormatBundle'
-        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
-    dataset = build_dataset(cfg.data.test)
-    data_loader = build_dataloader(
-        dataset,
-        samples_per_gpu=1,
-        workers_per_gpu=cfg.data.workers_per_gpu,
-        dist=False,
-        shuffle=False)
-
-    # build the model and load checkpoint
-    model = build_model(cfg.model)
-    # We need call `init_weights()` to load pretained weights in MOT task.
-    model.init_weights()
-    fp16_cfg = cfg.get('fp16', None)
-    if fp16_cfg is not None:
-        wrap_fp16_model(model)
-    if args.checkpoint is not None:
-        load_checkpoint(model, args.checkpoint, map_location='cpu')
-    if args.fuse_conv_bn:
-        model = fuse_conv_bn(model)
-
-    model = MMDataParallel(model, device_ids=[0])
-
-    model.eval()
-
-    # the first several iterations may be very slow so skip them
-    num_warmup = 5
-    pure_inf_time = 0
-
-    # benchmark with 2000 image and take the average
-    for i, data in enumerate(data_loader):
-
-        torch.cuda.synchronize()
-        start_time = time.perf_counter()
-
-        with torch.no_grad():
-            model(return_loss=False, rescale=True, **data)
-
-        torch.cuda.synchronize()
-        elapsed = time.perf_counter() - start_time
-
-        if i >= num_warmup:
-            pure_inf_time += elapsed
-            if (i + 1) % args.log_interval == 0:
-                fps = (i + 1 - num_warmup) / pure_inf_time
-                print(f'Done image [{i + 1:<3}/ 2000], fps: {fps:.1f} img / s')
-
-        if (i + 1) == 2000:
-            pure_inf_time += elapsed
-            fps = (i + 1 - num_warmup) / pure_inf_time
-            print(f'Overall fps: {fps:.1f} img / s')
-            break
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/analysis/mot/mot_param_search.py b/tools/analysis/mot/mot_param_search.py
deleted file mode 100644
index 44df9f863..000000000
--- a/tools/analysis/mot/mot_param_search.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import os
-from itertools import product
-
-import mmcv
-import torch
-from dotty_dict import dotty
-from mmcv import Config, DictAction, get_logger, print_log
-from mmcv.cnn import fuse_conv_bn
-from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
-from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
-                         wrap_fp16_model)
-from mmdet.datasets import build_dataset
-
-from mmtrack.models import build_tracker
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='mmtrack test model')
-    parser.add_argument('config', help='test config file path')
-    parser.add_argument('--checkpoint', help='checkpoint file')
-    parser.add_argument('--out', help='output result file')
-    parser.add_argument('--log', help='log file')
-    parser.add_argument(
-        '--fuse-conv-bn',
-        action='store_true',
-        help='Whether to fuse conv and bn, this will slightly increase'
-        'the inference speed')
-    parser.add_argument(
-        '--format-only',
-        action='store_true',
-        help='Format the output results without perform evaluation. It is'
-        'useful when you want to format the result to a specific format and '
-        'submit it to the test server')
-    parser.add_argument('--eval', type=str, nargs='+', help='eval types')
-    parser.add_argument('--show', action='store_true', help='show results')
-    parser.add_argument(
-        '--show-dir', help='directory where painted images will be saved')
-    parser.add_argument(
-        '--gpu-collect',
-        action='store_true',
-        help='whether to use gpu to collect results.')
-    parser.add_argument(
-        '--tmpdir',
-        help='tmp directory used for collecting results from multiple '
-        'workers, available when gpu-collect is not specified')
-    parser.add_argument(
-        '--cfg-options',
-        nargs='+',
-        action=DictAction,
-        help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file.')
-    parser.add_argument(
-        '--eval-options',
-        nargs='+',
-        action=DictAction,
-        help='custom options for evaluation, the key-value pair in xxx=yyy '
-        'format will be kwargs for dataset.evaluate() function')
-    parser.add_argument(
-        '--launcher',
-        choices=['none', 'pytorch', 'slurm', 'mpi'],
-        default='none',
-        help='job launcher')
-    parser.add_argument('--local_rank', type=int, default=0)
-    args = parser.parse_args()
-    if 'LOCAL_RANK' not in os.environ:
-        os.environ['LOCAL_RANK'] = str(args.local_rank)
-    return args
-
-
-def get_search_params(cfg, search_params=None, prefix=None, logger=None):
-    if search_params is None:
-        search_params = dict()
-    for k, v in cfg.items():
-        if prefix is not None:
-            entire_k = prefix + '.' + k
-        else:
-            entire_k = k
-        if isinstance(v, list):
-            print_log(f'search `{entire_k}` in {v}.', logger)
-            search_params[entire_k] = v
-        if isinstance(v, dict):
-            search_params = get_search_params(v, search_params, entire_k,
-                                              logger)
-    return search_params
-
-
-def main():
-
-    args = parse_args()
-
-    assert args.out or args.eval or args.format_only or args.show \
-        or args.show_dir, \
-        ('Please specify at least one operation (save/eval/format/show the '
-         'results / save the results) with the argument "--out", "--eval"'
-         ', "--format-only", "--show" or "--show-dir"')
-
-    if args.eval and args.format_only:
-        raise ValueError('--eval and --format_only cannot be both specified')
-
-    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
-        raise ValueError('The output file must be a pkl file.')
-
-    cfg = Config.fromfile(args.config)
-    if cfg.get('USE_MMDET', False):
-        from mmdet.apis import multi_gpu_test, single_gpu_test
-        from mmdet.datasets import build_dataloader
-        from mmdet.models import build_detector as build_model
-    else:
-        from mmtrack.apis import multi_gpu_test, single_gpu_test
-        from mmtrack.datasets import build_dataloader
-        from mmtrack.models import build_model
-    if args.cfg_options is not None:
-        cfg.merge_from_dict(args.cfg_options)
-    # set cudnn_benchmark
-    if cfg.get('cudnn_benchmark', False):
-        torch.backends.cudnn.benchmark = True
-    # cfg.model.pretrains = None
-    if hasattr(cfg.model, 'detector'):
-        cfg.model.detector.pretrained = None
-    cfg.data.test.test_mode = True
-
-    # init distributed env first, since logger depends on the dist info.
-    if args.launcher == 'none':
-        distributed = False
-    else:
-        distributed = True
-        init_dist(args.launcher, **cfg.dist_params)
-
-    # build the dataloader
-    dataset = build_dataset(cfg.data.test)
-    data_loader = build_dataloader(
-        dataset,
-        samples_per_gpu=1,
-        workers_per_gpu=cfg.data.workers_per_gpu,
-        dist=distributed,
-        shuffle=False)
-
-    logger = get_logger('ParamsSearcher', log_file=args.log)
-    # get all cases
-    search_params = get_search_params(cfg.model.tracker, logger=logger)
-    combinations = [p for p in product(*search_params.values())]
-    search_cfgs = []
-    for c in combinations:
-        search_cfg = dotty(cfg.model.tracker.copy())
-        for i, k in enumerate(search_params.keys()):
-            search_cfg[k] = c[i]
-        search_cfgs.append(dict(search_cfg))
-    print_log(f'Totally {len(search_cfgs)} cases.', logger)
-    # init with the first one
-    cfg.model.tracker = search_cfgs[0].copy()
-
-    # build the model and load checkpoint
-    if cfg.get('test_cfg', False):
-        model = build_model(
-            cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
-    else:
-        model = build_model(cfg.model)
-    # We need call `init_weights()` to load pretained weights in MOT task.
-    model.init_weights()
-    fp16_cfg = cfg.get('fp16', None)
-    if fp16_cfg is not None:
-        wrap_fp16_model(model)
-
-    if args.checkpoint is not None:
-        checkpoint = load_checkpoint(
-            model, args.checkpoint, map_location='cpu')
-        if 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']:
-            model.CLASSES = checkpoint['meta']['CLASSES']
-    if not hasattr(model, 'CLASSES'):
-        model.CLASSES = dataset.CLASSES
-
-    if args.fuse_conv_bn:
-        model = fuse_conv_bn(model)
-
-    if not distributed:
-        model = MMDataParallel(model, device_ids=[0])
-    else:
-        model = MMDistributedDataParallel(
-            model.cuda(),
-            device_ids=[torch.cuda.current_device()],
-            broadcast_buffers=False)
-
-    print_log(f'Record {cfg.search_metrics}.', logger)
-    for i, search_cfg in enumerate(search_cfgs):
-        if not distributed:
-            model.module.tracker = build_tracker(search_cfg)
-            outputs = single_gpu_test(model, data_loader, args.show,
-                                      args.show_dir)
-        else:
-            model.module.tracker = build_tracker(search_cfg)
-            outputs = multi_gpu_test(model, data_loader, args.tmpdir,
-                                     args.gpu_collect)
-        rank, _ = get_dist_info()
-        if rank == 0:
-            if args.out:
-                print(f'\nwriting results to {args.out}')
-                mmcv.dump(outputs, args.out)
-            kwargs = {} if args.eval_options is None else args.eval_options
-            if args.format_only:
-                dataset.format_results(outputs, **kwargs)
-            if args.eval:
-                eval_kwargs = cfg.get('evaluation', {}).copy()
-                # hard-code way to remove EvalHook args
-                for key in ['interval', 'tmpdir', 'start', 'gpu_collect']:
-                    eval_kwargs.pop(key, None)
-                eval_kwargs.update(dict(metric=args.eval, **kwargs))
-                results = dataset.evaluate(outputs, **eval_kwargs)
-                _records = []
-                for k in cfg.search_metrics:
-                    if isinstance(results[k], float):
-                        _records.append(f'{(results[k]):.3f}')
-                    else:
-                        _records.append(f'{(results[k])}')
-                print_log(f'{combinations[i]}: {_records}', logger)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/analysis/sot/sot_siamrpn_param_search.py b/tools/analysis/sot/sot_siamrpn_param_search.py
deleted file mode 100644
index 89f7fc803..000000000
--- a/tools/analysis/sot/sot_siamrpn_param_search.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import os
-
-import numpy as np
-import torch
-from mmcv import Config, DictAction, get_logger, print_log
-from mmcv.cnn import fuse_conv_bn
-from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
-from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
-                         wrap_fp16_model)
-from mmdet.datasets import build_dataset
-
-
-def parse_range(range_str):
-    range_list = range_str.split(',')
-    assert len(range_list) == 3 and float(range_list[1]) >= float(
-        range_list[0])
-    param = map(float, range_list)
-    return np.round(np.arange(*param), decimals=2)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='mmtrack test model')
-    parser.add_argument('config', help='test config file path')
-    parser.add_argument('--checkpoint', help='checkpoint file')
-    parser.add_argument(
-        '--penalty-k-range',
-        type=parse_range,
-        help="the range of hyper-parameter 'penalty_k' in SiamRPN++; the format \
-            is 'start,stop,step'")
-    parser.add_argument(
-        '--lr-range',
-        type=parse_range,
-        help="the range of hyper-parameter 'lr' in SiamRPN++; the format is \
-            'start,stop,step'")
-    parser.add_argument(
-        '--win-influ-range',
-        type=parse_range,
-        help="the range of hyper-parameter 'window_influence' in SiamRPN++; the \
-            format is 'start,stop,step'")
-    parser.add_argument(
-        '--fuse-conv-bn',
-        action='store_true',
-        help='Whether to fuse conv and bn, this will slightly increase'
-        'the inference speed')
-    parser.add_argument('--log', help='log file', default=None)
-    parser.add_argument('--eval', type=str, nargs='+', help='eval types')
-    parser.add_argument('--show', action='store_true', help='show results')
-    parser.add_argument(
-        '--show-score-thr',
-        type=float,
-        default=0.3,
-        help='score threshold (default: 0.3)')
-    parser.add_argument(
-        '--show-dir', help='directory where painted images will be saved')
-    parser.add_argument(
-        '--gpu-collect',
-        action='store_true',
-        help='whether to use gpu to collect results.')
-    parser.add_argument(
-        '--tmpdir',
-        help='tmp directory used for collecting results from multiple '
-        'workers, available when gpu-collect is not specified')
-    parser.add_argument(
-        '--cfg-options',
-        nargs='+',
-        action=DictAction,
-        help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file.')
-    parser.add_argument(
-        '--eval-options',
-        nargs='+',
-        action=DictAction,
-        help='custom options for evaluation, the key-value pair in xxx=yyy '
-        'format will be kwargs for dataset.evaluate() function')
-    parser.add_argument(
-        '--launcher',
-        choices=['none', 'pytorch', 'slurm', 'mpi'],
-        default='none',
-        help='job launcher')
-    parser.add_argument('--local_rank', type=int, default=0)
-    args = parser.parse_args()
-    if 'LOCAL_RANK' not in os.environ:
-        os.environ['LOCAL_RANK'] = str(args.local_rank)
-    return args
-
-
-def main():
-    args = parse_args()
-
-    assert args.eval or args.show \
-        or args.show_dir, \
-        ('Please specify at least one operation (eval/show the '
-         'results) with the argument "--eval"'
-         ', "--show" or "--show-dir"')
-
-    cfg = Config.fromfile(args.config)
-
-    if cfg.get('USE_MMDET', False):
-        from mmdet.apis import multi_gpu_test, single_gpu_test
-        from mmdet.datasets import build_dataloader
-        from mmdet.models import build_detector as build_model
-        if 'detector' in cfg.model:
-            cfg.model = cfg.model.detector
-    elif cfg.get('USE_MMCLS', False):
-        from mmtrack.apis import multi_gpu_test, single_gpu_test
-        from mmtrack.datasets import build_dataloader
-        from mmtrack.models import build_reid as build_model
-        if 'reid' in cfg.model:
-            cfg.model = cfg.model.reid
-    else:
-        from mmtrack.apis import multi_gpu_test, single_gpu_test
-        from mmtrack.datasets import build_dataloader
-        from mmtrack.models import build_model
-    if args.cfg_options is not None:
-        cfg.merge_from_dict(args.cfg_options)
-    # set cudnn_benchmark
-    if cfg.get('cudnn_benchmark', False):
-        torch.backends.cudnn.benchmark = True
-    cfg.data.test.test_mode = True
-
-    # init distributed env first, since logger depends on the dist info.
-    if args.launcher == 'none':
-        distributed = False
-    else:
-        distributed = True
-        init_dist(args.launcher, **cfg.dist_params)
-
-    # build the dataloader
-    dataset = build_dataset(cfg.data.test)
-    data_loader = build_dataloader(
-        dataset,
-        samples_per_gpu=1,
-        workers_per_gpu=cfg.data.workers_per_gpu,
-        dist=distributed,
-        shuffle=False)
-
-    logger = get_logger('SOTParamsSearcher', log_file=args.log)
-
-    # build the model and load checkpoint
-    if cfg.get('test_cfg', False):
-        model = build_model(
-            cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
-    else:
-        model = build_model(cfg.model)
-    fp16_cfg = cfg.get('fp16', None)
-    if fp16_cfg is not None:
-        wrap_fp16_model(model)
-    if args.checkpoint is not None:
-        checkpoint = load_checkpoint(
-            model, args.checkpoint, map_location='cpu')
-        if 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']:
-            model.CLASSES = checkpoint['meta']['CLASSES']
-    if not hasattr(model, 'CLASSES'):
-        model.CLASSES = dataset.CLASSES
-
-    if args.fuse_conv_bn:
-        model = fuse_conv_bn(model)
-
-    if not distributed:
-        model = MMDataParallel(model, device_ids=[0])
-    else:
-        model = MMDistributedDataParallel(
-            model.cuda(),
-            device_ids=[torch.cuda.current_device()],
-            broadcast_buffers=False)
-
-    # init best_score, best_results and best parames
-    if 'meta' in checkpoint and 'hook_msgs' in checkpoint[
-            'meta'] and 'best_score' in checkpoint['meta']['hook_msgs']:
-        best_score = checkpoint['meta']['hook_msgs']['best_score']
-    else:
-        best_score = 0
-
-    key_metric = cfg.evaluation.save_best
-    best_result = {f'{key_metric}': best_score}
-
-    best_params = dict(
-        penalty_k=cfg.model.test_cfg.rpn.penalty_k,
-        lr=cfg.model.test_cfg.rpn.lr,
-        win_influ=cfg.model.test_cfg.rpn.window_influence)
-    print_log(f'init best score as: {best_score}', logger)
-    print_log(f'init best params as: {best_params}', logger)
-
-    num_cases = len(args.penalty_k_range) * len(args.lr_range) * len(
-        args.win_influ_range)
-    case_count = 0
-
-    # compare function setting in parameter search
-    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
-    compare_func = rule_map[cfg.evaluation.rule]
-
-    for penalty_k in args.penalty_k_range:
-        for lr in args.lr_range:
-            for win_influ in args.win_influ_range:
-                case_count += 1
-                cfg.model.test_cfg.rpn.penalty_k = penalty_k
-                cfg.model.test_cfg.rpn.lr = lr
-                cfg.model.test_cfg.rpn.window_influence = win_influ
-                print_log(f'-----------[{case_count}/{num_cases}]-----------',
-                          logger)
-                print_log(
-                    f'penalty_k={penalty_k} lr={lr} win_influence={win_influ}',
-                    logger)
-
-                if not distributed:
-                    outputs = single_gpu_test(
-                        model,
-                        data_loader,
-                        args.show,
-                        args.show_dir,
-                        show_score_thr=args.show_score_thr)
-                else:
-                    outputs = multi_gpu_test(model, data_loader, args.tmpdir,
-                                             args.gpu_collect)
-
-                rank, _ = get_dist_info()
-                if rank == 0:
-                    kwargs = args.eval_options if args.eval_options else {}
-                    if args.eval:
-                        eval_kwargs = cfg.get('evaluation', {}).copy()
-                        # hard-code way to remove EvalHook args
-                        eval_hook_args = [
-                            'interval', 'tmpdir', 'start', 'gpu_collect',
-                            'save_best', 'rule', 'by_epoch'
-                        ]
-                        for key in eval_hook_args:
-                            eval_kwargs.pop(key, None)
-                        eval_kwargs.update(dict(metric=args.eval, **kwargs))
-                        eval_results = dataset.evaluate(outputs, **eval_kwargs)
-                        print_log(f'evaluation results: {eval_results}',
-                                  logger)
-                        print_log('------------------------------------------',
-                                  logger)
-
-                        if compare_func(eval_results[key_metric],
-                                        best_result[key_metric]):
-                            best_result = eval_results
-                            best_params['penalty_k'] = penalty_k,
-                            best_params['lr'] = lr,
-                            best_params['win_influ'] = win_influ
-
-                        print_log(
-                            f'The current best evaluation results: \
-                                {best_result}', logger)
-                        print_log(f'The current best params: {best_params}',
-                                  logger)
-
-    print_log(
-        f'After parameter searching, the best evaluation results: \
-            {best_result}', logger)
-    print_log(f'After parameter searching, the best params: {best_params}',
-              logger)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/analysis/analyze_logs.py b/tools/analysis_tools/analyze_logs.py
similarity index 100%
rename from tools/analysis/analyze_logs.py
rename to tools/analysis_tools/analyze_logs.py
diff --git a/tools/analysis_tools/benchmark.py b/tools/analysis_tools/benchmark.py
new file mode 100644
index 000000000..2d25436c9
--- /dev/null
+++ b/tools/analysis_tools/benchmark.py
@@ -0,0 +1,133 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+from mmengine import Config, DictAction, MMLogger
+from mmengine.dist import init_dist
+from mmengine.utils import mkdir_or_exist
+
+from mmtrack.utils import register_all_modules
+from mmtrack.utils.benchmark import (DataLoaderBenchmark, DatasetBenchmark,
+                                     InferenceBenchmark)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMTrack benchmark')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--task',
+        choices=['inference', 'dataloader', 'dataset'],
+        default='dataloader',
+        help='Which task do you want to go to benchmark')
+    parser.add_argument(
+        '--repeat-num',
+        type=int,
+        default=1,
+        help='number of repeat times of measurement for averaging the results')
+    parser.add_argument(
+        '--max-iter', type=int, default=2000, help='num of max iter')
+    parser.add_argument(
+        '--log-interval', type=int, default=50, help='interval of logging')
+    parser.add_argument(
+        '--num-warmup', type=int, default=5, help='Number of warmup')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--dataset-type',
+        choices=['train', 'val', 'test'],
+        default='test',
+        help='Benchmark dataset type. only supports train, val and test')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing '
+        'benchmark metrics')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def inference_benchmark(args, cfg, distributed, logger):
+    benchmark = InferenceBenchmark(
+        cfg,
+        args.checkpoint,
+        distributed,
+        args.fuse_conv_bn,
+        args.max_iter,
+        args.log_interval,
+        args.num_warmup,
+        logger=logger)
+    return benchmark
+
+
+def dataloader_benchmark(args, cfg, distributed, logger):
+    benchmark = DataLoaderBenchmark(
+        cfg,
+        distributed,
+        args.dataset_type,
+        args.max_iter,
+        args.log_interval,
+        args.num_warmup,
+        logger=logger)
+    return benchmark
+
+
+def dataset_benchmark(args, cfg, distributed, logger):
+    benchmark = DatasetBenchmark(
+        cfg,
+        args.dataset_type,
+        args.max_iter,
+        args.log_interval,
+        args.num_warmup,
+        logger=logger)
+    return benchmark
+
+
+def main():
+    register_all_modules()
+
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    distributed = False
+    if args.launcher != 'none':
+        init_dist(args.launcher, **cfg.get('env_cfg', {}).get('dist_cfg', {}))
+        distributed = True
+
+    log_file = None
+    if args.work_dir:
+        log_file = os.path.join(args.work_dir, 'benchmark.log')
+        mkdir_or_exist(args.work_dir)
+
+    logger = MMLogger.get_instance(
+        'mmtrack', log_file=log_file, log_level='INFO')
+
+    benchmark = eval(f'{args.task}_benchmark')(args, cfg, distributed, logger)
+    benchmark.run(args.repeat_num)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/browse_dataset.py b/tools/analysis_tools/browse_dataset.py
new file mode 100644
index 000000000..ffc0fe7f3
--- /dev/null
+++ b/tools/analysis_tools/browse_dataset.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import mmengine
+import numpy as np
+from mmdet.models.utils import mask2ndarray
+from mmengine import Config, DictAction
+from mmengine.structures import InstanceData
+
+from mmtrack.registry import DATASETS, VISUALIZERS
+from mmtrack.structures import TrackDataSample
+from mmtrack.utils import register_all_modules
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=2,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # register all modules in mmtrack into the registries
+    register_all_modules(init_default_scope=True)
+
+    dataset = DATASETS.build(cfg.train_dataloader.dataset)
+
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.metainfo
+
+    progress_bar = mmengine.ProgressBar(len(dataset))
+    gt_sample = TrackDataSample()  # just to wrap the `gt_instances`
+    for idx, item in enumerate(dataset):
+        data_sample = item['data_samples']
+        for img_key, imgs in item['inputs'].items():
+            img_paths = data_sample.get(img_key + '_path')
+            img_key_prefix = img_key[:-3]
+            gt_instances = data_sample.get(img_key_prefix + 'gt_instances')
+            if not isinstance(img_paths, list):
+                img_paths = [img_paths]
+            for img_idx in range(imgs.shape[0]):
+                new_gt_instances = InstanceData()
+                img_path = img_paths[img_idx]
+                img = imgs[img_idx].permute(1, 2, 0).numpy()
+                # For each item, their file names may be the same.
+                # Create a new folder to avoid overwriting the image files.
+                out_file = osp.join(args.output_dir,
+                                    str(idx).zfill(6),
+                                    f'{img_key_prefix}img_{img_idx}.jpg'
+                                    ) if args.output_dir is not None else None
+
+                img = img[..., [2, 1, 0]]  # bgr to rgb
+                # Get the correct index for each instance by using
+                # map_instances_to_img_idx
+                map_instances_to_img_idx = gt_instances.\
+                    map_instances_to_img_idx.numpy()
+                idx_bool_flag = (map_instances_to_img_idx == img_idx)
+                for key in ['bboxes', 'labels', 'instances_id']:
+                    if key in gt_instances:
+                        new_gt_instances[key] = gt_instances[key][
+                            idx_bool_flag]
+
+                gt_masks = gt_instances.get('masks', None)
+                if gt_masks is not None:
+                    gt_masks = gt_masks[idx_bool_flag]
+                    masks = mask2ndarray(gt_masks)
+                    new_gt_instances['masks'] = masks.astype(np.bool)
+
+                gt_sample.gt_instances = new_gt_instances
+
+                visualizer.add_datasample(
+                    osp.basename(img_path),
+                    img,
+                    data_sample=gt_sample,
+                    draw_pred=False,
+                    show=not args.not_show,
+                    wait_time=args.show_interval,
+                    out_file=out_file)
+                # Record file path mapping.
+                if args.output_dir is not None:
+                    with open(
+                            osp.join(args.output_dir,
+                                     str(idx).zfill(6), 'info.txt'), 'a') as f:
+                        f.write(f'The source filepath of'
+                                f' `{img_key_prefix}img_{img_idx}.jpg`'
+                                f' is `{img_path}`.\n')
+
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis/mot/dist_mot_search.sh b/tools/analysis_tools/mot/dist_mot_search.sh
similarity index 100%
rename from tools/analysis/mot/dist_mot_search.sh
rename to tools/analysis_tools/mot/dist_mot_search.sh
diff --git a/tools/analysis/mot/mot_dummy_results.py b/tools/analysis_tools/mot/mot_dummy_results.py
similarity index 92%
rename from tools/analysis/mot/mot_dummy_results.py
rename to tools/analysis_tools/mot/mot_dummy_results.py
index 0c041e263..baa9fd323 100644
--- a/tools/analysis/mot/mot_dummy_results.py
+++ b/tools/analysis_tools/mot/mot_dummy_results.py
@@ -3,7 +3,7 @@
 import os
 import os.path as osp
 
-import mmcv
+import mmengine
 
 
 def parse_args():
@@ -17,7 +17,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    anns = mmcv.load(args.json_file)
+    anns = mmengine.load(args.json_file)
 
     if not osp.exists(args.out_folder):
         os.makedirs(args.out_folder)
diff --git a/tools/analysis/mot/mot_error_visualize.py b/tools/analysis_tools/mot/mot_error_visualize.py
similarity index 77%
rename from tools/analysis/mot/mot_error_visualize.py
rename to tools/analysis_tools/mot/mot_error_visualize.py
index 62f41644e..8c7e0e3f8 100644
--- a/tools/analysis/mot/mot_error_visualize.py
+++ b/tools/analysis_tools/mot/mot_error_visualize.py
@@ -2,22 +2,25 @@
 import argparse
 import os
 import os.path as osp
+import re
 
 import mmcv
 import motmetrics as mm
 import numpy as np
-from mmcv import Config
-from mmcv.utils import print_log
+from mmengine import Config
+from mmengine.logging import print_log
+from torch.utils.data import Dataset
 
-from mmtrack.core.utils import imshow_mot_errors
-from mmtrack.datasets import build_dataset
+from mmtrack.registry import DATASETS
+from mmtrack.utils import imshow_mot_errors, register_all_modules
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
         description='visualize errors for multiple object tracking')
     parser.add_argument('config', help='path of the config file')
-    parser.add_argument('--result-file', help='path of the inference result')
+    parser.add_argument(
+        '--result-dir', help='directory of the inference result')
     parser.add_argument(
         '--out-dir',
         help='directory where painted images or videos will be saved')
@@ -37,11 +40,11 @@ def parse_args():
     return args
 
 
-def compare_res_gts(resfiles, dataset, video_name):
+def compare_res_gts(results_dir: str, dataset: Dataset, video_name: str):
     """Evaluate the results of the video.
 
     Args:
-        resfiles (dict): A dict containing the directory of the MOT results.
+        results_dir (str): the directory of the MOT results.
         dataset (Dataset): MOT dataset of the video to be evaluated.
         video_name (str): Name of the video to be evaluated.
 
@@ -50,17 +53,19 @@ def compare_res_gts(resfiles, dataset, video_name):
         res is the results of inference and gt is the ground truth.
     """
     if 'half-train' in dataset.ann_file:
-        gt_file = osp.join(dataset.img_prefix,
+        gt_file = osp.join(dataset.data_prefix['img_path'],
                            f'{video_name}/gt/gt_half-train.txt')
     elif 'half-val' in dataset.ann_file:
-        gt_file = osp.join(dataset.img_prefix,
+        gt_file = osp.join(dataset.data_prefix['img_path'],
                            f'{video_name}/gt/gt_half-val.txt')
     else:
-        gt_file = osp.join(dataset.img_prefix, f'{video_name}/gt/gt.txt')
-    res_file = osp.join(resfiles['track'], f'{video_name}.txt')
+        gt_file = osp.join(dataset.data_prefix['img_path'],
+                           f'{video_name}/gt/gt.txt')
+    res_file = osp.join(results_dir, f'{video_name}.txt')
     gt = mm.io.loadtxt(gt_file)
     res = mm.io.loadtxt(res_file)
-    ini_file = osp.join(dataset.img_prefix, f'{video_name}/seqinfo.ini')
+    ini_file = osp.join(dataset.data_prefix['img_path'],
+                        f'{video_name}/seqinfo.ini')
     if osp.exists(ini_file):
         acc, ana = mm.utils.CLEAR_MOT_M(gt, res, ini_file)
     else:
@@ -76,9 +81,6 @@ def main():
         ('Please specify at least one operation (show the results '
          '/ save the results) with the argument "--show" or "--out-dir"')
 
-    if not args.result_file.endswith(('.pkl', 'pickle')):
-        raise ValueError('The result file must be a pkl file.')
-
     if args.out_dir is not None:
         os.makedirs(args.out_dir, exist_ok=True)
 
@@ -88,27 +90,31 @@ def main():
               'and the blue bounding box denotes ID switch.')
 
     cfg = Config.fromfile(args.config)
-    dataset = build_dataset(cfg.data.val, dict(test_mode=True))
-    results = mmcv.load(args.result_file)
+
+    register_all_modules(init_default_scope=True)
+    dataset = DATASETS.build(cfg.val_dataloader.dataset)
 
     # create index from frame_id to filename
     filenames_dict = dict()
-    for data_info in dataset.data_infos:
-        video_name = data_info['filename'].split(os.sep, 1)[0]
-        frame_id = int(data_info['filename'].rsplit(os.sep,
-                                                    1)[-1].split('.')[0])
+    for i in range(len(dataset)):
+        data_info = dataset.get_data_info(i)
+        # the `data_info['file_name']` usually has the same format
+        # with "MOT17-09-DPM/img1/000003.jpg"
+        # split with both '\' and '/' to be compatible with different OS.
+        split_path = re.split(r'[\\/]', data_info['file_name'])
+        video_name = split_path[-3]
+        frame_id = int(split_path[-1].split('.')[0])
         if video_name not in filenames_dict:
             filenames_dict[video_name] = dict()
-        filenames_dict[video_name][frame_id] = data_info['filename']
-
-    # format the results to txts
-    resfile_path, resfiles, video_names, tmp_dir = dataset.format_results(
-        results, None, ['track'])
+        # the data_info['img_path'] usually has the same format
+        # with `img_path_prefix + "MOT17-09-DPM/img1/000003.jpg"`
+        filenames_dict[video_name][frame_id] = data_info['img_path']
+    video_names = tuple(filenames_dict.keys())
 
     for video_name in video_names:
         print_log(f'Start processing video {video_name}')
 
-        acc, res, gt = compare_res_gts(resfiles, dataset, video_name)
+        acc, res, gt = compare_res_gts(args.result_dir, dataset, video_name)
 
         frames_id_list = sorted(
             list(set(acc.mot_events.index.get_level_values(0))))
@@ -118,8 +124,7 @@ def main():
             cur_res = res.loc[frame_id] if frame_id in res.index else None
             cur_gt = gt.loc[frame_id] if frame_id in gt.index else None
             # path of image
-            img = osp.join(dataset.img_prefix,
-                           filenames_dict[video_name][frame_id])
+            img = filenames_dict[video_name][frame_id]
             fps = events[events.Type == 'FP']
             fns = events[events.Type == 'MISS']
             idsws = events[events.Type == 'SWITCH']
diff --git a/tools/analysis_tools/mot/mot_param_search.py b/tools/analysis_tools/mot/mot_param_search.py
new file mode 100644
index 000000000..3d7b1866c
--- /dev/null
+++ b/tools/analysis_tools/mot/mot_param_search.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+from itertools import product
+
+from mmengine.config import Config, DictAction
+from mmengine.dist import get_dist_info
+from mmengine.logging import MMLogger, print_log
+from mmengine.runner import Runner
+
+from mmtrack.utils import register_all_modules
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMTrack test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def get_search_params(cfg, search_params=None, prefix=None, logger=None):
+    if search_params is None:
+        search_params = dict()
+    for k, v in cfg.items():
+        if prefix is not None:
+            entire_k = prefix + '.' + k
+        else:
+            entire_k = k
+        if isinstance(v, list):
+            print_log(f'search `{entire_k}` in {v}.', logger)
+            search_params[entire_k] = v
+        if isinstance(v, dict):
+            search_params = get_search_params(v, search_params, entire_k,
+                                              logger)
+    return search_params
+
+
+def main():
+
+    args = parse_args()
+
+    # register all modules in mmtrack into the registries
+    # do not init the default scope here because it will be init in the runner
+    register_all_modules(init_default_scope=False)
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    logger = MMLogger.get_instance(name='ParamsSearcher', logger_name='Logger')
+    # get all cases
+    search_params = get_search_params(cfg.model.tracker, logger=logger)
+    search_params_names = tuple(search_params.keys())
+    all_search_cases = []
+    for values in product(*search_params.values()):
+        search = dict()
+        for k, v in zip(search_params_names, values):
+            search[k] = v
+        all_search_cases.append(search)
+
+    print_log(f'Totally {len(all_search_cases)} cases.', logger)
+
+    search_metrics = []
+    metrics_types = [cfg.test_evaluator.metric] if isinstance(
+        cfg.test_evaluator.metric, str) else cfg.test_evaluator.metric
+    if 'HOTA' in metrics_types:
+        search_metrics.extend(['HOTA', 'AssA', 'DetA'])
+    if 'CLEAR' in metrics_types:
+        search_metrics.extend(
+            ['MOTA', 'MOTP', 'IDSW', 'TP', 'FN', 'FP', 'Frag', 'MT', 'ML'])
+    if 'Identity' in metrics_types:
+        search_metrics.extend(['IDF1', 'IDTP', 'IDFN', 'IDFP', 'IDP', 'IDR'])
+    print_log(f'Record {search_metrics}.', logger)
+
+    runner = Runner.from_cfg(cfg)
+    for case in all_search_cases:
+        for name, value in case.items():
+            if hasattr(runner.model, 'module'):
+                setattr(runner.model.module.tracker, name, value)
+            else:
+                setattr(runner.model.tracker, name, value)
+        runner.test()
+        rank, _ = get_dist_info()
+        if rank == 0:
+            _records = []
+            for metric in search_metrics:
+                res = runner.message_hub.get_scalar(
+                    'test/motchallenge-metric/' + metric).current()
+                if isinstance(res, float):
+                    _records.append(f'{res:.3f}')
+                else:
+                    _records.append(f'{res}')
+            print_log(f'-------------- {case}: {_records} --------------',
+                      logger)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis/mot/slurm_mot_search.sh b/tools/analysis_tools/mot/slurm_mot_search.sh
similarity index 100%
rename from tools/analysis/mot/slurm_mot_search.sh
rename to tools/analysis_tools/mot/slurm_mot_search.sh
diff --git a/tools/analysis/sot/dist_sot_siamrpn_search.sh b/tools/analysis_tools/sot/dist_sot_siamrpn_search.sh
similarity index 100%
rename from tools/analysis/sot/dist_sot_siamrpn_search.sh
rename to tools/analysis_tools/sot/dist_sot_siamrpn_search.sh
diff --git a/tools/analysis/sot/slurm_sot_siamrpn_search.sh b/tools/analysis_tools/sot/slurm_sot_siamrpn_search.sh
similarity index 100%
rename from tools/analysis/sot/slurm_sot_siamrpn_search.sh
rename to tools/analysis_tools/sot/slurm_sot_siamrpn_search.sh
diff --git a/tools/analysis_tools/sot/sot_playback.py b/tools/analysis_tools/sot/sot_playback.py
new file mode 100644
index 000000000..ee336ba34
--- /dev/null
+++ b/tools/analysis_tools/sot/sot_playback.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+from argparse import ArgumentParser
+
+import mmcv
+import mmengine
+import numpy as np
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('video_path', help='video path')
+    parser.add_argument('track_results', help='the tracked results')
+    parser.add_argument('--gt_bboxes', help='the groundtruth bboxes file')
+    parser.add_argument('--output', help='output video file (mp4 format)')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        default=False,
+        help='whether to show visualizations.')
+    parser.add_argument('--fps', help='FPS of the output video')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+
+    # load images
+    if osp.isdir(args.video_path):
+        imgs = sorted(
+            filter(lambda x: x.endswith(('.jpg', '.png', '.jpeg')),
+                   os.listdir(args.video_path)),
+            key=lambda x: int(x.split('.')[0]))
+        IN_VIDEO = False
+    else:
+        imgs = mmcv.VideoReader(args.video_path)
+        IN_VIDEO = True
+
+    OUT_VIDEO = False
+    # define output
+    if args.output is not None:
+        if args.output.endswith('.mp4'):
+            OUT_VIDEO = True
+            out_dir = tempfile.TemporaryDirectory()
+            out_path = out_dir.name
+            _out = args.output.rsplit(os.sep, 1)
+            if len(_out) > 1:
+                os.makedirs(_out[0], exist_ok=True)
+        else:
+            out_path = args.output
+            os.makedirs(out_path, exist_ok=True)
+    fps = args.fps
+    if args.show or OUT_VIDEO:
+        if fps is None:
+            if IN_VIDEO:
+                fps = imgs.fps
+            if OUT_VIDEO:
+                raise ValueError('Please set the FPS for the output video.')
+        else:
+            fps = int(fps)
+
+    prog_bar = mmengine.ProgressBar(len(imgs))
+    track_bboxes = mmengine.list_from_file(args.track_results)
+    if args.gt_bboxes is not None:
+        gt_bboxes = mmengine.list_from_file(args.gt_bboxes)
+        assert len(track_bboxes) == len(gt_bboxes)
+
+    # test and show/save the images
+    for i, img in enumerate(imgs):
+        if isinstance(img, str):
+            img_path = osp.join(args.video_path, img)
+            img = mmcv.imread(img_path)
+
+        if args.output is not None:
+            if IN_VIDEO or OUT_VIDEO:
+                out_file = osp.join(out_path, f'{i:06d}.jpg')
+            else:
+                out_file = osp.join(out_path, img_path.rsplit(os.sep, 1)[-1])
+        else:
+            out_file = None
+
+        draw_bboxes = []
+        track_bbox = np.array(list(map(float,
+                                       track_bboxes[i].split(','))))[None]
+        track_bbox[:, 2] += track_bbox[:, 0]
+        track_bbox[:, 3] += track_bbox[:, 1]
+        draw_bboxes.append(track_bbox)
+        colors = 'green'
+        if args.gt_bboxes is not None:
+            gt_bbox = np.array(list(map(float, gt_bboxes[i].split(','))))[None]
+            gt_bbox[:, 2] += gt_bbox[:, 0]
+            gt_bbox[:, 3] += gt_bbox[:, 1]
+            draw_bboxes.append(gt_bbox)
+            colors = ['green', 'blue']
+
+        mmcv.imshow_bboxes(
+            img,
+            draw_bboxes,
+            show=args.show,
+            colors=colors,
+            wait_time=int(1000. / fps) if fps else 0,
+            out_file=out_file,
+            thickness=2)
+        prog_bar.update()
+
+    if args.output and OUT_VIDEO:
+        print(
+            f'\nmaking the output video at {args.output} with a FPS of {fps}')
+        mmcv.frames2video(out_path, args.output, fps=fps, fourcc='mp4v')
+        out_dir.cleanup()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/tools/analysis_tools/sot/sot_plot_curve.py b/tools/analysis_tools/sot/sot_plot_curve.py
new file mode 100644
index 000000000..f8386e69a
--- /dev/null
+++ b/tools/analysis_tools/sot/sot_plot_curve.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+import mmengine
+import numpy as np
+
+from mmtrack.utils import (plot_norm_precision_curve, plot_precision_curve,
+                           plot_success_curve)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='sot plot')
+    parser.add_argument(
+        'sot_eval_res',
+        help='the json/yaml/pickle file path of evaluation results. It can '
+        "be a file or path directory. If it's a path directory, all the "
+        'files in this directory will be loaded. The final loaded '
+        'content must be a collection of name/value pairs. The '
+        'name is a tracker name. The value is also a collection of name/value '
+        'pairs in the format dict(success=np.ndarray, '
+        'norm_precision=np.ndarray, precision=np.ndarray). The metrics have '
+        'shape (M, ), where M is the number of values corresponding to '
+        'different thresholds.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        default=False,
+        help='whether to show the plotting results')
+    parser.add_argument(
+        '--plot_save_path',
+        default=None,
+        type=str,
+        help='The saved path of the figure.')
+    args = parser.parse_args()
+
+    if osp.isdir(args.sot_eval_res):
+        all_eval_results = dict()
+        for res_file in os.listdir(args.sot_eval_res):
+            if res_file.endswith(('.json', '.yml', '.yaml', '.pkl')):
+                eval_res = mmengine.load(osp.join(args.sot_eval_res, res_file))
+                all_eval_results.update(eval_res)
+    else:
+        assert osp.isfile(
+            args.sot_eval_res), f'The file {args.sot_eval_res} does not exist'
+        all_eval_results = mmengine.load(args.sot_eval_res)
+    assert isinstance(all_eval_results, dict)
+
+    tracker_names = []
+    all_success = []
+    all_norm_precision = []
+    all_precision = []
+    for tracker_name, scores in all_eval_results.items():
+        tracker_names.append(tracker_name)
+        if 'success' in scores:
+            all_success.append(scores['success'])
+        if 'precision' in scores:
+            all_precision.append(scores['precision'])
+        if 'norm_precision' in scores:
+            all_norm_precision.append(scores['norm_precision'])
+
+    if len(all_success) > 0:
+        all_success = np.stack(all_success)
+        plot_success_curve(
+            all_success,
+            tracker_names=tracker_names,
+            plot_save_path=args.plot_save_path,
+            show=args.show)
+    if len(all_precision) > 0:
+        all_precision = np.stack(all_precision)
+        plot_precision_curve(
+            all_precision,
+            tracker_names=tracker_names,
+            plot_save_path=args.plot_save_path,
+            show=args.show)
+    if len(all_norm_precision) > 0:
+        all_norm_precision = np.stack(all_norm_precision)
+        plot_norm_precision_curve(
+            all_norm_precision,
+            tracker_names=tracker_names,
+            plot_save_path=args.plot_save_path,
+            show=args.show)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/sot/sot_siamrpn_param_search.py b/tools/analysis_tools/sot/sot_siamrpn_param_search.py
new file mode 100644
index 000000000..4a40b71b5
--- /dev/null
+++ b/tools/analysis_tools/sot/sot_siamrpn_param_search.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+import numpy as np
+from mmengine.config import Config, DictAction
+from mmengine.dist import get_dist_info
+from mmengine.logging import print_log
+from mmengine.runner import Runner
+
+from mmtrack.utils import register_all_modules
+
+
+def parse_range(range_str):
+    range_list = range_str.split(',')
+    assert len(range_list) == 3 and float(range_list[1]) >= float(
+        range_list[0])
+    param = map(float, range_list)
+    return np.round(np.arange(*param), decimals=2)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMTrack test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument(
+        '--penalty-k-range',
+        type=parse_range,
+        help="the range of hyper-parameter 'penalty_k' in SiamRPN++; the format \
+            is 'start,stop,step'")
+    parser.add_argument(
+        '--lr-range',
+        type=parse_range,
+        help="the range of hyper-parameter 'lr' in SiamRPN++; the format is \
+            'start,stop,step'")
+    parser.add_argument(
+        '--win-influ-range',
+        type=parse_range,
+        help="the range of hyper-parameter 'window_influence' in SiamRPN++; the \
+            format is 'start,stop,step'")
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def parameter_search(runner, args):
+    cfg = runner.cfg
+    logger = runner.logger
+
+    # calculate the number of all search cases and set comparing standard.
+    num_cases = len(args.penalty_k_range) * len(args.lr_range) * len(
+        args.win_influ_range)
+    case_count = 0
+    # compare function setting in parameter search. Now, the default comparing
+    # ruler is  `greater` because the model doesn't record comparing ruler
+    # of metrics in ``MMEngine``.
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    compare_func = rule_map['greater']
+
+    if cfg.test_evaluator.metric == 'OPE':
+        eval_metrics = ['success', 'norm_precision', 'precision']
+        key_metric = 'success'
+    else:
+        eval_metrics = ['eao', 'accuracy', 'robustness', 'num_fails']
+        key_metric = 'eao'
+
+    checkpoint = runner.load_checkpoint(args.checkpoint)
+
+    # init best_score, best_results and best parames
+    if 'meta' in checkpoint and 'hook_msgs' in checkpoint[
+            'meta'] and key_metric in checkpoint['meta']['hook_msgs']:
+        best_score = checkpoint['meta']['hook_msgs'][key_metric]
+    else:
+        best_score = 0
+    best_result = {f'{key_metric}': best_score}
+
+    best_params = dict(
+        penalty_k=cfg.model.test_cfg.rpn.penalty_k,
+        lr=cfg.model.test_cfg.rpn.lr,
+        win_influ=cfg.model.test_cfg.rpn.window_influence)
+    print_log(f'init best score as: {best_score}', logger)
+    print_log(f'init best params as: {best_params}', logger)
+
+    for penalty_k in args.penalty_k_range:
+        for lr in args.lr_range:
+            for win_influ in args.win_influ_range:
+                case_count += 1
+                runner.model.test_cfg.rpn.penalty_k = penalty_k
+                runner.model.test_cfg.rpn.lr = lr
+                runner.model.test_cfg.rpn.window_influence = win_influ
+                print_log(f'-----------[{case_count}/{num_cases}]-----------',
+                          logger)
+                print_log(
+                    f'penalty_k={penalty_k} lr={lr} win_influence={win_influ}',
+                    logger)
+
+                # start testing
+                runner.test()
+
+                # parse the eluation results
+                res = dict()
+                for metric in eval_metrics:
+                    res[metric] = runner.message_hub.get_scalar(
+                        'test/sot/' + metric).current()
+
+                # show results
+                rank, _ = get_dist_info()
+                if rank == 0:
+                    print_log(f'evaluation results: {res}', logger)
+                    print_log('------------------------------------------',
+                              logger)
+                    if compare_func(res[key_metric], best_result[key_metric]):
+                        best_result = res
+                        best_params['penalty_k'] = penalty_k
+                        best_params['lr'] = lr
+                        best_params['win_influ'] = win_influ
+                    print_log(
+                        f'The current best evaluation results: {best_result}',
+                        logger)
+                    print_log(f'The current best params: {best_params}',
+                              logger)
+
+    print_log(
+        'After parameter searching, the best evaluation results: '
+        f'{best_result}', logger)
+    print_log(f'After parameter searching, the best params: {best_params}',
+              logger)
+
+
+def main():
+    args = parse_args()
+
+    # register all modules in mmtrack into the registries
+    # do not init the default scope here because it will be init in the runner
+    register_all_modules(init_default_scope=False)
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    # build the runner from config
+    runner = Runner.from_cfg(cfg)
+
+    parameter_search(runner, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/dataset_converters/aicity/aicity2coco.py b/tools/dataset_converters/aicity/aicity2coco.py
new file mode 100644
index 000000000..809d65dca
--- /dev/null
+++ b/tools/dataset_converters/aicity/aicity2coco.py
@@ -0,0 +1,157 @@
+import os
+import cv2
+from tqdm import tqdm
+from argparse import ArgumentParser
+import mmengine
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("data_dir", help="Path to the data directory")
+
+    return parser.parse_args()
+
+def parse_gts(ann_path: str):
+    """
+    Read the annotations from the ground-truth file and convert them to format.
+        list of dicts with keys: `id`, `category_id`, `instance_id`, `bbox`, `area`
+    Note: The `bbox` is in the format `[xtop, ytop, w, h]`.
+
+    Args:
+        ann_path (str): Path to the annotation file.
+        Note: Each line in the annotation file is in the following format:
+            `<frame_id>,<track_id>,<bbox_left>,<bbox_top>,<bbox_width>,<bbox_height>,1,-1,-1,-1`
+
+    Returns:
+        list: List of annotations.
+    """
+
+    outs = []
+
+    with open(ann_path, "r") as f:        
+        while True:
+            ann = f.readline()
+
+            if ann == "":
+                break
+
+            ann = ann.strip().split(",")
+            frame_id, instance_id = map(int, ann[:2])
+            bbox = list(map(float, ann[2:6]))
+            category_id = 1
+            area = bbox[2] * bbox[3]
+            
+            ann = dict(
+                id=frame_id,
+                category_id=category_id,
+                instance_id=instance_id,
+                bbox=bbox,
+                area=area)
+
+            outs.append(ann)
+
+    return outs
+
+def get_image_infos(frames_dir: str):
+    """
+    Get the frames information from the directory containing the frame images.
+
+    Args:
+        frames_dir (str): Path to the directory containing the images.
+
+    Returns:
+        list: List of image information order by frame_id. Each element is a dict with keys `id`, `file_name`, `height`, `width`, `frame_id`.
+    """
+
+    outs = []
+
+    height, width = None, None
+    
+    prev_frame_id = -1
+    for img_path in os.scandir(frames_dir):
+        frame_id = int(img_path.name.split(".")[0])
+
+        assert frame_id > prev_frame_id, f"Frame ids are not in order: {frame_id} <= {prev_frame_id}"
+        prev_frame_id = frame_id
+
+        if height is None:
+            height, width = cv2.imread(img_path.path).shape[:2]
+        
+        info = dict(
+            file_name=img_path.path,
+            height=height,
+            width=width,
+            id=frame_id)
+
+        outs.append(info)
+    
+    return outs
+
+
+def main():
+    args = parse_args()
+
+    for subset in ("train", "validation"):
+        subset_anns = {
+            "videos": [],
+            "images": [],
+            "annotations": [],
+            "categories": []
+        }
+        subset_dir = os.path.join(args.data_dir, subset)
+        save_dir = os.path.join(args.data_dir, "annotations")
+
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+
+        print(f'Extracting images from {subset} set')
+        for scene_dir in tqdm(os.scandir(subset_dir)):
+            if scene_dir.is_dir():
+                for camera_id, camera_dir in enumerate(os.scandir(scene_dir)):
+                    if camera_dir.is_dir():
+                        imgs_dir = os.path.join(camera_dir.path, "imgs")
+                        gt_path = os.path.join(camera_dir.path, "label.txt")
+
+                        # Read the annotations
+                        anns = parse_gts(gt_path)
+                        imgs = get_image_infos(imgs_dir)
+
+                        # Match the annotations with the image infos, and add `id` and `frame_id` keys to both of them
+                        # Since frames are not extracted with true FPS, we need to match the annotations with the image infos
+                        ann_id = 0
+                        new_anns = []
+                        for frame_id, imgs in enumerate(imgs):
+                            img["frame_id"] = frame_id
+
+                            while ann_id < len(anns):
+                                ann = anns[ann_id]
+                                if ann["id"] == img["id"]:
+                                    ann["frame_id"] = frame_id
+                                    new_anns.append(ann)
+                                
+                                ann_id += 1
+                        anns = new_anns
+
+                        # Add video_id keys to the annotations
+                        for ann in anns:
+                            ann["video_id"] = camera_id
+                        
+                        # Add video_id keys to the image infos
+                        for img in imgs:
+                            img["video_id"] = camera_id
+
+                        # Add the annotations and image infos to the subset
+                        subset_anns["annotations"].extend(anns)
+                        subset_anns["images"].extend(imgs)
+
+                        # Add the videos to the subset
+                        subset_anns["videos"].append(
+                            dict(
+                                id=camera_id,
+                                name=camera_dir.path,))
+                        
+        # Add the categories to the subset
+        subset_anns["categories"].append(dict(id=1, name="person"))
+
+        print("Saving annotations...")
+        mmengine.dump(subset_anns, os.path.join(save_dir, f"{subset}.json"))
diff --git a/tools/dataset_converters/aicity/test.ipynb b/tools/dataset_converters/aicity/test.ipynb
new file mode 100644
index 000000000..62601e672
--- /dev/null
+++ b/tools/dataset_converters/aicity/test.ipynb
@@ -0,0 +1,90 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from aicity2coco import parse_gts\n",
+    "\n",
+    "gts = parse_gts(\"../../../demo/label.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from torchvision.utils import draw_bounding_boxes\n",
+    "from torchvision.io import read_image\n",
+    "from torchvision.ops import box_convert\n",
+    "import torch\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "imgs_dir = \"../../../demo/imgs\"\n",
+    "img_id = 6 * 3\n",
+    "\n",
+    "def make_ann_img(img_id, gts):\n",
+    "    try:\n",
+    "        img = read_image(os.path.join(imgs_dir, f\"{img_id:06d}.jpg\"))\n",
+    "\n",
+    "        bboxes = []\n",
+    "        for ann in gts:\n",
+    "            if ann[\"image_id\"] == img_id:\n",
+    "                bboxes.append(ann[\"bbox\"])\n",
+    "        \n",
+    "        if len(bboxes) == 0:\n",
+    "            return img\n",
+    "        else:\n",
+    "            bboxes = torch.tensor(bboxes)\n",
+    "            bboxes = box_convert(bboxes, 'xywh', 'xyxy')\n",
+    "\n",
+    "            ret = draw_bounding_boxes(img, bboxes, width=3)\n",
+    "\n",
+    "            return ret\n",
+    "    except:\n",
+    "        print(bboxes)\n",
+    "\n",
+    "import cv2\n",
+    "writer = cv2.VideoWriter(\"out.avi\", cv2.VideoWriter_fourcc(*\"XVID\"), 5, (1920, 1080))\n",
+    "\n",
+    "for img_id in range(6, 20000, 6):\n",
+    "    ret = make_ann_img(img_id, gts)\n",
+    "    ret = ret.moveaxis(0, -1).numpy()\n",
+    "\n",
+    "    writer.write(ret)\n",
+    "writer.release()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mot-mmtrack",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "e4bfa47eb115d085a5fd36886584e2476f8ebafa1f4dedfed6cf2234c0e3adec"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tools/dataset_converters/aicity/vid2imgs.py b/tools/dataset_converters/aicity/vid2imgs.py
new file mode 100644
index 000000000..2e9c4a6c9
--- /dev/null
+++ b/tools/dataset_converters/aicity/vid2imgs.py
@@ -0,0 +1,71 @@
+import os, shutil
+from warnings import warn
+from tqdm import tqdm
+from argparse import ArgumentParser
+import multiprocessing as mp
+from concurrent.futures import ThreadPoolExecutor
+from math import ceil
+import mmcv
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("data_dir", help="Path to the data directory")
+    parser.add_argument("--fps", type=int, default=5, help="FPS to extract images at. Default: 5")
+    parser.add_argument("--num_workers", type=int, default=mp.cpu_count(), help="Number of workers to use. Default: all available cpus")
+
+    return parser.parse_args()
+
+def extract_images(vid_path: str, imgs_dir: str, fps: int):
+    """
+    Extract images from a video and save them to a directory.
+
+    Args:
+        vid_path (str): Path to the video file.
+        imgs_dir (str): Path to the directory where the images will be saved.
+    """
+
+    if not os.path.exists(vid_path):
+        print(f"Missing video: {vid_path}")
+
+    reader = mmcv.VideoReader(vid_path)
+    frame_step = int(reader.fps / fps)
+
+    if os.path.exists(imgs_dir):
+        extracted_imgs = os.listdir(imgs_dir)
+        num_frames = ceil(reader.frame_cnt / frame_step)
+
+        if len(extracted_imgs) == num_frames:
+            print(f"Images already extracted: {imgs_dir}. Skipping...")
+            return
+        elif len(extracted_imgs) > 0:
+            print(f"Mismatch in number of extracted images. Number of extracted images: {len(extracted_imgs)}. Expected {num_frames}")
+            shutil.rmtree(imgs_dir)
+    os.makedirs(imgs_dir)
+
+
+    for frame_id in range(0, reader.frame_cnt, frame_step):
+        frame = reader.get_frame(frame_id)
+        mmcv.imwrite(frame, os.path.join(imgs_dir, f"{frame_id:06d}.jpg"))
+
+def main():
+    args = parse_args()
+
+    def extract_images_for_camera(camera_dir):
+        if camera_dir.is_dir():
+            vid_path = os.path.join(camera_dir.path, "video.mp4")
+            imgs_dir = os.path.join(camera_dir.path, "imgs")
+
+            extract_images(vid_path, imgs_dir, args.fps)
+
+    for subset in ("train", "validation"):
+        subset_dir = os.path.join(args.data_dir, subset)
+
+        print(f'Extracting images from {subset} set')
+        for scene_dir in tqdm(os.scandir(subset_dir)):
+            if scene_dir.is_dir():
+                with ThreadPoolExecutor(max_workers=args.num_workers) as executor:
+                    executor.map(extract_images_for_camera, os.scandir(scene_dir.path))
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/convert_datasets/dancetrack/dancetrack2coco.py b/tools/dataset_converters/dancetrack/dancetrack2coco.py
similarity index 88%
rename from tools/convert_datasets/dancetrack/dancetrack2coco.py
rename to tools/dataset_converters/dancetrack/dancetrack2coco.py
index bc6151c3d..a5e3957cc 100644
--- a/tools/convert_datasets/dancetrack/dancetrack2coco.py
+++ b/tools/dataset_converters/dancetrack/dancetrack2coco.py
@@ -17,9 +17,6 @@
 # Classes in DanceTrack:
 #   1: 'pedestrian'
 #
-#   USELESS classes are not included into the json file.
-#   IGNORES classes are included with `ignore=True`.
-#
 # This file is adapted from the data label conversion file for MOT
 # But as Dancetrack does not provide public detections and provides
 # official train/val/test splitting, we make necessary adaptation.
@@ -29,11 +26,11 @@
 import os.path as osp
 from collections import defaultdict
 
-import mmcv
+import mmengine
 from tqdm import tqdm
 
-USELESS = [3, 4, 5, 6, 9, 10, 11]
-IGNORES = [2, 7, 8, 12, 13]
+# Classes in DanceTrack:
+CLASSES = [dict(id=1, name='pedestrian')]
 
 
 def parse_args():
@@ -53,21 +50,16 @@ def parse_gts(gts):
         frame_id, ins_id = map(int, gt[:2])
         bbox = list(map(float, gt[2:6]))
         conf = float(gt[6])
-        class_id = int(gt[7])
+        category_id = int(gt[7])
         visibility = float(gt[8])
-        if class_id in USELESS:
-            continue
-        elif class_id in IGNORES:
-            continue
         anns = dict(
-            category_id=1,
+            category_id=category_id,
             bbox=bbox,
             area=bbox[2] * bbox[3],
             iscrowd=False,
             visibility=visibility,
             mot_instance_id=ins_id,
-            mot_conf=conf,
-            mot_class_id=class_id)
+            mot_conf=conf)
         outputs[frame_id].append(anns)
     return outputs
 
@@ -86,7 +78,7 @@ def main():
         in_folder = osp.join(args.input, subset)
         out_file = osp.join(args.output, f'{subset}_cocoformat.json')
         outputs = defaultdict(list)
-        outputs['categories'] = [dict(id=1, name='pedestrian')]
+        outputs['categories'] = CLASSES
 
         video_names = os.listdir(in_folder)
         video_names = [d for d in video_names if d != '.DS_Store']
@@ -96,7 +88,7 @@ def main():
             ins_maps = dict()
             # load video infos
             video_folder = osp.join(in_folder, video_name)
-            infos = mmcv.list_from_file(f'{video_folder}/seqinfo.ini')
+            infos = mmengine.list_from_file(f'{video_folder}/seqinfo.ini')
             # video-level infos
             assert video_name == infos[1].strip().split('=')[1]
             img_folder = infos[2].strip().split('=')[1]
@@ -117,7 +109,7 @@ def main():
                 height=height)
             # parse annotations
             if parse_gt:
-                gts = mmcv.list_from_file(f'{video_folder}/gt/gt.txt')
+                gts = mmengine.list_from_file(f'{video_folder}/gt/gt.txt')
                 img2gts = parse_gts(gts)
 
             # image and box level infos
@@ -152,7 +144,7 @@ def main():
             vid_id += 1
             outputs['num_instances'] = ins_id
         print(f'{subset} has {ins_id} instances.')
-        mmcv.dump(outputs, out_file)
+        mmengine.dump(outputs, out_file)
         print(f'Done! Saved as {out_file}')
 
 
diff --git a/tools/convert_datasets/got10k/gen_got10k_infos.py b/tools/dataset_converters/got10k/gen_got10k_infos.py
similarity index 100%
rename from tools/convert_datasets/got10k/gen_got10k_infos.py
rename to tools/dataset_converters/got10k/gen_got10k_infos.py
diff --git a/tools/convert_datasets/got10k/got10k2coco.py b/tools/dataset_converters/got10k/got10k2coco.py
similarity index 92%
rename from tools/convert_datasets/got10k/got10k2coco.py
rename to tools/dataset_converters/got10k/got10k2coco.py
index 71d4b6b9a..ae0a92800 100644
--- a/tools/convert_datasets/got10k/got10k2coco.py
+++ b/tools/dataset_converters/got10k/got10k2coco.py
@@ -6,6 +6,7 @@
 from collections import defaultdict
 
 import mmcv
+import mmengine
 from tqdm import tqdm
 
 
@@ -43,14 +44,14 @@ def convert_got10k(ann_dir, save_dir, split='test'):
     records = dict(vid_id=1, img_id=1, ann_id=1, global_instance_id=1)
     got10k['categories'] = [dict(id=0, name=0)]
 
-    videos_list = mmcv.list_from_file(osp.join(ann_dir, split, 'list.txt'))
+    videos_list = mmengine.list_from_file(osp.join(ann_dir, split, 'list.txt'))
     for video_name in tqdm(videos_list, desc=split):
         video = dict(id=records['vid_id'], name=video_name)
         got10k['videos'].append(video)
 
         video_path = osp.join(ann_dir, split, video_name)
         ann_file = osp.join(video_path, 'groundtruth.txt')
-        gt_bboxes = mmcv.list_from_file(ann_file)
+        gt_bboxes = mmengine.list_from_file(ann_file)
 
         img_files = glob.glob(osp.join(video_path, '*.jpg'))
         img_files = sorted(
@@ -58,17 +59,17 @@ def convert_got10k(ann_dir, save_dir, split='test'):
         img = mmcv.imread(osp.join(video_path, '00000001.jpg'))
         height, width, _ = img.shape
         if split in ['train', 'val']:
-            absence_label = mmcv.list_from_file(
+            absence_label = mmengine.list_from_file(
                 osp.join(video_path, 'absence.label'))
             # cover_label denotes the ranges of object visible ratios, ant it's
             # in range [0,8] which correspond to ranges of object visible
             # ratios: 0%, (0%, 15%], (15%~30%], (30%, 45%], (45%, 60%],
             # (60%, 75%], (75%, 90%], (90%, 100%) and 100% respectively
-            cover_label = mmcv.list_from_file(
+            cover_label = mmengine.list_from_file(
                 osp.join(video_path, 'cover.label'))
             # cut_by_image_label denotes whether the object is cut by the image
             # boundary.
-            cut_by_image_label = mmcv.list_from_file(
+            cut_by_image_label = mmengine.list_from_file(
                 osp.join(video_path, 'cut_by_image.label'))
         for frame_id, img_file in enumerate(img_files):
             img_name = img_file.split(os.sep)[-1]
@@ -115,7 +116,7 @@ def convert_got10k(ann_dir, save_dir, split='test'):
 
     if not osp.isdir(save_dir):
         os.makedirs(save_dir)
-    mmcv.dump(got10k, osp.join(save_dir, f'got10k_{split}.json'))
+    mmengine.dump(got10k, osp.join(save_dir, f'got10k_{split}.json'))
     print(f'-----GOT10k {split} Dataset------')
     print(f'{records["vid_id"]- 1} videos')
     print(f'{records["global_instance_id"]- 1} instances')
diff --git a/tools/convert_datasets/got10k/unzip_got10k.sh b/tools/dataset_converters/got10k/unzip_got10k.sh
similarity index 100%
rename from tools/convert_datasets/got10k/unzip_got10k.sh
rename to tools/dataset_converters/got10k/unzip_got10k.sh
diff --git a/tools/convert_datasets/ilsvrc/imagenet2coco_det.py b/tools/dataset_converters/ilsvrc/imagenet2coco_det.py
similarity index 97%
rename from tools/convert_datasets/ilsvrc/imagenet2coco_det.py
rename to tools/dataset_converters/ilsvrc/imagenet2coco_det.py
index 25dcdc87d..a6fadcbe9 100644
--- a/tools/convert_datasets/ilsvrc/imagenet2coco_det.py
+++ b/tools/dataset_converters/ilsvrc/imagenet2coco_det.py
@@ -6,7 +6,7 @@
 import xml.etree.ElementTree as ET
 from collections import defaultdict
 
-import mmcv
+import mmengine
 from tqdm import tqdm
 
 CLASSES = ('airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
@@ -137,7 +137,7 @@ def convert_det(DET, ann_dir, save_dir):
     obj_num_classes = dict()
 
     vid_train_img_list = osp.join(ann_dir, 'Lists/DET_train_30classes.txt')
-    vid_train_img_list = mmcv.list_from_file(vid_train_img_list)
+    vid_train_img_list = mmengine.list_from_file(vid_train_img_list)
     vid_train_img_names = []
     for vid_train_img_info in vid_train_img_list:
         vid_train_img_names.append(f"{vid_train_img_info.split(' ')[0]}.JPEG")
@@ -170,7 +170,7 @@ def convert_det(DET, ann_dir, save_dir):
                                                       obj_num_classes)
     if not osp.isdir(save_dir):
         os.makedirs(save_dir)
-    mmcv.dump(DET, osp.join(save_dir, 'imagenet_det_30plus1cls.json'))
+    mmengine.dump(DET, osp.join(save_dir, 'imagenet_det_30plus1cls.json'))
     print('-----ImageNet DET------')
     print(f'total {records["img_id"] - 1} images')
     print(f'{records["num_no_objects"]} images have no objects')
diff --git a/tools/convert_datasets/ilsvrc/imagenet2coco_vid.py b/tools/dataset_converters/ilsvrc/imagenet2coco_vid.py
similarity index 97%
rename from tools/convert_datasets/ilsvrc/imagenet2coco_vid.py
rename to tools/dataset_converters/ilsvrc/imagenet2coco_vid.py
index b438d580b..8878a042f 100644
--- a/tools/convert_datasets/ilsvrc/imagenet2coco_vid.py
+++ b/tools/dataset_converters/ilsvrc/imagenet2coco_vid.py
@@ -5,7 +5,7 @@
 import xml.etree.ElementTree as ET
 from collections import defaultdict
 
-import mmcv
+import mmengine
 from tqdm import tqdm
 
 CLASSES = ('airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
@@ -47,7 +47,7 @@ def parse_args():
 def parse_train_list(ann_dir):
     """Parse the txt file of ImageNet VID train dataset."""
     img_list = osp.join(ann_dir, 'Lists/VID_train_15frames.txt')
-    img_list = mmcv.list_from_file(img_list)
+    img_list = mmengine.list_from_file(img_list)
     train_infos = defaultdict(list)
     for info in img_list:
         info = info.split(' ')
@@ -62,7 +62,7 @@ def parse_train_list(ann_dir):
 def parse_val_list(ann_dir):
     """Parse the txt file of ImageNet VID val dataset."""
     img_list = osp.join(ann_dir, 'Lists/VID_val_videos.txt')
-    img_list = mmcv.list_from_file(img_list)
+    img_list = mmengine.list_from_file(img_list)
     val_infos = defaultdict(list)
     for info in img_list:
         info = info.split(' ')
@@ -173,7 +173,7 @@ def convert_vid(VID, ann_dir, save_dir, mode='train'):
         records['vid_id'] += 1
     if not osp.isdir(save_dir):
         os.makedirs(save_dir)
-    mmcv.dump(VID, osp.join(save_dir, f'imagenet_vid_{mode}.json'))
+    mmengine.dump(VID, osp.join(save_dir, f'imagenet_vid_{mode}.json'))
     print(f'-----ImageNet VID {mode}------')
     print(f'{records["vid_id"]- 1} videos')
     print(f'{records["img_id"]- 1} images')
diff --git a/tools/convert_datasets/lasot/gen_lasot_infos.py b/tools/dataset_converters/lasot/gen_lasot_infos.py
similarity index 100%
rename from tools/convert_datasets/lasot/gen_lasot_infos.py
rename to tools/dataset_converters/lasot/gen_lasot_infos.py
diff --git a/tools/convert_datasets/lasot/lasot2coco.py b/tools/dataset_converters/lasot/lasot2coco.py
similarity index 93%
rename from tools/convert_datasets/lasot/lasot2coco.py
rename to tools/dataset_converters/lasot/lasot2coco.py
index f1a8ac44f..57509b1fa 100644
--- a/tools/convert_datasets/lasot/lasot2coco.py
+++ b/tools/dataset_converters/lasot/lasot2coco.py
@@ -5,6 +5,7 @@
 from collections import defaultdict
 
 import mmcv
+import mmengine
 from tqdm import tqdm
 
 
@@ -41,7 +42,7 @@ def convert_lasot(ann_dir, save_dir, split='test'):
     lasot = defaultdict(list)
     records = dict(vid_id=1, img_id=1, ann_id=1, global_instance_id=1)
     lasot['categories'] = [dict(id=0, name=0)]
-    videos_list = mmcv.list_from_file(
+    videos_list = mmengine.list_from_file(
         osp.join(osp.dirname(__file__), 'testing_set.txt'))
     if split == 'train':
         train_videos_list = []
@@ -57,12 +58,12 @@ def convert_lasot(ann_dir, save_dir, split='test'):
         video = dict(id=records['vid_id'], name=video_name)
         lasot['videos'].append(video)
 
-        gt_bboxes = mmcv.list_from_file(
+        gt_bboxes = mmengine.list_from_file(
             osp.join(video_path, 'groundtruth.txt'))
-        full_occlusion = mmcv.list_from_file(
+        full_occlusion = mmengine.list_from_file(
             osp.join(video_path, 'full_occlusion.txt'))
         full_occlusion = full_occlusion[0].split(',')
-        out_of_view = mmcv.list_from_file(
+        out_of_view = mmengine.list_from_file(
             osp.join(video_path, 'out_of_view.txt'))
         out_of_view = out_of_view[0].split(',')
 
@@ -101,7 +102,7 @@ def convert_lasot(ann_dir, save_dir, split='test'):
 
     if not osp.isdir(save_dir):
         os.makedirs(save_dir)
-    mmcv.dump(lasot, osp.join(save_dir, f'lasot_{split}.json'))
+    mmengine.dump(lasot, osp.join(save_dir, f'lasot_{split}.json'))
     print(f'-----LaSOT {split} Dataset------')
     print(f'{records["vid_id"]- 1} videos')
     print(f'{records["global_instance_id"]- 1} instances')
diff --git a/tools/convert_datasets/lasot/testing_set.txt b/tools/dataset_converters/lasot/testing_set.txt
similarity index 100%
rename from tools/convert_datasets/lasot/testing_set.txt
rename to tools/dataset_converters/lasot/testing_set.txt
diff --git a/tools/convert_datasets/mot/crowdhuman2coco.py b/tools/dataset_converters/mot/crowdhuman2coco.py
similarity index 97%
rename from tools/convert_datasets/mot/crowdhuman2coco.py
rename to tools/dataset_converters/mot/crowdhuman2coco.py
index e368b4fd5..84af82daf 100644
--- a/tools/convert_datasets/mot/crowdhuman2coco.py
+++ b/tools/dataset_converters/mot/crowdhuman2coco.py
@@ -5,7 +5,7 @@
 import os.path as osp
 from collections import defaultdict
 
-import mmcv
+import mmengine
 from PIL import Image
 from tqdm import tqdm
 
@@ -82,7 +82,7 @@ def convert_crowdhuman(ann_dir, save_dir, mode='train'):
 
     if not osp.isdir(save_dir):
         os.makedirs(save_dir)
-    mmcv.dump(outputs, osp.join(save_dir, f'crowdhuman_{mode}.json'))
+    mmengine.dump(outputs, osp.join(save_dir, f'crowdhuman_{mode}.json'))
     print(f'-----CrowdHuman {mode} set------')
     print(f'total {records["img_id"] - 1} images')
     if mode != 'test':
diff --git a/tools/convert_datasets/mot/mot2coco.py b/tools/dataset_converters/mot/mot2coco.py
similarity index 86%
rename from tools/convert_datasets/mot/mot2coco.py
rename to tools/dataset_converters/mot/mot2coco.py
index 5d18b02bb..e8e890212 100644
--- a/tools/convert_datasets/mot/mot2coco.py
+++ b/tools/dataset_converters/mot/mot2coco.py
@@ -12,34 +12,32 @@
 #   DETs and Results:
 #       <frame_id>, <instance_id>, <x1>, <y1>, <w>, <h>, <conf>,
 #       <x>, <y>, <z> # for 3D objects
-#
-# Classes in MOT:
-#   1: 'pedestrian'
-#   2: 'person on vehicle'
-#   3: 'car'
-#   4: 'bicycle'
-#   5: 'motorbike'
-#   6: 'non motorized vehicle'
-#   7: 'static person'
-#   8: 'distractor'
-#   9: 'occluder'
-#   10: 'occluder on the ground',
-#   11: 'occluder full'
-#   12: 'reflection'
-#
-#   USELESS classes are not included into the json file.
-#   IGNORES classes are included with `ignore=True`.
+
 import argparse
 import os
 import os.path as osp
 from collections import defaultdict
 
-import mmcv
+import mmengine
 import numpy as np
 from tqdm import tqdm
 
-USELESS = [3, 4, 5, 6, 9, 10, 11]
-IGNORES = [2, 7, 8, 12, 13]
+# Classes in MOT:
+CLASSES = [
+    dict(id=1, name='pedestrian'),
+    dict(id=2, name='person_on_vehicle'),
+    dict(id=3, name='car'),
+    dict(id=4, name='bicycle'),
+    dict(id=5, name='motorbike'),
+    dict(id=6, name='non_mot_vehicle'),
+    dict(id=7, name='static_person'),
+    dict(id=8, name='distractor'),
+    dict(id=9, name='occluder'),
+    dict(id=10, name='occluder_on_ground'),
+    dict(id=11, name='occluder_full'),
+    dict(id=12, name='reflection'),
+    dict(id=13, name='crowd')
+]
 
 
 def parse_args():
@@ -67,25 +65,20 @@ def parse_gts(gts, is_mot15):
         bbox = list(map(float, gt[2:6]))
         if is_mot15:
             conf = 1.
-            class_id = 1
+            category_id = 1
             visibility = 1.
         else:
             conf = float(gt[6])
-            class_id = int(gt[7])
+            category_id = int(gt[7])
             visibility = float(gt[8])
-        if class_id in USELESS:
-            continue
-        elif class_id in IGNORES:
-            continue
         anns = dict(
-            category_id=1,
+            category_id=category_id,
             bbox=bbox,
             area=bbox[2] * bbox[3],
             iscrowd=False,
             visibility=visibility,
             mot_instance_id=ins_id,
-            mot_conf=conf,
-            mot_class_id=class_id)
+            mot_conf=conf)
         outputs[frame_id].append(anns)
     return outputs
 
@@ -125,7 +118,7 @@ def main():
             in_folder = osp.join(args.input, subset)
         out_file = osp.join(args.output, f'{subset}_cocoformat.json')
         outputs = defaultdict(list)
-        outputs['categories'] = [dict(id=1, name='pedestrian')]
+        outputs['categories'] = CLASSES
         if args.convert_det:
             det_file = osp.join(args.output, f'{subset}_detections.pkl')
             detections = dict(det_bboxes=dict())
@@ -136,7 +129,7 @@ def main():
             ins_maps = dict()
             # load video infos
             video_folder = osp.join(in_folder, video_name)
-            infos = mmcv.list_from_file(f'{video_folder}/seqinfo.ini')
+            infos = mmengine.list_from_file(f'{video_folder}/seqinfo.ini')
             # video-level infos
             assert video_name == infos[1].strip().split('=')[1]
             img_folder = infos[2].strip().split('=')[1]
@@ -155,13 +148,13 @@ def main():
                 height=height)
             # parse annotations
             if parse_gt:
-                gts = mmcv.list_from_file(f'{video_folder}/gt/gt.txt')
+                gts = mmengine.list_from_file(f'{video_folder}/gt/gt.txt')
                 if 'MOT15' in video_folder:
                     img2gts = parse_gts(gts, True)
                 else:
                     img2gts = parse_gts(gts, False)
             if args.convert_det:
-                dets = mmcv.list_from_file(f'{video_folder}/det/det.txt')
+                dets = mmengine.list_from_file(f'{video_folder}/det/det.txt')
                 img2dets = parse_dets(dets)
             # make half sets
             if 'half' in subset:
@@ -215,9 +208,9 @@ def main():
             vid_id += 1
             outputs['num_instances'] = ins_id
         print(f'{subset} has {ins_id} instances.')
-        mmcv.dump(outputs, out_file)
+        mmengine.dump(outputs, out_file)
         if args.convert_det:
-            mmcv.dump(detections, det_file)
+            mmengine.dump(detections, det_file)
             print(f'Done! Saved as {out_file} and {det_file}')
         else:
             print(f'Done! Saved as {out_file}')
diff --git a/tools/convert_datasets/mot/mot2reid.py b/tools/dataset_converters/mot/mot2reid.py
similarity index 97%
rename from tools/convert_datasets/mot/mot2reid.py
rename to tools/dataset_converters/mot/mot2reid.py
index 4986c7b66..e3d86e5c9 100644
--- a/tools/convert_datasets/mot/mot2reid.py
+++ b/tools/dataset_converters/mot/mot2reid.py
@@ -35,6 +35,7 @@
 import random
 
 import mmcv
+import mmengine
 import numpy as np
 from tqdm import tqdm
 
@@ -87,7 +88,7 @@ def main():
     for video_name in tqdm(video_names):
         # load video infos
         video_folder = osp.join(in_folder, video_name)
-        infos = mmcv.list_from_file(f'{video_folder}/seqinfo.ini')
+        infos = mmengine.list_from_file(f'{video_folder}/seqinfo.ini')
         # video-level infos
         assert video_name == infos[1].strip().split('=')[1]
         raw_img_folder = infos[2].strip().split('=')[1]
@@ -99,7 +100,7 @@ def main():
         reid_train_folder = osp.join(args.output, 'imgs')
         if not osp.exists(reid_train_folder):
             os.makedirs(reid_train_folder)
-        gts = mmcv.list_from_file(f'{video_folder}/gt/gt.txt')
+        gts = mmengine.list_from_file(f'{video_folder}/gt/gt.txt')
         last_frame_id = -1
         for gt in gts:
             gt = gt.strip().split(',')
diff --git a/tools/convert_datasets/otb100/download_otb100.py b/tools/dataset_converters/otb100/download_otb100.py
similarity index 100%
rename from tools/convert_datasets/otb100/download_otb100.py
rename to tools/dataset_converters/otb100/download_otb100.py
diff --git a/tools/convert_datasets/otb100/otb100_infos.txt b/tools/dataset_converters/otb100/otb100_infos.txt
similarity index 100%
rename from tools/convert_datasets/otb100/otb100_infos.txt
rename to tools/dataset_converters/otb100/otb100_infos.txt
diff --git a/tools/convert_datasets/otb100/otb2coco.py b/tools/dataset_converters/otb100/otb2coco.py
similarity index 96%
rename from tools/convert_datasets/otb100/otb2coco.py
rename to tools/dataset_converters/otb100/otb2coco.py
index 8ab81d1f1..b6d6a7dca 100644
--- a/tools/convert_datasets/otb100/otb2coco.py
+++ b/tools/dataset_converters/otb100/otb2coco.py
@@ -7,6 +7,7 @@
 from collections import defaultdict
 
 import mmcv
+import mmengine
 from tqdm import tqdm
 
 
@@ -71,7 +72,7 @@ def convert_otb100(otb, ann_dir, save_dir):
             video = dict(id=records['vid_id'], name=video_name)
             otb['videos'].append(video)
 
-            gt_bboxes = mmcv.list_from_file(gt_file)
+            gt_bboxes = mmengine.list_from_file(gt_file)
             if video_name == 'Tiger1':
                 gt_bboxes = gt_bboxes[start_frame_id - 1:]
             for frame_id, gt_bbox in enumerate(gt_bboxes):
@@ -107,7 +108,7 @@ def convert_otb100(otb, ann_dir, save_dir):
 
     if not osp.isdir(save_dir):
         os.makedirs(save_dir)
-    mmcv.dump(otb, osp.join(save_dir, 'otb100.json'))
+    mmengine.dump(otb, osp.join(save_dir, 'otb100.json'))
     print('-----OTB100 Dataset------')
     print(f'{records["vid_id"]- 1} videos')
     print(f'{records["global_instance_id"]- 1} instances')
diff --git a/tools/convert_datasets/otb100/unzip_otb100.sh b/tools/dataset_converters/otb100/unzip_otb100.sh
similarity index 100%
rename from tools/convert_datasets/otb100/unzip_otb100.sh
rename to tools/dataset_converters/otb100/unzip_otb100.sh
diff --git a/tools/convert_datasets/tao/merge_coco_with_lvis.py b/tools/dataset_converters/tao/merge_coco_with_lvis.py
similarity index 100%
rename from tools/convert_datasets/tao/merge_coco_with_lvis.py
rename to tools/dataset_converters/tao/merge_coco_with_lvis.py
diff --git a/tools/convert_datasets/tao/tao2coco.py b/tools/dataset_converters/tao/tao2coco.py
similarity index 93%
rename from tools/convert_datasets/tao/tao2coco.py
rename to tools/dataset_converters/tao/tao2coco.py
index edbbe5aa8..07094b392 100644
--- a/tools/convert_datasets/tao/tao2coco.py
+++ b/tools/dataset_converters/tao/tao2coco.py
@@ -58,7 +58,7 @@
 import os.path as osp
 from collections import defaultdict
 
-import mmcv
+import mmengine
 from tao.toolkit.tao import Tao
 from tqdm import tqdm
 
@@ -75,16 +75,16 @@ def parse_args():
 
 
 def get_classes(tao_path, filter_classes=True):
-    train = mmcv.load(osp.join(tao_path, 'train.json'))
+    train = mmengine.load(osp.join(tao_path, 'train.json'))
 
     train_classes = list(set([_['category_id'] for _ in train['annotations']]))
     print(f'TAO train set contains {len(train_classes)} categories.')
 
-    val = mmcv.load(osp.join(tao_path, 'validation.json'))
+    val = mmengine.load(osp.join(tao_path, 'validation.json'))
     val_classes = list(set([_['category_id'] for _ in val['annotations']]))
     print(f'TAO val set contains {len(val_classes)} categories.')
 
-    test = mmcv.load(osp.join(tao_path, 'test_categories.json'))
+    test = mmengine.load(osp.join(tao_path, 'test_categories.json'))
     test_classes = list(set([_['id'] for _ in test['categories']]))
     print(f'TAO test set contains {len(test_classes)} categories.')
 
@@ -106,7 +106,7 @@ def get_classes(tao_path, filter_classes=True):
 
 def convert_tao(file, classes):
     tao = Tao(file)
-    raw = mmcv.load(file)
+    raw = mmengine.load(file)
 
     out = defaultdict(list)
     out['tracks'] = raw['tracks'].copy()
@@ -151,7 +151,7 @@ def main():
         c = '_482' if args.filter_classes else ''
         prefix = file.split('.')[0].split('_')[0]
         out_file = f'{prefix}{c}_classes.json'
-        mmcv.dump(out, osp.join(args.input, out_file))
+        mmengine.dump(out, osp.join(args.input, out_file))
 
 
 if __name__ == '__main__':
diff --git a/tools/convert_datasets/trackingnet/gen_trackingnet_infos.py b/tools/dataset_converters/trackingnet/gen_trackingnet_infos.py
similarity index 100%
rename from tools/convert_datasets/trackingnet/gen_trackingnet_infos.py
rename to tools/dataset_converters/trackingnet/gen_trackingnet_infos.py
diff --git a/tools/convert_datasets/trackingnet/trackingnet2coco.py b/tools/dataset_converters/trackingnet/trackingnet2coco.py
similarity index 96%
rename from tools/convert_datasets/trackingnet/trackingnet2coco.py
rename to tools/dataset_converters/trackingnet/trackingnet2coco.py
index 84e9e038d..7098b5d46 100644
--- a/tools/convert_datasets/trackingnet/trackingnet2coco.py
+++ b/tools/dataset_converters/trackingnet/trackingnet2coco.py
@@ -5,6 +5,7 @@
 from collections import defaultdict
 
 import mmcv
+import mmengine
 from tqdm import tqdm
 
 
@@ -60,7 +61,7 @@ def convert_trackingnet(ann_dir, save_dir, split='test'):
             trackingnet['videos'].append(video)
 
             ann_file = osp.join(chunk_ann_dir, 'anno', video_name + '.txt')
-            gt_bboxes = mmcv.list_from_file(ann_file)
+            gt_bboxes = mmengine.list_from_file(ann_file)
             video_path = osp.join(chunk_ann_dir, 'frames', video_name)
             img_names = os.listdir(video_path)
             img_names = sorted(img_names, key=lambda x: int(x[:-4]))
@@ -104,7 +105,7 @@ def convert_trackingnet(ann_dir, save_dir, split='test'):
 
     if not osp.isdir(save_dir):
         os.makedirs(save_dir)
-    mmcv.dump(trackingnet, osp.join(save_dir, f'trackingnet_{split}.json'))
+    mmengine.dump(trackingnet, osp.join(save_dir, f'trackingnet_{split}.json'))
     print(f'-----TrackingNet {split} Dataset------')
     print(f'{records["vid_id"]- 1} videos')
     print(f'{records["global_instance_id"]- 1} instances')
diff --git a/tools/convert_datasets/trackingnet/unzip_trackinget.sh b/tools/dataset_converters/trackingnet/unzip_trackinget.sh
similarity index 100%
rename from tools/convert_datasets/trackingnet/unzip_trackinget.sh
rename to tools/dataset_converters/trackingnet/unzip_trackinget.sh
diff --git a/tools/convert_datasets/uav123/uav123_info_deprecated.txt b/tools/dataset_converters/uav123/uav123_info_deprecated.txt
similarity index 100%
rename from tools/convert_datasets/uav123/uav123_info_deprecated.txt
rename to tools/dataset_converters/uav123/uav123_info_deprecated.txt
diff --git a/tools/convert_datasets/uav123/uav123_infos.txt b/tools/dataset_converters/uav123/uav123_infos.txt
similarity index 100%
rename from tools/convert_datasets/uav123/uav123_infos.txt
rename to tools/dataset_converters/uav123/uav123_infos.txt
diff --git a/tools/convert_datasets/uav123/uav2coco.py b/tools/dataset_converters/uav123/uav2coco.py
similarity index 94%
rename from tools/convert_datasets/uav123/uav2coco.py
rename to tools/dataset_converters/uav123/uav2coco.py
index cd0e46cd2..b023eab77 100644
--- a/tools/convert_datasets/uav123/uav2coco.py
+++ b/tools/dataset_converters/uav123/uav2coco.py
@@ -5,6 +5,7 @@
 from collections import defaultdict
 
 import mmcv
+import mmengine
 from tqdm import tqdm
 
 
@@ -36,7 +37,7 @@ def convert_uav123(uav123, ann_dir, save_dir):
     # "anno_name,anno_path,video_path,start_frame,end_frame"
     info_path = osp.join(
         os.path.dirname(__file__), 'uav123_info_deprecated.txt')
-    uav_info = mmcv.list_from_file(info_path)[1:]
+    uav_info = mmengine.list_from_file(info_path)[1:]
 
     records = dict(vid_id=1, img_id=1, ann_id=1, global_instance_id=1)
     uav123['categories'] = [dict(id=0, name=0)]
@@ -54,7 +55,7 @@ def convert_uav123(uav123, ann_dir, save_dir):
         video = dict(id=records['vid_id'], name=video_name)
         uav123['videos'].append(video)
 
-        gt_bboxes = mmcv.list_from_file(osp.join(ann_dir, anno_path))
+        gt_bboxes = mmengine.list_from_file(osp.join(ann_dir, anno_path))
         assert len(gt_bboxes) == end_frame - start_frame + 1
 
         img = mmcv.imread(
@@ -95,7 +96,7 @@ def convert_uav123(uav123, ann_dir, save_dir):
 
     if not osp.isdir(save_dir):
         os.makedirs(save_dir)
-    mmcv.dump(uav123, osp.join(save_dir, 'uav123.json'))
+    mmengine.dump(uav123, osp.join(save_dir, 'uav123.json'))
     print('-----UAV123 Dataset------')
     print(f'{records["vid_id"]- 1} videos')
     print(f'{records["global_instance_id"]- 1} instances')
diff --git a/tools/convert_datasets/vot/download_vot.py b/tools/dataset_converters/vot/download_vot.py
similarity index 100%
rename from tools/convert_datasets/vot/download_vot.py
rename to tools/dataset_converters/vot/download_vot.py
diff --git a/tools/convert_datasets/vot/gen_vot_infos.py b/tools/dataset_converters/vot/gen_vot_infos.py
similarity index 100%
rename from tools/convert_datasets/vot/gen_vot_infos.py
rename to tools/dataset_converters/vot/gen_vot_infos.py
diff --git a/tools/convert_datasets/vot/vot2coco.py b/tools/dataset_converters/vot/vot2coco.py
similarity index 96%
rename from tools/convert_datasets/vot/vot2coco.py
rename to tools/dataset_converters/vot/vot2coco.py
index e4b16fe8e..792dd81f4 100644
--- a/tools/convert_datasets/vot/vot2coco.py
+++ b/tools/dataset_converters/vot/vot2coco.py
@@ -6,6 +6,7 @@
 
 import cv2
 import mmcv
+import mmengine
 import numpy as np
 from tqdm import tqdm
 
@@ -47,7 +48,7 @@ def parse_attribute(video_path, attr_name, img_num):
     """
     attr_path = osp.join(video_path, attr_name + '.tag')
     if osp.isfile(attr_path):
-        attr_list = mmcv.list_from_file(attr_path)
+        attr_list = mmengine.list_from_file(attr_path)
     else:
         attr_list = []
     # unspecified tag is '0'(default)
@@ -77,7 +78,7 @@ def convert_vot(ann_dir, save_dir, dataset_type):
 
         video_path = osp.join(ann_dir, 'data', video_name)
         ann_file = osp.join(video_path, 'groundtruth.txt')
-        gt_anns = mmcv.list_from_file(ann_file)
+        gt_anns = mmengine.list_from_file(ann_file)
 
         camera_motion = parse_attribute(video_path, 'camera_motion',
                                         len(gt_anns))
@@ -141,7 +142,7 @@ def convert_vot(ann_dir, save_dir, dataset_type):
 
     if not osp.isdir(save_dir):
         os.makedirs(save_dir)
-    mmcv.dump(vot, osp.join(save_dir, f'{dataset_type}.json'))
+    mmengine.dump(vot, osp.join(save_dir, f'{dataset_type}.json'))
     print(f'-----VOT Challenge {dataset_type} Dataset------')
     print(f'{records["vid_id"]- 1} videos')
     print(f'{records["global_instance_id"]- 1} instances')
diff --git a/tools/convert_datasets/youtubevis/youtubevis2coco.py b/tools/dataset_converters/youtubevis/youtubevis2coco.py
similarity index 94%
rename from tools/convert_datasets/youtubevis/youtubevis2coco.py
rename to tools/dataset_converters/youtubevis/youtubevis2coco.py
index f1b376d73..a864f43a3 100644
--- a/tools/convert_datasets/youtubevis/youtubevis2coco.py
+++ b/tools/dataset_converters/youtubevis/youtubevis2coco.py
@@ -5,7 +5,7 @@
 import os.path as osp
 from collections import defaultdict
 
-import mmcv
+import mmengine
 from tqdm import tqdm
 
 
@@ -48,9 +48,10 @@ def convert_vis(ann_dir, save_dir, dataset_version, mode='train'):
     obj_num_classes = dict()
 
     if dataset_version == '2019':
-        official_anns = mmcv.load(osp.join(ann_dir, f'{mode}.json'))
+        official_anns = mmengine.load(osp.join(ann_dir, f'{mode}.json'))
     elif dataset_version == '2021':
-        official_anns = mmcv.load(osp.join(ann_dir, mode, 'instances.json'))
+        official_anns = mmengine.load(
+            osp.join(ann_dir, mode, 'instances.json'))
     VIS['categories'] = copy.deepcopy(official_anns['categories'])
 
     has_annotations = mode == 'train'
@@ -131,8 +132,8 @@ def convert_vis(ann_dir, save_dir, dataset_version, mode='train'):
 
     if not osp.isdir(save_dir):
         os.makedirs(save_dir)
-    mmcv.dump(VIS,
-              osp.join(save_dir, f'youtube_vis_{dataset_version}_{mode}.json'))
+    mmengine.dump(
+        VIS, osp.join(save_dir, f'youtube_vis_{dataset_version}_{mode}.json'))
     print(f'-----YouTube VIS {dataset_version} {mode}------')
     print(f'{records["vid_id"]- 1} videos')
     print(f'{records["img_id"]- 1} images')
diff --git a/tools/dev.sh b/tools/dev.sh
new file mode 100644
index 000000000..41c56761b
--- /dev/null
+++ b/tools/dev.sh
@@ -0,0 +1,4 @@
+python demo/demo_mot_vis.py configs/mot/deepsort/my_config.py --input demo/demo.mp4 --output mot.mp4
+
+# python demo/demo_mot_vis.py configs/mot/deepsort/my_config.py --input demo/demo.mp4 --output mot.mp4
+# python tools/test.py configs/mot/deepsort/deepsort_pose.py
\ No newline at end of file
diff --git a/tools/analysis/print_config.py b/tools/misc/print_config.py
similarity index 93%
rename from tools/analysis/print_config.py
rename to tools/misc/print_config.py
index c3538ef56..a5e6e641e 100644
--- a/tools/analysis/print_config.py
+++ b/tools/misc/print_config.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
 
-from mmcv import Config, DictAction
+from mmengine import Config, DictAction
 
 
 def parse_args():
diff --git a/tools/analysis/publish_model.py b/tools/misc/publish_model.py
similarity index 86%
rename from tools/analysis/publish_model.py
rename to tools/misc/publish_model.py
index da16e3f99..d00a8aecd 100644
--- a/tools/analysis/publish_model.py
+++ b/tools/misc/publish_model.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
-import glob
+import os
 import os.path as osp
 import subprocess
 
@@ -18,9 +18,11 @@ def parse_args():
 
 def process_checkpoint(in_file, out_file):
     exp_dir = osp.dirname(in_file)
-    log_json_path = list(sorted(glob.glob(osp.join(exp_dir,
-                                                   '*.log.json'))))[-1]
-    model_time = osp.split(log_json_path)[-1].split('.')[0]
+    model_time = sorted([
+        x for x in os.listdir(exp_dir) if osp.isdir(osp.join(exp_dir, x))
+    ])[-1]
+    log_json_path = osp.join(exp_dir,
+                             f'{model_time}/vis_data/{model_time}.json')
 
     checkpoint = torch.load(in_file, map_location='cpu')
     # remove optimizer for smaller file size
diff --git a/tools/test.py b/tools/test.py
index f25a01297..70bc8f367 100644
--- a/tools/test.py
+++ b/tools/test.py
@@ -2,75 +2,34 @@
 import argparse
 import os
 import os.path as osp
-import time
 
-import mmcv
-import torch
-from mmcv import Config, DictAction
-from mmcv.cnn import fuse_conv_bn
-from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
-from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
-                         wrap_fp16_model)
-from mmdet.apis import set_random_seed
+from mmengine.config import Config, DictAction
+from mmengine.model import is_model_wrapper
+from mmengine.registry import RUNNERS
+from mmengine.runner import Runner
 
-from mmtrack.core import setup_multi_processes
-from mmtrack.datasets import build_dataset
+from mmtrack.utils import register_all_modules
 
 
+# TODO: support fuse_conv_bn, visualization, and format_only
 def parse_args():
-    parser = argparse.ArgumentParser(description='mmtrack test model')
+    parser = argparse.ArgumentParser(
+        description='MMTrack test (and eval) a model')
     parser.add_argument('config', help='test config file path')
     parser.add_argument('--checkpoint', help='checkpoint file')
-    parser.add_argument('--out', help='output result file')
     parser.add_argument(
         '--work-dir',
         help='the directory to save the file containing evaluation metrics')
-    parser.add_argument(
-        '--fuse-conv-bn',
-        action='store_true',
-        help='Whether to fuse conv and bn, this will slightly increase'
-        'the inference speed')
-    parser.add_argument(
-        '--gpu-id',
-        type=int,
-        default=0,
-        help='id of gpu to use '
-        '(only applicable to non-distributed testing)')
-    parser.add_argument(
-        '--format-only',
-        action='store_true',
-        help='Format the output results without perform evaluation. It is'
-        'useful when you want to format the result to a specific format and '
-        'submit it to the test server')
-    parser.add_argument('--eval', type=str, nargs='+', help='eval types')
-    parser.add_argument('--show', action='store_true', help='show results')
-    parser.add_argument(
-        '--show-score-thr',
-        type=float,
-        default=0.3,
-        help='score threshold (default: 0.3)')
-    parser.add_argument(
-        '--show-dir', help='directory where painted images will be saved')
-    parser.add_argument(
-        '--gpu-collect',
-        action='store_true',
-        help='whether to use gpu to collect results.')
-    parser.add_argument(
-        '--tmpdir',
-        help='tmp directory used for collecting results from multiple '
-        'workers, available when gpu-collect is not specified')
     parser.add_argument(
         '--cfg-options',
         nargs='+',
         action=DictAction,
         help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file.')
-    parser.add_argument(
-        '--eval-options',
-        nargs='+',
-        action=DictAction,
-        help='custom options for evaluation, the key-value pair in xxx=yyy '
-        'format will be kwargs for dataset.evaluate() function')
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
     parser.add_argument(
         '--launcher',
         choices=['none', 'pytorch', 'slurm', 'mpi'],
@@ -86,139 +45,43 @@ def parse_args():
 def main():
     args = parse_args()
 
-    assert args.out or args.eval or args.format_only or args.show \
-        or args.show_dir, \
-        ('Please specify at least one operation (save/eval/format/show the '
-         'results / save the results) with the argument "--out", "--eval"'
-         ', "--format-only", "--show" or "--show-dir"')
-
-    if args.eval and args.format_only:
-        raise ValueError('--eval and --format_only cannot be both specified')
-
-    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
-        raise ValueError('The output file must be a pkl file.')
+    # register all modules in mmtrack into the registries
+    # do not init the default scope here because it will be init in the runner
+    register_all_modules(init_default_scope=False)
 
+    # load config
     cfg = Config.fromfile(args.config)
-    if cfg.get('USE_MMDET', False):
-        from mmdet.apis import multi_gpu_test, single_gpu_test
-        from mmdet.datasets import build_dataloader
-        from mmdet.models import build_detector as build_model
-        if 'detector' in cfg.model:
-            cfg.model = cfg.model.detector
-    elif cfg.get('TRAIN_REID', False):
-        from mmdet.apis import multi_gpu_test, single_gpu_test
-        from mmdet.datasets import build_dataloader
-
-        from mmtrack.models import build_reid as build_model
-        if 'reid' in cfg.model:
-            cfg.model = cfg.model.reid
-    else:
-        from mmtrack.apis import multi_gpu_test, single_gpu_test
-        from mmtrack.datasets import build_dataloader
-        from mmtrack.models import build_model
+    cfg.launcher = args.launcher
     if args.cfg_options is not None:
         cfg.merge_from_dict(args.cfg_options)
 
-    # set multi-process settings
-    setup_multi_processes(cfg)
-
-    # set random seeds. Force setting fixed seed and deterministic=True in SOT
-    # configs
-    if cfg.get('cudnn_benchmark', False):
-        torch.backends.cudnn.benchmark = True
-    if cfg.get('seed', None) is not None:
-        set_random_seed(
-            cfg.seed, deterministic=cfg.get('deterministic', False))
-    cfg.data.test.test_mode = True
-
-    cfg.gpu_ids = [args.gpu_id]
-
-    # init distributed env first, since logger depends on the dist info.
-    if args.launcher == 'none':
-        distributed = False
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
     else:
-        distributed = True
-        init_dist(args.launcher, **cfg.dist_params)
-
-    rank, _ = get_dist_info()
-    # allows not to create
-    if args.work_dir is not None and rank == 0:
-        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
-        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
-        json_file = osp.join(args.work_dir, f'eval_{timestamp}.log.json')
-
-    # build the dataloader
-    dataset = build_dataset(cfg.data.test)
-    data_loader = build_dataloader(
-        dataset,
-        samples_per_gpu=1,
-        workers_per_gpu=cfg.data.workers_per_gpu,
-        dist=distributed,
-        shuffle=False)
-
-    # build the model and load checkpoint
-    if cfg.get('test_cfg', False):
-        model = build_model(
-            cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
-    else:
-        model = build_model(cfg.model)
-    # We need call `init_weights()` to load pretained weights in MOT task.
-    model.init_weights()
-    fp16_cfg = cfg.get('fp16', None)
-    if fp16_cfg is not None:
-        wrap_fp16_model(model)
-    if args.checkpoint is not None:
-        checkpoint = load_checkpoint(
-            model, args.checkpoint, map_location='cpu')
-        if 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']:
-            model.CLASSES = checkpoint['meta']['CLASSES']
-    if not hasattr(model, 'CLASSES'):
-        model.CLASSES = dataset.CLASSES
-
-    if args.fuse_conv_bn:
-        model = fuse_conv_bn(model)
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
 
-    if not distributed:
-        model = MMDataParallel(model, device_ids=cfg.gpu_ids)
-        outputs = single_gpu_test(
-            model,
-            data_loader,
-            args.show,
-            args.show_dir,
-            show_score_thr=args.show_score_thr)
+    if is_model_wrapper(runner.model):
+        runner.model.module.init_weights()
     else:
-        model = MMDistributedDataParallel(
-            model.cuda(),
-            device_ids=[torch.cuda.current_device()],
-            broadcast_buffers=False)
-        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
-                                 args.gpu_collect)
+        runner.model.init_weights()
 
-    rank, _ = get_dist_info()
-    if rank == 0:
-        if args.out:
-            print(f'\nwriting results to {args.out}')
-            mmcv.dump(outputs, args.out)
-        kwargs = {} if args.eval_options is None else args.eval_options
-        if args.format_only:
-            dataset.format_results(outputs, **kwargs)
-        if args.eval:
-            eval_kwargs = cfg.get('evaluation', {}).copy()
-            # hard-code way to remove EvalHook args
-            eval_hook_args = [
-                'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
-                'rule', 'by_epoch'
-            ]
-            for key in eval_hook_args:
-                eval_kwargs.pop(key, None)
-            eval_kwargs.update(dict(metric=args.eval, **kwargs))
-            metric = dataset.evaluate(outputs, **eval_kwargs)
-            print(metric)
-            metric_dict = dict(
-                config=args.config, mode='test', epoch=cfg.total_epochs)
-            metric_dict.update(metric)
-            if args.work_dir is not None:
-                mmcv.dump(metric_dict, json_file)
+    # start testing
+    runner.test()
 
 
 if __name__ == '__main__':
diff --git a/tools/torchserve/mmtrack2torchserve.py b/tools/torchserve/mmtrack2torchserve.py
index ed0aa6c1f..b49633e54 100644
--- a/tools/torchserve/mmtrack2torchserve.py
+++ b/tools/torchserve/mmtrack2torchserve.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
-import mmcv
+import mmengine
 
 try:
     from model_archiver.model_packaging import package_model
@@ -42,9 +42,9 @@ def mmtrack2torchserve(
             If True, if there is an existing `{model_name}.mar`
             file under `output_folder` it will be overwritten.
     """
-    mmcv.mkdir_or_exist(output_folder)
+    mmengine.mkdir_or_exist(output_folder)
 
-    config = mmcv.Config.fromfile(config_file)
+    config = mmengine.Config.fromfile(config_file)
 
     with TemporaryDirectory() as tmpdir:
         config.dump(f'{tmpdir}/config.py')
diff --git a/tools/train.py b/tools/train.py
index 0bc503460..b2b6cae5b 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -1,23 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
-import copy
+import logging
 import os
 import os.path as osp
-import time
-import warnings
 
-import mmcv
-import torch
-import torch.distributed as dist
-from mmcv import Config, DictAction
-from mmcv.runner import init_dist
-from mmdet.apis import set_random_seed
+from mmengine.config import Config, DictAction
+from mmengine.logging import print_log
+from mmengine.registry import RUNNERS
+from mmengine.runner import Runner
 
-from mmtrack import __version__
-from mmtrack.apis import init_random_seed
-from mmtrack.core import setup_multi_processes
-from mmtrack.datasets import build_dataset
-from mmtrack.utils import collect_env, get_root_logger
+from mmtrack.utils import register_all_modules
 
 
 def parse_args():
@@ -25,44 +17,28 @@ def parse_args():
     parser.add_argument('config', help='train config file path')
     parser.add_argument('--work-dir', help='the dir to save logs and models')
     parser.add_argument(
-        '--resume-from', help='the checkpoint file to resume from')
-    parser.add_argument(
-        '--no-validate',
+        '--amp',
         action='store_true',
-        help='whether not to evaluate the checkpoint during training')
-    group_gpus = parser.add_mutually_exclusive_group()
-    group_gpus.add_argument(
-        '--gpus',
-        type=int,
-        help='(Deprecated, please use --gpu-id) number of gpus to use '
-        '(only applicable to non-distributed training)')
-    group_gpus.add_argument(
-        '--gpu-ids',
-        type=int,
-        nargs='+',
-        help='(Deprecated, please use --gpu-id) ids of gpus to use '
-        '(only applicable to non-distributed training)')
-    group_gpus.add_argument(
-        '--gpu-id',
-        type=int,
-        default=0,
-        help='id of gpu to use '
-        '(only applicable to non-distributed training)')
-    parser.add_argument('--seed', type=int, default=None, help='random seed')
+        default=False,
+        help='enable automatic-mixed-precision training')
     parser.add_argument(
-        '--diff_seed',
+        '--auto-scale-lr',
         action='store_true',
-        help='Whether or not set different seeds for different ranks')
+        help='enable automatically scaling LR.')
     parser.add_argument(
-        '--deterministic',
+        '--resume',
         action='store_true',
-        help='whether to set deterministic options for CUDNN backend.')
+        help='resume from the latest checkpoint in the work_dir automatically')
     parser.add_argument(
         '--cfg-options',
         nargs='+',
         action=DictAction,
         help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file.')
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
     parser.add_argument(
         '--launcher',
         choices=['none', 'pytorch', 'slurm', 'mpi'],
@@ -79,32 +55,16 @@ def parse_args():
 def main():
     args = parse_args()
 
-    cfg = Config.fromfile(args.config)
+    # register all modules in mmtrack into the registries
+    # do not init the default scope here because it will be init in the runner
+    register_all_modules(init_default_scope=False)
 
-    if cfg.get('USE_MMDET', False):
-        from mmdet.apis import train_detector as train_model
-        from mmdet.models import build_detector as build_model
-        if 'detector' in cfg.model:
-            cfg.model = cfg.model.detector
-    elif cfg.get('TRAIN_REID', False):
-        from mmdet.apis import train_detector as train_model
-
-        from mmtrack.models import build_reid as build_model
-        if 'reid' in cfg.model:
-            cfg.model = cfg.model.reid
-    else:
-        from mmtrack.apis import train_model
-        from mmtrack.models import build_model
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.launcher = args.launcher
     if args.cfg_options is not None:
         cfg.merge_from_dict(args.cfg_options)
 
-    # set multi-process settings
-    setup_multi_processes(cfg)
-
-    # set cudnn_benchmark
-    if cfg.get('cudnn_benchmark', False):
-        torch.backends.cudnn.benchmark = True
-
     # work_dir is determined in this priority: CLI > segment in file > filename
     if args.work_dir is not None:
         # update configs according to CLI args if args.work_dir is not None
@@ -113,97 +73,46 @@ def main():
         # use config filename as default work_dir if cfg.work_dir is None
         cfg.work_dir = osp.join('./work_dirs',
                                 osp.splitext(osp.basename(args.config))[0])
-    if args.resume_from is not None:
-        cfg.resume_from = args.resume_from
-    if args.gpus is not None:
-        cfg.gpu_ids = range(1)
-        warnings.warn('`--gpus` is deprecated because we only support '
-                      'single GPU mode in non-distributed training. '
-                      'Use `gpus=1` now.')
-    if args.gpu_ids is not None:
-        cfg.gpu_ids = args.gpu_ids[0:1]
-        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
-                      'Because we only support single GPU mode in '
-                      'non-distributed training. Use the first GPU '
-                      'in `gpu_ids` now.')
-    if args.gpus is None and args.gpu_ids is None:
-        cfg.gpu_ids = [args.gpu_id]
-
-    # init distributed env first, since logger depends on the dist info.
-    if args.launcher == 'none':
-        distributed = False
-    else:
-        distributed = True
-        init_dist(args.launcher, **cfg.dist_params)
-
-    # create work_dir
-    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
-    # dump config
-    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
-    # init the logger before other steps
-    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
-    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
-    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
-
-    # init the meta dict to record some important information such as
-    # environment info and seed, which will be logged
-    meta = dict()
-    # log env info
-    env_info_dict = collect_env()
-    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
-    dash_line = '-' * 60 + '\n'
-    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
-                dash_line)
-    meta['env_info'] = env_info
-
-    # log some basic info
-    logger.info(f'Distributed training: {distributed}')
-    logger.info(f'Config:\n{cfg.pretty_text}')
-
-    # set random seeds. Force setting fixed seed and deterministic=True in SOT
-    # configs
-    if args.seed is not None:
-        cfg.seed = args.seed
-    elif cfg.get('seed', None) is None:
-        cfg.seed = init_random_seed()
-    cfg.seed = cfg.seed + dist.get_rank() if args.diff_seed else cfg.seed
-
-    deterministic = True if args.deterministic else cfg.get(
-        'deterministic', False)
-    logger.info(f'Set random seed to {cfg.seed}, '
-                f'deterministic: {deterministic}')
-    set_random_seed(cfg.seed, deterministic=deterministic)
-    meta['seed'] = cfg.seed
-
-    if cfg.get('train_cfg', False):
-        model = build_model(
-            cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
+
+    # enable automatic-mixed-precision training
+    if args.amp is True:
+        optim_wrapper = cfg.optim_wrapper.type
+        if optim_wrapper == 'AmpOptimWrapper':
+            print_log(
+                'AMP training is already enabled in your config.',
+                logger='current',
+                level=logging.WARNING)
+        else:
+            assert optim_wrapper == 'OptimWrapper', (
+                '`--amp` is only supported when the optimizer wrapper type is '
+                f'`OptimWrapper` but got {optim_wrapper}.')
+            cfg.optim_wrapper.type = 'AmpOptimWrapper'
+            cfg.optim_wrapper.loss_scale = 'dynamic'
+
+    # enable automatically scaling LR
+    if args.auto_scale_lr:
+        if 'auto_scale_lr' in cfg and \
+                'enable' in cfg.auto_scale_lr and \
+                'base_batch_size' in cfg.auto_scale_lr:
+            cfg.auto_scale_lr.enable = True
+        else:
+            raise RuntimeError('Can not find "auto_scale_lr" or '
+                               '"auto_scale_lr.enable" or '
+                               '"auto_scale_lr.base_batch_size" in your'
+                               ' configuration file.')
+    cfg.resume = args.resume
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
     else:
-        model = build_model(cfg.model)
-    model.init_weights()
-
-    datasets = [build_dataset(cfg.data.train)]
-    if len(cfg.workflow) == 2:
-        val_dataset = copy.deepcopy(cfg.data.val)
-        val_dataset.pipeline = cfg.data.train.pipeline
-        datasets.append(build_dataset(val_dataset))
-    if cfg.checkpoint_config is not None:
-        # save mmtrack version, config file content and class names in
-        # checkpoints as meta data
-        cfg.checkpoint_config.meta = dict(
-            mmtrack_version=__version__,
-            config=cfg.pretty_text,
-            CLASSES=datasets[0].CLASSES)
-    # add an attribute for visualization convenience
-    model.CLASSES = datasets[0].CLASSES
-    train_model(
-        model,
-        datasets,
-        cfg,
-        distributed=distributed,
-        validate=(not args.no_validate),
-        timestamp=timestamp,
-        meta=meta)
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # start training
+    runner.train()
 
 
 if __name__ == '__main__':