From 23d334b2e05f5fdc7320384c8b47c370c602dd54 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 19 Jul 2022 11:38:17 -0700
Subject: [PATCH 01/30] Add back support for PYTORCH_TEST_WITH_MPS (#66)

Fix the TEST_WITH_MPS macro.
---
 torch/testing/_internal/common_device_type.py | 8 +++-----
 torch/testing/_internal/common_utils.py       | 1 +
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 8e34ec10a83505..75e87155c7ca01 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -13,7 +13,7 @@
 import torch.backends.mps
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
-    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, \
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, TEST_WITH_MPS, \
     _TestParametrizer, compose_parametrize_fns, dtype_name, \
     NATIVE_DEVICES, skipIfTorchDynamo
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, \
@@ -555,10 +555,8 @@ def get_device_type_test_bases():
         test_bases.append(CPUTestBase)
         if torch.cuda.is_available():
             test_bases.append(CUDATestBase)
-        # Disable MPS testing in generic device testing temporarily while we're
-        # ramping up support.
-        # elif torch.backends.mps.is_available():
-        #   test_bases.append(MPSTestBase)
+        elif torch.backends.mps.is_available():
+          test_bases.append(MPSTestBase)
 
     return test_bases
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 03193f5ed7b270..66466c56aa3a93 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -896,6 +896,7 @@ def _check_module_exists(name: str) -> bool:
 TEST_WITH_TSAN = os.getenv('PYTORCH_TEST_WITH_TSAN', '0') == '1'
 TEST_WITH_UBSAN = os.getenv('PYTORCH_TEST_WITH_UBSAN', '0') == '1'
 TEST_WITH_ROCM = os.getenv('PYTORCH_TEST_WITH_ROCM', '0') == '1'
+TEST_WITH_MPS = os.getenv('PYTORCH_TEST_WITH_MPS', '0') == '1'
 
 # Enables tests that are slow to run (disabled by default)
 TEST_WITH_SLOW = os.getenv('PYTORCH_TEST_WITH_SLOW', '0') == '1'

From 12085cffc8cc30658617b147da9db143026d5591 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 24 Jan 2023 12:37:40 -0800
Subject: [PATCH 02/30] Enable MPS CI runners (#252)

* Test MPS CI runners

* Cherry pick remaining files

* Enable lintrunner:

* Change lint  runner

* Retrigger checks

* Retrigger checks #2

* Retrigger checks #3

* Retrigger checks #4

* Retrigger checks #5

* Retrigger checks #5

* Retrigger checks #7

* Retrigger checks #8

* Retrigger checks #9

* Retrigger checks #9 (change arch to arm)

* Retrigger checks #10

* Retrigger checks #11

* Retrigger checks #12

* Retrigger checks #13

* Retrigger checks #14

* Retrigger checks #14

* Retrigger checks #15

* Retrigger checks #16

* Retrigger checks #16

* Retrigger checks #17

* Retrigger checks #19

* Retrigger checks #20

* Retrigger checks #21

* Fix lintrunner

* Fix lintrunner

* Remove lint.json
---
 .github/workflows/_mac-build.yml   |   4 +-
 .github/workflows/_mac-test.yml    |   6 +
 .github/workflows/check-labels.yml |  44 ---
 .github/workflows/lint.yml         |  69 ++++-
 .github/workflows/mac-mps.yml      |  11 +-
 test/test_mps.py                   | 468 ++++++++++++++++++++++++++++-
 6 files changed, 547 insertions(+), 55 deletions(-)
 delete mode 100644 .github/workflows/check-labels.yml

diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index f5f66ae5129bf7..58c70125b71178 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -63,8 +63,8 @@ on:
 
 jobs:
   build:
-    # Don't run on forked repos.
-    if: github.repository_owner == 'pytorch'
+    # # Don't run on forked repos.
+    # if: github.repository_owner == 'pytorch'
     runs-on: ${{ inputs.runner-type }}
     env:
       # For sccache access (only on non-forked PRs)
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index d8ede95f2958dd..f61a3d28a3451c 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -182,6 +182,12 @@ jobs:
         run: |
           cat test/**/*.log || true
 
+      - name: Print remaining test logs
+        shell: bash
+        if: always()
+        run: |
+          cat test/**/*.log || true
+
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml
deleted file mode 100644
index 5fa5fed16daf80..00000000000000
--- a/.github/workflows/check-labels.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Check Labels
-
-on:
-  pull_request:
-    types: [opened, synchronize, reopened, labeled, unlabeled]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  check-labels:
-    name: Check labels
-    runs-on: linux.20_04.4x
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
-
-      - name: Install requirements
-        id: requirements
-        run: |
-          pip install -r .github/requirements-gha-cache.txt --user
-
-      - name: Check labels
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PR_NUM: ${{ github.event.number }}
-        run: |
-          set -ex
-          python3 .github/scripts/check_labels.py "${PR_NUM}"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 5dc152286e5039..98a941d48b8385 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -5,9 +5,6 @@ on:
   push:
     branches:
       - master
-      - main
-      - release/*
-      - landchecks/*
   workflow_dispatch:
 
 # The names of steps that actually test the code should be suffixed with `(nonretryable)`.
@@ -251,6 +248,72 @@ jobs:
           # All we need to see is that it passes
           python3 torch/utils/collect_env.py
 
+    runs-on: macos-m1-12
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          submodules: false
+          fetch-depth: 1
+
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: 3.9
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+          # pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
+
+      - name: Install requirements
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} python3 -m pip install --force-reinstall -r .github/requirements-gha-cache.txt
+
+      - name: Initialize lint dependencies
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} lintrunner init
+
+      - name: Do build steps necessary for linters
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} python3 -m tools.linter.clang_tidy.generate_build_files
+          ${CONDA_RUN} python3 -m tools.generate_torch_version --is_debug=false
+          ${CONDA_RUN} python3 -m tools.pyi.gen_pyi \
+            --native-functions-path aten/src/ATen/native/native_functions.yaml \
+            --tags-path aten/src/ATen/native/tags.yaml \
+            --deprecated-functions-path "tools/autograd/deprecated.yaml"
+
+      - name: Run lintrunner on all MPS files (nonretryable)
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          set +e
+          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py; then
+              echo ""
+              echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
+              echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
+              exit 1
+          fi
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml
index 663eac84514fe7..a2ca4867fd76b8 100644
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@@ -1,10 +1,11 @@
 name: Mac MPS
 
 on:
-  push:
-    tags:
-      - ciflow/mps/*
-  workflow_dispatch:
+  # push:
+  #   tags:
+  #     - ciflow/mps/*
+  # workflow_dispatch:
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -18,7 +19,7 @@ jobs:
       sync-tag: macos-12-py3-arm64-build
       build-environment: macos-12-py3-arm64
       xcode-version: "13.3.1"
-      runner-type: macos-12-xl
+      runner-type: macos-m1-13
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python_version: 3.9.12
diff --git a/test/test_mps.py b/test/test_mps.py
index b3740b5cd1148c..2b186d8f4c1984 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1212,7 +1212,11 @@ def test_norm(self):
         self.assertEqual(res, res_cpu)
 
         c = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="mps")
+<<<<<<< HEAD
         c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]] , dtype=torch.float, device="cpu")
+=======
+        c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="cpu")
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
         res = torch.norm(c, dim=0)
         res_cpu = torch.norm(c_cpu, dim=0)
@@ -2371,12 +2375,21 @@ def helper(x, return_inverse, return_counts):
 
             self.assertEqual(result, result_cpu)
         helper(torch.tensor([1, 2, 4, 2, 1]), False, False)
+<<<<<<< HEAD
         helper(torch.randint(3, (10, )), False, False)
         helper(torch.randint(3, (10, )), True, False)
         helper(torch.randint(3, (10, )), False, True)
         helper(torch.randint(3, (10, )), True, True)
         helper(torch.randint(3, (1, )), True, True)
         helper(torch.randint(3, (0, )), True, True)
+=======
+        helper(torch.randint(3, (10,)), False, False)
+        helper(torch.randint(3, (10,)), True, False)
+        helper(torch.randint(3, (10,)), False, True)
+        helper(torch.randint(3, (10,)), True, True)
+        helper(torch.randint(3, (1,)), True, True)
+        helper(torch.randint(3, (0,)), True, True)
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
     def test_unique_consecutive(self):
         def helper(x, dim, return_inverse, return_counts):
@@ -2388,6 +2401,7 @@ def helper(x, dim, return_inverse, return_counts):
 
             self.assertEqual(result, result_cpu)
         helper(torch.tensor([1, 2, 4, 2, 1]), 0, False, False)
+<<<<<<< HEAD
         helper(torch.randint(3, (10, )), 0, False, False)
         helper(torch.randint(3, (10, )), 0, True, False)
         helper(torch.randint(3, (10, )), 0, False, True)
@@ -2395,6 +2409,15 @@ def helper(x, dim, return_inverse, return_counts):
         helper(torch.randint(3, (10, )), 0, True, True)
         helper(torch.randint(3, (1, )), 0, True, True)
         helper(torch.randint(3, (0, )), 0, True, True)
+=======
+        helper(torch.randint(3, (10,)), 0, False, False)
+        helper(torch.randint(3, (10,)), 0, True, False)
+        helper(torch.randint(3, (10,)), 0, False, True)
+        helper(torch.randint(3, (10,)), 0, True, True)
+        helper(torch.randint(3, (10,)), 0, True, True)
+        helper(torch.randint(3, (1,)), 0, True, True)
+        helper(torch.randint(3, (0,)), 0, True, True)
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
         helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, False, False)
         helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, True, True)
@@ -4776,6 +4799,11 @@ def helper(shape, padding, op, value=0):
         helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
         # check the workaround for the right padding bug in Monterey
         helper((1, 2, 2, 2, 2), (0, 1), nn.ConstantPad3d)
+<<<<<<< HEAD
+=======
+        # input size < pad size
+        helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
     # Test stack forward
     def test_stack(self):
@@ -5058,6 +5086,7 @@ def helper(shape, dim=0):
             for dim in range(len(shape)):
                 helper(shape, dim)
 
+<<<<<<< HEAD
     # Test softplus
     def test_softplus(self):
         def helper(shape, beta=1, threshold=20):
@@ -5081,8 +5110,31 @@ def helper(shape, beta=1, threshold=20):
             for beta in [0.5, 1, 2, 3, 4]:
                 for threshold in [0.5, 20, 30, 40, 50]:
                     helper(shape, beta, threshold)
+=======
+    # # Test softplus
+    # def test_softplus(self):
+    #     def helper(shape, beta=1, threshold=20):
+    #         cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+    #         x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+    #         softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
+    #         softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
+
+    #         cpu_grad = torch.randn(softplus_result.shape)
+    #         grad = cpu_grad.to('mps')
+
+    #         softplus_result.backward(gradient=grad)
+    #         softplus_result_cpu.backward(gradient=cpu_grad)
+
+    #         self.assertEqual(softplus_result, softplus_result_cpu)
+    #         self.assertEqual(x.grad, cpu_x.grad)
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
-    # Test silu
+    #     # Test empty shape too
+    #     for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
+    #         for beta in [0.5, 1, 2, 3, 4]:
+    #             for threshold in [0.5, 20, 30, 40, 50]:
+    #                 helper(shape, beta, threshold)
 
     def test_silu(self):
         def helper(shape):
@@ -5776,7 +5828,11 @@ def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, reduce_str="
 
         # for reduce in ["sum", "prod", "amax", "amin"]:
         for reduce_type in ["add", "multiply"]:
+<<<<<<< HEAD
             helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce_type)
+=======
+            helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce)
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
             helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
             helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
             helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce_type)
@@ -9511,6 +9567,7 @@ class TestConsistency(TestCaseMPS):
     # If the dtype list is None, all dtypes are excluded.
     # All the entries in this list should be removed
     BLOCKLIST = {
+<<<<<<< HEAD
         # Functions that hang
         'masked_fill': [torch.bool, torch.uint8, torch.float32], 'where': [torch.bool],
         # + forward when requires_grad=True or running backward
@@ -9621,6 +9678,393 @@ class TestConsistency(TestCaseMPS):
         'inner': None,
         'dstack': None,
         'take_along_dim': None,
+=======
+        # Functions that hard crash
+        'nn.functional.softplus': [torch.float32],
+        'median': [torch.float32, torch.int16, torch.int32, torch.uint8, torch.int16],
+        'sgn': [torch.bool],
+        'linalg.inv': [torch.float32],
+        'linalg.inv_ex': [torch.float32],
+        'linalg.matrix_power': [torch.float32],
+        'nn.functional.interpolate': [torch.float32],
+        'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.interpolatearea': [torch.float32],
+        'resize_as_': [torch.float16, torch.float32],
+        'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],
+
+        # Functions with correctness issues
+        'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'divfloor_rounding': [torch.int16, torch.int32, torch.int64],
+        'divtrunc_rounding': [torch.float16],
+        'norm': [torch.float16],
+        'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
+        'cumulative_trapezoid': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addr': [torch.float16],
+        'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'trace': [torch.int64],
+        'normalnumber_mean': [torch.float16, torch.float32],
+        'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'multinomial': [torch.float32],
+        'floor_divide': [torch.int16, torch.int32, torch.int64],
+        'dist': [torch.float16],
+
+        # failure due to issue: atan2() may generate NAN in output with
+        'atan2': [torch.bool, torch.int16, torch.int32, torch.uint8],
+
+        # Unsupported Border padding mode
+        'grid_sampler_2d': [torch.float32],
+        'nn.functional.grid_sample': [torch.float32],
+
+        # failures due to issue #103039644: Wrong results from avgPooling2DWithSourceTensor()
+        # when both ceilMode and includeZeroPadToAverage are True
+        'nn.functional.avg_pool1d': [torch.float32, torch.int64],
+        'nn.functional.avg_pool2d': [torch.float32, torch.int64],
+        'nn.functional.adaptive_avg_pool1d': [torch.float32],
+        'nn.functional.adaptive_avg_pool2d': [torch.float32],
+    }
+
+    UNIMPLEMENTED_OPS = {
+        # Failures due to lack of op implementation on MPS backend
+        'linalg.eig': [torch.float32],
+        'linalg.eigvals': [torch.float32],
+        'fft.fft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'stft': [torch.float32],
+        'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
+        'rounddecimals_neg_3': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rounddecimals_3': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rounddecimals_0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rmod__': [torch.float16, torch.float32],
+        '__rsub__': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'aminmax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'angle': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'argsort': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bucketize': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cholesky': [torch.float32],
+        'cholesky_inverse': [torch.float32],
+        'cholesky_solve': [torch.float32],
+        'copysign': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cummax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cummin': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cumprod': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'digamma': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'erfc': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'erfinv': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmin': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmod': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'frexp': [torch.float16, torch.float32],
+        'gcd': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'geqrf': [torch.float32],
+        'heaviside': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'histc': [torch.float32],
+        'histogram': [torch.float32],
+        'histogramdd': [torch.float32],
+        'hypot': [torch.float32],
+        'i0': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'igamma': [torch.float16, torch.float32],
+        'igammac': [torch.float16, torch.float32],
+        'index_copy': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'index_fill': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'index_reduce': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isin': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isneginf': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isposinf': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'kthvalue': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lcm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'ldexp': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lerp': [torch.float32],
+        'lgamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.cholesky': [torch.float32],
+        'linalg.cholesky_ex': [torch.float32],
+        'linalg.cond': [torch.float32],
+        'linalg.detsingular': [torch.float32],
+        'linalg.det': [torch.float32],
+        'linalg.eigh': [torch.float32],
+        'linalg.eigvalsh': [torch.float32],
+        'linalg.householder_product': [torch.float32],
+        'linalg.ldl_factor': [torch.float32],
+        'linalg.ldl_factor_ex': [torch.float32],
+        'linalg.ldl_solve': [torch.float32],
+        'linalg.lstsq': [torch.float32],
+        'linalg.lstsqgrad_oriented': [torch.float32],
+        'linalg.lu': [torch.float32],
+        'linalg.lu_factor': [torch.float32],
+        'linalg.lu_factor_ex': [torch.float32],
+        'linalg.lu_solve': [torch.float32],
+        'linalg.matrix_norm': [torch.float32],
+        'linalg.norm': [torch.float32],
+        'linalg.normsubgradients_at_zero': [torch.float32],
+        'linalg.qr': [torch.float32],
+        'linalg.slogdet': [torch.float32],
+        'linalg.solve': [torch.float32],
+        'linalg.solve_ex': [torch.float32],
+        'linalg.svdvals': [torch.float32],
+        'linalg.tensorsolve': [torch.float32],
+        'linalg.vander': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.vecdot': [torch.float32],
+        'logcumsumexp': [torch.float32],
+        'logdet': [torch.float32],
+        'logit': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lu': [torch.float32],
+        'lu_solve': [torch.float32],
+        'lu_unpack': [torch.float32],
+        'masked.cumprod': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'masked.median': [torch.float32],
+        'masked_scatter': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'matrix_exp': [torch.float32],
+        'mode': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'msort': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgamma': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_1': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_3': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_5': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nanquantile': [torch.float32],
+        'nanmean': [torch.float32, torch.float16],
+        'nanmedian': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nansum': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'native_dropout_backward': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nextafter': [torch.float32],
+        'normnuc': [torch.float32],
+        'nn.functional._scaled_dot_product_attention': [torch.float32],
+        'nn.functional.fractional_max_pool2d': [torch.float32],
+        'nn.functional.fractional_max_pool3d': [torch.float32],
+        'nn.functional.adaptive_avg_pool3d': [torch.float16, torch.float32],
+        'nn.functional.adaptive_max_pool3d': [torch.float32],
+        'nn.functional.interpolatebicubic': [torch.float32],
+        'nn.functional.interpolatelinear': [torch.float32],
+        'nn.functional.interpolatetrilinear': [torch.float32],
+        'nn.functional.max_unpool1dgrad': [torch.float32],
+        'nn.functional.max_unpool2dgrad': [torch.float32],
+        'nn.functional.max_unpool3dgrad': [torch.float32],
+        'nn.functional.avg_pool3d': [torch.float32, torch.int64],
+        'nn.functional.ctc_loss': [torch.float32],
+        'nn.functional.embedding_bag': [torch.float16, torch.float32],
+        'nn.functional.max_pool2d': [torch.float32],
+        'nn.functional.hardshrink': [torch.float32],
+        'nn.functional.hardsigmoid': [torch.float32],
+        'nn.functional.logsigmoid': [torch.float32],
+        'nn.functional.max_pool3d': [torch.float32],
+        'nn.functional.max_unpool1d': [torch.float32],
+        'nn.functional.max_unpool2d': [torch.float32],
+        'nn.functional.max_unpool3d': [torch.float32],
+        'nn.functional.mish': [torch.float32],
+        'nn.functional.multi_margin_loss': [torch.float32],
+        'nn.functional.multilabel_margin_loss': [torch.float32],
+        'nn.functional.multilabel_soft_margin_loss': [torch.float32],
+        'nn.functional.pdist': [torch.float32],
+        'nn.functional.rrelu': [torch.float32],
+        'nn.functional.softshrink': [torch.float32],
+        'nn.functional.unfold': [torch.float16, torch.float32],
+        'nn.functional.norm': [torch.float32],
+        'ormqr': [torch.float32],
+        'pca_lowrank': [torch.float32],
+        'pinverse': [torch.float32],
+        'polar': [torch.float32],
+        'polygamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_3': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_4': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'qr': [torch.float32],
+        'quantile': [torch.float32],
+        'remainder': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'renorm': [torch.float16, torch.float32],
+        'roll': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rsub': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceamax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceamin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducemin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducemean': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceprod': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducesum': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'searchsorted': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reduce': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reduceoffsets': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reducelengths': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'sinc': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'sort': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.airy_ai': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_j0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_j1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_y0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_y1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.chebyshev_polynomial_t': [torch.bool,
+                                           torch.float16,
+                                           torch.float32,
+                                           torch.int16,
+                                           torch.int32,
+                                           torch.int64,
+                                           torch.uint8],
+        'special.chebyshev_polynomial_u': [torch.bool,
+                                           torch.float16,
+                                           torch.float32,
+                                           torch.int16,
+                                           torch.int32,
+                                           torch.int64,
+                                           torch.uint8],
+        'special.entr': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.erfcx': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.hermite_polynomial_h': [torch.bool,
+                                         torch.float16,
+                                         torch.float32,
+                                         torch.int16,
+                                         torch.int32,
+                                         torch.int64,
+                                         torch.uint8],
+        'special.hermite_polynomial_he': [torch.bool,
+                                          torch.float16,
+                                          torch.float32,
+                                          torch.int16,
+                                          torch.int32,
+                                          torch.int64,
+                                          torch.uint8],
+        'special.i0e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.i1e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.laguerre_polynomial_l': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.log_ndtr': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_i0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_k0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_k1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.ndtri': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.polygamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.polygammaspecial_polygamma_n_0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.scaled_modified_bessel_k0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.scaled_modified_bessel_k1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.spherical_bessel_j0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.xlog1py': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.zeta': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'std_mean': [torch.float16, torch.float32],
+        'std_meanunbiased': [torch.float16, torch.float32],
+        'svd_lowrank': [torch.float32],
+        'symeig': [torch.float32],
+        'take': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'to_sparse': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'var_mean': [torch.float16, torch.float32],
+        'var_meanunbiased': [torch.float16, torch.float32],
+        'vdot': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'view_as_complex': [torch.float16, torch.float32],
+        'xlogy': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+    }
+
+    EXPECTED_FAILURES = {
+        # Failures due to unsupported data types on MPS backend
+        'bfloat16': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'chalf': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.conv1d': [torch.int64],
+        'nn.functional.conv2d': [torch.int64],
+        'nn.functional.conv_transpose1d': [torch.int64],
+        'nn.functional.softminwith_dtype': [torch.bool,
+                                            torch.float16,
+                                            torch.float32,
+                                            torch.int16,
+                                            torch.int32,
+                                            torch.int64,
+                                            torch.uint8],
+        'log_softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rmatmul__': [torch.int16, torch.int32, torch.uint8],
+        'addmmdecomposed': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addmv': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'baddbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cdouble': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cfloat': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'complex': [torch.float16, torch.float32],
+        'double': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'einsum': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fftshift': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifftshift': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'float_power': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'full_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'inner': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.matrix_rank': [torch.float32],
+        'linalg.matrix_rankhermitian': [torch.float32],
+        'linalg.multi_dot': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.pinv': [torch.float32],
+        'linalg.pinvhermitian': [torch.float32],
+        'log_softmax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'matmul': [torch.int16, torch.int32, torch.int64, torch.uint8],  # MPS device does not support mm for non-float inputs
+        'mm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mv': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_ones': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_zeros': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.batch_norm': [torch.float32],
+        'nn.functional.bilinear': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.linear': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.softmin': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'ones_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'signal.windows.blackman': [torch.float16],
+        'signal.windows.cosine': [torch.float16],
+        'signal.windows.exponential': [torch.float16],
+        'signal.windows.gaussian': [torch.float16],
+        'signal.windows.general_cosine': [torch.float16],
+        'signal.windows.general_hamming': [torch.float16],
+        'signal.windows.hamming': [torch.float16],
+        'signal.windows.hann': [torch.float16],
+        'signal.windows.kaiser': [torch.float16],
+        'stft': [torch.float32],
+        'tensordot': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'zeros_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bincount': [torch.int16, torch.int32, torch.int64, torch.uint8],
+
+        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.int16, torch.int32],
+    }
+
+    UNDEFINED_BEHAVIOUR = {
+        # Failures due to random output that they generate using
+        # Philox engine causing mismatch with CPU results
+        'uniform': [torch.float16, torch.float32],
+        'rand_like': [torch.float16, torch.float32],
+        'randint_like': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'randn_like': [torch.float16, torch.float32],
+        'bernoulli': [torch.float32],
+        'normal': [torch.float16, torch.float32, torch.float16, torch.float32],
+        'nn.functional.alpha_dropout': [torch.float32],
+        'nn.functional.dropout': [torch.float32],
+        'nn.functional.dropout2d': [torch.float32],
+        'nn.functional.dropout3d': [torch.float32],
+        # these fill tensors with uninitialized data, causing mismatch with CPU
+        'new_empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'empty_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # problem 103190467, as_strided_scatter has non-deterministic behavior when the update indices are not unique
+        'as_strided_scatter': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # duplicate indices are used in the testcase - undefined behaviour
+        'index_put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
     }
 
     # Those ops worked on MacOS12, but broken on MacOS13
@@ -9642,6 +10086,20 @@ class TestConsistency(TestCaseMPS):
         'masked.var',
     }
 
+<<<<<<< HEAD
+=======
+    dirname = os.path.dirname(__file__)
+    filename = os.path.join(dirname, "cuda_results.yaml")
+    with open(filename) as f:
+        data = yaml.safe_load(f)
+    CUDA_RESULT = dict()
+    for key, value in data.items():
+        CUDA_RESULT[key] = torch.as_tensor(value)
+
+    MPS_SKIP_LIST = reduce(lambda x, y: dict(x, **y), (
+        FAST_MATH_PRECISION_ISSUES, BLOCKLIST, UNDEFINED_BEHAVIOUR, EXPECTED_FAILURES, UNIMPLEMENTED_OPS))
+
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
     # Used for accept mode only
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
@@ -9728,8 +10186,16 @@ def get_samples():
                 self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
             except Exception as e:
+<<<<<<< HEAD
                 if any(s in str(e).lower() for s in ["int64", "macos 13", "adaptive pool mps"]):
                     self.skipTest(f"Expected Runtime Error: {str(e)}")
+=======
+                if any(s in str(e).lower() for s in ["int64", "macos 13"]):
+                    self.skipTest(f"{str(e)}")
+
+                if op.name in self.CUDA_RESULT and self.compare_with_CUDA(op, mps_out, atol=atol, rtol=rtol):
+                    continue
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
                 if not generate_new_truth:
                     raise e

From e8f89dfbb6c91ccad98e8eb2450fd079bafe28a2 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Sat, 4 Feb 2023 11:44:46 -0800
Subject: [PATCH 03/30] Use DISTRIBUTED=1 for MPS CI runners (#292)

* Use DISTRIBUTED=1 for MPS CI runners

* Disable openmp
---
 .ci/pytorch/macos-build.sh       | 2 +-
 .github/workflows/_mac-build.yml | 1 +
 .github/workflows/_mac-test.yml  | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh
index dbba68081d3eb5..0b0b1e3599b30b 100755
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@@ -37,7 +37,7 @@ cross_compile_arm64() {
   # Cross compilation for arm64
   # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
   # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-  USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+  USE_DISTRIBUTED=1 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_OPENMP=OFF USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 }
 
 compile_x86_64() {
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 58c70125b71178..5a6483ad54b3e8 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -106,6 +106,7 @@ jobs:
           environment-file: ${{ inputs.environment-file }}
 
       - name: Install macOS homebrew dependencies
+        if: ${{ runner.arch == 'X64' }}
         run: |
           # Install dependencies
           brew install libomp
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index f61a3d28a3451c..fb4ceaad40be98 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -128,6 +128,7 @@ jobs:
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
       - name: Install macOS homebrew dependencies
+        if: ${{ runner.arch == 'X64' }}
         run: |
           # Install dependencies
           brew install libomp

From 85cdb98935efe7029dbc572d7211fe8a9d6cf417 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Tue, 14 Feb 2023 08:21:06 -0800
Subject: [PATCH 04/30] Update the test mps.

---
 test/cuda_results.yaml |  102 ++
 test/test_mps.py       | 2157 ++++++++++++++++++++++++----------------
 2 files changed, 1391 insertions(+), 868 deletions(-)
 create mode 100644 test/cuda_results.yaml

diff --git a/test/cuda_results.yaml b/test/cuda_results.yaml
new file mode 100644
index 00000000000000..bc6e0948ae5690
--- /dev/null
+++ b/test/cuda_results.yaml
@@ -0,0 +1,102 @@
+ConsistencyTest: {
+  nn.functional.conv_transpose2d:
+    [[[7.399066925048828, 4.4053635597229, -25.85348129272461,
+        58.88909149169922, -88.75193786621094, -18.98126983642578, 9.437820434570312],
+      [-59.78305435180664, -65.34088134765625, -108.04747009277344, 196.6062469482422,
+        71.39350891113281, 37.8786735534668, -69.55322265625], [92.78504943847656,
+        91.24403381347656, -94.33301544189453, 9.261059761047363, -182.10206604003906,
+        141.4270477294922, 146.89010620117188], [-14.363212585449219, 43.454036712646484,
+        -76.1098403930664, 242.9479522705078, 198.1458282470703, -49.77315139770508,
+        5.891449451446533], [-43.56822967529297, 4.782844066619873, -29.526945114135742,
+        65.15388488769531, 161.29757690429688, 118.60847473144531, 27.08570671081543],
+      [68.29853057861328, -11.507468223571777, 2.044086217880249, 11.003862380981445,
+        34.993282318115234, -21.256723403930664, 91.49512481689453], [-70.4466781616211,
+        69.04386138916016, 7.764842987060547, 7.61972713470459, -28.99899673461914,
+        54.575748443603516, -5.762258052825928]], [[-36.238487243652344, 37.29551696777344,
+        -22.012331008911133, -30.1353702545166, 33.82851028442383, 33.00322341918945,
+        2.7218000888824463], [-7.999058246612549, 122.72489929199219, -1.0639530420303345,
+        2.9564287662506104, -143.1276092529297, -110.75650024414062, 48.0764274597168],
+      [-91.0599136352539, -11.656601905822754, 69.62447357177734, 88.12522888183594,
+        337.3008728027344, -76.9416732788086, -110.24406433105469], [-108.1512451171875,
+        98.42401123046875, 142.46144104003906, -127.48089599609375, -3.367496967315674,
+        86.82833099365234, 86.29623413085938], [-14.339198112487793, -52.287410736083984,
+        171.43614196777344, 200.14817810058594, 200.35476684570312, -189.4150390625,
+        -46.86980056762695], [30.196495056152344, 25.22877311706543, 95.29426574707031,
+        4.455311298370361, 118.48747253417969, 87.11080932617188, -83.6124038696289],
+      [-2.5434072017669678, 91.8791732788086, -10.615175247192383, -12.58531379699707,
+        -49.3439826965332, 33.37324523925781, -5.983145713806152]], [[4.551003932952881,
+        15.84842586517334, -46.354671478271484, 14.721636772155762, 39.01048278808594,
+        49.70054244995117, -18.268564224243164], [16.728954315185547, 129.43505859375,
+        -4.6139116287231445, -3.382319688796997, -238.76353454589844, 13.42194938659668,
+        40.393280029296875], [-2.335604429244995, -85.94283294677734, -142.2253875732422,
+        135.27537536621094, 18.01512336730957, -26.331714630126953, -33.35443878173828],
+      [-79.17593383789062, -93.72674560546875, -110.94194030761719, -61.455223083496094,
+        6.811624526977539, 129.06478881835938, 12.435402870178223], [10.859378814697266,
+        41.3059196472168, 143.55824279785156, -41.754737854003906, -235.32406616210938,
+        -70.98460388183594, 130.46929931640625], [193.57574462890625, -142.5060272216797,
+        -102.45012664794922, 124.68048095703125, 136.05215454101562, -9.650590896606445,
+        -45.59521484375], [-37.829593658447266, 39.12519454956055, 9.293094635009766,
+        -18.8004093170166, -0.7294210195541382, 51.884910583496094, 36.15913391113281]],
+    [[-15.651233673095703, 16.31340980529785, -26.752052307128906, 6.281721115112305,
+        43.765541076660156, -13.097319602966309, -30.443206787109375], [10.67841911315918,
+        66.1829605102539, -9.394262313842773, -131.45101928710938, -38.621002197265625,
+        65.9507064819336, 48.76960372924805], [-76.0918197631836, -9.108996391296387,
+        13.64936637878418, 96.7411880493164, 124.2474365234375, -111.50318145751953,
+        -42.397071838378906], [-83.31562805175781, 32.27967071533203, 250.08163452148438,
+        58.24131393432617, 129.95318603515625, -10.683560371398926, -123.84668731689453],
+      [-11.536887168884277, -15.220125198364258, 197.18821716308594, -31.680112838745117,
+        -81.35874938964844, 157.96974182128906, 105.61251831054688], [78.15926361083984,
+        -84.49744415283203, -73.91180419921875, 86.370361328125, 77.87918090820312,
+        55.3555908203125, -7.273794651031494], [25.232547760009766, 30.352109909057617,
+        53.722267150878906, 44.87421798706055, 44.618812561035156, 4.511796951293945,
+        9.039834976196289]]]
+}
+UnitTest: {
+  norm: 
+  [
+    {
+      dtype: f16,
+      args: [[[ 8.9453,  4.0859,  0.1230,  2.1367, -5.0000],
+        [ 7.2773, -4.6953, -3.5586,  8.2812, -0.8789],
+        [ 0.7119, -1.4854,  6.8633, -7.9805, -3.6562],
+        [-1.0195, -7.2695, -0.0264, -3.5078, -0.2900],
+        [ 8.7656,  5.8984, -2.3125, -0.0352,  5.2812]],],
+      params: [0.5,],
+      res: [2000.]
+    },
+    {
+      dtype: f16,
+      args: [[[[ 8.9219,  3.0508, -3.0234, -5.6250, -5.3516],
+         [-5.8906,  5.2109, -7.2500,  7.3047, -0.1846],
+         [-2.1367, -8.8047, -3.4727, -3.0859,  4.9062],
+         [ 2.1797, -8.5078,  6.1445, -5.0547,  2.8828],
+         [-2.6191,  4.6680, -4.1758,  8.7734, -5.4844]],
+
+        [[-5.8984,  7.3281, -7.3672, -0.0879,  7.0039],
+         [ 2.0117, -6.4258,  8.6250,  2.5137, -2.2676],
+         [-7.2578,  1.6875,  7.8750,  7.5078,  0.8350],
+         [-4.8164, -3.6914, -3.9199,  4.9219, -4.6680],
+         [ 5.0547, -7.1289,  2.3633,  3.7793, -7.4375]],
+
+        [[-8.6953, -3.8750,  0.8965, -4.4453,  6.1328],
+         [ 8.6719,  2.5586, -3.0664, -7.7891,  2.5234],
+         [ 5.8008,  0.5977,  4.9219,  3.0156,  3.6211],
+         [-6.0898, -3.4883,  2.6543,  7.1992,  5.9414],
+         [-3.6035,  8.3906,  2.2070, -1.1162,  7.2852]],
+
+        [[-2.4531, -2.9180,  6.2422, -6.3711, -8.3516],
+         [ 3.3398, -8.5078, -8.9375, -2.0312, -4.3320],
+         [-1.4326, -4.5000, -0.3252, -6.8555, -8.2969],
+         [ 5.8438,  5.6094, -6.6797, -0.0439,  3.6035],
+         [ 4.5859,  7.1016, -0.8086,  5.6953,  0.5098]],
+
+        [[ 3.0859,  4.4844,  0.6152,  7.9609, -7.6562],
+         [-0.7998, -3.4023,  5.7734, -2.4785,  5.9219],
+         [ 7.1094,  1.4502, -7.1289,  4.7188, -4.8359],
+         [ 2.7422, -1.9512,  5.6602, -3.6387, -8.6953],
+         [-4.6953,  0.2900,  2.7148, -0.0176,  7.6992]]],],
+      params: [1.5],
+      res: [125.2500]
+    },
+  ],
+}
diff --git a/test/test_mps.py b/test/test_mps.py
index 2b186d8f4c1984..2085d0cebe721a 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -16,8 +16,10 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import itertools
+import yaml
+import platform
 from collections import defaultdict
-from torch import inf
+from torch._six import inf
 from torch.nn import Parameter
 from torch.testing._internal import opinfo
 from torch.testing._internal.common_utils import \
@@ -26,9 +28,10 @@
 from torch.testing import make_tensor
 from torch.testing._comparison import TensorLikePair
 from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
+import torch.mps
 import torch.backends.mps
 from torch.distributions import Uniform, Exponential
-from functools import partial
+from functools import partial, reduce
 
 from torch.testing._internal.common_methods_invocations import (
     op_db,
@@ -62,6 +65,8 @@
     TestCase = object  # noqa: F811
     NNTestCase = object  # noqa: F811
 
+product_version = float('.'.join(platform.mac_ver()[0].split('.')[:2]))
+
 # Determine whether to enable MPS memory leak check (uses same code as CUDA).
 TEST_MPS_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_MPS_MEM_LEAK_CHECK', '0') == '1'
 
@@ -371,6 +376,15 @@ def test_avg_pool2d_ceil_mode(self):
 
 
 class TestMPS(TestCaseMPS):
+    def help_extra_unit(self, opname, op):
+        if opname not in OP_UNIT_TEST:
+            return
+        for test in OP_UNIT_TEST[opname]:
+            mps_args = test.sample()
+            mps_out = op(*mps_args)
+            mps_out = (mps_out, ) if isinstance(mps_out, torch.Tensor) else mps_out
+            self.assertEqual(test.expected(), mps_out)
+
     def test_exp(self, device="mps", dtype=torch.float):
         for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()):
             b = torch.arange(18, device="cpu") / 3 * math.pi
@@ -432,6 +446,53 @@ def helper(val, shape):
         helper(0, [1024])
         helper(0.2, [2, 3])
 
+    def test_mm(self):
+        B = torch.ones(5, 6).to("mps")
+        C = torch.ones(6, 5).to("mps")
+        D = torch.mm(B, C).cpu()
+        torch.testing.assert_close(D, torch.full((5, 5), 6.0))
+
+    def test_linalg_cross(self):
+        def helper(dtype):
+            device = "mps"
+            if dtype is torch.int32 or dtype is torch.int64:
+                x = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
+                y = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
+            else:
+                x = torch.rand(100, 3, 100, dtype=dtype, device=device)
+                y = torch.rand(100, 3, 100, dtype=dtype, device=device)
+            x_cpu = x.to("cpu")
+            y_cpu = y.to("cpu")
+            res1 = torch.linalg.cross(x, y, dim=1)
+            res2 = torch.tensor((), dtype=dtype, device=device)
+            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
+            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
+            torch.linalg.cross(x, y, dim=1, out=res2)
+            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
+            self.assertEqual(res1, res2)
+            self.assertEqual(res1, res1_cpu)
+            self.assertEqual(res2, res2_cpu)
+
+            # test for broadcastable inputs
+            if dtype is torch.int32 or dtype is torch.int64:
+                x = torch.randint(0, 99999, (1, 3, 2), dtype=dtype, device=device)
+                y = torch.randint(0, 99999, (4, 3, 1), dtype=dtype, device=device)
+            else:
+                x = torch.rand(1, 3, 2, dtype=dtype, device=device)
+                y = torch.rand(4, 3, 1, dtype=dtype, device=device)
+            x_cpu = x.to("cpu")
+            y_cpu = y.to("cpu")
+            res1 = torch.linalg.cross(x, y, dim=1)
+            res2 = torch.tensor((), dtype=dtype, device=device)
+            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
+            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
+            torch.linalg.cross(x, y, dim=1, out=res2)
+            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
+            self.assertEqual(res1, res2)
+            self.assertEqual(res1, res1_cpu)
+            self.assertEqual(res2, res2_cpu)
+        [helper(dtype) for dtype in [torch.int32, torch.int64, torch.float32]]
+
     def test_cdist_large(self, device="mps"):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(100, 10, device=device)
@@ -577,53 +638,6 @@ def test_cdist_norm_batch(self, device="mps"):
                             expected = self._brute_cdist(x, y, p=p)
                             self.assertEqual(expected, actual)
 
-    def test_mm(self):
-        B = torch.ones(5, 6).to("mps")
-        C = torch.ones(6, 5).to("mps")
-        D = torch.mm(B, C).cpu()
-        torch.testing.assert_close(D, torch.full((5, 5), 6.0))
-
-    def test_linalg_cross(self):
-        def helper(dtype):
-            device = "mps"
-            if dtype is torch.int32 or dtype is torch.int64:
-                x = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
-                y = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
-            else:
-                x = torch.rand(100, 3, 100, dtype=dtype, device=device)
-                y = torch.rand(100, 3, 100, dtype=dtype, device=device)
-            x_cpu = x.to("cpu")
-            y_cpu = y.to("cpu")
-            res1 = torch.linalg.cross(x, y, dim=1)
-            res2 = torch.tensor((), dtype=dtype, device=device)
-            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
-            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
-            torch.linalg.cross(x, y, dim=1, out=res2)
-            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
-            self.assertEqual(res1, res2)
-            self.assertEqual(res1, res1_cpu)
-            self.assertEqual(res2, res2_cpu)
-
-            # test for broadcastable inputs
-            if dtype is torch.int32 or dtype is torch.int64:
-                x = torch.randint(0, 99999, (1, 3, 2), dtype=dtype, device=device)
-                y = torch.randint(0, 99999, (4, 3, 1), dtype=dtype, device=device)
-            else:
-                x = torch.rand(1, 3, 2, dtype=dtype, device=device)
-                y = torch.rand(4, 3, 1, dtype=dtype, device=device)
-            x_cpu = x.to("cpu")
-            y_cpu = y.to("cpu")
-            res1 = torch.linalg.cross(x, y, dim=1)
-            res2 = torch.tensor((), dtype=dtype, device=device)
-            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
-            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
-            torch.linalg.cross(x, y, dim=1, out=res2)
-            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
-            self.assertEqual(res1, res2)
-            self.assertEqual(res1, res1_cpu)
-            self.assertEqual(res2, res2_cpu)
-        [helper(dtype) for dtype in [torch.int32, torch.int64, torch.float32]]
-
     def test_cross(self):
         a = torch.randn(4, 3, device="mps")
         b = torch.randn(4, 3, device="mps")
@@ -640,6 +654,13 @@ def test_addmm(self):
         D = torch.addmm(A, B, C).to("cpu")
         torch.testing.assert_close(D, torch.full((5, 5), 7.0))
 
+    def test_addr(self):
+        A = torch.ones(5, 10).to("mps")
+        B = torch.ones(5).to("mps")
+        C = torch.ones(10).to("mps")
+        D = torch.addr(A, B, C).to("cpu")
+        torch.testing.assert_close(D, torch.full((5, 10), 2.0))
+
     def test_bmm(self):
         batch1_cpu = torch.randn(10, 3, 4)
         batch2_cpu = torch.randn(10, 4, 5)
@@ -653,13 +674,6 @@ def test_bmm(self):
         self.assertEqual(output_cpu, output_mps)
         self.assertEqual(output_cpu.size(), output_mps.size())
 
-    def test_addr(self):
-        A = torch.ones(5, 10).to("mps")
-        B = torch.ones(5).to("mps")
-        C = torch.ones(10).to("mps")
-        D = torch.addr(A, B, C).to("cpu")
-        torch.testing.assert_close(D, torch.full((5, 10), 2.0))
-
     def test_trace(self):
         M_cpu = torch.randn(3, 3)
         M_mps = M_cpu.detach().clone().to("mps")
@@ -1212,11 +1226,7 @@ def test_norm(self):
         self.assertEqual(res, res_cpu)
 
         c = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="mps")
-<<<<<<< HEAD
-        c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]] , dtype=torch.float, device="cpu")
-=======
         c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="cpu")
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
         res = torch.norm(c, dim=0)
         res_cpu = torch.norm(c_cpu, dim=0)
@@ -1241,6 +1251,8 @@ def test_norm(self):
         res_cpu = torch.norm(d_cpu[0, :, :]), torch.norm(d_cpu[1, :, :])
         self.assertEqual(res, res_cpu)
 
+        self.help_extra_unit('norm', torch.norm)
+
     def test_layer_norm(self):
         # TODO: Test non-contiguous
         def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dtype=torch.float32):
@@ -1822,12 +1834,6 @@ def test_view_slice(self):
                 actual_pts[i, j] = X[pts[i, j], j]
                 self.assertEqual(actual_pts[i, j], actual_pts_mps[i, j])
 
-    def test_slice_scatter(self):
-        shape = (4, 4)
-        tensor = torch.randint(10, shape, device="mps")
-        tensor_before = tensor.clone()
-        torch.empty(shape[0], shape[1] * 2, device="mps")[:, ::2].copy_(tensor)
-        torch.testing.assert_close(tensor, tensor_before)
 
     def test_slice(self):
         values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
@@ -1987,99 +1993,6 @@ def helper(shape, repeats):
         helper((3, 4, 5), (2, 3, 4, 5))
         helper((3, 4, 5), (2, 2, 2))
 
-    def test_torch_repeat_interleave(self, device="mps"):
-        y = torch.tensor([[1, 2], [3, 4]], device=device)
-        # exercise single argument function signature
-        temp = y.repeat_interleave(2)
-        self.assertEqual(torch.Size([8]), temp.size())
-
-        for dtype in [torch.int, torch.long]:
-            lengths = torch.tensor([1, 2], dtype=dtype, device="mps")
-            output_size = torch.sum(lengths)
-            a = torch.repeat_interleave(
-                y,
-                lengths,
-                dim=0,
-            )
-            self.assertEqual(a.dtype, y.dtype)
-            self.assertEqual(a.size(), torch.Size([3, 2]))
-
-            a_with_output = torch.repeat_interleave(
-                y,
-                lengths,
-                dim=0,
-                output_size=output_size,
-            )
-            self.assertEqual(a_with_output.dtype, y.dtype)
-            self.assertEqual(a_with_output.size(), torch.Size([3, 2]))
-
-    def test_repeat_interleave(self, device="mps"):
-        x = torch.tensor([0, 1, 2, 3], device=device)
-        expected = torch.tensor([1, 2, 2, 3, 3, 3], dtype=torch.int32, device=device)
-        self.assertEqual(torch.repeat_interleave(x), expected)
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(torch.arange(4, device=device).reshape(2, 2))
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(torch.arange(4.0, device=device))
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(torch.tensor([1, 2, -1, 3, 4], device=device))
-
-        y = torch.tensor([[1, 2], [3, 4]], device=device)
-
-        y1_v1 = torch.repeat_interleave(y, 2)
-        y1_v2 = torch.repeat_interleave(y, torch.tensor(2, device=device))
-        y1_v3 = torch.repeat_interleave(y, torch.tensor([2], device=device))
-        y1_expect = torch.tensor([1, 1, 2, 2, 3, 3, 4, 4], device=device)
-        self.assertEqual(y1_v1, y1_expect)
-        self.assertEqual(y1_v2, y1_expect)
-        self.assertEqual(y1_v3, y1_expect)
-
-        y2 = torch.repeat_interleave(y, 3, dim=1)
-        y2_expect = torch.tensor([[1, 1, 1, 2, 2, 2],
-                                  [3, 3, 3, 4, 4, 4]], device=device)
-        self.assertEqual(y2, y2_expect)
-
-        y3 = torch.repeat_interleave(y, torch.tensor([1, 2], device=device), dim=0)
-        y3_expect = torch.tensor([[1, 2],
-                                  [3, 4],
-                                  [3, 4]], device=device)
-        self.assertEqual(y3, y3_expect)
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(y, torch.tensor([1, 2, 3], device=device), dim=0)
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(y, torch.arange(9, device=device).reshape(3, 3), dim=0)
-
-        # test zero sized dimension
-        x = torch.zeros((5, 0), device=device)
-        y = torch.repeat_interleave(x, repeats=3, dim=1)
-        self.assertEqual(y, x.new_zeros(5, 0, device=device))
-
-        x = torch.tensor([], dtype=torch.int64, device=device)
-        y = torch.repeat_interleave(x, x)
-        self.assertEqual(y, x)
-
-    def test_repeat_interleave_simple(self):
-        def helper(shape, dtype=torch.float32, num_repeats=torch.Tensor(), dim=None):
-            x = torch.randn(shape, dtype=dtype, device="mps")
-            x_cpu = x.detach().clone().cpu()
-
-            num_repeats_cpu = num_repeats.detach().clone().cpu()
-
-            repeats = torch.repeat_interleave(x, num_repeats, dim)
-            repeats_cpu = torch.repeat_interleave(x_cpu, num_repeats_cpu, dim)
-
-            self.assertEqual(repeats, repeats_cpu)
-        helper(shape=3, num_repeats=torch.tensor([100], device="mps"))
-        helper(shape=(2, 2), num_repeats=torch.tensor([3, 3], device="mps"), dim=0)
-        helper(shape=(10, 15, 8), num_repeats=torch.arange(10, device="mps"), dim=0)
-        helper(shape=(10, 15, 8), num_repeats=torch.randint(0, 100, (15, ), device="mps"), dim=1)
-        helper(shape=(10, 15, 30), num_repeats=torch.randint(0, 100, (30, ), device="mps"), dim=2)
-
     def test_count_nonzero(self):
         def helper(dtype):
             n = [
@@ -2155,6 +2068,15 @@ def test_to(self):
             x_mps = x_cpu.to('mps')
             self.assertEqual(x_mps.to(torch.float32), x_cpu.to(torch.float32))
 
+    @unittest.skipIf(True, "non-contiguous tensor to mps is incorrect.")
+    def test_to_non_contiguous(self):
+        x = torch.arange(16, dtype=torch.float32).reshape(2, 2, 2, 2)
+        x1 = x[:, :, :1, :]
+        x2 = x[:, :, 1:, :]
+        self.assertFalse(x1.is_contiguous())
+        self.assertFalse(x2.is_contiguous())
+        self.assertEqual(x1, x1.detach().to("mps"))
+        self.assertEqual(x2, x2.detach().to("mps"))
 
     def test_setitem_scalar(self) -> None:
         device = 'mps'
@@ -2228,9 +2150,9 @@ def test_storage_offset_greater_than_src_nbytes(self):
             tensor_list.append(t)
 
         for i in range(0, n_tensors - 1):
-            t = tensor_list[i].view(1, n_tensor_elems)
+            t = tensor_list[i].view(1, 784)
             t_mps = t.to("mps")
-            self.assertEqual(t, t_mps.cpu(), f"i={i}")
+            self.assertEqual(t, t_mps.cpu())
 
     # See https://github.com/pytorch/pytorch/issues/82427
     # and https://github.com/pytorch/pytorch/issues/83692
@@ -2242,6 +2164,7 @@ def test_full_bugs(self):
         y_cpu = torch.full((2, 2), 247, device='cpu', dtype=torch.uint8)
         self.assertEqual(y_mps, y_cpu)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     # See https://github.com/pytorch/pytorch/issues/84995
     def test_div_bugs(self):
         for (dtype, mode) in itertools.product(integral_types(), ['trunc', 'floor']):
@@ -2308,6 +2231,7 @@ def ensure_tuple(x):
                 self.assertEqual(expected_inverse.view(additional_shape), y_inverse)
                 self.assertEqual(expected_counts, y_counts)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_unique_all_dtypes(self, device="mps"):
         def helper(dtype):
             def ensure_tuple(x):
@@ -2363,7 +2287,7 @@ def ensure_tuple(x):
                                 if k == i:
                                     count += 1
                             self.assertEqual(j, count)
-        [helper(dtype) for dtype in [torch.float32, torch.int64, torch.int32, torch.int16, torch.uint8]]
+        [helper(dtype) for dtype in [torch.float32, torch.float16, torch.int64, torch.int32, torch.int16, torch.uint8]]
 
     def test_unique(self):
         def helper(x, return_inverse, return_counts):
@@ -2375,21 +2299,12 @@ def helper(x, return_inverse, return_counts):
 
             self.assertEqual(result, result_cpu)
         helper(torch.tensor([1, 2, 4, 2, 1]), False, False)
-<<<<<<< HEAD
-        helper(torch.randint(3, (10, )), False, False)
-        helper(torch.randint(3, (10, )), True, False)
-        helper(torch.randint(3, (10, )), False, True)
-        helper(torch.randint(3, (10, )), True, True)
-        helper(torch.randint(3, (1, )), True, True)
-        helper(torch.randint(3, (0, )), True, True)
-=======
         helper(torch.randint(3, (10,)), False, False)
         helper(torch.randint(3, (10,)), True, False)
         helper(torch.randint(3, (10,)), False, True)
         helper(torch.randint(3, (10,)), True, True)
         helper(torch.randint(3, (1,)), True, True)
         helper(torch.randint(3, (0,)), True, True)
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
     def test_unique_consecutive(self):
         def helper(x, dim, return_inverse, return_counts):
@@ -2401,15 +2316,6 @@ def helper(x, dim, return_inverse, return_counts):
 
             self.assertEqual(result, result_cpu)
         helper(torch.tensor([1, 2, 4, 2, 1]), 0, False, False)
-<<<<<<< HEAD
-        helper(torch.randint(3, (10, )), 0, False, False)
-        helper(torch.randint(3, (10, )), 0, True, False)
-        helper(torch.randint(3, (10, )), 0, False, True)
-        helper(torch.randint(3, (10, )), 0, True, True)
-        helper(torch.randint(3, (10, )), 0, True, True)
-        helper(torch.randint(3, (1, )), 0, True, True)
-        helper(torch.randint(3, (0, )), 0, True, True)
-=======
         helper(torch.randint(3, (10,)), 0, False, False)
         helper(torch.randint(3, (10,)), 0, True, False)
         helper(torch.randint(3, (10,)), 0, False, True)
@@ -2417,7 +2323,6 @@ def helper(x, dim, return_inverse, return_counts):
         helper(torch.randint(3, (10,)), 0, True, True)
         helper(torch.randint(3, (1,)), 0, True, True)
         helper(torch.randint(3, (0,)), 0, True, True)
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
         helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, False, False)
         helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, True, True)
@@ -2460,134 +2365,6 @@ def test_from_numpy_non_contiguous(self):
         t_mps = torch.tensor(a, device="mps")
         self.assertEqual(t_cpu, t_mps.to("cpu"))
 
-    # See https://github.com/pytorch/pytorch/issues/86954
-    def test_copy_non_contiguous(self):
-        x = torch.arange(27).reshape(3, 3, 3).permute(2, 0, 1)
-        self.assertFalse(x.is_contiguous())
-        y = x.to('mps')
-        self.assertFalse(y.is_contiguous())
-        self.assertEqual(x, y.to('cpu'))
-
-        x = torch.arange(4**3).reshape(4, 4, 4).permute((2, 0, 1))[1:, ::2]
-        y = x.to('mps')
-        self.assertEqual(x, y.to('cpu'))
-
-        x = torch.full((4, 4, 4, 4), 13, device="cpu")
-        y = torch.full((4, 4, 4, 4), 13, device="mps")
-        z = torch.arange(4**4).reshape(4, 4, 4, 4).permute(3, 2, 0, 1)[1::, ::2]
-        x.permute(3, 2, 1, 0)[1::, ::2] = z
-        # As y is on MPS and z on CPU, this dispatches to a copy operator
-        y.permute(3, 2, 1, 0)[1::, ::2] = z
-        self.assertEqual(x, y.to('cpu'))
-
-    # See https://github.com/pytorch/pytorch/pull/84742
-    # and https://github.com/pytorch/pytorch/pull/78319
-    def test_binops_dtype_precedence(self):
-        # Test dtype precedence (casting order) in binary operations by comparing to CPU result
-        # Example values for all dtypes supported on the MPS backend
-        sample_vals = {
-            torch.bool: [False, True],
-            torch.int16: [-15, 0, 1, 10],
-            torch.int32: [-376, 0, 1, 13],
-            torch.int64: [-8, 0, 1, 77],
-            torch.float16: [-234.5, 0.0, 1.0, 2.0],
-            torch.float32: [-1.0, 0.0, 0.1, 111.99],
-        }
-        # Test all combinations of dtypes, operations, dimensionality
-        for dtype1, dtype2, binop in itertools.product(
-                sample_vals.keys(), sample_vals.keys(), ['add', 'sub', 'mul', 'div']):
-            # bool minus bool is generally unsupported, so skip
-            if binop == 'sub' and (dtype1 == torch.bool or dtype2 == torch.bool):
-                continue
-            full_shape = (10,)
-            for val1, val2 in itertools.product(sample_vals[dtype1], sample_vals[dtype2]):
-                # print(f'{dtype1},{dtype2}: ({val1}).{binop}({val2})')
-                # print(getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                #            (torch.tensor(val2, dtype=dtype2, device='mps')))
-                # print(getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                #            (torch.tensor(val2, dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='mps')),
-                    getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor([val1], dtype=dtype1, device='mps'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='mps')),
-                    getattr(torch.tensor([val1], dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='mps')),
-                    getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor([val1], dtype=dtype1, device='mps'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='mps')),
-                    getattr(torch.tensor([val1], dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='cpu')))
-                # Test tensors created with torch.full
-                x1 = torch.full(full_shape, val1, dtype=dtype1, device='mps')
-                y1 = torch.tensor(val2, dtype=dtype2, device='mps')
-                x2 = torch.full(full_shape, val1, dtype=dtype1, device='cpu')
-                y2 = torch.tensor(val2, dtype=dtype2, device='cpu')
-                self.assertEqual(getattr(x1, binop)(y1), getattr(x2, binop)(y2))
-                x3 = torch.tensor(val1, dtype=dtype1, device='mps')
-                y3 = torch.full(full_shape, val2, dtype=dtype2, device='mps')
-                x4 = torch.tensor(val1, dtype=dtype1, device='cpu')
-                y4 = torch.full(full_shape, val2, dtype=dtype2, device='cpu')
-                self.assertEqual(getattr(x3, binop)(y3), getattr(x4, binop)(y4))
-                self.assertEqual(
-                    getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                           (torch.full(full_shape, val2, dtype=dtype2, device='mps')),
-                    getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                           (torch.full(full_shape, val2, dtype=dtype2, device='cpu')))
-
-    def test_nansum(self):
-        def helper(dtype, noncontiguous, dim):
-            zero_cpu = torch.zeros((), dtype=dtype)
-
-            # Randomly scale the values
-            scale = random.randint(10, 100)
-            x_cpu: torch.Tensor = make_tensor(
-                (5, 5), dtype=dtype, device='cpu',
-                low=-scale, high=scale, noncontiguous=noncontiguous)
-
-            if dtype.is_floating_point:
-                nan_mask_cpu = x_cpu < (0.2 * scale)
-                x_no_nan_cpu = torch.where(nan_mask_cpu, zero_cpu, x_cpu)
-                x_cpu[nan_mask_cpu] = np.nan
-            else:
-                x_no_nan_cpu = x_cpu
-
-            x_mps = x_cpu.to('mps')
-            actual_out_mps = torch.empty(0, dtype=dtype, device='mps')
-            expect_out_cpu = torch.empty(0, dtype=dtype)
-            dim_kwargs = {"dim": dim} if dim is not None else {}
-            expect = torch.sum(x_no_nan_cpu, **dim_kwargs)
-
-            actual_cpu = torch.nansum(x_cpu, **dim_kwargs)
-            # Sanity check on CPU
-            self.assertEqual(expect, actual_cpu)
-
-            # Test MPS
-            actual_mps = torch.nansum(x_mps, **dim_kwargs)
-            # Test out= variant
-            torch.nansum(x_mps, out=actual_out_mps, **dim_kwargs)
-            torch.nansum(x_cpu, out=expect_out_cpu, **dim_kwargs)
-            self.assertEqual(expect, actual_mps)
-            self.assertEqual(expect_out_cpu, actual_out_mps)
-
-        args = itertools.product(
-            (torch.float16, torch.float32, torch.int32, torch.int64),   # dtype
-            (True, False),                                              # noncontiguous
-            (0, 1, None),                                               # dim
-        )
-
-        for dtype, noncontiguous, dim in args:
-            with self.subTest(dtype=dtype, noncontiguous=noncontiguous, dim=dim):
-                helper(dtype, noncontiguous, dim)
-
     def test_cumsum_all_dtypes(self):
         def helper(dtype):
             t = torch.tensor([1, 1, 1, 1], device="mps", dtype=dtype)
@@ -2605,22 +2382,32 @@ def helper(dtype):
             e_string = str(e)
             self.assertEqual(e_string, "MPS does not support cumsum op with int64 input")
 
-    def test_cumsum_minus_one_axis(self):
-        def helper(dtype):
-            # Test with axis -1
-            cpu_x = None
-            if(dtype == torch.float32):
-                cpu_x = torch.randn(10, 3, device='cpu', dtype=torch.float32)
-            else:
-                cpu_x = torch.randint(0, 20, (10, 3), device='cpu', dtype=torch.float32)
+    def test_gelu_tanh(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
             x = cpu_x.detach().clone().to('mps')
 
-            cpu_y = cpu_x.cumsum(-1)
-            y = x.cumsum(-1)
+            gelu_tanh_result = torch.nn.functional.gelu(x, approximate='tanh')
+            gelu_tanh_result_cpu = torch.nn.functional.gelu(cpu_x, approximate='tanh')
+            self.assertEqual(gelu_tanh_result, gelu_tanh_result_cpu)
 
-            self.assertEqual(y, cpu_y)
+        helper((2, 8, 4, 5))
 
-        [helper(dtype) for dtype in [torch.float32, torch.int16, torch.int32, torch.uint8]]
+    # # Failures due to precision issues, enable after resolving from mps
+    # def test_div_floor_int(self):
+    #     def helper(shape, dtype):
+    #         cpu_x = torch.randint(-9999, -1,shape, device='cpu', dtype=dtype)
+    #         x = cpu_x.detach().clone().to('mps')
+
+    #         cpu_y = torch.randint(1, 9999, shape, device='cpu', dtype=dtype)
+    #         y = cpu_y.detach().clone().to('mps')
+
+    #         div_result = torch.div(x, y,rounding_mode='floor')
+    #         div_result_cpu = torch.div(cpu_x, cpu_y, rounding_mode='floor')
+    #         self.assertEqual(div_result, div_result_cpu)
+
+    #     helper((2, 8, 4, 5), torch.int16)
+    #     helper((2, 8, 4, 5), torch.int32)
 
     def test_median_int16(self):
         def helper(shape, dtype):
@@ -2633,6 +2420,23 @@ def helper(shape, dtype):
 
         helper((2, 8, 4, 5), torch.int16)
 
+    def test_cumsum_minus_one_axis(self):
+        def helper(dtype):
+            # Test with axis -1
+            cpu_x = None
+            if dtype == torch.float32:
+                cpu_x = torch.randn(10, 3, device='cpu', dtype=torch.float32)
+            else:
+                cpu_x = torch.randint(0, 20, (10, 3), device='cpu', dtype=torch.float32)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = cpu_x.cumsum(-1)
+            y = x.cumsum(-1)
+
+            self.assertEqual(y, cpu_y)
+
+        [helper(dtype) for dtype in [torch.float32, torch.int16, torch.int32, torch.uint8]]
+
 class TestLogical(TestCaseMPS):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
         return torch.tensor(x, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -2785,6 +2589,20 @@ def test_smooth_l1_loss_reduction_mean_sum_backward(self):
 
 
 class TestNLLLoss(TestCaseMPS):
+    def test_nll2d_loss_backward(self, device='mps'):
+        a = torch.randn(3, 5, requires_grad=True, device=device)
+        b = torch.tensor([1, 0, 4], device=device)
+        loss = nn.NLLLoss()
+        out = loss(a, b)
+        self.assertIsNone(out.grad_fn._saved_weight)
+        loss = nn.NLLLoss(weight=torch.ones((5,), device=device))
+        out = loss(a, b)
+        self.assertEqual(out.grad_fn._saved_weight, torch.ones((5,)))
+
+        out.sum().backward()
+        with self.assertRaisesRegex(RuntimeError, "after they have already been freed"):
+            out.grad_fn._saved_weight
+
     def test_nll_loss_mismatched_batch(self, device='mps'):
         x = torch.randn((10, 3), requires_grad=True, device=device)
         # t should have size (10,)
@@ -2846,13 +2664,13 @@ def _nll_loss_helper(self, input_size, reduction, expected):
         input = torch.rand(input_size, requires_grad=True, device='cpu')
         num_channels = input_size[1]
         target_size = (input_size[0], ) + tuple(input_size[2:])
-        target = torch.randint(num_channels, target_size, device='cpu')
         weights = torch.randn(num_channels)
+        weights_mps = weights.to("mps")
+        target = torch.randint(num_channels, target_size, device='cpu')
 
         # MPS
         input_mps = input.detach().clone().to('mps').requires_grad_()
         target_mps = target.detach().clone().to('mps')
-        weights_mps = weights.to("mps")
 
         output_cpu = F.nll_loss(input, target, weight=weights, reduction=reduction)
         output_mps = F.nll_loss(input_mps, target_mps, weight=weights_mps, reduction=reduction)
@@ -3389,6 +3207,7 @@ def test_eq(self):
 
         self.assertEqual(result_cpu, result_mps.to('cpu'))
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_signed_vs_unsigned_comparison(self):
         cpu_x = torch.tensor((-1, 2, 3), device='cpu', dtype=torch.uint8)
         mps_x = torch.tensor((-1, 2, 3), device='mps', dtype=torch.uint8)
@@ -4542,26 +4361,6 @@ def helper(shape):
         helper((5, 9, 7, 4))
         helper((50, 20, 7, 4))
 
-    def test_sort(self):
-        for SIZE in (4, 2049):
-            device = 'mps'
-            x = torch.rand(4, SIZE, device=device)
-            res1val, res1ind = torch.sort(x)
-
-            res2val = torch.tensor((), device=device)
-            res2ind = torch.tensor((), device=device, dtype=torch.long)
-            torch.sort(x, out=(res2val, res2ind))
-            self.assertEqual(res1val, res2val, atol=0, rtol=0)
-            self.assertEqual(res1ind, res2ind, atol=0, rtol=0)
-            self.assertEqual(torch.argsort(x), res1ind)
-            self.assertEqual(x.argsort(), res1ind)
-
-            self.assertEqual(
-                torch.sort(torch.tensor((50, 40, 30, 20, 10), device=device))[0],
-                torch.tensor((10, 20, 30, 40, 50), device=device),
-                atol=0, rtol=0
-            )
-
     def test_upsample_nearest2d(self):
         def helper(N, C, H, W):
             inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
@@ -4627,6 +4426,7 @@ def helper(N, C, H, W):
         helper(1, 1, 4, 4)
         helper(7, 5, 3, 2)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_interpolate(self):
         def helper(shape, output_size, scales, mode, align_corners=False):
             inputCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
@@ -4776,6 +4576,8 @@ def helper(shape, padding, op, value=0):
         helper((2, 1, 6, 8), 2, nn.ReplicationPad2d)
         # verify if a change in shape of padding would cause problems with graph caching
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ReplicationPad2d)
+        # negative padding
+        helper((1, 3, 4, 4), (-1, 1, -2, 1), nn.ReplicationPad2d)
         # Constant Pad 2D
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ConstantPad2d)
         # input size < pad size
@@ -4795,15 +4597,10 @@ def helper(shape, padding, op, value=0):
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReplicationPad3d)
         # Constant Pad 3D
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
-        # input size < pad size
-        helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
         # check the workaround for the right padding bug in Monterey
         helper((1, 2, 2, 2, 2), (0, 1), nn.ConstantPad3d)
-<<<<<<< HEAD
-=======
         # input size < pad size
         helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
     # Test stack forward
     def test_stack(self):
@@ -5086,7 +4883,6 @@ def helper(shape, dim=0):
             for dim in range(len(shape)):
                 helper(shape, dim)
 
-<<<<<<< HEAD
     # Test softplus
     def test_softplus(self):
         def helper(shape, beta=1, threshold=20):
@@ -5110,31 +4906,8 @@ def helper(shape, beta=1, threshold=20):
             for beta in [0.5, 1, 2, 3, 4]:
                 for threshold in [0.5, 20, 30, 40, 50]:
                     helper(shape, beta, threshold)
-=======
-    # # Test softplus
-    # def test_softplus(self):
-    #     def helper(shape, beta=1, threshold=20):
-    #         cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
-    #         x = cpu_x.detach().clone().to('mps').requires_grad_()
-
-    #         softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
-    #         softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
-
-    #         cpu_grad = torch.randn(softplus_result.shape)
-    #         grad = cpu_grad.to('mps')
 
-    #         softplus_result.backward(gradient=grad)
-    #         softplus_result_cpu.backward(gradient=cpu_grad)
-
-    #         self.assertEqual(softplus_result, softplus_result_cpu)
-    #         self.assertEqual(x.grad, cpu_x.grad)
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
-
-    #     # Test empty shape too
-    #     for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
-    #         for beta in [0.5, 1, 2, 3, 4]:
-    #             for threshold in [0.5, 20, 30, 40, 50]:
-    #                 helper(shape, beta, threshold)
+    # Test silu
 
     def test_silu(self):
         def helper(shape):
@@ -5340,17 +5113,6 @@ def _gelu_ref(X):
         finally:
             torch.set_num_threads(num_threads)
 
-    def test_gelu_tanh(self):
-        def helper(shape):
-            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
-            x = cpu_x.detach().clone().to('mps')
-
-            gelu_tanh_result = torch.nn.functional.gelu(x, approximate='tanh')
-            gelu_tanh_result_cpu = torch.nn.functional.gelu(cpu_x, approximate='tanh')
-            self.assertEqual(gelu_tanh_result, gelu_tanh_result_cpu)
-
-        helper((2, 8, 4, 5))
-
     # Test hardtanh
     def test_hardtanh(self):
         def helper(shape, min_val, max_val, inplace=False):
@@ -5527,14 +5289,14 @@ def helper(shape):
 
     # Test index add
     def test_index_add(self):
-        def helper(shape, dim, index, source_shape, alpha, x_dtype=torch.float32, idx_dtype=torch.int32):
-            cpu_x = torch.randn(shape, device='cpu', dtype=x_dtype, requires_grad=False)
+        def helper(shape, dim, index, source_shape, alpha, idx_dtype=torch.int32):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
             x = cpu_x.detach().clone().to('mps')
 
             cpu_idx = torch.tensor(index, device='cpu', dtype=idx_dtype)
             idx = cpu_idx.detach().clone().to('mps')
 
-            cpu_source = torch.randn(source_shape, device='cpu', dtype=x_dtype, requires_grad=False)
+            cpu_source = torch.randn(source_shape, device='cpu', dtype=torch.float, requires_grad=False)
             source = cpu_source.detach().clone().to('mps')
 
             idx_result = torch.index_add(x, dim=dim, index=idx, source=source, alpha=alpha)
@@ -5550,8 +5312,6 @@ def helper(shape, dim, index, source_shape, alpha, x_dtype=torch.float32, idx_dt
         # test result dim=1
         helper((2,), 0, [1], (1,), 6.0)
         helper(2, 0, 1, 1, 6)
-        # test float16
-        helper((2,), 0, [1], (1,), 6.0, x_dtype=torch.float16)
 
     # Test flip
     def test_flip(self):
@@ -5595,23 +5355,6 @@ def helper(shape, dim, index, idx_dtype=torch.int32):
         helper((2, 8, 4, 5), 2, [3, 0, 1])
         helper((2, 8, 4, 5), 3, [2, 3, 0])
         helper((2, 3, 3), -1, [1, 2])
-        helper((), 0, [0])
-        helper((5), 0, [])
-
-    def test_index_select_scalar(self):
-        def helper(value, dim, index, idx_dtype=torch.int32):
-            cpu_x = torch.tensor(value, device='cpu', dtype=torch.float, requires_grad=False)
-            x = cpu_x.detach().clone().to('mps')
-
-            cpu_idx = torch.tensor(index, device='cpu', dtype=idx_dtype)
-            idx = cpu_idx.detach().clone().to('mps')
-
-            idx_result = torch.index_select(x, dim=dim, index=idx)
-            idx_result_cpu = torch.index_select(cpu_x, dim=dim, index=cpu_idx)
-
-            self.assertEqual(idx_result, idx_result_cpu)
-
-        helper(22, 0, [])
 
     def test_embedding_dense_backward(self):
         def helper(n, d, m, idx):
@@ -5828,11 +5571,7 @@ def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, reduce_str="
 
         # for reduce in ["sum", "prod", "amax", "amin"]:
         for reduce_type in ["add", "multiply"]:
-<<<<<<< HEAD
             helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce_type)
-=======
-            helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce)
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
             helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
             helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
             helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce_type)
@@ -5994,13 +5733,6 @@ def test_arange_empty(self):
         y_cpu = torch.arange(0, 0, 1, out=out_cpu)
         self.assertEqual(y_mps, y_cpu)
 
-    # Test rgange
-    def test_range(self):
-        self.assertEqual(np.arange(11, dtype=np.float32), torch.range(0, 10, device='mps'))
-        self.assertEqual(np.arange(7, 0, -1, dtype=np.float32), torch.range(7, 1, -1, device='mps'))
-        self.assertEqual(np.array([1.0000, 1.3000, 1.6000, 1.9000], dtype=np.float32), torch.range(1, 2, .3, device='mps'))
-        self.assertEqual(np.arange(6.3, dtype=np.float32), torch.arange(0, 6.3, device='mps'))
-
     # Test softmax
     def test_softmax(self):
         def helper(shape, dim, channels_last=False):
@@ -6239,25 +5971,24 @@ def test_device_synchronize(self):
         torch.mps.synchronize()
 
     def test_mps_allocator_module(self):
-        # first garbage collect and empty the cached blocks
+        # limit memory allocations up to 1.5x of recommended maximum size from Metal API
+        torch.mps.set_per_process_memory_fraction(1.5)
+
+        # just running some ops to allocate buffers
+        net1 = torch.nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)\
+            .to(device='mps', dtype=torch.float)
+
+        x = torch.rand(1, 128, 6, 6, device='mps', dtype=torch.float, requires_grad=True)
+        x = net1(x)
+        print(f"current_allocated: {torch.mps.current_allocated_memory() / 1024} KB, "
+              f"driver_allocated: {torch.mps.driver_allocated_memory() / 1024} KB\n")
         gc.collect()
+        # running this test alone will not release any buffers as they are in use.
+        # however, running along with other tests should release the cached allocations.
         torch.mps.empty_cache()
-        # measure memory allocations from MPSAllocator
-        current_alloc_before = torch.mps.current_allocated_memory()
-        # after garbage collection and emptying the cache the
-        # current_allocated_memory must be zero
-        self.assertTrue(current_alloc_before == 0)
-        # measure total memory allocations from Metal driver
-        driver_alloc_before = torch.mps.driver_allocated_memory()
-        # allocate a new 8 MB tensor to force allocation of a new Metal Heap
-        x = torch.ones(1024 * 1024 * 8, device="mps")
-        # get memory allocations after allocating tensor x
-        current_alloc_after = torch.mps.current_allocated_memory()
-        driver_alloc_after = torch.mps.driver_allocated_memory()
-        # current and driver memory allocations must have
-        # grown at this point
-        self.assertTrue(current_alloc_after > current_alloc_before)
-        self.assertTrue(driver_alloc_after > driver_alloc_before)
+        x.backward(torch.randn_like(x))
+        print(f"current_allocated: {torch.mps.current_allocated_memory() / 1024} KB, "
+              f"driver_allocated: {torch.mps.driver_allocated_memory() / 1024} KB\n")
 
     # Test random_.to and random_.from
     def test_random(self):
@@ -6425,65 +6156,18 @@ def helper(probs, compare_mean, compare_var, num_samples=5, replacement=True):
         helper(np.array([1, 1, 1, 1, 1]), (0 + 1 + 2 + 3 + 4) / 5, (6 - 2 * 2), 10000)
         helper(np.array([[1, 1, 1, 1, 1, 1, 1]]), 0, 0, 7, False)
 
-    def test_cumsum_dim_check(self):
-        x = torch.rand((3, 3), device="mps")
-        self.assertEqual(x.cumsum(1), x.cumsum(-1))
-        self.assertEqual(x.cumsum(0), x.cumsum(-2))
-        self.assertRaises(IndexError, lambda: x.cumsum(2))
-        self.assertRaises(IndexError, lambda: x.cumsum(-3))
-
-
-class TestTopK(TestCase):
-    def _test_topk(self, shape, largest):
-        cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
-        x = cpu_x.detach().clone().to('mps')
-        if isinstance(shape, tuple):
-            for curr_dim, dim_size in enumerate(shape):
-                for k in range(1, dim_size + 1):
-                    topk_values, topk_indices = torch.topk(x, k, dim=curr_dim, largest=largest)
-                    topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=curr_dim, largest=largest)
-                    self.assertEqual(topk_values, topk_values_cpu)
-                    self.assertEqual(topk_indices, topk_indices_cpu)
-        else:
-            for k in range(1, shape):
-                topk_values, topk_indices = torch.topk(x, k, dim=0, largest=largest)
-                topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=0, largest=largest)
-                self.assertEqual(topk_values, topk_values_cpu)
-                self.assertEqual(topk_indices, topk_indices_cpu)
-
-    def test_topk(self):
-        largest_vals = [True, False]
-        shapes = [
-            # Zero Element Tensors
-            0,
-            (1, 0),
-            (0, 1),
-            (1, 0, 1),
-            # Multiple Element Tensors
-            1,
-            2,
-            (5, 1),
-            (1, 5),
-            (5, 9, 7, 4),
-        ]
-
-        for shape in shapes:
-            for largest_val in largest_vals:
-                with self.subTest(shape=shape, largest_val=largest_val):
-                    self._test_topk(shape, largest_val)
-
 class TestNNMPS(NNTestCase):
 
     def _create_basic_net(self):
         class Layer(nn.Module):
             def __init__(self):
-                super().__init__()
+                super(Layer, self).__init__()
                 self.layer_dummy_param = Parameter(torch.empty(3, 5))
                 self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7))
 
         class Net(nn.Module):
             def __init__(self):
-                super().__init__()
+                super(Net, self).__init__()
                 self.l1 = Layer()
                 self.dummy_param = Parameter(torch.empty(3, 5))
                 self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1))
@@ -6571,27 +6255,24 @@ def test_zero_grad(self):
         self.assertIsNotNone(module.weight.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         module.zero_grad()
-        self.assertIsNone(module.weight.grad)
+        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
 
         module.bias.requires_grad = True
         module.zero_grad()
-        self.assertIsNone(module.weight.grad)
+        self.assertIsNotNone(module.weight.grad)
         self.assertIsNone(module.bias.grad)
         module(i).sum().backward()
         self.assertIsNotNone(module.weight.grad)
         self.assertIsNotNone(module.bias.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         self.assertGreater(module.bias.grad.data.abs().sum(), 0)
-
-        # Force set to zeros.
-        module.zero_grad(set_to_none=False)
+        module.zero_grad()
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
         self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
 
-        module.zero_grad()
+        # Force set to None.
+        module.zero_grad(set_to_none=True)
         self.assertIsNone(module.weight.grad)
-        self.assertIsNone(module.bias.grad)
-
 
     def test_no_grad(self):
         for dtype in [torch.bfloat16, torch.float, torch.double]:
@@ -6706,33 +6387,6 @@ def attention2(key, *, workaround=False, device):
         r2_cpu = r2.to("cpu")
         self.assertEqual(r1, r2_cpu)
 
-    def test_group_norm_backward(self, device='mps'):
-        # See https://github.com/pytorch/pytorch/issues/88331 for more detail
-        shape = [1, 4, 16, 16]
-        x = torch.full(shape, 7.0, device=device)
-
-        target = torch.ones((1, 3, 128, 128), device=device)
-
-        conv_in = nn.Conv2d(4, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), device=device)
-        conv_out = nn.Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), device=device)
-        norm = nn.GroupNorm(32, 128, eps=1e-6, affine=True, device=device)
-
-        with torch.enable_grad():
-            x = x.detach().requires_grad_()
-            out = 5.5 * x
-            out = conv_in(out)
-            out = out + norm(out)
-            out = out + norm(out)
-            out = out + norm(out)
-            out = F.interpolate(out, scale_factor=8.0, mode="nearest")
-            out = norm(out)
-            out = conv_out(out)
-
-            loss = (out - target).norm(dim=-1).sum()
-            grad = -torch.autograd.grad(loss, x)[0]
-            self.assertFalse(grad.detach().isnan().any().item(), 'NaN gradients returned by autograd')
-
-
     # def test_conv2d_same_padding(self, device='mps'):
         # x = torch.rand(1, 1, 10, 11, device=device)
         # y = torch.rand(1, 1, 4, 5, device=device)
@@ -7547,10 +7201,12 @@ def test_T(self, device="mps"):
         self.assertEqual(t2, t1)
         b = torch.randn(10, device=device)
         self.assertEqual(b, b.T)
+        scalar = torch.tensor(5, device=device)
+        self.assertEqual(scalar, scalar.T)
 
     def test_transposes(self, device="mps", dtype=torch.float32):
         for op in ("T", "H", "mT", "mH", "adjoint"):
-            shapes = ((2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((2, 3),)
+            shapes = ((), (2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((), (2, 3),)
             for shape in shapes:
                 a = make_tensor(shape, device=device, dtype=dtype)
                 t1 = getattr(a, op)
@@ -8407,6 +8063,7 @@ def test_bool_indices(self, device="mps"):
             self.assertEqual(v[boolIndices], torch.tensor([True], dtype=torch.bool, device=device))
             self.assertEqual(len(w), 2)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_bool_indices_accumulate(self, device="mps"):
         mask = torch.zeros(size=(10, ), dtype=torch.uint8, device=device)
         mask = mask > 0
@@ -8597,6 +8254,7 @@ def helper(device, dtype):
             self.assertEqual(res.shape, src.shape)
         [helper(device="mps", dtype=dtype) for dtype in [torch.float, torch.int32]]
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_index_src_datatype(self):
         def helper(device, dtype):
             orig_dtype = dtype
@@ -9078,73 +8736,134 @@ def test_serialization_map_location(self):
 for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]:
     del MPS_DTYPES[MPS_DTYPES.index(t)]
 
+abbrs_to_torch_dtype_dict = {value : key for (key, value) in dtype_abbrs.items()}
+class UnitTestSample:
+    def __init__(self, dtype, args, params, out):
+        requires_grad = (dtype.is_floating_point or dtype.is_complex)
+        self.args_ = [t.detach().to('mps').requires_grad_(requires_grad) for t in args]
+        self.params_ = params
+        self.out_ = out
+
+    def sample(self):
+        return self.args_ + self.params_
+
+    def expected(self):
+        return tuple(self.out_)
+
+CUDA_RESULT = dict()
+OP_UNIT_TEST = dict()
+dirname = os.path.dirname(__file__)
+filename = os.path.join(dirname, "cuda_results.yaml")
+with open(filename) as f:
+    data = yaml.safe_load(f)
+    for key, value in data['ConsistencyTest'].items():
+        CUDA_RESULT[key] = torch.as_tensor(value)
+    for key, samples in data['UnitTest'].items():
+        unit_tests = []
+        for sample in samples:
+            dtype = abbrs_to_torch_dtype_dict[sample['dtype']]
+            args = [torch.as_tensor(arg).to(dtype) for arg in sample['args']]
+            params = sample['params']
+            out = [torch.as_tensor(res).to(dtype) for res in sample['res']]
+            unit_tests.append(UnitTestSample(dtype, args, params, out))
+        OP_UNIT_TEST[key] = unit_tests
 
 class TestConsistency(TestCaseMPS):
+
     # TODO: This is only used while some ops are being added.
     # This list should contain all ops and dtypes eventually
     # This can be generated automatically in the `new_mps_allowlist.txt` file
     # by doing `EXPECTTEST_ACCEPT=1 python test_mps.py TestConsistencyCPU`
     # You most likely do NOT want to modify this manually
     ALLOWLIST_OP = {
+        'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__getitem__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__radd__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rand__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        '__rdiv__': ['f16', 'f32', 'i16', 'i32', 'u8'],
-        '__rmatmul__': ['f32'],
+        '__rdiv__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rmatmul__': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rmod__': ['f16', 'f32'],
         '__rmul__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__ror__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        '__rpow__': ['f16'],
+        '__rpow__': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rsub__': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rxor__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.log_softmax': ['f32'],
-        'masked.logaddexp': ['f32'],
-        'masked.logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.norm': ['f16', 'f32'],
-        'masked.normalize': ['f16', 'f32'],
-        'masked.softmax': ['f32'],
-        'masked.softmin': ['f32'],
-        'masked.std': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.var': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'abs': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'acos': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'acosh': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        '_native_batch_norm_legit': ['f32'],
+        '_softmax_backward_data': ['f32'],
+        'abs': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'acos': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'acosh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'addbmm': ['f32'],
+        'addbmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'addcdiv': ['f32'],
         'addcmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'addmm': ['f32'],
-        'addmv': ['f32'],
-        'addr': ['f32'],
+        'addmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'addmv': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'addr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'all': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'allclose': ['f16', 'f32'],
+        'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'aminmax': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'angle': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'any': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'arange': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'amix': ['f32'],
-        'asin': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'asinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'atan': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'atan2': ['f32'],
-        'atanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'argsort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'argwhere': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'as_strided': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'as_strided_scatter': ['b8',
+                               'f16',
+                               'f32',
+                               'i16',
+                               'i32',
+                               'i64',
+                               'u8'],
+        'asin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'asinh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atan': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atan2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atanh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_1d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_2d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_3d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'baddbmm': ['f32'],
+        'baddbmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'bernoulli': ['f32'],
+        'bfloat16': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bincount': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_and': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_left_shift': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_not': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_or': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_right_shift': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_xor': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'bmm': ['f32'],
+        'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'bool': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'broadcast_shapes': ['f32'],
+        'broadcast_tensors': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'broadcast_to': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bucketize': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'byte': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cartesian_prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'cat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'ceil': ['f32', 'int32', 'int64', 'f16'],
+        'cdist': ['f32'],
+        'cdouble': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ceil': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cfloat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'chalf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'char': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cholesky': ['f32'],
+        'cholesky_inverse': ['f32'],
+        'cholesky_solve': ['f32'],
         'chunk': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp_max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -9152,241 +8871,659 @@ class TestConsistency(TestCaseMPS):
         'clone': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'column_stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'combinations': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'complex': ['f16', 'f32'],
         'conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'conj_physical': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'constant_pad_nd': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'contiguous': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'corrcoef': ['f32'],
-        'cos': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'cosh': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'cov': ['f32'],
-        'cumsum': ['f16', 'f32', 'int16', 'int32'],
+        'copysign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'corrcoef': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cos': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cosh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cov': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cross': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cummax': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cummin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumprod': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'diag': ['f32', 'i32'],
-        'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'diagflat': ['f32', 'i32'],
-        'diagonal_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagflat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal_scatter': ['b8',
+                             'f16',
+                             'f32',
+                             'i16',
+                             'i32',
+                             'i64',
+                             'u8'],
         'diff': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'dist': ['f32'],
+        'digamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dist': ['f16', 'f32'],
+        'div': ['f16', 'f32', 'u8', 'b8', 'i16', 'i32', 'i64'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'einsum': ['f32'],
+        'double': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'einsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'empty_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'eq': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'erf': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'exp': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'erf': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'erfc': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'erfinv': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'exp': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expand': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expand_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expm1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'eye': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fftshift': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifftshift': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'flatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'flip': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'fliplr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'flipud': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'flip': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fliplr': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'flipud': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'float': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'floor': ['f32', 'f16', 'i16', 'i32', 'i64'],
-        'floor_divide': ['f32', 'f16'],
-        'fmod': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'float_power': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'floor': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'floor_divide': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmod': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'frac': ['f16', 'f32'],
+        'frexp': ['f16', 'f32'],
+        'full': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'full_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'gather': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'gradient': ['f16', 'f32', 'i16'],
+        'gcd': ['i16', 'i32', 'i64', 'u8'],
         'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'geqrf': ['f32'],
+        'gradient': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'grid_sampler_2d': ['f32'],
         'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'half': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'heaviside': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'histc': ['f32'],
+        'histogram': ['f32'],
+        'histogramdd': ['f32'],
+        'hsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'hypot': ['f32'],
+        'i0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'igamma': ['f16', 'f32'],
+        'igammac': ['f16', 'f32'],
+        'index_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_put': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_reduce': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'index_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'inner': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'int': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isclose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isfinite': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isin': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'isinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isnan': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isneginf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isposinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isreal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'kron': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.matrix_norm': ['f16'],
+        'kthvalue': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'lcm': ['i16', 'i32', 'i64', 'u8'],
+        'ldexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'le': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lerp': ['f32'],
+        'lgamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.cholesky': ['f32'],
+        'linalg.cholesky_ex': ['f32'],
+        'linalg.cond': ['f32'],
+        'linalg.cross': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.det': ['f32'],
+        'linalg.eig': ['f32'],
+        'linalg.eigh': ['f32'],
+        'linalg.eigvals': ['f32'],
+        'linalg.eigvalsh': ['f32'],
+        'linalg.householder_product': ['f32'],
+        'linalg.inv': ['f32'],
+        'linalg.inv_ex': ['f32'],
+        'linalg.ldl_factor': ['f32'],
+        'linalg.ldl_factor_ex': ['f32'],
+        'linalg.ldl_solve': ['f32'],
+        'linalg.lstsq': ['f32'],
+        'linalg.lu': ['f32'],
+        'linalg.lu_factor': ['f32'],
+        'linalg.lu_factor_ex': ['f32'],
+        'linalg.lu_solve': ['f32'],
+        'linalg.matrix_norm': ['f16', 'f32'],
+        'linalg.matrix_power': ['f32'],
+        'linalg.matrix_rank': ['f32'],
+        'linalg.multi_dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.norm': ['f16', 'f32'],
+        'linalg.pinv': ['f32'],
+        'linalg.qr': ['f32'],
+        'linalg.slogdet': ['f32'],
+        'linalg.solve': ['f32'],
+        'linalg.solve_ex': ['f32'],
+        'linalg.solve_triangular': ['f32'],
         'linalg.svd': ['f32'],
+        'linalg.svdvals': ['f32'],
+        'linalg.tensorinv': ['f32'],
+        'linalg.tensorsolve': ['f32'],
+        'linalg.vander': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.vecdot': ['f32'],
         'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'log': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log10': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log1p': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log2': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log_softmax': ['f32'],
-        'logaddexp': ['f16', 'f32'],
-        'logaddexp2': ['f16', 'f32'],
+        'log': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log10': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log1p': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log_softmax': ['f32', 'b8', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'logaddexp': ['f32'],
+        'logaddexp2': ['f32'],
+        'logcumsumexp': ['f32'],
+        'logdet': ['f32'],
         'logical_and': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_not': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_or': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logit': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logsumexp': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'long': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lu': ['f32'],
+        'lu_solve': ['f32'],
+        'lu_unpack': ['f32'],
+        'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.amax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.amin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.cumprod': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.cumsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.log_softmax': ['f32'],
+        'masked.logaddexp': ['f32'],
+        'masked.logsumexp': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.median': ['f32'],
+        'masked.norm': ['f16', 'f32'],
+        'masked.normalize': ['f16', 'f32'],
+        'masked.prod': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.softmax': ['f32'],
+        'masked.softmin': ['f32'],
+        'masked.std': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.var': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'matmul': ['f32'],
-        'mm': ['f32'],
-        'mv': ['f32'],
+        'matmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'matrix_exp': ['f32'],
+        'max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'max_pool2d_with_indices_backward': ['f32'],
+        'maximum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mean': ['f16', 'f32'],
+        'median': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'meshgrid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'min': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'minimum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'mode': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'movedim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'msort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mul': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'multinomial': ['f32'],
+        'mv': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'mvlgamma': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nan_to_num': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'nn.functional.adaptive_max_pool1d': ['f32'],
-        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nanmean': ['f16', 'f32'],
+        'nanmedian': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nanquantile': ['f32'],
+        'nansum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'narrow': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'narrow_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'native_batch_norm': ['f32'],
+        'native_dropout_backward': ['b8',
+                                    'f16',
+                                    'f32',
+                                    'i16',
+                                    'i32',
+                                    'i64',
+                                    'u8'],
+        'native_layer_norm': ['f32'],
+        'ne': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'neg': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_empty_strided': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'new_full': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nextafter': ['f32'],
+        'nn.functional._scaled_dot_product_attention': ['f32'],
         'nn.functional.adaptive_avg_pool1d': ['f32'],
         'nn.functional.adaptive_avg_pool2d': ['f32'],
+        'nn.functional.adaptive_avg_pool3d': ['f16', 'f32'],
+        'nn.functional.adaptive_max_pool1d': ['f32'],
+        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nn.functional.adaptive_max_pool3d': ['f32'],
+        'nn.functional.alpha_dropout': ['f32'],
         'nn.functional.avg_pool1d': ['f32', 'i64'],
         'nn.functional.avg_pool2d': ['f32', 'i64'],
+        'nn.functional.avg_pool3d': ['f32', 'i64'],
+        'nn.functional.batch_norm': ['f32'],
+        'nn.functional.bilinear': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.binary_cross_entropy': ['f32'],
         'nn.functional.binary_cross_entropy_with_logits': ['f32'],
         'nn.functional.celu': ['f32'],
         'nn.functional.conv1d': ['f32'],
         'nn.functional.conv2d': ['f32'],
         'nn.functional.conv_transpose1d': ['f32'],
-        'nn.functional.cosine_embedding_loss': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.conv_transpose2d': ['f32'],
+        'nn.functional.cosine_embedding_loss': ['b8',
+                                                'f32',
+                                                'i16',
+                                                'i32',
+                                                'i64',
+                                                'u8'],
         'nn.functional.cosine_similarity': ['f32'],
+        'nn.functional.cross_entropy': ['f32'],
+        'nn.functional.ctc_loss': ['f32'],
+        'nn.functional.dropout': ['f32'],
+        'nn.functional.dropout2d': ['f32'],
+        'nn.functional.dropout3d': ['f32'],
         'nn.functional.elu': ['f32'],
-        'nn.functional.feature_alpha_dropout': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.embedding': ['f16', 'f32'],
+        'nn.functional.embedding_bag': ['f16', 'f32'],
+        'nn.functional.feature_alpha_dropout': ['b8',
+                                                'f16',
+                                                'f32',
+                                                'i16',
+                                                'i32',
+                                                'i64',
+                                                'u8'],
+        'nn.functional.fractional_max_pool2d': ['f32'],
+        'nn.functional.fractional_max_pool3d': ['f32'],
         'nn.functional.gaussian_nll_loss': ['f32'],
+        'nn.functional.gelu': ['f32'],
         'nn.functional.glu': ['f32'],
+        'nn.functional.grid_sample': ['f32'],
         'nn.functional.group_norm': ['f32'],
+        'nn.functional.hardshrink': ['f32'],
+        'nn.functional.hardsigmoid': ['f32'],
+        'nn.functional.hardswish': ['f32'],
         'nn.functional.hardtanh': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.hinge_embedding_loss': ['f32'],
         'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
-        'nn.functional.kl_div': ['f32', 'i16', 'i32', 'i64'],
+        'nn.functional.interpolate': ['f32', 'u8'],
+        'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
+        'nn.functional.layer_norm': ['f32'],
         'nn.functional.leaky_relu': ['f32'],
-        'nn.functional.linear': ['f32'],
-        'nn.functional.local_response_norm': ['f32'],
-        'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32'],
+        'nn.functional.linear': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.local_response_norm': ['f32', 'i64'],
+        'nn.functional.logsigmoid': ['f32'],
+        'nn.functional.margin_ranking_loss': ['f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
         'nn.functional.max_pool1d': ['f32'],
         'nn.functional.max_pool2d': ['f32'],
-        'max_pool2d_with_indices_backward': ['f32'],
+        'nn.functional.max_pool3d': ['f32'],
+        'nn.functional.max_unpool1d': ['f32'],
+        'nn.functional.max_unpool2d': ['f32'],
+        'nn.functional.max_unpool3d': ['f32'],
+        'nn.functional.mish': ['f32'],
         'nn.functional.mse_loss': ['f16', 'f32'],
+        'nn.functional.multi_margin_loss': ['f32'],
+        'nn.functional.multilabel_margin_loss': ['f32'],
+        'nn.functional.multilabel_soft_margin_loss': ['f32'],
         'nn.functional.nll_loss': ['f32'],
-        'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.padconstant': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.padreflect': ['f32'],
-        'nn.functional.padreplicate': ['f32'],
-        'nn.functional.pairwise_distance': ['f16', 'f32', 'i16', 'i32', 'i64'],
-        'nn.functional.poisson_nll_loss': ['f32', 'i16', 'i32', 'u8'],
+        'nn.functional.normalize': ['f32'],
+        'nn.functional.one_hot': ['i64'],
+        'nn.functional.pad': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'nn.functional.pairwise_distance': ['f16',
+                                            'f32',
+                                            'i16',
+                                            'i32',
+                                            'i64',
+                                            'u8'],
+        'nn.functional.pdist': ['f32'],
+        'nn.functional.pixel_shuffle': ['b8',
+                                        'f16',
+                                        'f32',
+                                        'i16',
+                                        'i32',
+                                        'i64',
+                                        'u8'],
+        'nn.functional.pixel_unshuffle': ['b8',
+                                          'f16',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'nn.functional.poisson_nll_loss': ['f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
         'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.relu6': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.rrelu': ['f32'],
         'nn.functional.selu': ['f32'],
         'nn.functional.silu': ['f32'],
         'nn.functional.smooth_l1_loss': ['f16', 'f32'],
         'nn.functional.soft_margin_loss': ['f32'],
-        'nn.functional.softmin': ['f32'],
-        'nn.functional.softplus': ['f32'],
-        'nn.functional.softsign': ['f16', 'f32', 'i16', 'u8'],
-        'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'u8'],
+        'nn.functional.softmin': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.softshrink': ['f32'],
+        'nn.functional.softsign': ['f16',
+                                   'f32',
+                                   'i16',
+                                   'i32',
+                                   'i64',
+                                   'u8'],
+        'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.threshold': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.triplet_margin_loss': ['f32', 'i16', 'i32', 'i64'],
-        'nn.functional.triplet_margin_with_distance_loss': ['f32', 'i16', 'i32', 'i64'],
+        'nn.functional.triplet_margin_loss': ['f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'nn.functional.triplet_margin_with_distance_loss': ['f32',
+                                                            'i16',
+                                                            'i32',
+                                                            'i64',
+                                                            'u8'],
+        'nn.functional.unfold': ['f16', 'f32'],
         'nn.functional.upsample_bilinear': ['f32'],
-        'nn.functional.upsample_nearest': ['f32'],
+        'nn.functional.upsample_nearest': ['f32', 'u8'],
+        'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'norm': ['f32', 'f16'],
+        'normal': ['f16', 'f32'],
+        'ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ones_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ormqr': ['f32'],
+        'outer': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'pca_lowrank': ['f32'],
+        'permute': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'pinverse': ['f32'],
+        'polar': ['f32'],
+        'polygamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'positive': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'pow': ['f16'],
+        'pow': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'prod': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'put': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'qr': ['f32'],
+        'quantile': ['f32'],
         'rad2deg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'rand_like': ['f16', 'f32'],
+        'randint': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'randint_like': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'randn': ['f16', 'f32'],
+        'randn_like': ['f16', 'f32'],
+        'ravel': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'real': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'remainder' : ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'remainder': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'renorm': ['f16', 'f32'],
         'repeat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'repeat_interleave': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'resize_': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'resize_as_': ['b8', 'i16', 'i32', 'i64', 'u8'],
+        'repeat_interleave': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'reshape': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'reshape_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'resize_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'resize_as_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'roll': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'rot90': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
-        'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'round': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'rsqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'rsub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'scalar_tensor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'scatter_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'select_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'scatter_reduce': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'searchsorted': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'segment_reduce': ['f16', 'f32'],
+        'select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'short': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'slice_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'softmax': ['f32'],
+        'sigmoid': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'signal.windows.bartlett': ['f16', 'f32'],
+        'signal.windows.blackman': ['f16', 'f32'],
+        'signal.windows.cosine': ['f16', 'f32'],
+        'signal.windows.exponential': ['f16', 'f32'],
+        'signal.windows.gaussian': ['f16', 'f32'],
+        'signal.windows.general_cosine': ['f16', 'f32'],
+        'signal.windows.general_hamming': ['f16', 'f32'],
+        'signal.windows.hamming': ['f16', 'f32'],
+        'signal.windows.hann': ['f16', 'f32'],
+        'signal.windows.kaiser': ['f16', 'f32'],
+        'signbit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sinc': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sinh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'slice': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'slice_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'softmax': ['f32', 'b8', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'sort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.airy_ai': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_j0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_j1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_y0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_y1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.chebyshev_polynomial_t': ['b8',
+                                           'f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
+        'special.chebyshev_polynomial_u': ['b8',
+                                           'f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
+        'special.entr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.erfcx': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.hermite_polynomial_h': ['b8',
+                                         'f32',
+                                         'i16',
+                                         'i32',
+                                         'i64',
+                                         'u8'],
+        'special.hermite_polynomial_he': ['b8',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'special.i0e': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.i1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.i1e': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.laguerre_polynomial_l': ['b8',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'special.log_ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.modified_bessel_i0': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_i1': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_k0': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_k1': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
         'special.ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.ndtri': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.polygamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.scaled_modified_bessel_k0': ['b8',
+                                              'f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'special.scaled_modified_bessel_k1': ['b8',
+                                              'f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'special.spherical_bessel_j0': ['b8',
+                                        'f32',
+                                        'i16',
+                                        'i32',
+                                        'i64',
+                                        'u8'],
+        'special.xlog1py': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.zeta': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'split_with_sizes': ['b8',
+                             'f16',
+                             'f32',
+                             'i16',
+                             'i32',
+                             'i64',
+                             'u8'],
+        'sqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'square': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'squeeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'std': ['f16', 'f32'],
+        'std_mean': ['f16', 'f32'],
+        'stft': ['f32'],
         'sub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sum_to_size': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'svd': ['f32'],
+        'svd_lowrank': ['f32'],
+        'symeig': ['f32'],
         't': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'tan': ['b8', 'i16', 'i32', 'u8'],
-        'tanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'tensordot': ['f32'],
+        'take': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'take_along_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'tan': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'tanh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tensor_split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'tile': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'topk': ['f32', 'f16'],
-        'trapz': ['f16', 'f32', 'i16', 'i32', 'i64'],
-        'sort': ['f32', 'i16', 'i32', 'i64'],
-        'argsort': ['f32', 'i16', 'i32', 'i64'],
+        'tensordot': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'tile': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'to_sparse': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'topk': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'trace': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'transpose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'trapezoid': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumulative_trapezoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'trapz': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'triangular_solve': ['f32'],
         'tril': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tril_indices': ['i32', 'i64'],
         'triu': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'triu_indices': ['i32', 'i64'],
         'true_divide': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'trunc': ['f32'],
+        'trunc': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'unbind': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unflatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'unfold': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'unfold_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'uniform': ['f16', 'f32'],
+        'unique_consecutive': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unsqueeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'var': ['f16', 'f32'],
+        'var_mean': ['f16', 'f32'],
+        'vdot': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'view': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'view_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'view_as_complex': ['f16', 'f32'],
+        'view_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'where': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'std': ['f16', 'f32'],
-        'var': ['f16', 'f32'],
-        'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'mean': ['f16', 'f32'],
-        'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'native_layer_norm': ['torch.float32'],
-        'nn.functional.layer_norm': ['torch.float32'],
-        'nn.functional.bilinear': ['f32'],
-        'linalg.solve_triangular': ['f32'],
-        'triangular_solve': ['f32'],
-        '_native_batch_norm_legit': ['f32'],
-        'native_batch_norm': ['f32'],
-        'minreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.inv': ['f32'],
-        'linalg.inv_ex': ['f32'],
-        'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'xlogy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zeros_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.softplus': ['f32'],
     }
 
-
     ALLOWLIST_OP_GRAD = {
+        'H': ['f16', 'f32'],
+        'T': ['f16', 'f32'],
+        '__getitem__': ['f16', 'f32'],
         '__radd__': ['f16', 'f32'],
         '__rdiv__': ['f16', 'f32'],
         '__rmatmul__': ['f32'],
+        '__rmod__': ['f16', 'f32'],
         '__rmul__': ['f16', 'f32'],
-        'masked.log_softmax': ['f32'],
-        'masked.logaddexp': ['f32'],
-        'masked.softmax': ['f32'],
-        'masked.softmin': ['f32'],
-        'masked.std': ['f32'],
+        '__rpow__': ['f32'],
+        '__rsub__': ['f16', 'f32'],
+        '_native_batch_norm_legit': ['f32'],
+        '_softmax_backward_data': ['f32'],
         'abs': ['f16', 'f32'],
         'acos': ['f32'],
         'acosh': ['f32'],
@@ -9398,168 +9535,524 @@ class TestConsistency(TestCaseMPS):
         'addmv': ['f32'],
         'addr': ['f32'],
         'all': ['f16', 'f32'],
+        'amax': ['f16', 'f32'],
+        'amin': ['f16', 'f32'],
+        'angle': ['f16', 'f32'],
         'any': ['f16', 'f32'],
         'arange': ['f16', 'f32'],
         'argmax': ['f16', 'f32'],
         'argmin': ['f16', 'f32'],
+        'argsort': ['f16', 'f32'],
+        'argwhere': ['f16', 'f32'],
+        'as_strided': ['f16', 'f32'],
+        'as_strided_scatter': ['f16', 'f32'],
         'asin': ['f32'],
         'asinh': ['f32'],
         'atan': ['f32'],
         'atan2': ['f32'],
+        'atanh': ['f32'],
         'atleast_1d': ['f16', 'f32'],
         'atleast_2d': ['f16', 'f32'],
         'atleast_3d': ['f16', 'f32'],
         'baddbmm': ['f32'],
+        'bernoulli': ['f32'],
+        'bfloat16': ['f16', 'f32'],
         'block_diag': ['f16', 'f32'],
         'bmm': ['f32'],
+        'bool': ['f16', 'f32'],
         'broadcast_shapes': ['f32'],
+        'broadcast_tensors': ['f16', 'f32'],
+        'broadcast_to': ['f16', 'f32'],
+        'bucketize': ['f16', 'f32'],
+        'byte': ['f16', 'f32'],
+        'cartesian_prod': ['f16', 'f32'],
+        'cat': ['f16', 'f32'],
+        'cdist': ['f32'],
         'ceil': ['f32'],
+        'char': ['f16', 'f32'],
+        'cholesky': ['f32'],
+        'cholesky_inverse': ['f32'],
+        'cholesky_solve': ['f32'],
         'chunk': ['f16', 'f32'],
+        'clamp': ['f32'],
+        'clamp_max': ['f16', 'f32'],
+        'clamp_min': ['f16', 'f32'],
         'clone': ['f16', 'f32'],
         'column_stack': ['f16', 'f32'],
+        'combinations': ['f16', 'f32'],
         'conj': ['f16', 'f32'],
         'conj_physical': ['f16', 'f32'],
+        'constant_pad_nd': ['f16', 'f32'],
         'contiguous': ['f16', 'f32'],
+        'copysign': ['f16', 'f32'],
         'corrcoef': ['f32'],
         'cos': ['f32'],
         'cosh': ['f32'],
-        'cumsum': ['f16', 'f32'],
+        'count_nonzero': ['f16', 'f32'],
+        'cov': ['f32'],
+        'cross': ['f32'],
+        'cummax': ['f32'],
+        'cummin': ['f32'],
+        'cumprod': ['f32'],
+        'cumsum': ['f32'],
+        'cumulative_trapezoid': ['f32'],
         'deg2rad': ['f16', 'f32'],
-        'diag': ['f32'],
+        'diag': ['f16', 'f32'],
         'diag_embed': ['f16', 'f32'],
-        'diagflat': ['f32'],
+        'diagflat': ['f16', 'f32'],
+        'diagonal': ['f16', 'f32'],
+        'diagonal_copy': ['f16', 'f32'],
         'diagonal_scatter': ['f16', 'f32'],
         'diff': ['f16', 'f32'],
-        'dist': ['f32'],
+        'digamma': ['f32'],
+        'dist': ['f16', 'f32'],
+        'div': ['f16', 'f32'],
         'dot': ['f32'],
+        'double': ['f16', 'f32'],
+        'dsplit': ['f16', 'f32'],
+        'dstack': ['f16', 'f32'],
         'einsum': ['f32'],
+        'empty_like': ['f16', 'f32'],
+        'eq': ['f16', 'f32'],
         'erf': ['f32'],
+        'erfc': ['f32'],
+        'erfinv': ['f32'],
         'exp': ['f32'],
         'exp2': ['f16', 'f32'],
+        'expand': ['f16', 'f32'],
+        'expand_as': ['f16', 'f32'],
+        'expm1': ['f32'],
+        'fft.fftshift': ['f16', 'f32'],
+        'fft.hfft': ['f32'],
+        'fft.hfft2': ['f32'],
+        'fft.hfftn': ['f32'],
+        'fft.ifftshift': ['f16', 'f32'],
+        'fft.irfft': ['f32'],
+        'fft.irfft2': ['f32'],
+        'fft.irfftn': ['f32'],
         'fill': ['f16', 'f32'],
         'flatten': ['f16', 'f32'],
         'flip': ['f16', 'f32'],
         'fliplr': ['f16', 'f32'],
         'flipud': ['f16', 'f32'],
-        'float': ['f32'],
+        'float': ['f16', 'f32'],
+        'float_power': ['f16', 'f32'],
         'floor': ['f32'],
-        'gradient': ['f32'],
-        'half': ['f16'],
+        'fmax': ['f16', 'f32'],
+        'fmin': ['f16', 'f32'],
+        'fmod': ['f16', 'f32'],
+        'frac': ['f16', 'f32'],
+        'frexp': ['f16', 'f32'],
+        'full': ['f16', 'f32'],
+        'full_like': ['f16', 'f32'],
+        'gather': ['f16', 'f32'],
+        'ge': ['f16', 'f32'],
+        'gradient': ['f16', 'f32'],
+        'grid_sampler_2d': ['f32'],
+        'gt': ['f16', 'f32'],
+        'half': ['f16', 'f32'],
+        'histc': ['f32'],
+        'hsplit': ['f16', 'f32'],
         'hstack': ['f16', 'f32'],
-        'index_select': ['f16', 'f32'],
+        'hypot': ['f32'],
+        'i0': ['f32'],
         'index_add': ['f16', 'f32'],
+        'index_copy': ['f16', 'f32'],
+        'index_fill': ['f16', 'f32'],
+        'index_put': ['f16', 'f32'],
+        'index_reduce': ['f16', 'f32'],
+        'index_select': ['f16', 'f32'],
+        'inner': ['f32'],
+        'int': ['f16', 'f32'],
         'isclose': ['f16', 'f32'],
         'isfinite': ['f16', 'f32'],
+        'isin': ['f32'],
         'isinf': ['f16', 'f32'],
         'isnan': ['f16', 'f32'],
+        'isneginf': ['f16', 'f32'],
+        'isposinf': ['f16', 'f32'],
         'isreal': ['f16', 'f32'],
-        'kron': ['f32'],
-        'linalg.matrix_norm': ['f16'],
+        'kron': ['f16', 'f32'],
+        'kthvalue': ['f32'],
+        'ldexp': ['f16', 'f32'],
+        'le': ['f16', 'f32'],
+        'lerp': ['f32'],
+        'lgamma': ['f32'],
+        'linalg.cholesky': ['f32'],
+        'linalg.cholesky_ex': ['f32'],
+        'linalg.cond': ['f32'],
+        'linalg.cross': ['f32'],
+        'linalg.det': ['f32'],
+        'linalg.eigh': ['f32'],
+        'linalg.eigvalsh': ['f32'],
+        'linalg.householder_product': ['f32'],
+        'linalg.inv': ['f32'],
+        'linalg.inv_ex': ['f32'],
+        'linalg.ldl_factor': ['f32'],
+        'linalg.ldl_factor_ex': ['f32'],
+        'linalg.lstsq': ['f32'],
+        'linalg.lu': ['f32'],
+        'linalg.lu_factor': ['f32'],
+        'linalg.lu_factor_ex': ['f32'],
+        'linalg.lu_solve': ['f32'],
+        'linalg.matrix_norm': ['f16', 'f32'],
+        'linalg.matrix_power': ['f32'],
+        'linalg.matrix_rank': ['f32'],
+        'linalg.multi_dot': ['f32'],
+        'linalg.norm': ['f16', 'f32'],
+        'linalg.pinv': ['f32'],
+        'linalg.qr': ['f32'],
+        'linalg.slogdet': ['f32'],
+        'linalg.solve': ['f32'],
+        'linalg.solve_ex': ['f32'],
+        'linalg.solve_triangular': ['f32'],
         'linalg.svd': ['f32'],
+        'linalg.svdvals': ['f32'],
+        'linalg.tensorinv': ['f32'],
+        'linalg.tensorsolve': ['f32'],
+        'linalg.vander': ['f32'],
+        'linalg.vecdot': ['f32'],
+        'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32'],
         'log': ['f32'],
         'log10': ['f32'],
         'log1p': ['f32'],
         'log2': ['f32'],
-        'log_softmax': ['f32'],
+        'log_softmax': ['f32', 'f16'],
         'logaddexp': ['f32'],
+        'logaddexp2': ['f32'],
+        'logcumsumexp': ['f32'],
+        'logdet': ['f32'],
+        'logical_and': ['f16', 'f32'],
         'logical_not': ['f16', 'f32'],
+        'logical_or': ['f16', 'f32'],
+        'logical_xor': ['f16', 'f32'],
+        'logit': ['f32'],
         'logspace': ['f32'],
+        'logsumexp': ['f32'],
+        'long': ['f16', 'f32'],
+        'lt': ['f16', 'f32'],
+        'lu': ['f32'],
+        'lu_solve': ['f32'],
+        'lu_unpack': ['f32'],
+        'mH': ['f16', 'f32'],
+        'mT': ['f16', 'f32'],
+        'masked.amax': ['f16', 'f32'],
+        'masked.amin': ['f16', 'f32'],
+        'masked.argmax': ['f16', 'f32'],
+        'masked.argmin': ['f16', 'f32'],
+        'masked.cumprod': ['f32'],
+        'masked.cumsum': ['f32'],
+        'masked.log_softmax': ['f32'],
+        'masked.logaddexp': ['f32'],
+        'masked.logsumexp': ['f32'],
+        'masked.mean': ['f16', 'f32'],
+        'masked.median': ['f32'],
+        'masked.norm': ['f16', 'f32'],
+        'masked.normalize': ['f16', 'f32'],
+        'masked.prod': ['f32'],
+        'masked.softmax': ['f32'],
+        'masked.softmin': ['f32'],
+        'masked.std': ['f32'],
+        'masked.sum': ['f16', 'f32'],
+        'masked.var': ['f16', 'f32'],
+        'masked_fill': ['f16', 'f32'],
+        'masked_scatter': ['f16', 'f32'],
+        'masked_select': ['f16', 'f32'],
         'matmul': ['f32'],
+        'matrix_exp': ['f32'],
+        'max': ['f16', 'f32'],
+        'max_pool2d_with_indices_backward': ['f32'],
+        'maximum': ['f16', 'f32'],
+        'mean': ['f16', 'f32'],
+        'median': ['f32'],
+        'meshgrid': ['f16', 'f32'],
+        'min': ['f16', 'f32'],
+        'minimum': ['f16', 'f32'],
         'mm': ['f32'],
+        'mode': ['f16', 'f32'],
+        'movedim': ['f16', 'f32'],
+        'msort': ['f16', 'f32'],
+        'mul': ['f16', 'f32'],
+        'multinomial': ['f32'],
         'mv': ['f32'],
+        'mvlgamma': ['f32'],
+        'nan_to_num': ['f16', 'f32'],
+        'nanmean': ['f16', 'f32'],
+        'nanmedian': ['f32'],
+        'nanquantile': ['f32'],
+        'nansum': ['f16', 'f32'],
+        'narrow': ['f16', 'f32'],
+        'native_batch_norm': ['f32'],
+        'native_dropout_backward': ['f16', 'f32'],
+        'native_layer_norm': ['f32'],
+        'ne': ['f16', 'f32'],
         'neg': ['f16', 'f32'],
-        'nn.functional.adaptive_max_pool1d': ['f32'],
-        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'new_empty': ['f16', 'f32'],
+        'new_empty_strided': ['f16', 'f32'],
+        'new_full': ['f16', 'f32'],
+        'new_ones': ['f16', 'f32'],
+        'new_zeros': ['f16', 'f32'],
+        'nn.functional._scaled_dot_product_attention': ['f32'],
         'nn.functional.adaptive_avg_pool1d': ['f32'],
         'nn.functional.adaptive_avg_pool2d': ['f32'],
+        'nn.functional.adaptive_avg_pool3d': ['f16', 'f32'],
+        'nn.functional.adaptive_max_pool1d': ['f32'],
+        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nn.functional.adaptive_max_pool3d': ['f32'],
+        'nn.functional.alpha_dropout': ['f32'],
         'nn.functional.avg_pool1d': ['f32'],
         'nn.functional.avg_pool2d': ['f32'],
+        'nn.functional.avg_pool3d': ['f32'],
+        'nn.functional.batch_norm': ['f32'],
+        'nn.functional.bilinear': ['f32'],
         'nn.functional.binary_cross_entropy': ['f32'],
+        'nn.functional.binary_cross_entropy_with_logits': ['f32'],
         'nn.functional.celu': ['f32'],
         'nn.functional.conv1d': ['f32'],
         'nn.functional.conv2d': ['f32'],
         'nn.functional.conv_transpose1d': ['f32'],
+        'nn.functional.conv_transpose2d': ['f32'],
+        'nn.functional.conv_transpose3d': ['f32'],
         'nn.functional.cosine_embedding_loss': ['f32'],
+        'nn.functional.cosine_similarity': ['f32'],
+        'nn.functional.cross_entropy': ['f32'],
+        'nn.functional.ctc_loss': ['f32'],
+        'nn.functional.dropout': ['f32'],
+        'nn.functional.dropout2d': ['f32'],
+        'nn.functional.dropout3d': ['f32'],
         'nn.functional.elu': ['f32'],
-        'nn.functional.feature_alpha_dropout': ['f16', 'f32'],
+        'nn.functional.embedding': ['f16', 'f32'],
+        'nn.functional.embedding_bag': ['f16', 'f32'],
+        'nn.functional.feature_alpha_dropout': ['f32', 'f16'],
+        'nn.functional.fractional_max_pool2d': ['f32'],
+        'nn.functional.fractional_max_pool3d': ['f32'],
+        'nn.functional.gaussian_nll_loss': ['f32'],
+        'nn.functional.gelu': ['f32'],
         'nn.functional.glu': ['f32'],
+        'nn.functional.grid_sample': ['f32'],
+        'nn.functional.group_norm': ['f32'],
+        'nn.functional.hardshrink': ['f32'],
+        'nn.functional.hardsigmoid': ['f32'],
+        'nn.functional.hardswish': ['f32'],
         'nn.functional.hardtanh': ['f32'],
         'nn.functional.hinge_embedding_loss': ['f32'],
         'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
+        'nn.functional.interpolate': ['f32'],
         'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
+        'nn.functional.layer_norm': ['f32'],
         'nn.functional.leaky_relu': ['f32'],
+        'nn.functional.linear': ['f32'],
         'nn.functional.local_response_norm': ['f32'],
+        'nn.functional.logsigmoid': ['f32'],
         'nn.functional.margin_ranking_loss': ['f32'],
         'nn.functional.max_pool1d': ['f32'],
         'nn.functional.max_pool2d': ['f32'],
+        'nn.functional.max_pool3d': ['f32'],
+        'nn.functional.max_unpool1d': ['f32'],
+        'nn.functional.max_unpool2d': ['f32'],
+        'nn.functional.max_unpool3d': ['f32'],
+        'nn.functional.mish': ['f32'],
         'nn.functional.mse_loss': ['f32'],
+        'nn.functional.multi_margin_loss': ['f32'],
+        'nn.functional.multilabel_margin_loss': ['f32'],
+        'nn.functional.multilabel_soft_margin_loss': ['f32'],
         'nn.functional.nll_loss': ['f32'],
-        'nn.functional.pad': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'nn.functional.normalize': ['f32'],
+        'nn.functional.pad': ['f16', 'f32'],
         'nn.functional.pairwise_distance': ['f16', 'f32'],
+        'nn.functional.pdist': ['f32'],
+        'nn.functional.pixel_shuffle': ['f16', 'f32'],
+        'nn.functional.pixel_unshuffle': ['f16', 'f32'],
         'nn.functional.poisson_nll_loss': ['f32'],
+        'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32'],
         'nn.functional.relu6': ['f32'],
+        'nn.functional.rrelu': ['f32'],
         'nn.functional.selu': ['f32'],
         'nn.functional.silu': ['f32'],
+        'nn.functional.smooth_l1_loss': ['f32'],
         'nn.functional.soft_margin_loss': ['f32'],
-        'nn.functional.softmin': ['f32'],
+        'nn.functional.softmin': ['f32', 'f16'],
         'nn.functional.softplus': ['f32'],
+        'nn.functional.softshrink': ['f32'],
         'nn.functional.softsign': ['f16', 'f32'],
-        'nn.functional.smooth_l1_loss': ['f32'],
+        'nn.functional.tanhshrink': ['f32'],
         'nn.functional.threshold': ['f32'],
         'nn.functional.triplet_margin_loss': ['f32'],
         'nn.functional.triplet_margin_with_distance_loss': ['f32'],
+        'nn.functional.unfold': ['f16', 'f32'],
         'nn.functional.upsample_bilinear': ['f32'],
-        'norm': ['f32', 'f16'],
+        'nn.functional.upsample_nearest': ['f32'],
+        'nonzero': ['f16', 'f32'],
+        'norm': ['f16', 'f32'],
+        'normal': ['f16', 'f32'],
+        'ones': ['f16', 'f32'],
+        'ones_like': ['f16', 'f32'],
+        'ormqr': ['f32'],
+        'outer': ['f16', 'f32'],
+        'pca_lowrank': ['f32'],
+        'permute': ['f16', 'f32'],
+        'pinverse': ['f32'],
+        'polygamma': ['f32'],
         'positive': ['f16', 'f32'],
+        'pow': ['f32'],
+        'prod': ['f32'],
+        'put': ['f16', 'f32'],
+        'qr': ['f32'],
+        'quantile': ['f32'],
         'rad2deg': ['f16', 'f32'],
+        'rand_like': ['f16', 'f32'],
+        'randint': ['f16', 'f32'],
+        'randint_like': ['f16', 'f32'],
+        'randn_like': ['f16', 'f32'],
+        'ravel': ['f16', 'f32'],
         'real': ['f16', 'f32'],
         'reciprocal': ['f16', 'f32'],
+        'remainder': ['f16', 'f32'],
+        'renorm': ['f16', 'f32'],
         'repeat': ['f16', 'f32'],
         'repeat_interleave': ['f16', 'f32'],
+        'reshape': ['f16', 'f32'],
+        'reshape_as': ['f16', 'f32'],
         'resolve_conj': ['f16', 'f32'],
         'resolve_neg': ['f16', 'f32'],
+        'roll': ['f16', 'f32'],
+        'rot90': ['f16', 'f32'],
         'round': ['f32'],
         'rsqrt': ['f32'],
+        'rsub': ['f16', 'f32'],
+        'scatter': ['f16', 'f32'],
+        'scatter_add': ['f16', 'f32'],
+        'scatter_reduce': ['f16', 'f32'],
+        'searchsorted': ['f16', 'f32'],
+        'segment_reduce': ['f16', 'f32'],
+        'select': ['f16', 'f32'],
         'select_scatter': ['f16', 'f32'],
+        'sgn': ['f16', 'f32'],
+        'short': ['f16', 'f32'],
+        'sigmoid': ['f32'],
         'sign': ['f16', 'f32'],
+        'signbit': ['f16', 'f32'],
         'sin': ['f32'],
+        'sinc': ['f32'],
         'sinh': ['f32'],
+        'slice': ['f16', 'f32'],
         'slice_scatter': ['f16', 'f32'],
-        'softmax': ['f32'],
+        'softmax': ['f32', 'f16'],
+        'sort': ['f16', 'f32'],
+        'special.airy_ai': ['f32'],
+        'special.bessel_j0': ['f32'],
+        'special.bessel_j1': ['f32'],
+        'special.bessel_y0': ['f32'],
+        'special.bessel_y1': ['f32'],
+        'special.chebyshev_polynomial_t': ['f32'],
+        'special.chebyshev_polynomial_u': ['f32'],
+        'special.entr': ['f32'],
+        'special.erfcx': ['f32'],
+        'special.hermite_polynomial_h': ['f32'],
+        'special.hermite_polynomial_he': ['f32'],
+        'special.i0e': ['f32'],
+        'special.i1': ['f32'],
+        'special.i1e': ['f32'],
+        'special.laguerre_polynomial_l': ['f32'],
+        'special.log_ndtr': ['f32'],
+        'special.modified_bessel_i0': ['f32'],
+        'special.modified_bessel_i1': ['f32'],
+        'special.modified_bessel_k0': ['f32'],
+        'special.modified_bessel_k1': ['f32'],
+        'special.ndtr': ['f32'],
+        'special.ndtri': ['f32'],
+        'special.polygamma': ['f32'],
+        'special.scaled_modified_bessel_k0': ['f32'],
+        'special.scaled_modified_bessel_k1': ['f32'],
+        'special.spherical_bessel_j0': ['f32'],
+        'special.xlog1py': ['f16', 'f32'],
         'split': ['f16', 'f32'],
+        'split_with_sizes': ['f16', 'f32'],
         'sqrt': ['f32'],
         'square': ['f16', 'f32'],
         'squeeze': ['f16', 'f32'],
         'stack': ['f16', 'f32'],
-        'sub': ['f32'],
+        'std': ['f16', 'f32'],
+        'std_mean': ['f16', 'f32'],
+        'sub': ['f16', 'f32'],
+        'sum': ['f16', 'f32'],
         'sum_to_size': ['f16', 'f32'],
         'svd': ['f32'],
+        'svd_lowrank': ['f32'],
+        'symeig': ['f32'],
         't': ['f16', 'f32'],
+        'take': ['f16', 'f32'],
+        'take_along_dim': ['f16', 'f32'],
+        'tan': ['f32'],
         'tanh': ['f32'],
+        'tensor_split': ['f16', 'f32'],
         'tensordot': ['f32'],
         'tile': ['f16', 'f32'],
+        'to': ['f16', 'f32'],
+        'topk': ['f32'],
+        'trace': ['f32'],
+        'transpose': ['f16', 'f32'],
+        'trapezoid': ['f16', 'f32'],
+        'trapz': ['f16', 'f32'],
+        'triangular_solve': ['f32'],
         'tril': ['f16', 'f32'],
         'triu': ['f16', 'f32'],
         'true_divide': ['f16', 'f32'],
         'trunc': ['f32'],
         'unbind': ['f16', 'f32'],
         'unflatten': ['f16', 'f32'],
+        'unfold': ['f16', 'f32'],
+        'unfold_copy': ['f16', 'f32'],
+        'uniform': ['f16', 'f32'],
         'unsqueeze': ['f16', 'f32'],
+        'var': ['f16', 'f32'],
+        'var_mean': ['f16', 'f32'],
+        'vdot': ['f32'],
         'view': ['f16', 'f32'],
         'view_as': ['f16', 'f32'],
+        'view_copy': ['f16', 'f32'],
         'vsplit': ['f16', 'f32'],
         'vstack': ['f16', 'f32'],
+        'where': ['f16', 'f32'],
+        'xlogy': ['f16', 'f32'],
         'zero_': ['f16', 'f32'],
-        'linalg.solve_triangular': ['f32'],
-        'triangular_solve': ['f32'],
-        '_native_batch_norm_legit': ['f32'],
-        'native_batch_norm': ['f32'],
-        'native_layer_norm': ['f32'],
-        'nn.functional.gelu': ['f32'],
+        'zeros': ['f16', 'f32'],
+        'zeros_like': ['f16', 'f32'],
+    }
+
+    BLOCKLIST_OP_GRAD = {
+        # Unimplemented ops
+        '__getitem__': ['f16'],
+        'combinations': ['f16', 'f32'],
+        'logaddexp2': ['f32'],
+        'masked_select': ['f16', 'f32'],
+        'nn.functional.binary_cross_entropy_with_logits': ['f16', 'f32'],
+        'nn.functional.group_norm': ['f32'],
+        'prod': ['f32'],
+        'sgn': ['f16', 'f32'],
+        'unfold_copy': ['f16', 'f32'],
+        'unfold': ['f16', 'f32'],
+        'trace': ['f32'],
+
+        # Correctness issues
+        'nn.functional.prelu': ['f32'],
+        'atanh': ['f32'],
+        'div': ['f16'],
+        'nn.functional.bilinear': ['f32'],
+        'nn.functional.embedding': ['f16'],
+
+        # Unsupported dtype
+        'special.ndtr': ['f32'],
+        'trapezoid': ['f16', 'f32'],
+        'trapz': ['f16', 'f32'],
+    }
+
+    BLOCKLIST_OP_GRAD_MACOS_12 = {
+        'remainder': ['f16'],
     }
 
     # These ops that are problematic. So never run them even when
@@ -9567,145 +10060,22 @@ class TestConsistency(TestCaseMPS):
     # If the dtype list is None, all dtypes are excluded.
     # All the entries in this list should be removed
     BLOCKLIST = {
-<<<<<<< HEAD
-        # Functions that hang
-        'masked_fill': [torch.bool, torch.uint8, torch.float32], 'where': [torch.bool],
-        # + forward when requires_grad=True or running backward
-        'masked.mean': [torch.bool, torch.float16],
-        'masked.prod': [torch.bool],
-        'masked.sum': [torch.bool],
-
         # Functions that hard crash
-        'std': [torch.float16],
-        'stft': [torch.float32], 'var': [torch.float16],
-        # + forward when requires_grad=True or running backward
-        'nn.functional.embedding': [torch.float32, torch.float16],
-        '__rpow__': [torch.int64],
-
-        'as_strided_scatter': [torch.uint8],
-        'atan2': [torch.int64],
-        'bfloat16': None,
-        'block_diag': [torch.uint8],
-        'byte': None,
-        'chalf': None,
-        'diag_embed': [torch.uint8],
-        'diagonal_scatter': [torch.uint8],
-        'long': None,
-        'nn.functional.conv1d': [torch.int64],
-        'nn.functional.conv2d': [torch.int64],
-        'nn.functional.conv_transpose1d': [torch.int64],
-        'nn.functional.conv_transpose2d': [torch.int64],
-        'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
-        'nn.functional.local_response_norm': [torch.int64],
-        'nn.functional.padcircular': [torch.uint8],
-        'pow': [torch.int64],
-        'select_scatter': [torch.uint8],
-        'sigmoid': [torch.int64],
-
-
-        # failures due to lack of op implementation on MPS backend
-        'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-
-        # These were moved from ALLOWLIST to BLOCK as they are not working
-        # locally
-        'tile': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '__radd__': ['torch.bool', 'torch.uint8'],
-        '__rmul__': ['torch.uint8'],
-        'neg': ['torch.uint8'],
-        'add': ['torch.bool', 'torch.uint8'],
-        'addr': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'diag': ['torch.int64'],
-        'diagflat': ['torch.int64'],
-
-        # Functions that are flaky
-        # These are detected as "ok" by the expect case but actually fail to run sometimes
-        'as_strided': None,
-        'broadcast_tensors': None,
-        'broadcast': None,
-        'broadcast_to': None,
-        'diagonal': None,
-        'divfloor_rounding': None,
-        'divno_rounding_mode': None,
-        'divtrunc_rounding': None,
-        'dsplit': None,
-        'hsplit': None,
-        'empty': None,
-        'expand_as': None,
-        'expand': None,
-        'ge': None,
-        'ne': None,
-        'le': None,
-        'lt': None,
-        'gt': None,
-        'transpose': None,
-        'splitlist_args': None,
-        'select': None,
-        'reshape': None,
-        'reshape_as': None,
-        'permute': None,
-        'norm': None,
-        'nn.functional.pixel_unshuffle': None,
-        'nn.functional.pixel_shuffle': None,
-        'nn.functional.cross_entropy': None,
-        'nn.functional.one_hot': None,
-        'narrow': None,
-        'movedim': None,
-        'minreduction_with_dim': None,
-        'minreduction_no_dim': None,
-        'minbinary': None,
-        'meshgridvariadic_tensors': None,
-        'meshgridlist_of_tensors': None,
-        'maxreduction_with_dim': None,
-        'maxreduction_no_dim': None,
-        'maxbinary': None,
-        'maximum': None,
-        'minimum': None,
-        'outer': None,
-        'softmaxwith_dtype': None,
-        'rounddecimals_neg_3': None,
-        'rounddecimals_3': None,
-        'rounddecimals_0': None,
-        'normnuc': None,
-        'nn.functional.softminwith_dtype': None,
-        'nn.functional.feature_alpha_dropoutwith_train': None,
-        'log_softmaxwith_dtype': None,
-        'split_with_sizes': None,
-        'trapezoid': None,
-        'eq': None,
-        'mul': None,
-        'cartesian_prod': None,
-        'bool': None,
-        'inner': None,
-        'dstack': None,
-        'take_along_dim': None,
-=======
-        # Functions that hard crash
-        'nn.functional.softplus': [torch.float32],
-        'median': [torch.float32, torch.int16, torch.int32, torch.uint8, torch.int16],
-        'sgn': [torch.bool],
-        'linalg.inv': [torch.float32],
-        'linalg.inv_ex': [torch.float32],
         'linalg.matrix_power': [torch.float32],
-        'nn.functional.interpolate': [torch.float32],
         'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'nn.functional.interpolatearea': [torch.float32],
         'resize_as_': [torch.float16, torch.float32],
         'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],
 
         # Functions with correctness issues
-        'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'divfloor_rounding': [torch.int16, torch.int32, torch.int64],
-        'divtrunc_rounding': [torch.float16],
-        'norm': [torch.float16],
         'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
-        'cumulative_trapezoid': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'addr': [torch.float16],
-        'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'trace': [torch.int64],
         'normalnumber_mean': [torch.float16, torch.float32],
         'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'multinomial': [torch.float32],
-        'floor_divide': [torch.int16, torch.int32, torch.int64],
+
+        # cpu result off, showing random values
+        'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # cpu result off, showing inf values
         'dist': [torch.float16],
 
         # failure due to issue: atan2() may generate NAN in output with
@@ -9715,12 +10085,12 @@ class TestConsistency(TestCaseMPS):
         'grid_sampler_2d': [torch.float32],
         'nn.functional.grid_sample': [torch.float32],
 
-        # failures due to issue #103039644: Wrong results from avgPooling2DWithSourceTensor()
-        # when both ceilMode and includeZeroPadToAverage are True
-        'nn.functional.avg_pool1d': [torch.float32, torch.int64],
-        'nn.functional.avg_pool2d': [torch.float32, torch.int64],
-        'nn.functional.adaptive_avg_pool1d': [torch.float32],
-        'nn.functional.adaptive_avg_pool2d': [torch.float32],
+        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.uint8],
+
+        # failures before macOS 13.3
+        'nn.functional.conv_transpose2d': [torch.float32],
     }
 
     UNIMPLEMENTED_OPS = {
@@ -9838,6 +10208,7 @@ class TestConsistency(TestCaseMPS):
         'nn.functional.fractional_max_pool3d': [torch.float32],
         'nn.functional.adaptive_avg_pool3d': [torch.float16, torch.float32],
         'nn.functional.adaptive_max_pool3d': [torch.float32],
+        'nn.functional.interpolatearea': [torch.float32],
         'nn.functional.interpolatebicubic': [torch.float32],
         'nn.functional.interpolatelinear': [torch.float32],
         'nn.functional.interpolatetrilinear': [torch.float32],
@@ -9847,7 +10218,6 @@ class TestConsistency(TestCaseMPS):
         'nn.functional.avg_pool3d': [torch.float32, torch.int64],
         'nn.functional.ctc_loss': [torch.float32],
         'nn.functional.embedding_bag': [torch.float16, torch.float32],
-        'nn.functional.max_pool2d': [torch.float32],
         'nn.functional.hardshrink': [torch.float32],
         'nn.functional.hardsigmoid': [torch.float32],
         'nn.functional.logsigmoid': [torch.float32],
@@ -9876,7 +10246,6 @@ class TestConsistency(TestCaseMPS):
         'polygammapolygamma_n_4': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'qr': [torch.float32],
         'quantile': [torch.float32],
-        'remainder': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
         'renorm': [torch.float16, torch.float32],
         'roll': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'rsub': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9950,6 +10319,7 @@ class TestConsistency(TestCaseMPS):
         'symeig': [torch.float32],
         'take': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'to_sparse': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'var_mean': [torch.float16, torch.float32],
         'var_meanunbiased': [torch.float16, torch.float32],
         'vdot': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9977,6 +10347,7 @@ class TestConsistency(TestCaseMPS):
         'addmmdecomposed': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'addbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'addmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addr': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'addmv': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'baddbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'bmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -10037,10 +10408,6 @@ class TestConsistency(TestCaseMPS):
         'tensordot': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'zeros_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'bincount': [torch.int16, torch.int32, torch.int64, torch.uint8],
-
-        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
-        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
-        '__rpow__': [torch.int16, torch.int32],
     }
 
     UNDEFINED_BEHAVIOUR = {
@@ -10064,15 +10431,19 @@ class TestConsistency(TestCaseMPS):
         'as_strided_scatter': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         # duplicate indices are used in the testcase - undefined behaviour
         'index_put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
+        # problem 104760543, zero to negative integer powers are undefined
+        '__rpow__': [torch.int16, torch.int32, torch.int64],
     }
 
-    # Those ops worked on MacOS12, but broken on MacOS13
-    VENTURA_BLOCKLIST = {
-        'masked.softmax': [torch.float32],
+    FAST_MATH_PRECISION_ISSUES = {
+        # Failures due to precision issues
+        'tan': [torch.float32],
+        'pow': [torch.float32],
         'masked.softmin': [torch.float32],
+        'masked.softmax': [torch.float32],
         'masked.log_softmax': [torch.float32],
-        'dot': [torch.int64],
+        'cdist': [torch.float32],
+        '__rpow__': [torch.float32]
     }
 
     FP16_LOW_PRECISION_LIST = {
@@ -10082,28 +10453,63 @@ class TestConsistency(TestCaseMPS):
         'true_divide', 'kron',
         'gradient', 'var', 'std',
         'linalg.vector_norm',
-        'masked.sum', 'masked.std',
-        'masked.var',
+        'addr',
+
+        # for macOS 12
+        'masked.normalize', 'masked.sum',
+        'outer',
+        'sum_to_size',
     }
 
-<<<<<<< HEAD
-=======
-    dirname = os.path.dirname(__file__)
-    filename = os.path.join(dirname, "cuda_results.yaml")
-    with open(filename) as f:
-        data = yaml.safe_load(f)
-    CUDA_RESULT = dict()
-    for key, value in data.items():
-        CUDA_RESULT[key] = torch.as_tensor(value)
+    BLOCKLIST_MACOS_12 = {
+        '__rdiv__': [torch.float16],
+        'masked.var': [torch.float16],
+        'sum': [torch.float16],
+        'mul': [torch.float16],
+
+        # expected failures
+        'nn.functional.interpolatenearest': [torch.float32],
+        'nn.functional.upsample_nearest': [torch.float32],
+        'nn.functional.conv_transpose2d': [torch.float32]
+    }
+
+    ALLOWLIST_MACOS_13_3 = {
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.uint8],
+        'nn.functional.conv_transpose2d': [torch.float32],
+    }
 
     MPS_SKIP_LIST = reduce(lambda x, y: dict(x, **y), (
         FAST_MATH_PRECISION_ISSUES, BLOCKLIST, UNDEFINED_BEHAVIOUR, EXPECTED_FAILURES, UNIMPLEMENTED_OPS))
 
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
     # Used for accept mode only
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
 
+    def get_error_message(self, key, op_name, dtype):
+        if key in self.FAST_MATH_PRECISION_ISSUES and dtype in self.FAST_MATH_PRECISION_ISSUES[key]:
+            return f"Running test with {op_name} fails due to precision issues (fast math) so skipping"
+        elif key in self.BLOCKLIST and dtype in self.BLOCKLIST[key]:
+            return f"Running test with {op_name} fails so skipping"
+        elif key in self.UNDEFINED_BEHAVIOUR and dtype in self.UNDEFINED_BEHAVIOUR[key]:
+            return f"Running test with {op_name} fails due to undefined behaviour / random output so skipping"
+        elif key in self.EXPECTED_FAILURES and dtype in self.EXPECTED_FAILURES[key]:
+            return f"Running test with {op_name} expected to fail due to unsupported MPS data type so skipping"
+        elif key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]:
+            return f"Running test with {op_name} expected to fail due to missing op implementation"
+        elif product_version < 13.0 and key in self.BLOCKLIST_MACOS_12 and dtype in self.BLOCKLIST_MACOS_12[key]:
+            return f"Running test with {op_name} expected to fail on macOS 12"
+        return None
+
+    def compare_with_CUDA(self, op, mps_out, atol, rtol):
+        cuda_out = CUDA_RESULT[op.name]
+        try:
+            self.assertEqual(cuda_out, mps_out, atol=atol, rtol=rtol)
+        except Exception as e:
+            return False
+        else:
+            return True
+
     @ops(op_db, allowed_dtypes=MPS_DTYPES)
     def test_output_match(self, device, dtype, op):
         self.assertEqual(device, "cpu")
@@ -10111,13 +10517,15 @@ def test_output_match(self, device, dtype, op):
             self.skipTest("MPS is not available")
 
         key = op.name + op.variant_test_name
-
-        if key in self.VENTURA_BLOCKLIST and torch.backends.mps.is_macos13_or_newer():
-            if dtype in self.VENTURA_BLOCKLIST[key]:
-                self.skipTest(f"{key}_{dtype} fails on Ventura, see https://github.com/pytorch/pytorch/issues/85758")
-        if key in self.BLOCKLIST:
-            if self.BLOCKLIST[key] is None or dtype in self.BLOCKLIST[key]:
-                self.skipTest(f"Running test with {op.name} hangs so skipping")
+        if key in self.MPS_SKIP_LIST:
+            msg = self.get_error_message(key, op.name, dtype)
+            if msg is not None and not (product_version >= 13.3 and
+                                        key in self.ALLOWLIST_MACOS_13_3 and dtype in self.ALLOWLIST_MACOS_13_3[key]):
+                self.skipTest(msg)
+        if product_version < 13.0 and key in self.BLOCKLIST_MACOS_12:
+            msg = self.get_error_message(key, op.name, dtype)
+            if msg is not None:
+                self.skipTest(msg)
 
         # Make this an expecttest manually
         # When this env variable is set, generate a new ALLOWLIST_OP
@@ -10135,7 +10543,10 @@ def test_output_match(self, device, dtype, op):
                 if dtype_abbrs[dtype] not in self.ALLOWLIST_OP[op.name]:
                     self.skipTest(f"{op.name} is in the allow list for MPS but {dtype} is excluded")
 
-            if op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name]:
+            if (op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name] or
+               (op.name in self.BLOCKLIST_OP_GRAD and dtype_abbrs[dtype] in self.BLOCKLIST_OP_GRAD[op.name]) or
+               (product_version < 13.0 and op.name in self.BLOCKLIST_OP_GRAD_MACOS_12 and
+               dtype_abbrs[dtype] in self.BLOCKLIST_OP_GRAD_MACOS_12[op.name])):
                 run_grad_test = False
 
         def get_samples():
@@ -10167,7 +10578,7 @@ def get_samples():
                 cpu_out = op(*cpu_args, **cpu_kwargs)
                 mps_out = op(*mps_args, **mps_kwargs)
 
-                if op.name == "nn.functional.conv2d" and dtype == torch.float32:
+                if op.name == "nn.functional.conv2d" or op.name == "linalg.multi_dot" and dtype == torch.float32:
                     atol = 1e-4
                     rtol = 3e-5
                 elif (op.name in self.FP16_LOW_PRECISION_LIST) and dtype == torch.float16:
@@ -10179,6 +10590,11 @@ def get_samples():
                 elif (op.name == "native_layer_norm"):
                     atol = 1e-4
                     rtol = 1.3e-5
+                elif op.name == "norm" and dtype == torch.float16:
+                    atol = 7e-4
+                    rtol = 1.5e-3
+                elif op.name == "unique" and cpu_kwargs["sorted"] is False:
+                    continue
                 else:
                     atol = None
                     rtol = None
@@ -10186,16 +10602,11 @@ def get_samples():
                 self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
             except Exception as e:
-<<<<<<< HEAD
                 if any(s in str(e).lower() for s in ["int64", "macos 13", "adaptive pool mps"]):
                     self.skipTest(f"Expected Runtime Error: {str(e)}")
-=======
-                if any(s in str(e).lower() for s in ["int64", "macos 13"]):
-                    self.skipTest(f"{str(e)}")
 
-                if op.name in self.CUDA_RESULT and self.compare_with_CUDA(op, mps_out, atol=atol, rtol=rtol):
+                if op.name in CUDA_RESULT and self.compare_with_CUDA(op, mps_out, atol=atol, rtol=rtol):
                     continue
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
                 if not generate_new_truth:
                     raise e
@@ -10274,6 +10685,12 @@ def req_grad(t):
 # Copied from `TestCommon` in `test_ops.py`, just enough to duplicate the `test_numpy_ref` for MPS
 @skipIfSlowGradcheckEnv
 class TestCommon(TestCase):
+
+    UNIMPLEMENTED_OPS = {
+        'aminmax': [torch.float32],
+        'roll': [torch.float32],
+    }
+
     exact_dtype = True
 
     # Verifies, on teardown, that no OpInfo is still using dynamic dtypes in CI
@@ -10304,6 +10721,10 @@ def tearDownClass(cls):
     # MPS only supports float32
     @ops(_ref_test_ops, allowed_dtypes=(torch.float32,))
     def test_numpy_ref_mps(self, device, dtype, op):
+        key = op.name + op.variant_test_name
+        if key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]:
+            self.skipTest(f"Running test with {op.name} expected to fail due to missing op implementation")
+
         # Unlike `test_numpy_ref`, this test compares in `float32` since at the time of this test's creation MPS
         # does not support float64 Tensors.
         # A few ops are currently broken on their reference inputs, but not their sample inputs. These should

From 66951a045edc5cf11dd670759a9bfe041c7dc3f1 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 14 Feb 2023 18:26:29 -0500
Subject: [PATCH 05/30] Remove torch._six from test_mps (#326)

---
 test/test_mps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 2085d0cebe721a..b7907e7ed19905 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -19,7 +19,7 @@
 import yaml
 import platform
 from collections import defaultdict
-from torch._six import inf
+from torch import inf
 from torch.nn import Parameter
 from torch.testing._internal import opinfo
 from torch.testing._internal.common_utils import \

From 5ada241bcea2c5d732596e142cd902009e727ec4 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 14 Feb 2023 15:27:02 -0800
Subject: [PATCH 06/30] Remove unnecessary CI files (#327)

* Remove unnecessary CI files

* Additional files

* Update lint
---
 .github/auto_request_review.yml           |  29 --
 .github/workflows/auto_request_review.yml |  22 --
 .github/workflows/lint.yml                | 237 --------------
 .github/workflows/pull.yml                | 368 ----------------------
 .github/workflows/run_torchbench.yml      | 103 ------
 5 files changed, 759 deletions(-)
 delete mode 100644 .github/auto_request_review.yml
 delete mode 100644 .github/workflows/auto_request_review.yml
 delete mode 100644 .github/workflows/pull.yml
 delete mode 100644 .github/workflows/run_torchbench.yml

diff --git a/.github/auto_request_review.yml b/.github/auto_request_review.yml
deleted file mode 100644
index 765fd1715e8919..00000000000000
--- a/.github/auto_request_review.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Documented at https://github.com/necojackarc/auto-request-review
-reviewers:
-  groups:
-    symbolic-shapes:
-      - ezyang
-      - Chillee
-      - albanD
-      - miladm
-      - bdhirsh
-      - voznesenskym
-      - jbschlosser
-
-  per_author:
-    symbolic-shapes:
-      - symbolic-shapes
-      - antoniojkim
-      - wconstab
-      - SherlockNoMad
-
-files:
-  # none yet, TODO: migrate CODEOWNERS here
-
-options:
-  ignore_draft: true
-  ignored_keywords:
-    - DO NOT REVIEW
-  # Just manually setup a self-referential per_author rule if you
-  # want group assignment
-  enable_group_assignment: false
diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml
deleted file mode 100644
index 7c98c2990fba76..00000000000000
--- a/.github/workflows/auto_request_review.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: Auto Request Review
-
-on:
-  pull_request:
-    types: [opened, ready_for_review, reopened]
-
-jobs:
-  auto-request-review:
-    # Don't run on forked repos
-    if: ${{ !github.event.pull_request.head.repo.fork }}
-    name: Auto Request Review
-    runs-on: ubuntu-latest
-    steps:
-      - name: Request review based on files changes and/or groups the author belongs to
-        # v0.7.0
-        uses: necojackarc/auto-request-review@e08cdffa277d50854744de3f76230260e61c67f4
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 98a941d48b8385..0b846bc5a90fa4 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -10,244 +10,7 @@ on:
 # The names of steps that actually test the code should be suffixed with `(nonretryable)`.
 # When any other step fails, it's job will be retried once by retryBot.
 jobs:
-  docker-image:
-    name: docker-image
-    uses: ./.github/workflows/_calculate-docker-image.yml
-    with:
-      docker-image-name: pytorch-linux-focal-linter
-
   lintrunner:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        CACHE_DIRECTORY="/tmp/.lintbin"
-        # Try to recover the cached binaries
-        if [[ -d "${CACHE_DIRECTORY}" ]]; then
-          # It's ok to fail this as lintrunner init would download these binaries
-          # again if they do not exist
-          cp -r "${CACHE_DIRECTORY}" . || true
-        fi
-
-        # This has already been cached in the docker image
-        lintrunner init 2> /dev/null
-
-        # Do build steps necessary for linters
-        python3 -m tools.linter.clang_tidy.generate_build_files
-        python3 -m tools.generate_torch_version --is_debug=false
-        python3 -m tools.pyi.gen_pyi \
-          --native-functions-path aten/src/ATen/native/native_functions.yaml \
-          --tags-path aten/src/ATen/native/tags.yaml \
-          --deprecated-functions-path "tools/autograd/deprecated.yaml"
-
-        RC=0
-        # Run lintrunner on all files
-        if ! lintrunner --force-color --all-files --tee-json=lint.json 2> /dev/null; then
-          echo ""
-          echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
-          echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
-          RC=1
-        fi
-
-        # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
-        jq --raw-output \
-          '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
-          lint.json || true
-
-        exit $RC
-
-  quick-checks:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Ensure no non-breaking spaces
-        # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2
-        # does not support the '\u000a' syntax (which is relevant for local linters)
-        (! git --no-pager grep -In "$(printf '\xC2\xA0')" -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
-
-        # Ensure cross-OS compatible file names
-        (! git ls-files | grep -E '([<>:"|?*]|[ .]$)' || (echo "The above file names are not valid across all operating systems. Please ensure they don't contain the characters '<>:""|?*' and don't end with a white space or a '.' "; false))
-
-        # Ensure no versionless Python shebangs
-        (! git --no-pager grep -In '#!.*python$' -- . || (echo "The above lines have versionless Python shebangs; please specify either python2 or python3"; false))
-
-        # Ensure ciflow tags mentioned in config
-        python3 .github/scripts/collect_ciflow_labels.py --validate-tags
-
-        # C++ docs check
-        pushd docs/cpp/source
-        ./check-doxygen.sh
-        popd
-
-        # CUDA kernel launch check
-        set -eux
-        python3 torch/testing/_internal/check_kernel_launches.py |& tee cuda_kernel_launch_checks.txt
-
-  pr-sanity-checks:
-    name: pr-sanity-checks
-    runs-on: [self-hosted, linux.large]
-    # Only run this on pull requests. This check is simple enough to be done without a Docker image
-    if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: -1
-
-      - name: PR size check (nonretryable)
-        env:
-          BASE: ${{ github.event.pull_request.base.sha }}
-          HEAD: ${{ github.event.pull_request.head.sha }}
-        run: |
-          bash .github/scripts/pr-sanity-check.sh
-
-  workflow-checks:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Regenerate workflows
-        .github/scripts/generate_ci_workflows.py
-
-        RC=0
-        # Assert that regenerating the workflows didn't change them
-        if ! .github/scripts/report_git_status.sh .github/workflows; then
-          echo
-          echo 'As shown by the above diff, the committed .github/workflows'
-          echo 'are not up to date according to .github/templates.'
-          echo 'Please run this command, commit, and push again to your PR:'
-          echo
-          echo '    .github/scripts/generate_ci_workflows.py'
-          echo
-          echo 'If running that command does nothing, you may need to rebase'
-          echo 'onto a more recent commit from the PyTorch master branch.'
-          RC=1
-        fi
-
-        # Check that jobs will be cancelled
-        .github/scripts/ensure_actions_will_cancel.py
-
-        exit $RC
-
-  toc:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Regenerate ToCs and check that they didn't change
-        set -eu
-
-        export PATH=~/.npm-global/bin:"$PATH"
-        for FILE in $(git grep -Il '<!-- toc -->' -- '**.md'); do
-          markdown-toc --bullets='-' -i "$FILE"
-        done
-
-        if ! .github/scripts/report_git_status.sh .; then
-          echo
-          echo 'As shown by the above diff, the table of contents in one or'
-          echo 'more Markdown files is not up to date with the file contents.'
-          echo 'You can either apply that Git diff directly to correct the'
-          echo 'table of contents, or if you have npm installed, you can'
-          echo 'install the npm package markdown-toc and run the following'
-          # shellcheck disable=SC2016
-          echo 'command (replacing $FILE with the filename for which you want'
-          echo 'to regenerate the table of contents):'
-          echo
-          # shellcheck disable=SC2016
-          echo "    markdown-toc --bullets='-' -i \"\$FILE\""
-          false
-        fi
-
-  test-tools:
-    name: Test tools
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      fetch-depth: 0
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Test tools
-        python3 -m unittest discover -vs tools/test -p 'test_*.py'
-        python3 -m unittest discover -vs .github/scripts -p 'test_*.py'
-
-  test_collect_env:
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    name: Test collect_env
-    runs-on: linux.20_04.4x
-    strategy:
-      matrix:
-        test_type: [with_torch, without_torch, older_python_version]
-    steps:
-      # [see note: pytorch repo ref]
-      # deep clone (fetch-depth 0) required, to allow us to use git log
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-      - name: Setup Python 3.5
-        if: matrix.test_type == 'older_python_version'
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.5'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-      - name: Setup Python 3.8
-        if: matrix.test_type != 'older_python_version'
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-      - name: Install torch
-        if: matrix.test_type == 'with_torch'
-        run: |
-          pip install -r requirements.txt
-          # Doesn't really matter what torch version, we just need ANY torch installed
-          pip install 'torch==1.*'
-      - name: Run collect_env.py (nonretryable)
-        run: |
-          # All we need to see is that it passes
-          python3 torch/utils/collect_env.py
-
     runs-on: macos-m1-12
     steps:
       - name: Checkout PyTorch
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
deleted file mode 100644
index 2c5493639e4e75..00000000000000
--- a/.github/workflows/pull.yml
+++ /dev/null
@@ -1,368 +0,0 @@
-name: pull
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-      - main
-      - release/*
-      - landchecks/*
-  workflow_dispatch:
-  schedule:
-    - cron: 29 8 * * *  # about 1:29am PDT
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-jobs:
-  linux-focal-py3_8-gcc7-build:
-    name: linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "docs_test", shard: 1, num_shards: 1,  runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_8-gcc7-test:
-    name: linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_8-gcc7-build
-    with:
-      build-environment: linux-focal-py3.8-gcc7
-      docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.test-matrix }}
-
-  linux-docs:
-    name: linux-docs
-    uses: ./.github/workflows/_docs.yml
-    needs: linux-focal-py3_8-gcc7-build
-    with:
-      build-environment: linux-focal-py3.8-gcc7
-      docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }}
-
-  linux-focal-py3_8-gcc7-no-ops:
-    name: linux-focal-py3.8-gcc7-no-ops
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7-no-ops
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-
-  linux-focal-py3_8-gcc7-pch:
-    name: linux-focal-py3.8-gcc7-pch
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7-pch
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-
-  linux-focal-py3_9-clang7-asan-build:
-    name: linux-focal-py3.9-clang7-asan
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.9-clang7-asan
-      docker-image-name: pytorch-linux-focal-py3-clang7-asan
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_9-clang7-asan-test:
-    name: linux-focal-py3.9-clang7-asan
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_9-clang7-asan-build
-    with:
-      build-environment: linux-focal-py3.9-clang7-asan
-      docker-image: ${{ needs.linux-focal-py3_9-clang7-asan-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_9-clang7-asan-build.outputs.test-matrix }}
-
-  linux-focal-py3_8-clang10-onnx-build:
-    name: linux-focal-py3.8-clang10-onnx
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-clang10-onnx
-      docker-image-name: pytorch-linux-focal-py3-clang10-onnx
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_8-clang10-onnx-test:
-    name: linux-focal-py3.8-clang10-onnx
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_8-clang10-onnx-build
-    with:
-      build-environment: linux-focal-py3.8-clang10-onnx
-      docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.test-matrix }}
-
-  linux-bionic-py3_8-clang9-build:
-    name: linux-bionic-py3.8-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3.8-clang9
-      docker-image-name: pytorch-linux-bionic-py3.8-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-bionic-py3_8-clang9-test:
-    name: linux-bionic-py3.8-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_8-clang9-build
-    with:
-      build-environment: linux-bionic-py3.8-clang9
-      docker-image: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.test-matrix }}
-
-  linux-bionic-py3_11-clang9-build:
-    name: linux-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3.11-clang9
-      docker-image-name: pytorch-linux-bionic-py3.11-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-bionic-py3_11-clang9-test:
-    name: linux-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_11-clang9-build
-    with:
-      build-environment: linux-bionic-py3.11-clang9
-      docker-image: ${{ needs.linux-bionic-py3_11-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_11-clang9-build.outputs.test-matrix }}
-
-  linux-vulkan-bionic-py3_11-clang9-build:
-    name: linux-vulkan-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-vulkan-bionic-py3.11-clang9
-      docker-image-name: pytorch-linux-bionic-py3.11-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-vulkan-bionic-py3_11-clang9-test:
-    name: linux-vulkan-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-vulkan-bionic-py3_11-clang9-build
-    with:
-      build-environment: linux-vulkan-bionic-py3.11-clang9
-      docker-image: ${{ needs.linux-vulkan-bionic-py3_11-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-vulkan-bionic-py3_11-clang9-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-build:
-    name: linux-bionic-cuda11.7-py3.10-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.test-matrix }}
-
-  linux-focal-py3-clang7-mobile-build:
-    name: linux-focal-py3-clang7-mobile-build
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3-clang7-mobile-build
-      docker-image-name: pytorch-linux-focal-py3-clang7-asan
-      build-generates-artifacts: false
-
-  linux-jammy-cuda-11_7-cudnn8-py3_8-clang12-build:
-    name: linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-      docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-
-  linux-focal-py3-clang7-mobile-custom-build-static:
-    name: linux-focal-py3-clang7-mobile-custom-build-static
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3-clang7-mobile-custom-build-static
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-      build-generates-artifacts: false
-
-  linux-bionic-py3_8-clang8-xla-build:
-    name: linux-bionic-py3_8-clang8-xla
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3_8-clang8-xla
-      docker-image-name: xla_base
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
-        ]}
-
-  linux-bionic-py3_8-clang8-xla-test:
-    name: linux-bionic-py3_8-clang8-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_8-clang8-xla-build
-    with:
-      build-environment: linux-bionic-py3_8-clang8-xla
-      docker-image: ${{ needs.linux-bionic-py3_8-clang8-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_8-clang8-xla-build.outputs.test-matrix }}
-
-  win-vs2019-cpu-py3-build:
-    name: win-vs2019-cpu-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cpu-py3
-      cuda-version: cpu
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "windows.4xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "windows.4xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  win-vs2019-cpu-py3-test:
-    name: win-vs2019-cpu-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cpu-py3-build
-    with:
-      build-environment: win-vs2019-cpu-py3
-      cuda-version: cpu
-      test-matrix: ${{ needs.win-vs2019-cpu-py3-build.outputs.test-matrix }}
-
-  win-vs2019-cuda11_7-py3-build:
-    if: github.event_name == 'pull_request'
-    name: win-vs2019-cuda11.7-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cuda11.7-py3
-      cuda-version: "11.7"
-      sync-tag: win-cuda-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-bazel-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-bazel-test
-    uses: ./.github/workflows/_bazel-build-test.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-bazel-test
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-
-  linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single:
-    name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
-    uses: ./.github/workflows/_android-build-test.yml
-    with:
-      build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-
-  linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit:
-    name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
-    uses: ./.github/workflows/_android-build-test.yml
-    with:
-      build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-
-  linux-focal-py3_8-gcc7-mobile-lightweight-dispatch-build:
-    name: linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-      build-generates-artifacts: false
-
-  linux-focal-rocm5_4_2-py3_8-build:
-    # don't run build twice on master
-    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm5.4.2-py3.8
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-rocm5.4.2-py3.8
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-sm86-build:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      cuda-arch-list: 8.6
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-sm86-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-sm86-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.test-matrix }}
diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
deleted file mode 100644
index 8d55f6a9479ca9..00000000000000
--- a/.github/workflows/run_torchbench.yml
+++ /dev/null
@@ -1,103 +0,0 @@
-name: TorchBench CI (pytorch-linux-py3.8-cu116)
-on:
-  pull_request:
-
-env:
-  PYTHON_VERSION: "3.8"
-  # must be consistent with https://github.com/pytorch/benchmark/blob/main/requirements.txt#L19
-  NUMPY_VERSION: "1.21.2"
-  SETUP_SCRIPT: "/data/nvme/bin/setup_instance.sh"
-  PR_NUM: ${{ github.event.number }}
-  PR_BODY: ${{ github.event.pull_request.body }}
-  PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
-  PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
-
-jobs:
-  run-torchbench:
-    # We don't accept running on non-pytorch repos because of security concerns
-    # Only run the job when the body contains magic word "RUN_TORCHBENCH:"
-    if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.body, 'RUN_TORCHBENCH:') }}
-    runs-on: [self-hosted, bm-runner]
-    # Set to 12 hours
-    timeout-minutes: 720
-    steps:
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          path: pytorch
-      - name: Update self-hosted PyTorch
-        run: |
-          pushd "${HOME}"/pytorch
-          git remote prune origin
-          git fetch
-          popd
-      - name: Create conda environment and install deps
-        run: |
-          conda create -y -n pr-ci python="${PYTHON_VERSION}"
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          conda install -y numpy="${NUMPY_VERSION}" requests ninja pyyaml mkl mkl-include \
-                           setuptools cmake=3.22.* typing-extensions boto3 \
-                           pillow pytest tabulate gitpython git-lfs tqdm psutil
-          pip install --pre torch torchvision torchtext -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html
-      - name: Setup TorchBench branch
-        run: |
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          PR_BODY_FILE=/tmp/pr-body.txt
-          echo "$PR_BODY" > ${PR_BODY_FILE}
-          python pytorch/.github/scripts/run_torchbench.py --pr-body "${PR_BODY_FILE}" set-torchbench-branch
-      - name: Checkout TorchBench
-        uses: malfet/checkout@silent-checkout
-        with:
-          repository: pytorch/benchmark
-          path: benchmark
-          lfs: false
-          ref: ${{ env.TORCHBENCH_BRANCH }}
-      - name: GPU Info
-        run: |
-          nvidia-smi
-      - name: Run TorchBench
-        run: |
-          set -x
-          pushd "${HOME}"/pytorch
-          PR_MERGE_BASE=$(git merge-base "$PR_BASE_SHA" "$PR_HEAD_SHA")
-          popd
-          PR_BODY_FILE=/tmp/pr-body.txt
-          echo "$PR_BODY" > ${PR_BODY_FILE}
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          python3 pytorch/.github/scripts/run_torchbench.py \
-                  --pr-body "$PR_BODY_FILE" \
-                  run \
-                  --pytorch-path "${HOME}"/pytorch \
-                  --torchbench-path "${PWD}"/benchmark \
-                  --pr-num "$PR_NUM" \
-                  --pr-base-sha "$PR_MERGE_BASE" \
-                  --pr-head-sha "$PR_HEAD_SHA"
-      - name: Upload result to S3
-        run: |
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          python3 pytorch/.github/scripts/run_torchbench.py \
-                  upload-s3 \
-                  --result-dir "${HOME}/.torchbench/bisection/pr${{ github.event.number }}"
-      - name: Remove conda environment and cleanup
-        run: |
-          conda env remove --name pr-ci
-          rm /tmp/pr-body.txt
-      - name: Upload artifact
-        uses: actions/upload-artifact@v3
-        with:
-          name: TorchBench result
-          path: ~/.torchbench/bisection/pr${{ github.event.number }}
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true

From bf8eba99a4f0435aaee9b4abd3123fbeef94d4a2 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 14 Feb 2023 15:28:17 -0800
Subject: [PATCH 07/30] Enable test modules on MPS and CI runners (#305) (#324)

* Enable test modules on MPS and CI runners

* Update lint.yml

* Update comments

* Retrigger CI

* Retrigger CI #2

* Remove comment
---
 .github/workflows/_mac-test-mps.yml | 14 +++++
 .github/workflows/lint.yml          |  2 +-
 test/test_modules.py                | 81 ++++++++++++++++++++++++++---
 3 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index 1fcafb6db66ff8..f9c402a772ac73 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -83,6 +83,20 @@ jobs:
           set -ex
           ${CONDA_RUN} python3 test/run_test.py --mps --verbose
 
+      - name: Run MPS Test Modules
+        id: test_2
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+        shell: arch -arch arm64 bash {0}
+        # During bring up of test_modules don't show this as an error.
+        continue-on-error: true
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          # TODO(https://github.com/pytorch/pytorch/issues/79293)
+
+          ${CONDA_RUN} python3 test/test_modules.py -k mps --verbose
+
       - name: Print remaining test logs
         shell: bash
         if: always()
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 0b846bc5a90fa4..58566ebc37465e 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -70,7 +70,7 @@ jobs:
           # shellcheck disable=SC1090
           set -ex
           set +e
-          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py; then
+          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py test/test_modules.py; then
               echo ""
               echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
               echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
diff --git a/test/test_modules.py b/test/test_modules.py
index 2ae17f5f8cf85b..9c244fb65e60b9 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -10,12 +10,23 @@
 from torch.testing._internal.common_cuda import with_tf32_off
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol, skipMeta)
+from torch.testing._internal.common_dtype import get_all_dtypes
 from torch.testing._internal.common_modules import module_db, modules, TrainEvalMode
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck,
-    gradgradcheck, skipIfMps, skipIfTorchInductor)
+    gradgradcheck, skipIfTorchInductor)
 from unittest.mock import patch, call
 
+MPS_DTYPES = get_all_dtypes()
+for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]:
+    del MPS_DTYPES[MPS_DTYPES.index(t)]
+
+def _get_mps_error_msg(device, dtype, op, mps_blocklist):
+    if torch.backends.mps.is_available() and device == "mps" and dtype not in MPS_DTYPES:
+        return f"MPS doesn't support {str(dtype)} datatype"
+    if op.name.startswith(tuple(mps_blocklist)):
+        return "MPS doesn't support op " + str(op.name)
+    return None
 
 class TestModule(TestCase):
     _do_cuda_memory_leak_check = True
@@ -33,7 +44,8 @@ def _assert_module_parameters_and_buffer_are(self, module, device, dtype):
         def _check_module(items, name, device=device, dtype=dtype):
             for item_name, item in items:
                 self.assertEqual(
-                    item.device, device,
+                    # workaround for the tests checking the device (mps:0 with mps)
+                    item.device.type, device.type,
                     f'{name} {item_name} is on device {item.device} instead of the expected device {device}')
                 if item.dtype.is_floating_point:
                     self.assertEqual(
@@ -42,9 +54,16 @@ def _check_module(items, name, device=device, dtype=dtype):
         _check_module(module.named_parameters(), "Parameter")
         _check_module(module.named_buffers(), "Buffer")
 
-    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     @modules(module_db)
     def test_forward(self, device, dtype, module_info, training):
+        MPS_BLOCKLIST = [
+            "nn.LSTM"  # segfault
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -84,6 +103,10 @@ def test_forward(self, device, dtype, module_info, training):
     # They should be applied to any created parameters and buffers.
     @modules(module_db)
     def test_factory_kwargs(self, device, dtype, module_info, training):
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -198,6 +221,11 @@ def _to_device1(objs):
     @modules(module_db)
     def test_repr(self, device, dtype, module_info, training):
         # Test module can be represented with repr and str without errors.
+
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -211,10 +239,19 @@ def test_repr(self, device, dtype, module_info, training):
             m.__repr__()
             str(m)
 
-    @skipIfMps
     @modules(module_db)
     def test_pickle(self, device, dtype, module_info, training):
         # Test that module can be pickled and unpickled.
+
+        MPS_BLOCKLIST = [
+            "nn.LSTM"  # hard crash
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -249,6 +286,15 @@ def test_pickle(self, device, dtype, module_info, training):
     def test_check_inplace(self, device, dtype, module_info, training):
         # Check if the inplace variant of the module gives the same result as the out of place
         # variant.
+
+        MPS_BLOCKLIST = [
+            "nn.ELU"  # hard crash
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=True, training=training)
@@ -326,11 +372,21 @@ def inner_zero_grad(obj):
                 obj.grad = None
         self._traverse_obj(obj, inner_zero_grad)
 
-    @skipIfMps
     @modules(module_db)
     @skipIfTorchInductor("to be fixed")
     def test_non_contiguous_tensors(self, device, dtype, module_info, training):
         # Check modules work with non-contiguous tensors
+        MPS_BLOCKLIST = [
+            # hard crashes
+            "nn.GRU",
+            "nn.LSTM",
+            "nn.RNN"
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
 
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
@@ -582,10 +638,18 @@ def check_backward(cpu_output, gpu_output):
                     for cpu_output, gpu_output in zip(flatten_cpu_outputs, flatten_gpu_outputs):
                         check_backward(cpu_output, gpu_output)
 
-    @skipIfMps
     @modules(module_db)
     @skipIfTorchInductor("to be fixed")
     def test_memory_format(self, device, dtype, module_info, training):
+        MPS_BLOCKLIST = [
+            "nn.BatchNorm3d",  # failed assert
+            "nn.LSTM",  # segfault
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         is_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6)
         # TODO tighten it to a specific module
         atol, rtol = (3e-3, 7e-3) if is_sm86 else (None, None)
@@ -682,9 +746,12 @@ def inner_check_out_mem_format(output):
 
     # Test whether train and eval modes differ for each module. Use to verify
     # that the ModuleInfo entry flag is correct.
-    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     @modules(module_db, train_eval_mode=TrainEvalMode.train_only)
     def test_if_train_and_eval_modes_differ(self, device, dtype, module_info, training):
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)

From 2f336a4ee7bcc70a5d6495dcc5cb659339d2f4a9 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 14 Feb 2023 15:30:28 -0800
Subject: [PATCH 08/30] [CHERRY-PICK] Block uint8 data type for unary and
 binary ops on macOS 12. (#313) (#328)

* Block uint8 data type for unary and binary ops on macOS 12. (#313)

* fixes after cherry-pick

---------

Co-authored-by: Ronian526 <11454459+Ronian526@users.noreply.github.com>
---
 aten/src/ATen/native/mps/operations/BinaryOps.mm | 2 ++
 aten/src/ATen/native/mps/operations/UnaryOps.mm  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index c730eccfe944e4..6569e59086fc9f 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -26,6 +26,8 @@
 void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha,
                     const Tensor& output_, std::string op_name, BinaryOpBlock binaryBlock)
 {
+  TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte ),
+              "MPS support binary op with uint8 natively starting from macOS 13.0");
   TORCH_CHECK(!(op_name == "power" && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS) &&
               (self.scalar_type() == ScalarType::Long ||
               (other.scalar_type() == ScalarType::Long && (self.scalar_type() != ScalarType::Half && self.scalar_type() != ScalarType::Float)))),
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index a869ff3379aa86..0c6e5b06d08984 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -16,6 +16,8 @@ bool is_empty_tensor(const Tensor& self) {
 
 void unary_op(const Tensor& self, const Tensor& output, std::string op_name, UnaryOpBlock unaryBlock, is_noop_p is_noop = is_empty_tensor)
 {
+  TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte ),
+              "MPS support unary op with uint8 natively starting from macOS 13.0");
   if (!output.is_same_size(self)) {
     output.resize_(self.sizes());
   }

From 108cdc015b11b98f735cc705626fc8ae3c9291b7 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 14 Feb 2023 20:38:59 -0500
Subject: [PATCH 09/30] Fix test_zero_grad() (#330)

---
 test/test_mps.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index b7907e7ed19905..a1fc4d7dd5f76a 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -6255,24 +6255,24 @@ def test_zero_grad(self):
         self.assertIsNotNone(module.weight.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         module.zero_grad()
-        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
+        self.assertIsNone(module.weight.grad)
 
         module.bias.requires_grad = True
         module.zero_grad()
-        self.assertIsNotNone(module.weight.grad)
+        self.assertIsNone(module.weight.grad)
         self.assertIsNone(module.bias.grad)
         module(i).sum().backward()
         self.assertIsNotNone(module.weight.grad)
         self.assertIsNotNone(module.bias.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         self.assertGreater(module.bias.grad.data.abs().sum(), 0)
-        module.zero_grad()
+        module.zero_grad(set_to_none=False)   # Force set to zeros.
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
         self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
 
-        # Force set to None.
-        module.zero_grad(set_to_none=True)
+        module.zero_grad()
         self.assertIsNone(module.weight.grad)
+        self.assertIsNone(module.bias.grad)
 
     def test_no_grad(self):
         for dtype in [torch.bfloat16, torch.float, torch.double]:

From 8de331505dde414f3a882b0b7feb75056b79368f Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 14 Feb 2023 22:05:23 -0500
Subject: [PATCH 10/30] Convert output back to ChannelsLast if needed (#325)

---
 aten/src/ATen/native/mps/operations/Pooling.mm | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index 2b9272d4675953..08727fed8265c8 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -83,6 +83,7 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
   pool2d_shape_check(input, kH, kW, dH, dW, padH, padW, dilationH, dilationW,
                      nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format);
 
+  auto output_memory_format = output.suggest_memory_format();
   // the output and indices are 'empty', so we could avoid unnecessary gatherView on empty tensors
   // by simply restriding them (instead of calling the costly Contiguous()).
   if (indices.suggest_memory_format() == MemoryFormat::ChannelsLast) {
@@ -94,8 +95,9 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
       outputSizes.insert(outputSizes.begin(), nbatch);
     }
     output.resize_(outputSizes);
-  } else if (output.suggest_memory_format() == MemoryFormat::ChannelsLast) {
+  } else if (output_memory_format == MemoryFormat::ChannelsLast) {
     output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
+    output_memory_format = MemoryFormat::Contiguous;
   }
 
   if (output.numel() == 0 || (is_backward_pass && grad_output.numel() == 0)) {
@@ -196,6 +198,10 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
     }
 
     runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
+
+    if (output_memory_format != suggested_memory_format) {
+      const_cast<Tensor&>(output) = output.to(suggested_memory_format);
+    }
   }
 }
 
@@ -356,6 +362,8 @@ Tensor mps_max_pool2d_backward(
     const Tensor& output,
     const Tensor& indices) {
 
+  auto indices_memory_format = indices.suggest_memory_format();
+
   mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
     MPSGraph* mpsGraph = cachedGraph.graph();
     NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor: cachedGraph.inputTensor
@@ -366,6 +374,10 @@ Tensor mps_max_pool2d_backward(
   };
   mps::pool2d_template(input, output, indices, c10::nullopt, kernel_size, stride,
                        padding, dilation, ceil_mode, false, c10::nullopt, pooling_op_block, "max_pool2d_indices");
+
+  if (indices_memory_format == MemoryFormat::ChannelsLast) {
+    const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
+  }
 }
 
 TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)(

From 051bc9c4c759d119ce76a9cd3b19b1ea5edee418 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 14 Feb 2023 22:44:43 -0800
Subject: [PATCH 11/30] Fix bilinear backward pass (#331)

* Fix bilinear backward pass

* Remove comment
---
 aten/src/ATen/native/mps/OperationUtils.mm       | 2 +-
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 3 +++
 test/test_mps.py                                 | 1 -
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 978162aed855ac..4e76c172fb6e91 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -265,7 +265,7 @@ void printTensorNDArray(const Tensor& t) {
   id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
   bool sliceViewTensor = canSliceViewTensor(src, mpsShape);
   // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if ((!src.is_contiguous() || (src.is_view() && src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
+  if ((!src.is_contiguous() || (src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
      Tensor emptyShell = Tensor();
     // use "_tensor" from Placeholder to retain view's output during its usage in other ops
     _tensor = gatherViewTensor(src, emptyShell);
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index f858714fb82d5c..a79aeca766d366 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -163,6 +163,9 @@ void reduction_out_mps(
     if (reduction_type == MPSReductionType::PROD) {
       output_t.fill_(1);
     }
+    else if (reduction_type == MPSReductionType::SUM) {
+      output_t.zero_();
+    }
     return;
   }
 
diff --git a/test/test_mps.py b/test/test_mps.py
index a1fc4d7dd5f76a..ed83acc1db08b1 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10042,7 +10042,6 @@ class TestConsistency(TestCaseMPS):
         'nn.functional.prelu': ['f32'],
         'atanh': ['f32'],
         'div': ['f16'],
-        'nn.functional.bilinear': ['f32'],
         'nn.functional.embedding': ['f16'],
 
         # Unsupported dtype

From 1b09ea22a544719e562e4a5f77e081529853b139 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Tue, 14 Feb 2023 23:02:51 -0800
Subject: [PATCH 12/30] Update macOS 12 blocklist (#323)

* Update macOS 12 blocklist
- move sum, masked.var, mul to low precision list
- unblock them from running

* - mark __rdiv__ failures as accumulate error exceeds atol/rtol
---
 test/test_mps.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index ed83acc1db08b1..51d1063b9b6d6c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10455,16 +10455,15 @@ class TestConsistency(TestCaseMPS):
         'addr',
 
         # for macOS 12
-        'masked.normalize', 'masked.sum',
+        'masked.normalize', 'masked.sum', 'masked.var',
         'outer',
-        'sum_to_size',
+        'sum_to_size', 'sum',
+        'mul',
     }
 
     BLOCKLIST_MACOS_12 = {
+        # failures because of accumulate error exceeds atol/rtol
         '__rdiv__': [torch.float16],
-        'masked.var': [torch.float16],
-        'sum': [torch.float16],
-        'mul': [torch.float16],
 
         # expected failures
         'nn.functional.interpolatenearest': [torch.float32],

From 8c7df6f68f1fda239c278e256578e43cc3f27fc7 Mon Sep 17 00:00:00 2001
From: jhavukainen <104022140+jhavukainen@users.noreply.github.com>
Date: Tue, 14 Feb 2023 23:42:45 -0800
Subject: [PATCH 13/30] [MPS] Fixes for LSTM. (#319)

- Backward pass has to give explicit bias tensor of zeros if none is passed to the op or the bias gradient will not be calculated.
- Fixed bias tensor mistakenly getting overwritten to zeros
- Fixes crash when lstm op called with has_biases set to false. Change takes into account the changed shape of the input params TensorList depending on the bias flag.

Co-authored-by: Kulin Seth <kulin_seth@apple.com>
---
 aten/src/ATen/native/mps/operations/RnnOps.mm | 91 ++++++++++++-------
 test/test_mps.py                              | 57 ++++++++++++
 2 files changed, 116 insertions(+), 32 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/RnnOps.mm b/aten/src/ATen/native/mps/operations/RnnOps.mm
index d46ce356318e24..bee82fcc248036 100644
--- a/aten/src/ATen/native/mps/operations/RnnOps.mm
+++ b/aten/src/ATen/native/mps/operations/RnnOps.mm
@@ -30,10 +30,15 @@
     std::vector<Tensor> biases;
     std::vector<Tensor> recurrent_biases;
     for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
-        recurrent_kernel_weights.push_back(params[i*4+1]);
-        biases.push_back(params[i*4+2]);
-        recurrent_biases.push_back(params[i*4+3]);
+        if (has_biases) {
+            kernel_weights.push_back(params[i*4]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
+            biases.push_back(params[i*4+2]);
+            recurrent_biases.push_back(params[i*4+3]);
+        } else {
+            kernel_weights.push_back(params[i*2]);
+            recurrent_kernel_weights.push_back(params[i*2+1]);
+        }
     }
 
     struct CachedGraph : public MPSCachedGraph {
@@ -71,8 +76,10 @@
             for (size_t i = 0; i < num_layers; i += 1) {
                 [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                 [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
-                [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                if(has_biases) {
+                    [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                    [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                }
             }
 
             MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor];
@@ -109,9 +116,12 @@
             NSMutableArray<MPSGraphTensor*>* outputZStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
             NSMutableArray<MPSGraphTensor*>* outputCellStateFwdArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
             for(int i = 0; i < num_layers; i++) {
-                MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
-                                                                    secondaryTensor:recurrentBiasList[i]
-                                                                            name:nil];
+                MPSGraphTensor* biasTensor = nil;
+                if(has_biases) {
+                    biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                     secondaryTensor:recurrentBiasList[i]
+                                                                name:nil];
+                }
                 outputs = [mpsGraph LSTMWithSourceTensor:inputTensor_
                                         recurrentWeight:recurrentKernelWeightsList[i]
                                             inputWeight:kernelWeightsList[i]
@@ -121,7 +131,6 @@
                                              descriptor:opDesc
                                                    name:nil];
 
-
                 stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                             dimension:0
                                                             start:i
@@ -196,12 +205,14 @@
       for (size_t i = 0; i < num_layers; i+=1) {
           kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
           recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
-          bias = Placeholder([biasList objectAtIndex:i], biases[i]);
-          recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
           [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
           [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-          [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
-          [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+          if(has_biases) {
+            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+          }
 
       }
       Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensors_[0], input);
@@ -250,10 +261,15 @@
     std::vector<Tensor> biases;
     std::vector<Tensor> recurrent_biases;
     for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
-        recurrent_kernel_weights.push_back(params[i*4+1]);
-        biases.push_back(params[i*4+2]);
-        recurrent_biases.push_back(params[i*4+3]);
+        if(has_biases) {
+            kernel_weights.push_back(params[i*4]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
+            biases.push_back(params[i*4+2]);
+            recurrent_biases.push_back(params[i*4+3]);
+        } else {
+            kernel_weights.push_back(params[i*2]);
+            recurrent_kernel_weights.push_back(params[i*2+1]);
+        }
     }
 
     struct CachedGraph : public MPSCachedGraph {
@@ -296,8 +312,10 @@
                     for (size_t i = 0; i < num_layers; i += 1) {
                         [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                         [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                        [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
-                        [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                        if(has_biases) {
+                            [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                            [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                        }
                     }
 
                     MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input));
@@ -349,9 +367,15 @@
                         cellStateFwd = [mpsGraph squeezeTensor:cellStateFwd
                                                     axis:0
                                                     name:nil];
-                        MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
-                                                                            secondaryTensor:recurrentBiasList[i]
-                                                                            name:nil];
+                        MPSGraphTensor* biasTensor = nil;
+                        if(has_biases) {
+                            biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                            secondaryTensor:recurrentBiasList[i]
+                                                            name:nil];
+                        } else {
+                            biasTensor = [mpsGraph constantWithScalar:0.0
+                                                            dataType:inputTensor.dataType];
+                        }
 
                         MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                                     dimension:0
@@ -391,7 +415,6 @@
                                                   descriptor: opDesc
                                                         name: nil];
 
-
                         gradientTensor_ = [outputs objectAtIndex:0];
                         [gradOutputArray addObject:[outputs objectAtIndex:0]];
                         [gradRecWeightsArray addObject:[outputs objectAtIndex:1]];
@@ -445,18 +468,20 @@
         for (size_t i = 0; i < num_layers; i+=1) {
             kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
             recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
-            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
-            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
             [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
             [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
-            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            if(has_biases) {
+                bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+                recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+                [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+                [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            }
         }
 
         Tensor output = at::empty_like(input);
         Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[0]);
         Tensor grad_weights = at::empty_like(kernel_weights[0]);
-        Tensor grad_bias = at::empty_like(biases[0]);
+        Tensor grad_bias = at::empty((kernel_weights[0].size(0)), kernel_weights[0].options());
         Tensor grad_state = at::empty_like(hx[0]);
         Tensor grad_cell_state = at::empty_like(hx[1]);
         Placeholder outputPlaceholder   = Placeholder(cachedGraph->outputTensors_[0], output);
@@ -482,13 +507,15 @@
             Tensor output = at::empty_like(input);
             Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[i]);
             Tensor grad_weights = at::empty_like(kernel_weights[i]);
-            Tensor grad_bias = at::empty_like(biases[i]);
+            Tensor grad_bias = at::empty((kernel_weights[0].size(0)), kernel_weights[0].options());
             Tensor grad_state = at::empty_like(hx[0]);
             Tensor grad_cell_state = at::empty_like(hx[1]);
             weights.push_back(grad_weights);
             weights.push_back(grad_rec_weights);
-            weights.push_back(grad_bias);
-            weights.push_back(grad_bias);
+            if(has_biases) {
+                weights.push_back(grad_bias);
+                weights.push_back(grad_bias);
+            }
             gradOutPlaceholder = Placeholder([gradOutputArray objectAtIndex:i], output);
             gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex:i], grad_rec_weights);
             gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex:i], grad_weights);
diff --git a/test/test_mps.py b/test/test_mps.py
index 51d1063b9b6d6c..d63b98083f3b67 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8588,6 +8588,63 @@ def get_results(device):
         self.assertEqual(cpu_input_grad, mps_input_grad)
         self.assertEqual(cpu_weight_grad, mps_weight_grad)
 
+    def test_RNN_cell_no_broadcasting(self):
+        def test(cell_module, input, hx, input_size, hidden_size):
+            cell = cell_module(input_size, hidden_size, device='mps')
+            self.assertRaises(RuntimeError, lambda: cell(input, hx))
+
+        def test_all(hidden_size, bad_hx, good_hx, input_size, input):
+            test(nn.RNNCell, input, bad_hx, input_size, hidden_size)
+            test(nn.GRUCell, input, bad_hx, input_size, hidden_size)
+            test(nn.LSTMCell, input, (bad_hx, good_hx), input_size, hidden_size)
+            test(nn.LSTMCell, input, (good_hx, bad_hx), input_size, hidden_size)
+
+        hidden_size = 20
+        input_size = 10
+        input = torch.randn(3, input_size, device='mps')
+        bad_hx = torch.randn(1, hidden_size, device='mps')
+        good_hx = torch.randn(3, hidden_size, device='mps')
+
+        # Test hidden/input batch size broadcasting
+        test_all(hidden_size, bad_hx, good_hx, input_size, input)
+
+        # Test hx's hidden_size vs module's hidden_size broadcasting
+        bad_hx = torch.randn(3, 1)
+        test_all(hidden_size, bad_hx, good_hx, input_size, input)
+
+        # Test input's input_size vs module's input_size broadcasting
+        bad_input = torch.randn(3, 1)
+        test_all(hidden_size, good_hx, good_hx, input_size, bad_input)
+
+    def test_LSTM_cell(self):
+        # this is just a smoke test; these modules are implemented through
+        # autograd so no Jacobian test is needed
+        for bias in (True, False):
+            input = torch.randn(3, 10, device='mps')
+            hx = torch.randn(3, 20, device='mps')
+            cx = torch.randn(3, 20, device='mps')
+            lstm = nn.LSTMCell(10, 20, bias=bias, device='mps')
+            for _ in range(6):
+                hx, cx = lstm(input, (hx, cx))
+
+            (hx + cx).sum().backward()
+
+    def test_LSTM_cell_forward_input_size(self):
+        input = torch.randn(3, 11, device='mps')
+        hx = torch.randn(3, 20, device='mps')
+        cx = torch.randn(3, 20, device='mps')
+        lstm = nn.LSTMCell(10, 20, device='mps')
+        self.assertRaises(Exception, lambda: lstm(input, (hx, cx)))
+
+    def test_LSTM_cell_forward_hidden_size(self):
+        input = torch.randn(3, 10, device='mps')
+        hx = torch.randn(3, 21, device='mps')
+        cx = torch.randn(3, 20, device='mps')
+        lstm = nn.LSTMCell(10, 20, device='mps')
+        self.assertRaises(Exception, lambda: lstm(input, (hx, cx)))
+        self.assertRaises(Exception, lambda: lstm(input, (cx, hx)))
+
+
 class TestFallbackWarning(TestCase):
     # TODO: Remove once test_testing.py is running on MPS devices
     def test_no_warning_on_import(self):

From d42f74f70a17d40c3df994468a78800e0e56f9db Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 14 Feb 2023 23:43:02 -0800
Subject: [PATCH 14/30] Fix nn.functional.conv_transpose2d grad (#312) (#329)

- add _mps_convolution_impl that takes optional shape
- for conv_tranpose2d grad, use the shape from input directly
- remove nn.functional.conv_transpose2d grad from blocklist

Co-authored-by: Ronian526 <11454459+Ronian526@users.noreply.github.com>
---
 .../ATen/native/mps/operations/Convolution.mm | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 3cd442099f5ca4..7c0a33d36d042d 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -56,14 +56,15 @@ void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
   descriptor_.groups = groups;
 }
 
-Tensor _mps_convolution(
+Tensor _mps_convolution_impl(
     const Tensor& input_t,
     const Tensor& weight_t,
     const c10::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
+    c10::optional<IntArrayRef> input_shape) {
   TORCH_CHECK(input_t.dim() < 5, "Conv3D is not supported on MPS");
 
   namespace native_mps = at::native::mps;
@@ -83,6 +84,8 @@ Tensor _mps_convolution(
   auto memory_format = input_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
   auto output_t = at::empty(
+                    input_shape.has_value() ?
+                    input_shape.value() :
                     conv_output_size(input->sizes(), weight->sizes(),
                                      padding, stride, dilation),
                     input->scalar_type(),
@@ -237,6 +240,17 @@ Tensor _mps_convolution(
   return *output;
 }
 
+Tensor _mps_convolution(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups) {
+    return _mps_convolution_impl(input_t, weight_t, bias_opt, padding, stride, dilation, groups, c10::nullopt);
+}
+
 Tensor mps_convolution_backward_input(
     IntArrayRef input_size, const Tensor& grad_output_, const Tensor& weight_,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
@@ -576,10 +590,10 @@ Tensor _mps_convolution_transpose(
 Tensor mps_convolution_transpose_backward_input(
     const Tensor& grad_output_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups)
+    int64_t groups, IntArrayRef input_shape)
 {
-  return at::_mps_convolution(
-    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups);
+  return _mps_convolution_impl(
+    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups, input_shape);
 }
 
 Tensor mps_convolution_transpose_backward_weight(
@@ -603,7 +617,7 @@ Tensor mps_convolution_transpose_backward_weight(
 
   Tensor grad_input, grad_weight;
   if (output_mask[0]) {
-    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups);
+    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, input.sizes());
   }
   if (output_mask[1]) {
     grad_weight = mps_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups);

From 285620362f38582b09bed753f32de3e357c5e467 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 15 Feb 2023 15:07:51 -0500
Subject: [PATCH 15/30] Fix the crash in elu_backward() (#333)

Fixes a crash where the inputTensor could go null and cause a crash.
---
 .../ATen/native/mps/operations/Activation.mm  | 45 +++++--------------
 1 file changed, 12 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 9e643ebf29390c..84c2f8789790b0 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -1208,8 +1208,7 @@ void elu_variants_out_mps (
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *gradOutputTensor_ = nil;
-    MPSGraphTensor *inputTensor_ = nil;
-    MPSGraphTensor *resultTensor_ = nil;
+    MPSGraphTensor *selfOrResultTensor_ = nil;
     MPSGraphTensor *gradInputTensor_ = nil;
   };
 
@@ -1218,7 +1217,7 @@ void elu_variants_out_mps (
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
-    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
                                                  to_string(alpha.to<double>()) + ":" +
                                                  to_string(scale.to<double>()) + ":" +
                                                  to_string(input_scale.to<double>()) + ":" +
@@ -1235,18 +1234,14 @@ void elu_variants_out_mps (
           newCachedGraph = new CachedGraph(mpsGraph);
 
           MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-
-          MPSGraphTensor* inputTensor = nil;
-          MPSGraphTensor* resultTensor = nil;
-
+          MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
           MPSGraphTensor* lessThanZeroGradTensor = nil;
 
           if(is_result) {
-            resultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
             MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
                                                                shape:@[@1]
                                                             dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:resultTensor
+            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
                                                                         secondaryTensor:alphaTensor
                                                                                    name:nil];
             auto constMul = scale.to<double>() * input_scale.to<double>();
@@ -1258,11 +1253,10 @@ void elu_variants_out_mps (
                                                                           name:nil];
           }
           else {
-            inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
             MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
                                                                     shape:@[@1]
                                                                  dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
                                                                           secondaryTensor:inputScaleTensor
                                                                                      name:nil];
             MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor
@@ -1282,7 +1276,7 @@ void elu_variants_out_mps (
           MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
                                                               shape:@[@1]
                                                            dataType:getMPSDataType(grad_output.scalar_type())];
-          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
                                                                    secondaryTensor:zeroTensor
                                                                               name:nil];
           MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
@@ -1294,8 +1288,7 @@ void elu_variants_out_mps (
                                                                                  name:nil];
 
           newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->resultTensor_ = resultTensor;
+          newCachedGraph->selfOrResultTensor_ = selfOrResultTensor;
           newCachedGraph->gradInputTensor_ = gradInputTensor;
         }
         return newCachedGraph;
@@ -1304,28 +1297,14 @@ void elu_variants_out_mps (
     }
 
     Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
-    Placeholder selfPlaceholder = Placeholder();
-    Placeholder resultPlaceholder = Placeholder();
-    if(is_result)
-      resultPlaceholder = Placeholder(cachedGraph->resultTensor_, self_or_result, nil, executeGatherOp);
-    else
-      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp);
+    Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->selfOrResultTensor_, self_or_result, nil, executeGatherOp);
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
-
-    if(is_result)
-      feeds = @{
-        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-        resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()
-      };
-    else
-      feeds = @{
-        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-      };
-
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfOrResultPlaceholder.getMPSGraphTensor() : selfOrResultPlaceholder.getMPSGraphTensorData()
+    };
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
     };

From 18797b00dedb80df3a8f41474260683bd2a63a23 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Wed, 15 Feb 2023 13:14:39 -0800
Subject: [PATCH 16/30] Fix nn.functional.embedding grad (#335)

- casting the input tensor to float32 and cast back the output tensor
- unblock the test
---
 aten/src/ATen/native/mps/operations/Indexing.mm | 16 ++++++++++++++--
 test/test_mps.py                                |  1 -
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 310cbb7bf9370b..036f0a242f1157 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -886,19 +886,31 @@ Tensor embedding_dense_backward_mps(
 
             MPSGraphTensor* reshapedIndicesTensor = indicesTensor;
 
+            MPSGraphTensor* castGradTensor = incomingGradTensor;
+            MPSDataType dataType = mps::getMPSDataType(grad_.scalar_type());
+            // issue 105486100, scatterNDWithUpdatesTensor produces wrong result for float16
+            if (dataType == MPSDataTypeFloat16) {
+              castGradTensor = [mpsGraph castTensor: incomingGradTensor
+                                             toType: MPSDataTypeFloat32
+                                               name: nil];
+            }
             if (num_indices_dims != 0) {
               reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor: indicesTensor
                                                                axes: @[@-1]
                                                                name: nil];
             }
 
-            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: incomingGradTensor
+            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: castGradTensor
                                                              indicesTensor: reshapedIndicesTensor
                                                                      shape: native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape))
                                                            batchDimensions: 0
                                                                       mode: MPSGraphScatterModeAdd
                                                                       name: @"edb"];
-
+            if (dataType == MPSDataTypeFloat16) {
+              outgoingGradTensor = [mpsGraph castTensor: outgoingGradTensor
+                                                 toType: MPSDataTypeFloat16
+                                                   name: nil];
+            }
             newCachedGraph->incomingGradTensor_ = incomingGradTensor;
             newCachedGraph->indicesTensor_ = indicesTensor;
             newCachedGraph->outgoingGradTensor_ = outgoingGradTensor;
diff --git a/test/test_mps.py b/test/test_mps.py
index d63b98083f3b67..aea4cfe199b00c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10099,7 +10099,6 @@ class TestConsistency(TestCaseMPS):
         'nn.functional.prelu': ['f32'],
         'atanh': ['f32'],
         'div': ['f16'],
-        'nn.functional.embedding': ['f16'],
 
         # Unsupported dtype
         'special.ndtr': ['f32'],

From cf06ac5c9c4a8038562a2f7a454126afc19f328a Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 15 Feb 2023 13:17:39 -0800
Subject: [PATCH 17/30] Fix prelu backward (#334)

---
 aten/src/ATen/native/mps/operations/Activation.mm | 2 +-
 test/test_mps.py                                  | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 84c2f8789790b0..440cde4140f458 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -1819,7 +1819,7 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     using namespace mps;
 
     Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
-    Tensor weight_grad = at::empty_like(weight_, at::MemoryFormat::Contiguous);
+    Tensor weight_grad = at::empty_like(self, at::MemoryFormat::Contiguous);
     if (grad_output.numel() == 0) {
       return std::tuple<Tensor, Tensor>{grad_input, weight_grad};
     }
diff --git a/test/test_mps.py b/test/test_mps.py
index aea4cfe199b00c..d6bb86b9db3e16 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10096,7 +10096,6 @@ class TestConsistency(TestCaseMPS):
         'trace': ['f32'],
 
         # Correctness issues
-        'nn.functional.prelu': ['f32'],
         'atanh': ['f32'],
         'div': ['f16'],
 

From c65b8236c47a16f97a13e2aa3590dbeef0252fdd Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Wed, 15 Feb 2023 15:56:12 -0800
Subject: [PATCH 18/30] Reduction cast f16 to f32 only on macOS 12 (#332)

- unblock rdiv float16
---
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 5 ++++-
 test/test_mps.py                                 | 3 ---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index a79aeca766d366..d4112c99f6a87c 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -200,7 +200,10 @@ void reduction_out_mps(
              (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt)) {
             inputCastDtype = getMPSDataType(dtype.value());
           } else if (input_type != MPSDataTypeInt32   &&
-                     input_type != MPSDataTypeFloat32) {
+                     input_type != MPSDataTypeFloat32 &&
+                     input_type != MPSDataTypeFloat16) {
+            inputCastDtype = MPSDataTypeFloat32;
+          } else if (!is_macos_13_or_newer() && input_type == MPSDataTypeFloat16) {
             inputCastDtype = MPSDataTypeFloat32;
           }
 
diff --git a/test/test_mps.py b/test/test_mps.py
index d6bb86b9db3e16..261b19fca9f51c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10517,9 +10517,6 @@ class TestConsistency(TestCaseMPS):
     }
 
     BLOCKLIST_MACOS_12 = {
-        # failures because of accumulate error exceeds atol/rtol
-        '__rdiv__': [torch.float16],
-
         # expected failures
         'nn.functional.interpolatenearest': [torch.float32],
         'nn.functional.upsample_nearest': [torch.float32],

From 73f706846a1c57e2b26230a0f045f53eef548eb8 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 15 Feb 2023 20:24:37 -0800
Subject: [PATCH 19/30] Remove periodic file (running between PRs) (#336)

---
 .github/workflows/periodic.yml | 284 ---------------------------------
 1 file changed, 284 deletions(-)
 delete mode 100644 .github/workflows/periodic.yml

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
deleted file mode 100644
index 1c137084a97e9a..00000000000000
--- a/.github/workflows/periodic.yml
+++ /dev/null
@@ -1,284 +0,0 @@
-name: periodic
-
-on:
-  schedule:
-    - cron: 45 0,4,8,12,16,20 * * *
-    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
-  push:
-    tags:
-      - ciflow/periodic/*
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
-  cancel-in-progress: true
-
-jobs:
-  parallelnative-linux-focal-py3_8-gcc7-build:
-    name: parallelnative-linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: parallelnative-linux-focal-py3.8-gcc7
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-        ]}
-
-  parallelnative-linux-focal-py3_8-gcc7-test:
-    name: parallelnative-linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: parallelnative-linux-focal-py3_8-gcc7-build
-    with:
-      build-environment: parallelnative-linux-focal-py3.8-gcc7
-      docker-image: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build:
-    name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-test:
-    name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
-      timeout-minutes: 300
-
-  linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build:
-    name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      cuda-arch-list: '8.6'
-      test-matrix: |
-        { include: [
-          { config: "aot_eager_all", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          # These jobs run too slowly so they must be sharded, unfortunately
-          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-test:
-    name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
-
-  linux-focal-rocm5_4_2-py3_8-build:
-    name: linux-focal-rocm5.4.2-py3.8
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-rocm5.4.2-py3.8
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
-      test-matrix: |
-        { include: [
-          { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-        ]}
-
-  linux-focal-rocm5_4_2-py3_8-test:
-    name: linux-focal-rocm5.4.2-py3.8
-    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_4_2-py3_8-build
-    with:
-      build-environment: linux-focal-rocm5.4.2-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.test-matrix }}
-    secrets:
-      AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
-      AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
-
-  linux-bionic-cuda11_7-py3_9-gcc7-build:
-    name: linux-bionic-cuda11.7-py3.9-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.9-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      test-matrix: |
-        { include: [
-          { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.16xlarge.nvidia.gpu" },
-        ]}
-      build-with-debug: false
-
-  linux-bionic-cuda11_7-py3_9-gcc7-test:
-    name: linux-bionic-cuda11.7-py3.9-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_9-gcc7-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.9-gcc7
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_9-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_9-gcc7-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-debug-build:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-debug
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-debug
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      build-with-debug: true
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-debug-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-debug
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-debug-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-debug
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-debug-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-debug-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_8-py3_8-gcc7-debug-build:
-    name: linux-bionic-cuda11.8-py3.8-gcc7-debug
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.8-py3.8-gcc7-debug
-      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
-      build-with-debug: true
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_8-py3_8-gcc7-debug-test:
-    name: linux-bionic-cuda11.8-py3.8-gcc7-debug
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_8-py3_8-gcc7-debug-build
-    with:
-      build-environment: linux-bionic-cuda11.8-py3.8-gcc7-debug
-      docker-image: ${{ needs.linux-bionic-cuda11_8-py3_8-gcc7-debug-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_8-gcc7-debug-build.outputs.test-matrix }}
-
-  libtorch-linux-bionic-cuda11_8-gcc7-build:
-    name: libtorch-linux-bionic-cuda11.8-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: libtorch-linux-bionic-cuda11.8-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
-      build-generates-artifacts: false
-
-  win-vs2019-cuda11_8-py3-build:
-    name: win-vs2019-cuda11.8-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cuda11.8-py3
-      cuda-version: "11.8"
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  win-vs2019-cuda11_8-py3-test:
-    name: win-vs2019-cuda11.8-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cuda11_8-py3-build
-    with:
-      build-environment: win-vs2019-cuda11.8-py3
-      cuda-version: "11.8"
-      test-matrix: ${{ needs.win-vs2019-cuda11_8-py3-build.outputs.test-matrix }}
-
-  libtorch-linux-bionic-cuda11_7-gcc7-build:
-    name: libtorch-linux-bionic-cuda11.7-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: libtorch-linux-bionic-cuda11.7-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      build-generates-artifacts: false
-
-  win-vs2019-cuda11_7-py3-build:
-    name: win-vs2019-cuda11.7-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cuda11.7-py3
-      cuda-version: "11.7"
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  win-vs2019-cuda11_7-py3-test:
-    name: win-vs2019-cuda11.7-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cuda11_7-py3-build
-    with:
-      build-environment: win-vs2019-cuda11.7-py3
-      cuda-version: "11.7"
-      test-matrix: ${{ needs.win-vs2019-cuda11_7-py3-build.outputs.test-matrix }}
-
-  ios-12-5-1-x86-64-coreml:
-    name: ios-12-5-1-x86-64-coreml
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-x86-64-coreml
-      ios-platform: SIMULATOR
-      ios-arch: x86_64
-
-  ios-12-5-1-arm64:
-    name: ios-12-5-1-arm64
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64
-      ios-platform: OS
-      ios-arch: arm64
-
-  ios-12-5-1-arm64-coreml:
-    name: ios-12-5-1-arm64-coreml
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64-coreml
-      ios-platform: OS
-      ios-arch: arm64
-
-  ios-12-5-1-arm64-custom-ops:
-    name: ios-12-5-1-arm64-custom-ops
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64-custom-ops
-      ios-platform: OS
-      ios-arch: arm64
-
-  ios-12-5-1-arm64-metal:
-    name: ios-12-5-1-arm64-metal
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64-metal
-      ios-platform: OS
-      ios-arch: arm64
-
-  buck-build-test:
-    name: buck-build-test
-    uses: ./.github/workflows/_buck-build-test.yml

From 1c8f126f1ee183385085dbc17b4bf7ac78db35f8 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 15 Feb 2023 20:36:31 -0800
Subject: [PATCH 20/30] Fix upsample for NHWC output (#337)

* Fix upsample for NHWC output

* Add testcase
---
 aten/src/ATen/native/mps/operations/UpSample.mm | 11 ++++++++++-
 test/test_mps.py                                |  9 +++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
index 17895e19c7d76f..3b781dea08f484 100644
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -26,6 +26,11 @@ void upsample_out_template(const Tensor& input,
   } else {
     native::upsample_2d_common_check(input.sizes(), output_size);
   }
+  Tensor out;
+  if (!output.is_contiguous()) {
+    out = at::empty_like(output, MemoryFormat::Contiguous);
+  }
+
   bool centerResults = false;
   MPSGraphResizeMode resizeMode = MPSGraphResizeNearest;
   MPSGraphResizeNearestRoundingMode nearestRoundingMode = MPSGraphResizeNearestRoundingModeFloor;
@@ -199,7 +204,7 @@ void upsample_out_template(const Tensor& input,
     MPSGraphTensorData* sizeTensorData = [[[MPSGraphTensorData alloc] initWithMPSNDArray: sizeNDArray] autorelease];
 
     Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, out.has_storage() ? out : output, nil, false);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
         inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
@@ -209,6 +214,10 @@ void upsample_out_template(const Tensor& input,
         outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    if (out.has_storage()) {
+      output.copy_(out);
+    }
   }
 }
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 261b19fca9f51c..47ddaa210ac372 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4362,9 +4362,9 @@ def helper(shape):
         helper((50, 20, 7, 4))
 
     def test_upsample_nearest2d(self):
-        def helper(N, C, H, W):
+        def helper(N, C, H, W, memory_format):
             inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
-                                    requires_grad=True).reshape(N, C, H, W)
+                                    requires_grad=True).reshape(N, C, H, W).to(memory_format=memory_format)
             inputCPU.retain_grad()
             inputMPS = inputCPU.detach().to('mps').requires_grad_()
 
@@ -4390,8 +4390,9 @@ def helper(N, C, H, W):
 
                     self.assertEqual(inputCPU.grad, inputMPS.grad)
 
-        helper(1, 1, 4, 4)
-        helper(7, 5, 3, 2)
+        for memory_format in [torch.channels_last, torch.contiguous_format]:
+            helper(1, 1, 4, 4, memory_format=memory_format)
+            helper(7, 5, 3, 2, memory_format=memory_format)
 
     def test_upsample_bilinear2d(self):
         def helper(N, C, H, W):

From 42be72a92ba216a22812fd6403e78859fdc37a01 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 15 Feb 2023 21:27:00 -0800
Subject: [PATCH 21/30] [DOWNSTREAM] Fix build failure on x86 runners (#338)

---
 aten/src/ATen/native/mps/operations/Indexing.mm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 036f0a242f1157..8522ac920275f1 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -892,7 +892,7 @@ Tensor embedding_dense_backward_mps(
             if (dataType == MPSDataTypeFloat16) {
               castGradTensor = [mpsGraph castTensor: incomingGradTensor
                                              toType: MPSDataTypeFloat32
-                                               name: nil];
+                                               name: @"castGradTensor"];
             }
             if (num_indices_dims != 0) {
               reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor: indicesTensor
@@ -909,7 +909,7 @@ Tensor embedding_dense_backward_mps(
             if (dataType == MPSDataTypeFloat16) {
               outgoingGradTensor = [mpsGraph castTensor: outgoingGradTensor
                                                  toType: MPSDataTypeFloat16
-                                                   name: nil];
+                                                   name: @"castGradTensor"];
             }
             newCachedGraph->incomingGradTensor_ = incomingGradTensor;
             newCachedGraph->indicesTensor_ = indicesTensor;

From 6ace5f94a66ca1ea31f10ae50eb87c7dcdc83496 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Thu, 16 Feb 2023 21:37:34 -0800
Subject: [PATCH 22/30] Fix trace op (#340)

- give warnings of converting int64 for reduction ops
- use cast tensor for reduction sum on trace
- unblock trace from running
---
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 4 +++-
 test/test_mps.py                                 | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index d4112c99f6a87c..f47dd910dc234c 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -139,6 +139,8 @@ void reduction_out_mps(
   MPSReductionType reduction_type,
   const std::string& func_name) {
 
+  // issue 103641234, reduction ops does not have int64 support
+  TORCH_WARN_ONCE(input_t.scalar_type() != ScalarType::Long, "MPS: no support for int64 reduction ops, casting it to int32");
   IntArrayRef input_shape = input_t.sizes();
 
   if (opt_dim.has_value()) {
@@ -247,7 +249,7 @@ void reduction_out_mps(
                                                                axes:wrappedAxes
                                                                name:nil];
           } else if (reduction_type == MPSReductionType::TRACE) {
-            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:inputTensor
+            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:castInputTensor
                                                                      numLower:0
                                                                      numUpper:0
                                                                          name:nil];
diff --git a/test/test_mps.py b/test/test_mps.py
index 47ddaa210ac372..556b44362e58fe 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10123,7 +10123,6 @@ class TestConsistency(TestCaseMPS):
 
         # Functions with correctness issues
         'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
-        'trace': [torch.int64],
         'normalnumber_mean': [torch.float16, torch.float32],
         'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'multinomial': [torch.float32],

From c9b8ab7091310c29335b587c562009a1ac147855 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Thu, 16 Feb 2023 21:42:16 -0800
Subject: [PATCH 23/30] Update random result list (#339)

* - move nn.functional.feature_alpha_dropoutwith_train, normalnumber_mean, new_empty_strided to expected failures

* - update new_empty_strided

---------

Co-authored-by: Kulin Seth <kulin_seth@apple.com>
---
 test/test_mps.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 556b44362e58fe..ea762155f8d327 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10122,9 +10122,6 @@ class TestConsistency(TestCaseMPS):
         'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],
 
         # Functions with correctness issues
-        'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
-        'normalnumber_mean': [torch.float16, torch.float32],
-        'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'multinomial': [torch.float32],
 
         # cpu result off, showing random values
@@ -10472,7 +10469,9 @@ class TestConsistency(TestCaseMPS):
         'randint_like': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'randn_like': [torch.float16, torch.float32],
         'bernoulli': [torch.float32],
+        'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
         'normal': [torch.float16, torch.float32, torch.float16, torch.float32],
+        'normalnumber_mean': [torch.float16, torch.float32],
         'nn.functional.alpha_dropout': [torch.float32],
         'nn.functional.dropout': [torch.float32],
         'nn.functional.dropout2d': [torch.float32],
@@ -10481,6 +10480,7 @@ class TestConsistency(TestCaseMPS):
         'new_empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'empty_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         # problem 103190467, as_strided_scatter has non-deterministic behavior when the update indices are not unique
         'as_strided_scatter': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         # duplicate indices are used in the testcase - undefined behaviour

From d3e414e2b28cb9d0799db7bb20d6917016420d77 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 17 Feb 2023 09:21:54 -0800
Subject: [PATCH 24/30] Fix convolution crash in backward with weights; remove
 unnecessary contiguous calls (#341)

* Fix convolution crash; remove unnecessary contiguous calls

* Fix lintrunner
---
 .../ATen/native/mps/operations/Convolution.mm | 32 ++-----
 test/test_mps.py                              | 95 +++++++++++++++++--
 2 files changed, 95 insertions(+), 32 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 7c0a33d36d042d..4bddbba917f5e6 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -252,20 +252,17 @@ Tensor _mps_convolution(
 }
 
 Tensor mps_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_, const Tensor& weight_,
+    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_input";
-  TensorArg grad_output{ grad_output_, "grad_output", 1 },
-            weight{ weight_, "weight", 2 };
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            weight{ weight_t, "weight", 2 };
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
-  auto memory_format = grad_output_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
-  Tensor grad_output_t = grad_output_.contiguous(memory_format);
-  Tensor weight_t = weight_.contiguous(memory_format);
-  MPSShape* weightShape = getMPSShape(weight_);
   auto grad_input_t = at::empty( input_size, grad_output_t.options(), c10::nullopt);
 
   // Avoid "grad_input" when this is being used as transposed convolution
@@ -341,7 +338,7 @@ Tensor mps_convolution_backward_input(
           }
 
           MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape);
-          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(weight_t.scalar_type()), weightShape);
+          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
 
           MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
           if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
@@ -373,7 +370,7 @@ Tensor mps_convolution_backward_input(
     }
 
     auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
-    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t, weightShape);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
     auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
 
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
@@ -391,17 +388,14 @@ Tensor mps_convolution_backward_input(
 }
 
 Tensor mps_convolution_backward_weights(
-    IntArrayRef weight_size, const Tensor& grad_output_, const Tensor& input_,
+    IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_weights";
-  auto memory_format = input_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
 
-  auto grad_output_t = grad_output_.to(memory_format);
-  auto input_t = input_.to(memory_format);
-
   MPSShape* gradOutputShape = mps::getMPSShape(grad_output_t, memory_format);
 
   // For uniformity with everything else, although it seems grad_weight
@@ -539,12 +533,9 @@ Tensor mps_convolution_backward_weights(
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> mps_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     std::array<bool,3> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
   Tensor grad_input, grad_weight, grad_bias;
   if (input.numel() == 0) {
     if (output_mask[0]) {
@@ -609,12 +600,9 @@ Tensor mps_convolution_transpose_backward_weight(
 
 
 std::tuple<Tensor,Tensor> mps_convolution_transpose_backward(
-    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
+    const Tensor& input, const Tensor& grad_output, const Tensor& weight,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     std::array<bool,2> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
   Tensor grad_input, grad_weight;
   if (output_mask[0]) {
     grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, input.sizes());
diff --git a/test/test_mps.py b/test/test_mps.py
index ea762155f8d327..3de06f2bd276af 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7424,7 +7424,8 @@ def test_conv_transpose_1d_nn_functional(self):
     def test_conv_backward_1d_channels_last(self):
         def helper(shape, in_channels=1, out_channels=1, kernel_size=3, groups=1):
             # https://github.com/pytorch/pytorch/issues/84511
-            conv_cpu = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups)
+            conv_cpu = torch.nn.Conv1d(
+                in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).requires_grad_()
             conv_mps = torch.nn.Conv1d(
                 in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).to("mps")
             conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_(True)
@@ -7464,15 +7465,89 @@ def test_conv1d_contiguous(self):
 
     def test_conv2d_all_strides_paddings(self):
         # https://github.com/pytorch/pytorch/issues/83180
-        y_cpu = torch.randn(2, 2, 3, 6)
-        y_gpu = y_cpu.to(device='mps')
-        for strideX in range(1, 4):
-            for strideY in range(1, 4):
-                conv_cpu = torch.nn.Conv2d(in_channels=2, out_channels=2, kernel_size=3, stride=(strideX, strideY))
-                conv_gpu = copy.deepcopy(conv_cpu).to(device='mps')
-                x_cpu = conv_cpu(y_cpu)
-                x_gpu = conv_gpu(y_gpu)
-                self.assertEqual(x_cpu, x_gpu.cpu(), rtol=1e-03, atol=1e-05)
+        def helper(N, C, H, W, groups, input_mem_format, weight_mem_format, permute_data):
+            x_cpu = torch.randn(N, C, H, W).to(memory_format=input_mem_format).requires_grad_()
+            x_mps = x_cpu.detach().clone().to(device='mps').requires_grad_()
+
+            if permute_data:
+                x_cpu.permute(0, 2, 3, 1)
+                x_mps.permute(0, 2, 3, 1)
+
+            for strideX in range(1, 4):
+                for strideY in range(1, 4):
+                    conv_cpu = torch.nn.Conv2d(
+                        in_channels=N, out_channels=C, kernel_size=H, groups=groups, stride=(strideX, strideY)).requires_grad_()
+                    conv_cpu.weight.data = conv_cpu.weight.to(memory_format=weight_mem_format).requires_grad_()
+
+                    conv_mps = torch.nn.Conv2d(
+                        in_channels=N, out_channels=C, kernel_size=H, groups=groups, stride=(strideX, strideY), device="mps")
+                    conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+                    conv_mps.bias.data = conv_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+                    res_cpu = conv_cpu(x_cpu)
+                    res_mps = conv_mps(x_mps)
+                    self.assertEqual(res_cpu, res_mps.cpu(), rtol=1e-03, atol=1e-05)
+
+                    res_cpu = res_cpu.sum().backward()
+                    res_mps = res_mps.sum().backward()
+                    self.assertEqual(res_cpu, res_mps, rtol=2.6e-05, atol=2e-04)
+                    self.assertEqual(conv_cpu.weight.grad, conv_mps.weight.grad, rtol=2.6e-05, atol=2e-04)
+                    self.assertEqual(conv_cpu.bias.grad, conv_mps.bias.grad)
+                    self.assertEqual(x_cpu.grad, x_mps.grad)
+
+        for mem_format_input in [torch.contiguous_format, torch.channels_last]:
+            for mem_format_weight in [torch.contiguous_format, torch.channels_last]:
+                for permute_data in [True, False]:
+                    helper(2, 2, 3, 6, 1, mem_format_input, mem_format_weight, permute_data)
+                    helper(10, 10, 4, 6, 2, mem_format_input, mem_format_weight, permute_data)
+                    helper(32, 32, 4, 6, 2, mem_format_input, mem_format_weight, permute_data)
+
+    def test_conv_transpose_2d_strided(self):
+        def helper(m_cpu, memory_format):
+            m_mps = copy.deepcopy(m_cpu).requires_grad_()
+            m_mps.weight.data = m_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+            m_mps.bias.data = m_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+            input_cpu = torch.randn(20, 16, 50, 100).to(memory_format=memory_format).requires_grad_()
+            input_mps = input_cpu.detach().clone().to("mps")
+
+            output_cpu = m_cpu(input_cpu)
+            output_mps = m_mps(input_mps)
+            self.assertEqual(output_cpu, output_mps)
+
+        for mem_format_input in [torch.contiguous_format, torch.channels_last]:
+            # With square kernels and equal stride
+            helper(nn.ConvTranspose2d(16, 33, 3, stride=2).requires_grad_(), mem_format_input)
+
+            # non-square kernels and unequal stride and with padding
+            helper(nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)).requires_grad_(), mem_format_input)
+
+    def test_conv_transpose_2d_specified_output(self):
+        input_cpu = torch.randn(1, 16, 12, 12)
+        input_mps = input_cpu.detach().clone().to("mps")
+
+        downsample_cpu = nn.Conv2d(16, 16, 3, stride=2, padding=1)
+        downsample_mps = nn.Conv2d(16, 16, 3, stride=2, padding=1, device="mps")
+        downsample_mps.weight.data = downsample_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+        downsample_mps.bias.data = downsample_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+        upsample_cpu = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        upsample_mps = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1, device="mps")
+        upsample_mps.weight.data = upsample_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+        upsample_mps.bias.data = upsample_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+        h_cpu = downsample_cpu(input_cpu)
+        h_mps = downsample_mps(input_mps)
+        self.assertEqual(h_cpu, h_mps)
+
+        size_cpu = h_cpu.size()
+        size_mps = h_mps.size()
+        self.assertEqual(size_cpu, size_mps)
+
+        output_cpu = upsample_cpu(h_cpu, output_size=input_cpu.size())
+        output_mps = upsample_mps(h_mps, output_size=input_mps.size())
+        self.assertEqual(output_cpu, output_mps)
+        self.assertEqual(output_cpu.size(), output_mps.size())
 
     def test_conv2d_single_stride(self):
         y_cpu = torch.randn(2, 2, 3, 6)

From be8817bc2669b0261a77c25a397962943290860b Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 17 Feb 2023 15:42:50 -0500
Subject: [PATCH 25/30] Fix copy_cast_mps() on tensors with storage offset
 (#343)

This should fix the failure with GPT2 when use_cache=True
---
 aten/src/ATen/native/mps/operations/Copy.mm | 7 +++++--
 test/test_mps.py                            | 9 +++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index e4c673145adaae..94527cfd373fcc 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -251,8 +251,11 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   bool returnGatherOutput = dst_.is_contiguous();
   Tensor src;
   auto sameMemFormat = src_.is_contiguous(dst_.suggest_memory_format()) && dst_.is_contiguous(dst_.suggest_memory_format());
+  const bool sameDataType = src_.dtype() == dst_.dtype();
 
-  if (!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) {
+  if ((!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) ||
+      // the copy_cast path requires storage_offset to be applied before casting
+      (src_.storage_offset() && !sameDataType)) {
     Tensor emptyShell = Tensor();
     src = gatherViewTensor(src_, returnGatherOutput ? dst_ : emptyShell);
 
@@ -282,7 +285,7 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   src._set_neg(src_.is_neg());
 
   const size_t src_size = src.nbytes();
-  if (src.dtype() == dst_.dtype()) {
+  if (sameDataType) {
     MPSStream* stream = getCurrentMPSStream();
     // for GPU to GPU copies we only encode to stream's command buffer (no flushing)
     stream->copy(sourceBuffer, destBuffer, src_size, src_byte_offset, dst_byte_offset);
diff --git a/test/test_mps.py b/test/test_mps.py
index 3de06f2bd276af..bcaaf2a6bfc4e2 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1799,6 +1799,15 @@ def test_slice_reshape(self):
         x_cpu = x_cpu + 2
         self.assertEqual(x, x_cpu)
 
+    def test_slice_casting(self):
+        # generate random binary numbers
+        cpu_in = torch.bernoulli(torch.empty(1, 1, 128, 128).uniform_(0, 1)).to(torch.uint8)
+        mps_in = cpu_in.detach().clone().to("mps")
+        # check copy_cast(unit8 -> bool) on tensors with storage offset
+        cpu_out = cpu_in[:, :, 11 : 12, :12].to(torch.bool)
+        mps_out = mps_in[:, :, 11 : 12, :12].to(torch.bool)
+        self.assertEqual(cpu_out, mps_out)
+
     def test_slice_reshape_contg_view(self):
         import torch
 

From 8e371167f84eb5acf508993f0eed5fcff2291d80 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Sat, 18 Feb 2023 10:27:48 -0800
Subject: [PATCH 26/30] Enable int8 in TestConsistency (#347)

---
 test/test_mps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index bcaaf2a6bfc4e2..8ddaaf42f765fc 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8875,7 +8875,7 @@ def test_serialization_map_location(self):
 
 
 MPS_DTYPES = get_all_dtypes()
-for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]:
+for t in [torch.double, torch.cdouble, torch.cfloat, torch.bfloat16]:
     del MPS_DTYPES[MPS_DTYPES.index(t)]
 
 abbrs_to_torch_dtype_dict = {value : key for (key, value) in dtype_abbrs.items()}

From c30946a723d707df1c1fa4989a26420662fb48b3 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 21 Feb 2023 10:12:46 -0800
Subject: [PATCH 27/30] Convolution cleanup (#346)

Co-authored-by: Ramin Azarmehr <razarmehr@apple.com>
---
 aten/src/ATen/native/mps/operations/Convolution.mm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 4bddbba917f5e6..935d31d425577b 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -341,7 +341,7 @@ Tensor mps_convolution_backward_input(
           MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
 
           MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
-          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+          if (is_channels_last) {
             gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
           }
           MPSGraphTensor* gradInputTensor;
@@ -483,7 +483,7 @@ Tensor mps_convolution_backward_weights(
           MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
           MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
-          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+          if (is_channels_last) {
             gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
           }
 

From b520970694b63e42e457a929d1707087baf77d9f Mon Sep 17 00:00:00 2001
From: skotapati <skotapati@apple.com>
Date: Tue, 21 Feb 2023 14:22:48 -0800
Subject: [PATCH 28/30] Dev/skotapati/copy broadcasting (#350)

* Handle broadcasting by expanding src tensor in Copy.mm

* Unblock linalg_matrix_power

* Improved formatting
---
 aten/src/ATen/native/mps/operations/Copy.mm | 11 ++++++++---
 test/test_mps.py                            |  1 -
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 94527cfd373fcc..16f5718dd29c07 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -300,22 +300,27 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   TORCH_CHECK(dst.defined(), "dst is undefined");
   TORCH_CHECK(src.defined(), "src is undefined");
 
+  bool needs_broadcasting = false;
+
   if (src.numel() == 0 || dst.is_same(src)) {
     return dst;
   }
   if (dst.numel() == 0) {
     dst.resize_as_(src);
   }
+  if (dst.dim() > src.dim()) {
+    needs_broadcasting = true;
+  }
 
   if (src.device().type() == at::kMPS && dst.device().type() == at::kCPU) {
-    return copy_from_mps_(dst, src, non_blocking);
+    return copy_from_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
   }
   if (src.device().type() == at::kCPU && dst.device().type() == at::kMPS) {
-    return copy_to_mps_(dst, src, non_blocking);
+    return copy_to_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
   }
 
   if (src.device().type() == at::kMPS && dst.device().type() == at::kMPS) {
-    return copy_kernel_mps(dst, src, non_blocking);
+    return copy_kernel_mps(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
   }
   TORCH_INTERNAL_ASSERT(
       src.device().type() == DeviceType::MPS,
diff --git a/test/test_mps.py b/test/test_mps.py
index 8ddaaf42f765fc..e983760d0951c2 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10200,7 +10200,6 @@ class TestConsistency(TestCaseMPS):
     # All the entries in this list should be removed
     BLOCKLIST = {
         # Functions that hard crash
-        'linalg.matrix_power': [torch.float32],
         'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'resize_as_': [torch.float16, torch.float32],
         'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],

From 0473fe8ab477341b35cdbd9f2a794ac3af6b6ec9 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <dvieriu@apple.com>
Date: Wed, 8 Feb 2023 20:44:47 -0800
Subject: [PATCH 29/30] Execute convolution in NCHW if the suggested mem format
 is NHWC but the actual mem layout is NCHW

---
 .../ATen/native/mps/operations/Convolution.mm | 31 ++++++++++++++++---
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 935d31d425577b..66a16b75223584 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -83,6 +83,13 @@ Tensor _mps_convolution_impl(
 
   auto memory_format = input_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
+  bool gather_input_data = true;
+  // Perform the convolution directly in NCHW if the tensor is already contiguous in memory
+  if (is_channels_last && input_t.is_contiguous(memory_format)) {
+    is_channels_last = false;
+    gather_input_data = false;
+    memory_format = MemoryFormat::Contiguous;
+  }
   auto output_t = at::empty(
                     input_shape.has_value() ?
                     input_shape.value() :
@@ -215,7 +222,7 @@ Tensor _mps_convolution_impl(
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, inputShape);
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, inputShape, gather_input_data);
     auto weightsPlaceholder = native_mps::Placeholder(cachedGraph->weightTensor_, weight_t);
     auto biasPlaceholder = native_mps::Placeholder();
     // Reshape the bias to be broadcastable with output of conv2d
@@ -263,6 +270,12 @@ Tensor mps_convolution_backward_input(
   checkAllSameGPU(c, {grad_output, weight});
   auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
+  bool gather_input_data = true;
+  if (is_channels_last && grad_output_t.is_contiguous(memory_format)) {
+    is_channels_last = false;
+    gather_input_data = false;
+    memory_format = MemoryFormat::Contiguous;
+  }
   auto grad_input_t = at::empty( input_size, grad_output_t.options(), c10::nullopt);
 
   // Avoid "grad_input" when this is being used as transposed convolution
@@ -369,7 +382,7 @@ Tensor mps_convolution_backward_input(
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape, gather_input_data);
     auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
     auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
 
@@ -393,8 +406,16 @@ Tensor mps_convolution_backward_weights(
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_weights";
-  auto memory_format = grad_output_t.suggest_memory_format();
+  auto memory_format = input_.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
+  bool gather_input_data = true;
+  if (is_channels_last && input_t.is_contiguous(memory_format)) {
+    is_channels_last = false;
+    gather_input_data = false;
+    memory_format = MemoryFormat::Contiguous;
+  }
+  auto grad_output_t = grad_output_.to(memory_format);
+  auto input_t = input_.to(memory_format);
 
   MPSShape* gradOutputShape = mps::getMPSShape(grad_output_t, memory_format);
 
@@ -513,8 +534,8 @@ Tensor mps_convolution_backward_weights(
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
-    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape, gather_input_data);
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t, nil, gather_input_data);
     auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
 
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{

From 9a5b002439c4b200940d3f71148c2ae7af964519 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <dvieriu@apple.com>
Date: Fri, 17 Feb 2023 18:38:19 -0800
Subject: [PATCH 30/30] Fix build failure

---
 aten/src/ATen/native/mps/operations/Convolution.mm | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 66a16b75223584..66c6eac098d8f2 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -406,7 +406,7 @@ Tensor mps_convolution_backward_weights(
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_weights";
-  auto memory_format = input_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
   bool gather_input_data = true;
   if (is_channels_last && input_t.is_contiguous(memory_format)) {
@@ -414,8 +414,6 @@ Tensor mps_convolution_backward_weights(
     gather_input_data = false;
     memory_format = MemoryFormat::Contiguous;
   }
-  auto grad_output_t = grad_output_.to(memory_format);
-  auto input_t = input_.to(memory_format);
 
   MPSShape* gradOutputShape = mps::getMPSShape(grad_output_t, memory_format);