apache · jwfromm · Mar 12, 2021 · Jan 16, 2021 · Jan 18, 2021 · Jan 18, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -74,6 +74,7 @@ tvm_option(USE_CPP_RPC "Build CPP RPC" OFF)
 tvm_option(USE_TFLITE "Build with tflite support" OFF)
 tvm_option(USE_TENSORFLOW_PATH "TensorFlow root path when use TFLite" none)
 tvm_option(USE_COREML "Build with coreml support" OFF)
+tvm_option(USE_BNNS "Build with BNNS support" OFF)
 tvm_option(USE_TARGET_ONNX "Build with ONNX Codegen support" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF)
@@ -348,6 +349,7 @@ include(cmake/modules/contrib/HybridDump.cmake)
 include(cmake/modules/contrib/TFLite.cmake)
 include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
 include(cmake/modules/contrib/CoreML.cmake)
+include(cmake/modules/contrib/BNNS.cmake)
 include(cmake/modules/contrib/ONNX.cmake)
 include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)

diff --git a/cmake/config.cmake b/cmake/config.cmake
@@ -269,3 +269,6 @@ set(USE_HEXAGON_SDK /path/to/sdk)
 
 # Whether to use ONNX codegen
 set(USE_TARGET_ONNX OFF)
+
+# Whether enable BNNS runtime
+set(USE_BNNS OFF)
diff --git a/cmake/modules/contrib/BNNS.cmake b/cmake/modules/contrib/BNNS.cmake
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_BNNS STREQUAL "ON")
+  add_definitions(-DUSE_JSON_RUNTIME=1)
+  file(GLOB BNNS_RELAY_CONTRIB_SRC src/relay/backend/contrib/bnns/*.cc)
+  list(APPEND COMPILER_SRCS ${BNNS_RELAY_CONTRIB_SRC})
+  list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC})
+
+  list(APPEND TVM_RUNTIME_LINKER_LIBS "-framework Accelerate")
+
+  file(GLOB BNNS_CONTRIB_SRC src/runtime/contrib/bnns/*.cc)
+  list(APPEND RUNTIME_SRCS ${BNNS_CONTRIB_SRC})
+  message(STATUS "Build with BNNS JSON runtime: " ${EXTERN_LIBRARY_BNNS})
+endif()
+
diff --git a/docs/deploy/bnns.rst b/docs/deploy/bnns.rst
@@ -0,0 +1,183 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Relay BNNS Integration
+======================
+**Author**: `Egor Churaev <https://github.com/echuraev>`_
+
+Introduction
+------------
+
+Apple BNNS library is a collection of functions that can be used to construct neural networks
+for inference (and train). It’s supported in macOS, iOS, tvOS, and watchOS. BNNS provides
+primitives executed on all CPU supported on those platforms and optimized for high performance
+and low-energy consumption. This integration will offload as many operators as possible from Relay to BNNS.
+
+BNNS runtime is a part of platform API and available on all modern Apple operating systems.
+Application using BNNS will not depends on any additional external dependencies.
+
+BNNS functions uses Apple private hardware capabilities which are not exposed yet by Apple. Example
+of such capabilities can be AMX Apple cpu extension.
+
+This guide will demonstrate how to build TVM with BNNS codegen and runtime enabled. It will also provide example
+code to compile and run models using BNNS runtime. Finally, we document the supported operators.
+
+Building TVM with BNNS support
+------------------------------
+
+To turn on TVM BNNS codegen and TVM BNNS runtime you need to turn on the only USE_BNNS flag
+
+* USE_BNNS=ON/OFF - This flag will enable compiling a network with offloading subgraphs to BNNS primitives
+  and will link tvm library to the BNNS runtime module.
+
+Enabling of this flag will cause to search the default Accelerate Frameworks on current target SDK.
+The minimal versions of required SDK is macOS 11.0, iOS 14.0, tvOS 14.0 and watchOS 7.0.
+
+Example setting in config.cmake file:
+
+.. code:: cmake
+
+    set(USE_BNNS ON)
+
+BNNS partitioning of Relay graph
+--------------------------------
+
+Operations to be offloaded on BNNS execution must be annotated before passing of module for compilation.
+All ops annotated by `partition_for_bnns` will be offloaded for BNNS execution. The rest of the ops
+will go through the LLVM compilation and code generation.
+
+Important note: BNNS support primitives only with constant weights. To satisfy this requirements we have
+to map constants to related tensor abstraction in relay representation. To freeze tensors and operate
+with them as constants you may need to call ONNX importer with special flag "freeze_params=True"
+or performer binding manually. In general cases all relay importers don't do that by default.
+For your convenience "partition_for_bnns" can do this for you if params dictionary is passed as the argument.
+
+.. code:: python
+
+    from tvm.relay.op.contrib.bnns import partition_for_bnns
+    model = partition_for_bnns(model, params=params)
+
+
+Input data layout for operations to be offloaded to BNNS execution
+------------------------------------------------------------------
+
+BNNS kernels support only planar format of input data. The partitioner will require to have NCHW input
+layout for conv2d input.
+
+To use BNNS integration for models with interleave input layout, they should be converted before
+passing of module to `partition_for_bnns`. The layout conversion will happen only for explicitly
+enumerated types of ops. It might happen that depending on topology there might be regular data reorder
+around conv2d to interleave and planar layout. This will be reflected in performance penalties and affect
+execution time. It is recommended to analyze the whole topology and extend below list to convert all
+intermediate tensors to NCHW data layout.
+
+Example of input layouts change:
+
+.. code:: python
+
+    # For models with NHWC input layout
+    with tvm.transform.PassContext(opt_level=3):
+        mod = relay.transform.InferType()(mod)
+        mod = relay.transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"],
+                                            "nn.bias_add": ["NCHW", "default"],
+                                            "nn.relu": ["NCHW"]})(mod)
+
+
+Example: Build and Deploy Mobilenet v2 1.0 with BNNS
+----------------------------------------------------
+
+Create a Relay graph from a MXNet Mobilenet v2 1.0 model.
+
+.. code:: python
+
+    import tvm
+    from tvm import relay
+    import mxnet
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    block = get_model('mobilenetv2_1.0', pretrained=True)
+    module, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+
+
+Markup the parts of graphs to be offloaded to BNNS primitives. All ops which are supported by the BNNS
+integration will be handled by BNNS invocations, the rest of the ops will go through the
+regular TVM llvm compilation and code generation.
+
+After that you need to compile new module with target corresponding to required Apple platform
+
+.. code:: python
+
+    from tvm.relay.op.contrib.bnns import partition_for_bnns
+
+    # target for macOS Big Sur 11.1:
+    target = "llvm -mtriple=x86_64-apple-darwin20.2.0"
+
+    model = partition_for_bnns(model, params=params)  # to markup operations to be offloaded to BNNS
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(model, target=target, target_host=target, params=params)
+
+Export the module.
+
+.. code:: python
+
+    lib.export_library('compiled.dylib')
+
+
+Load module and run inference on the target machine with TVM  built with ``USE_BNNS`` enabled
+
+.. code:: python
+
+    import tvm
+    import numpy as np
+    from tvm.contrib import graph_runtime
+
+    ctx = tvm.cpu(0)
+    loaded_lib = tvm.runtime.load_module('compiled.dylib')
+    gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx))
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    input_data = np.random.uniform(0, 1, input_shape).astype(dtype)
+    gen_module.run(data=input_data)
+
+
+
+Operator support
+----------------
+
++------------------------+------------------------------------------------------------------------------+
+|       Relay Node       |              Remarks                                                         |
++========================+==============================================================================+
+| nn.conv2d              |                                                                              |
++------------------------+------------------------------------------------------------------------------+
+| nn.batch_norm          | Supported by BNNS integration only in nn.conv2d-batch_norm pattern           |
++------------------------+------------------------------------------------------------------------------+
+| nn.dense               |                                                                              |
++------------------------+------------------------------------------------------------------------------+
+| nn.batch_matmul        |                                                                              |
++------------------------+------------------------------------------------------------------------------+
+| nn.bias_add            | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense   |
+|                        | fusion                                                                       |
++------------------------+------------------------------------------------------------------------------+
+| add                    | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense fusion |
++------------------------+------------------------------------------------------------------------------+
+| nn.relu                | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion |
++------------------------+------------------------------------------------------------------------------+
+| nn.gelu                | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion |
++------------------------+------------------------------------------------------------------------------+
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
@@ -71,3 +71,4 @@ target device without relying on RPC. see the following resources on how to do s
    arm_compute_lib
    tensorrt
    vitis_ai
+   bnns
diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
@@ -21,6 +21,7 @@
 
 from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
 from tvm.relay.op.contrib.ethosn import partition_for_ethosn
+from tvm.relay.op.contrib.bnns import partition_for_bnns
 
 from .common import TVMCException
 
@@ -40,6 +41,10 @@
         "config_key": "relay.ext.ethos-n.options",
         "pass_pipeline": partition_for_ethosn,
     },
+    "bnns": {
+        "config_key": None,
+        "pass_pipeline": partition_for_bnns,
+    },
 }
 
 

diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
@@ -20,6 +20,7 @@
 
 from .arm_compute_lib import *
 from .dnnl import *
+from .bnns import *
 from .coreml import *
 from .ethosn import *
 from .tensorrt import *