halide · abadams · Nov 19, 2021 · Nov 19, 2021 · Nov 19, 2021
diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
@@ -2306,7 +2306,7 @@ void CodeGen_C::visit(const Call *op) {
         }
     } else if (op->is_intrinsic(Call::lerp)) {
         internal_assert(op->args.size() == 3);
-        Expr e = lower_lerp(op->args[0], op->args[1], op->args[2]);
+        Expr e = lower_lerp(op->args[0], op->args[1], op->args[2], target);
         rhs << print_expr(e);
     } else if (op->is_intrinsic(Call::absd)) {
         internal_assert(op->args.size() == 2);

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -2697,7 +2697,8 @@ void CodeGen_LLVM::visit(const Call *op) {
         Type wt = upgrade_type_for_arithmetic(op->args[2].type());
         Expr e = lower_lerp(cast(t, op->args[0]),
                             cast(t, op->args[1]),
-                            cast(wt, op->args[2]));
+                            cast(wt, op->args[2]),
+                            target);
         e = cast(op->type, e);
         codegen(e);
     } else if (op->is_intrinsic(Call::popcount)) {

diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
@@ -991,7 +991,7 @@ class OptimizePatterns : public IRMutator {
             // We need to lower lerps now to optimize the arithmetic
             // that they generate.
             internal_assert(op->args.size() == 3);
-            return mutate(lower_lerp(op->args[0], op->args[1], op->args[2]));
+            return mutate(lower_lerp(op->args[0], op->args[1], op->args[2], target));
         } else if ((op->is_intrinsic(Call::div_round_to_zero) ||
                     op->is_intrinsic(Call::mod_round_to_zero)) &&
                    !op->type.is_float() && op->type.is_vector()) {

diff --git a/src/Lerp.cpp b/src/Lerp.cpp
@@ -6,11 +6,12 @@
 #include "IROperator.h"
 #include "Lerp.h"
 #include "Simplify.h"
+#include "Target.h"
 
 namespace Halide {
 namespace Internal {
 
-Expr lower_lerp(Expr zero_val, Expr one_val, const Expr &weight) {
+Expr lower_lerp(Expr zero_val, Expr one_val, const Expr &weight, const Target &target) {
 
     Expr result;
 
@@ -134,13 +135,25 @@ Expr lower_lerp(Expr zero_val, Expr one_val, const Expr &weight) {
             case 8:
             case 16:
             case 32: {
-                Expr shift = Cast::make(UInt(2 * bits), bits);
-                Expr prod_sum = widening_mul(zero_val, inverse_typed_weight) + widening_mul(one_val, typed_weight);
-                // Computes x / (2 ** N - 1) as (x / 2 ** N + x) / 2 ** N.
-                // TODO: on x86 it's actually one instruction cheaper to do the division directly.
-                Expr divided = rounding_shift_right(rounding_shift_right(prod_sum, shift) + prod_sum, shift);
-
-                result = Cast::make(UInt(bits, computation_type.lanes()), divided);
+                Expr prod_sum = (widening_mul(zero_val, inverse_typed_weight) +
+                                 widening_mul(one_val, typed_weight));
+                // Now we need to do a rounding divide and narrow. For
+                // 8-bit, this rounding divide looks like (x + 127) /
+                // 255. On most platforms it's we can compute this as
+                // ((x + 128) / 256 + x + 128) / 256. Note that
+                // overflow is impossible here because the most our
+                // prod_sum can be is 255^2.
+                if (target.arch == Target::X86) {
+                    // On x86 we have no rounding shifts but we do
+                    // have a multiply-keep-high-half. So it's
+                    // actually one instruction cheaper to do the
+                    // division directly.
+                    Expr divisor = cast(UInt(bits), -1);
+                    result = (prod_sum + divisor / 2) / divisor;
+                } else {
+                    result = rounding_shift_right(rounding_shift_right(prod_sum, bits) + prod_sum, bits);
+                }
+                result = Cast::make(UInt(bits, computation_type.lanes()), result);
                 break;
             }
             case 64:

diff --git a/src/Lerp.h b/src/Lerp.h
@@ -8,11 +8,14 @@
 #include "Expr.h"
 
 namespace Halide {
+
+struct Target;
+
 namespace Internal {
 
 /** Build Halide IR that computes a lerp. Use by codegen targets that
  * don't have a native lerp. */
-Expr lower_lerp(Expr zero_val, Expr one_val, const Expr &weight);
+Expr lower_lerp(Expr zero_val, Expr one_val, const Expr &weight, const Target &target);
 
 }  // namespace Internal
 }  // namespace Halide