oxcaml · TheNumbat · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025
diff --git a/backend/amd64/emit.ml b/backend/amd64/emit.ml
@@ -387,6 +387,17 @@ let res16 i n = emit_subreg reg_low_16_name WORD i.res.(n)
 
 let res32 i n = emit_subreg reg_low_32_name DWORD i.res.(n)
 
+let narrow_to_xmm : X86_ast.arg -> X86_ast.arg = function
+  | Regf (YMM r | ZMM r) -> Regf (XMM r)
+  | ( Imm _ | Sym _ | Reg8L _ | Reg8H _ | Reg16 _ | Reg32 _ | Reg64 _
+    | Regf (XMM _)
+    | Mem _ | Mem64_RIP _ ) as res ->
+    res
+
+let argX i n = narrow_to_xmm (reg i.arg.(n))
+
+let resX i n = narrow_to_xmm (reg i.res.(n))
+
 (* Output an addressing mode *)
 
 let addressing addr typ i n =
@@ -1379,12 +1390,23 @@ let emit_atomic instr (op : Cmm.atomic_op) (size : Cmm.atomic_bitwidth) addr =
     I.xchg src dst
 
 let emit_reinterpret_cast (cast : Cmm.reinterpret_cast) i =
+  let open Amd64_simd_instrs in
   let distinct = not (Reg.same_loc i.arg.(0) i.res.(0)) in
   match cast with
   | Int_of_value | Value_of_int -> if distinct then I.mov (arg i 0) (res i 0)
   | Float_of_float32 | Float32_of_float ->
     if distinct then I.movss (arg i 0) (res i 0)
   | V128_of_v128 -> if distinct then I.movapd (arg i 0) (res i 0)
+  | V256_of_v256 ->
+    (* CR-soon mslater: align vec256/512 stack slots *)
+    if distinct
+    then
+      if Reg.is_stack i.arg.(0)
+      then I.simd vmovupd_Y_Ym256 [| arg i 0; res i 0 |]
+      else I.simd vmovupd_Ym256_Y [| arg i 0; res i 0 |]
+  | V512_of_v512 ->
+    (* CR-soon mslater: avx512 *)
+    Misc.fatal_error "avx512 instructions not yet implemented"
   | Float_of_int64 | Int64_of_float -> I.movq (arg i 0) (res i 0)
   | Float32_of_int32 -> I.movd (arg32 i 0) (res i 0)
   | Int32_of_float32 -> I.movd (arg i 0) (res32 i 0)
@@ -1398,32 +1420,46 @@ let emit_static_cast (cast : Cmm.static_cast) i =
   | Int_of_float Float32 -> I.cvttss2si (arg i 0) (res i 0)
   | Float_of_float32 -> I.cvtss2sd (arg i 0) (res i 0)
   | Float32_of_float -> I.cvtsd2ss (arg i 0) (res i 0)
-  | V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2 ->
-    if distinct then I.movsd (arg i 0) (res i 0)
-  | Scalar_of_v128 Int64x2 | V128_of_scalar Int64x2 ->
-    I.movq (arg i 0) (res i 0)
-  | Scalar_of_v128 Int32x4 -> I.movd (arg i 0) (res32 i 0)
-  | V128_of_scalar Int32x4 -> I.movd (arg32 i 0) (res i 0)
-  | V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4 ->
-    if distinct then I.movss (arg i 0) (res i 0)
-  | Scalar_of_v128 Int16x8 ->
+  | Scalar_of_v128 Float64x2 | Scalar_of_v256 Float64x4 ->
+    if distinct then I.movsd (argX i 0) (res i 0)
+  | V128_of_scalar Float64x2 | V256_of_scalar Float64x4 ->
+    if distinct then I.movsd (arg i 0) (resX i 0)
+  | Scalar_of_v128 Int64x2 | Scalar_of_v256 Int64x4 ->
+    I.movq (argX i 0) (res i 0)
+  | V128_of_scalar Int64x2 | V256_of_scalar Int64x4 ->
+    I.movq (arg i 0) (resX i 0)
+  | Scalar_of_v128 Int32x4 | Scalar_of_v256 Int32x8 ->
+    I.movd (argX i 0) (res32 i 0)
+  | V128_of_scalar Int32x4 | V256_of_scalar Int32x8 ->
+    I.movd (arg32 i 0) (resX i 0)
+  | Scalar_of_v128 Float32x4 | Scalar_of_v256 Float32x8 ->
+    if distinct then I.movss (argX i 0) (res i 0)
+  | V128_of_scalar Float32x4 | V256_of_scalar Float32x8 ->
+    if distinct then I.movss (arg i 0) (resX i 0)
+  | Scalar_of_v128 Int16x8 | Scalar_of_v256 Int16x16 ->
     (* [movw] and [movzx] cannot operate on vector registers. We must zero
        extend as the result is an untagged positive int. CR mslater: (SIMD)
        remove zx once we have unboxed int16 *)
-    I.movd (arg i 0) (res32 i 0);
+    I.movd (argX i 0) (res32 i 0);
     I.movzx (res16 i 0) (res i 0)
-  | Scalar_of_v128 Int8x16 ->
+  | Scalar_of_v128 Int8x16 | Scalar_of_v256 Int8x32 ->
     (* [movb] and [movzx] cannot operate on vector registers. We must zero
        extend as the result is an untagged positive int. CR mslater: (SIMD)
        remove zx once we have unboxed int8 *)
-    I.movd (arg i 0) (res32 i 0);
+    I.movd (argX i 0) (res32 i 0);
     I.movzx (res8 i 0) (res i 0)
-  | V128_of_scalar Int16x8 | V128_of_scalar Int8x16 ->
+  | V128_of_scalar Int16x8
+  | V128_of_scalar Int8x16
+  | V256_of_scalar Int16x16
+  | V256_of_scalar Int8x32 ->
     (* [movw] and [movb] cannot operate on vector registers. Moving 32 bits is
        OK because the argument is an untagged positive int and these operations
        leave the top bits of the vector unspecified. CR mslater: (SIMD) don't
        load 32 bits once we have unboxed int16/int8 *)
-    I.movd (arg32 i 0) (res i 0)
+    I.movd (arg32 i 0) (resX i 0)
+  | V512_of_scalar _ | Scalar_of_v512 _ ->
+    (* CR-soon mslater: avx512 *)
+    Misc.fatal_error "avx512 instructions not yet implemented"
 
 let assert_loc (loc : Simd.loc) arg =
   (match Reg.is_reg arg with

diff --git a/backend/amd64/proc.ml b/backend/amd64/proc.ml
@@ -709,6 +709,12 @@ let precolored_regs () =
 
 let operation_supported = function
   | Cpopcnt -> Arch.Extension.enabled POPCNT
+  | Creinterpret_cast V256_of_v256
+  | Cstatic_cast (V256_of_scalar _ | Scalar_of_v256 _) ->
+    Arch.Extension.allow_vec256 ()
+  | Creinterpret_cast V512_of_v512
+  | Cstatic_cast (V512_of_scalar _ | Scalar_of_v512 _) ->
+    Arch.Extension.allow_vec512 ()
   | Cprefetch _ | Catomic _
   | Capply _ | Cextcall _ | Cload _ | Calloc _ | Cstore _
   | Caddi | Csubi | Cmuli | Cmulhi _ | Cdivi | Cmodi
@@ -720,11 +726,27 @@ let operation_supported = function
   | Cnegf _ | Cabsf _ | Caddf _ | Csubf _ | Cmulf _ | Cdivf _ | Cpackf32
   | Ccmpf _
   | Craise _
-  | Creinterpret_cast _ | Cstatic_cast _
   | Cprobe _ | Cprobe_is_enabled _ | Copaque | Cbeginregion | Cendregion
   | Ctuple_field _
   | Cdls_get
   | Cpoll
-    -> true
+  | Creinterpret_cast (Int_of_value | Value_of_int |
+                       Int64_of_float | Float_of_int64 |
+                       Float32_of_float | Float_of_float32 |
+                       Float32_of_int32 | Int32_of_float32 |
+                       V128_of_v128)
+  | Cstatic_cast (Float_of_float32 | Float32_of_float |
+                  Int_of_float Float32 | Float_of_int Float32 |
+                  Float_of_int Float64 | Int_of_float Float64 |
+                  V128_of_scalar _ | Scalar_of_v128 _) ->
+    true
+
+let expression_supported = function
+  | Cconst_int _ | Cconst_natint _ | Cconst_float32 _ | Cconst_float _
+  | Cconst_vec128 _ | Cconst_symbol _  | Cvar _ | Clet _ | Cphantom_let _
+  | Ctuple _ | Cop _ | Csequence _ | Cifthenelse _ | Cswitch _ | Ccatch _
+  | Cexit _ -> true
+  | Cconst_vec256 _ -> Arch.Extension.allow_vec256 ()
+  | Cconst_vec512 _ -> Arch.Extension.allow_vec512 ()
 
 let trap_size_in_bytes = 16
diff --git a/backend/amd64/regalloc_stack_operands.ml b/backend/amd64/regalloc_stack_operands.ml
@@ -208,17 +208,31 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
             | SSE Div_f32 ),
             _ ))) ->
     May_still_have_spilled_registers
-  | Op (Reinterpret_cast (Float_of_float32 | Float32_of_float | V128_of_v128))
+  | Op
+      (Reinterpret_cast
+        ( Float_of_float32 | Float32_of_float | V128_of_v128 | V256_of_v256
+        | V512_of_v512 ))
   | Op (Static_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2))
-  | Op (Static_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4)) ->
+  | Op (Static_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4))
+  | Op (Static_cast (V256_of_scalar Float64x4 | Scalar_of_v256 Float64x4))
+  | Op (Static_cast (V256_of_scalar Float32x8 | Scalar_of_v256 Float32x8))
+  | Op (Static_cast (V512_of_scalar Float64x8 | Scalar_of_v512 Float64x8))
+  | Op (Static_cast (V512_of_scalar Float32x16 | Scalar_of_v512 Float32x16)) ->
     unary_operation_argument_or_result_on_stack map instr
   | Op (Reinterpret_cast (Float_of_int64 | Float32_of_int32))
-  | Op (Static_cast (V128_of_scalar (Int64x2 | Int32x4 | Int16x8 | Int8x16))) ->
+  | Op (Static_cast (V128_of_scalar (Int64x2 | Int32x4 | Int16x8 | Int8x16)))
+  | Op (Static_cast (V256_of_scalar (Int64x4 | Int32x8 | Int16x16 | Int8x32)))
+  | Op (Static_cast (V512_of_scalar (Int64x8 | Int32x16 | Int16x32 | Int8x64)))
+    ->
     may_use_stack_operand_for_only_argument map instr ~has_result:true
   | Op (Reinterpret_cast (Int64_of_float | Int32_of_float32))
-  | Op (Static_cast (Scalar_of_v128 (Int64x2 | Int32x4))) ->
+  | Op (Static_cast (Scalar_of_v128 (Int64x2 | Int32x4)))
+  | Op (Static_cast (Scalar_of_v256 (Int64x4 | Int32x8)))
+  | Op (Static_cast (Scalar_of_v512 (Int64x8 | Int32x16))) ->
     may_use_stack_operand_for_result map instr ~num_args:1
-  | Op (Static_cast (Scalar_of_v128 (Int16x8 | Int8x16))) ->
+  | Op (Static_cast (Scalar_of_v128 (Int16x8 | Int8x16)))
+  | Op (Static_cast (Scalar_of_v256 (Int16x16 | Int8x32)))
+  | Op (Static_cast (Scalar_of_v512 (Int16x32 | Int8x64))) ->
     (* CR mslater: (SIMD) replace once we have unboxed int16/int8 *)
     May_still_have_spilled_registers
   | Op

diff --git a/backend/arm64/emit.ml b/backend/arm64/emit.ml
@@ -1267,6 +1267,8 @@ module BR = Branch_relaxation.Make (struct
           | Int32_of_float32 )) ->
       1
     | Lop (Reinterpret_cast V128_of_v128) -> 1
+    | Lop (Reinterpret_cast (V256_of_v256 | V512_of_v512)) ->
+      Misc.fatal_error "arm64: got 256/512 bit vector"
     | Lop (Static_cast (Float_of_int Float64 | Int_of_float Float64)) -> 1
     | Lop
         (Static_cast
@@ -1280,6 +1282,11 @@ module BR = Branch_relaxation.Make (struct
           (Scalar_of_v128 (Int32x4 | Int64x2 | Float32x4 | Float64x2))) ->
       1
     | Lop (Static_cast (V128_of_scalar _)) -> 1
+    | Lop
+        (Static_cast
+          ( V256_of_scalar _ | Scalar_of_v256 _ | V512_of_scalar _
+          | Scalar_of_v512 _ )) ->
+      Misc.fatal_error "arm64: got 256/512 bit vector"
     | Lop (Floatop (Float64, (Iaddf | Isubf | Imulf | Idivf))) -> 1
     | Lop (Floatop (Float32, (Iaddf | Isubf | Imulf | Idivf))) -> 1
     | Lop (Specific Inegmulf) -> 1
@@ -1565,6 +1572,8 @@ let emit_reinterpret_cast (cast : Cmm.reinterpret_cast) i =
       DSL.check_reg Vec128 src;
       DSL.check_reg Vec128 dst;
       DSL.ins I.MOV [| DSL.emit_reg_v16b dst; DSL.emit_reg_v16b src |])
+  | V256_of_v256 | V512_of_v512 ->
+    Misc.fatal_error "arm64: got 256/512 bit vector"
   | Int_of_value | Value_of_int -> move src dst
 
 let emit_static_cast (cast : Cmm.static_cast) i =
@@ -1630,6 +1639,8 @@ let emit_static_cast (cast : Cmm.static_cast) i =
       then (
         DSL.check_reg Float src;
         DSL.ins I.FMOV [| DSL.emit_reg_d dst; DSL.emit_reg src |]))
+  | V256_of_scalar _ | Scalar_of_v256 _ | V512_of_scalar _ | Scalar_of_v512 _ ->
+    Misc.fatal_error "arm64: got 256/512 bit vector"
 
 (* Output the assembly code for an instruction *)
 

diff --git a/backend/arm64/proc.ml b/backend/arm64/proc.ml
@@ -330,14 +330,17 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
         | Move | Spill | Reload
         | Floatop _
         | Csel _
-        | Reinterpret_cast _ | Const_int _
+        | Const_int _
         | Const_float32 _ | Const_float _
         | Const_symbol _ | Const_vec128 _
         | Stackoffset _
         | Intop_imm _ | Intop_atomic _
         | Name_for_debugger _ | Probe_is_enabled _ | Opaque
         | Begin_region | End_region | Dls_get)
   | Poptrap _ | Prologue
+  | Op (Reinterpret_cast (Int_of_value | Value_of_int | Float_of_float32 |
+                          Float32_of_float | Float_of_int64 | Int64_of_float |
+                          Float32_of_int32 | Int32_of_float32 | V128_of_v128))
     -> [||]
   | Stack_check _ -> assert false (* not supported *)
   | Op (Const_vec256 _ | Const_vec512 _)
@@ -349,6 +352,9 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
           ((Twofiftysix_aligned|Twofiftysix_unaligned|
             Fivetwelve_aligned|Fivetwelve_unaligned),
             _, _))
+  | Op (Reinterpret_cast (V256_of_v256 | V512_of_v512))
+  | Op (Static_cast (V256_of_scalar _ | Scalar_of_v256 _ |
+                     V512_of_scalar _ | Scalar_of_v512 _))
     -> Misc.fatal_error "arm64: got 256/512 bit vector"
 
 (* note: keep this function in sync with `is_destruction_point` below. *)
@@ -442,34 +448,45 @@ let assemble_file infile outfile =
                  " -o " ^ Filename.quote outfile ^ " " ^ Filename.quote infile)
 
 let operation_supported : Cmm.operation -> bool = function
-  | Cprefetch _ | Catomic _ -> false
+  | Cprefetch _ | Catomic _
+  | Creinterpret_cast (V256_of_v256 | V512_of_v512)
+  | Cstatic_cast (V256_of_scalar _ | Scalar_of_v256 _ |
+                  V512_of_scalar _ | Scalar_of_v512 _) ->
+    false
   | Cpopcnt
   | Cnegf Float32 | Cabsf Float32 | Caddf Float32
   | Csubf Float32 | Cmulf Float32 | Cdivf Float32
   | Cpackf32
-  | Creinterpret_cast (Float32_of_float | Float_of_float32 |
-                       Float32_of_int32 | Int32_of_float32 |
-                       V128_of_v128)
-  | Cstatic_cast (Float_of_float32 | Float32_of_float |
-                  Int_of_float Float32 | Float_of_int Float32 |
-                  V128_of_scalar _ | Scalar_of_v128 _)
   | Cclz _ | Cctz _ | Cbswap _
   | Capply _ | Cextcall _ | Cload _ | Calloc _ | Cstore _
   | Caddi | Csubi | Cmuli | Cmulhi _ | Cdivi | Cmodi
   | Cand | Cor | Cxor | Clsl | Clsr | Casr
   | Ccmpi _ | Caddv | Cadda | Ccmpa _
   | Cnegf Float64 | Cabsf Float64 | Caddf Float64
   | Csubf Float64 | Cmulf Float64 | Cdivf Float64
-  | Creinterpret_cast (Int_of_value | Value_of_int |
-                       Int64_of_float | Float_of_int64)
-  | Cstatic_cast (Float_of_int Float64 | Int_of_float Float64)
   | Ccmpf _
   | Ccsel _
   | Craise _
   | Cprobe _ | Cprobe_is_enabled _ | Copaque
   | Cbeginregion | Cendregion | Ctuple_field _
   | Cdls_get
   | Cpoll
-    -> true
+  | Creinterpret_cast (Int_of_value | Value_of_int |
+                       Int64_of_float | Float_of_int64 |
+                       Float32_of_float | Float_of_float32 |
+                       Float32_of_int32 | Int32_of_float32 |
+                       V128_of_v128)
+  | Cstatic_cast (Float_of_float32 | Float32_of_float |
+                  Int_of_float Float32 | Float_of_int Float32 |
+                  Float_of_int Float64 | Int_of_float Float64 |
+                  V128_of_scalar _ | Scalar_of_v128 _) ->
+    true
+
+let expression_supported : Cmm.expression -> bool = function
+  | Cconst_int _ | Cconst_natint _ | Cconst_float32 _ | Cconst_float _
+  | Cconst_vec128 _ | Cconst_symbol _  | Cvar _ | Clet _ | Cphantom_let _
+  | Ctuple _ | Cop _ | Csequence _ | Cifthenelse _ | Cswitch _ | Ccatch _
+  | Cexit _ -> true
+  | Cconst_vec256 _ | Cconst_vec512 _ -> false
 
 let trap_size_in_bytes = 16
diff --git a/backend/cmm.ml b/backend/cmm.ml
@@ -324,6 +324,8 @@ type reinterpret_cast =
   | Float32_of_int32
   | Int32_of_float32
   | V128_of_v128
+  | V256_of_v256
+  | V512_of_v512
 
 type static_cast =
   | Float_of_int of float_width
@@ -332,6 +334,10 @@ type static_cast =
   | Float32_of_float
   | V128_of_scalar of vec128_type
   | Scalar_of_v128 of vec128_type
+  | V256_of_scalar of vec256_type
+  | Scalar_of_v256 of vec256_type
+  | V512_of_scalar of vec512_type
+  | Scalar_of_v512 of vec512_type
 
 module Alloc_mode = struct
   type t =
@@ -776,6 +782,27 @@ let equal_vec128_type v1 v2 =
   | Float64x2, Float64x2 -> true
   | (Int8x16 | Int16x8 | Int32x4 | Int64x2 | Float32x4 | Float64x2), _ -> false
 
+let equal_vec256_type v1 v2 =
+  match v1, v2 with
+  | Int8x32, Int8x32 -> true
+  | Int16x16, Int16x16 -> true
+  | Int32x8, Int32x8 -> true
+  | Int64x4, Int64x4 -> true
+  | Float32x8, Float32x8 -> true
+  | Float64x4, Float64x4 -> true
+  | (Int8x32 | Int16x16 | Int32x8 | Int64x4 | Float32x8 | Float64x4), _ -> false
+
+let equal_vec512_type v1 v2 =
+  match v1, v2 with
+  | Int8x64, Int8x64 -> true
+  | Int16x32, Int16x32 -> true
+  | Int32x16, Int32x16 -> true
+  | Int64x8, Int64x8 -> true
+  | Float32x16, Float32x16 -> true
+  | Float64x8, Float64x8 -> true
+  | (Int8x64 | Int16x32 | Int32x16 | Int64x8 | Float32x16 | Float64x8), _ ->
+    false
+
 let equal_float_width left right =
   match left, right with
   | Float64, Float64 -> true
@@ -794,9 +821,11 @@ let equal_reinterpret_cast (left : reinterpret_cast) (right : reinterpret_cast)
   | Float32_of_int32, Float32_of_int32 -> true
   | Int32_of_float32, Int32_of_float32 -> true
   | V128_of_v128, V128_of_v128 -> true
+  | V256_of_v256, V256_of_v256 -> true
+  | V512_of_v512, V512_of_v512 -> true
   | ( ( Int_of_value | Value_of_int | Float_of_float32 | Float32_of_float
       | Float_of_int64 | Int64_of_float | Float32_of_int32 | Int32_of_float32
-      | V128_of_v128 ),
+      | V128_of_v128 | V256_of_v256 | V512_of_v512 ),
       _ ) ->
     false
 
@@ -808,8 +837,13 @@ let equal_static_cast (left : static_cast) (right : static_cast) =
   | Int_of_float f1, Int_of_float f2 -> equal_float_width f1 f2
   | Scalar_of_v128 v1, Scalar_of_v128 v2 -> equal_vec128_type v1 v2
   | V128_of_scalar v1, V128_of_scalar v2 -> equal_vec128_type v1 v2
+  | Scalar_of_v256 v1, Scalar_of_v256 v2 -> equal_vec256_type v1 v2
+  | V256_of_scalar v1, V256_of_scalar v2 -> equal_vec256_type v1 v2
+  | Scalar_of_v512 v1, Scalar_of_v512 v2 -> equal_vec512_type v1 v2
+  | V512_of_scalar v1, V512_of_scalar v2 -> equal_vec512_type v1 v2
   | ( ( Float32_of_float | Float_of_float32 | Float_of_int _ | Int_of_float _
-      | Scalar_of_v128 _ | V128_of_scalar _ ),
+      | Scalar_of_v128 _ | V128_of_scalar _ | Scalar_of_v256 _
+      | V256_of_scalar _ | Scalar_of_v512 _ | V512_of_scalar _ ),
       _ ) ->
     false