8000 Vec256 casts & constants by TheNumbat · Pull Request #4201 · oxcaml/oxcaml · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Vec256 casts & constants #4201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 50 additions & 14 deletions backend/amd64/emit.ml
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,17 @@ let res16 i n = emit_subreg reg_low_16_name WORD i.res.(n)

let res32 i n = emit_subreg reg_low_32_name DWORD i.res.(n)

let narrow_to_xmm : X86_ast.arg -> X86_ast.arg = function
| Regf (YMM r | ZMM r) -> Regf (XMM r)
| ( Imm _ | Sym _ | Reg8L _ | Reg8H _ | Reg16 _ | Reg32 _ | Reg64 _
| Regf (XMM _)
| Mem _ | Mem64_RIP _ ) as res ->
res

let argX i n = narrow_to_xmm (reg i.arg.(n))

let resX i n = narrow_to_xmm (reg i.res.(n))

(* Output an addressing mode *)

let addressing addr typ i n =
Expand Down Expand Up @@ -1379,12 +1390,23 @@ let emit_atomic instr (op : Cmm.atomic_op) (size : Cmm.atomic_bitwidth) addr =
I.xchg src dst

let emit_reinterpret_cast (cast : Cmm.reinterpret_cast) i =
let open Amd64_simd_instrs in
let distinct = not (Reg.same_loc i.arg.(0) i.res.(0)) in
match cast with
| Int_of_value | Value_of_int -> if distinct then I.mov (arg i 0) (res i 0)
| Float_of_float32 | Float32_of_float ->
if distinct then I.movss (arg i 0) (res i 0)
| V128_of_v128 -> if distinct then I.movapd (arg i 0) (res i 0)
| V256_of_v256 ->
(* CR-soon mslater: align vec256/512 stack slots *)
if distinct
then
if Reg.is_stack i.arg.(0)
then I.simd vmovupd_Y_Ym256 [| arg i 0; res i 0 |]
else I.simd vmovupd_Ym256_Y [| arg i 0; res i 0 |]
| V512_of_v512 ->
(* CR-soon mslater: avx512 *)
Misc.fatal_error "avx512 instructions not yet implemented"
| Float_of_int64 | Int64_of_float -> I.movq (arg i 0) (res i 0)
| Float32_of_int32 -> I.movd (arg32 i 0) (res i 0)
| Int32_of_float32 -> I.movd (arg i 0) (res32 i 0)
Expand All @@ -1398,32 +1420,46 @@ let emit_static_cast (cast : Cmm.static_cast) i =
| Int_of_float Float32 -> I.cvttss2si (arg i 0) (res i 0)
| Float_of_float32 -> I.cvtss2sd (arg i 0) (res i 0)
| Float32_of_float -> I.cvtsd2ss (arg i 0) (res i 0)
| V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2 ->
if distinct then I.movsd (arg i 0) (res i 0)
| Scalar_of_v128 Int64x2 | V128_of_scalar Int64x2 ->
I.movq (arg i 0) (res i 0)
| Scalar_of_v128 Int32x4 -> I.movd (arg i 0) (res32 i 0)
| V128_of_scalar Int32x4 -> I.movd (arg32 i 0) (res i 0)
| V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4 ->
if distinct then I.movss (arg i 0) (res i 0)
| Scalar_of_v128 Int16x8 ->
| Scalar_of_v128 Float64x2 | Scalar_of_v256 Float64x4 ->
if distinct then I.movsd (argX i 0) (res i 0)
| V128_of_scalar Float64x2 | V256_of_scalar Float64x4 ->
if distinct then I.movsd (arg i 0) (resX i 0)
| Scalar_of_v128 Int64x2 | Scalar_of_v256 Int64x4 ->
I.movq (argX i 0) (res i 0)
| V128_of_scalar Int64x2 | V256_of_scalar Int64x4 ->
I.movq (arg i 0) (resX i 0)
| Scalar_of_v128 Int32x4 | Scalar_of_v256 Int32x8 ->
I.movd (argX i 0) (res32 i 0)
| V128_of_scalar Int32x4 | V256_of_scalar Int32x8 ->
I.movd (arg32 i 0) (resX i 0)
| Scalar_of_v128 Float32x4 | Scalar_of_v256 Float32x8 ->
if distinct then I.movss (argX i 0) (res i 0)
| V128_of_scalar Float32x4 | V256_of_scalar Float32x8 ->
if distinct then I.movss (arg i 0) (resX i 0)
| Scalar_of_v128 Int16x8 | Scalar_of_v256 Int16x16 ->
(* [movw] and [movzx] cannot operate on vector registers. We must zero
extend as the result is an untagged positive int. CR mslater: (SIMD)
remove zx once we have unboxed int16 *)
I.movd (arg i 0) (res32 i 0);
I.movd (argX i 0) (res32 i 0);
I.movzx (res16 i 0) (res i 0)
| Scalar_of_v128 Int8x16 ->
| Scalar_of_v128 Int8x16 | Scalar_of_v256 Int8x32 ->
(* [movb] and [movzx] cannot operate on vector registers. We must zero
extend as the result is an untagged positive int. CR mslater: (SIMD)
remove zx once we have unboxed int8 *)
I.movd (arg i 0) (res32 i 0);
I.movd (argX i 0) (res32 i 0);
I.movzx (res8 i 0) (res i 0)
| V128_of_scalar Int16x8 | V128_of_scalar Int8x16 ->
| V128_of_scalar Int16x8
| V128_of_scalar Int8x16
| V256_of_scalar Int16x16
| V256_of_scalar Int8x32 ->
(* [movw] and [movb] cannot operate on vector registers. Moving 32 bits is
OK because the argument is an untagged positive int and these operations
leave the top bits of the vector unspecified. CR mslater: (SIMD) don't
load 32 bits once we have unboxed int16/int8 *)
I.movd (arg32 i 0) (res i 0)
I.movd (arg32 i 0) (resX i 0)
| V512_of_scalar _ | Scalar_of_v512 _ ->
(* CR-soon mslater: avx512 *)
Misc.fatal_error "avx512 instructions not yet implemented"

let assert_loc (loc : Simd.loc) arg =
(match Reg.is_reg arg with
Expand Down
26 changes: 24 additions & 2 deletions backend/amd64/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,12 @@ let precolored_regs () =

let operation_supported = function
| Cpopcnt -> Arch.Extension.enabled POPCNT
| Creinterpret_cast V256_of_v256
| Cstatic_cast (V256_of_scalar _ | Scalar_of_v256 _) ->
Arch.Extension.allow_vec256 ()
| Creinterpret_cast V512_of_v512
| Cstatic_cast (V512_of_scalar _ | Scalar_of_v512 _) ->
Arch.Extension.allow_vec512 ()
| Cprefetch _ | Catomic _
| Capply _ | Cextcall _ | Cload _ | Calloc _ | Cstore _
| Caddi | Csubi | Cmuli | Cmulhi _ | Cdivi | Cmodi
Expand All @@ -720,11 +726,27 @@ let operation_supported = function
| Cnegf _ | Cabsf _ | Caddf _ | Csubf _ | Cmulf _ | Cdivf _ | Cpackf32
| Ccmpf _
| Craise _
| Creinterpret_cast _ | Cstatic_cast _
| Cprobe _ | Cprobe_is_enabled _ | Copaque | Cbeginregion | Cendregion
| Ctuple_field _
| Cdls_get
| Cpoll
-> true
| Creinterpret_cast (Int_of_value | Value_of_int |
Int64_of_float | Float_of_int64 |
Float32_of_float | Float_of_float32 |
Float32_of_int32 | Int32_of_float32 |
V128_of_v128)
| Cstatic_cast (Float_of_float32 | Float32_of_float |
Int_of_float Float32 | Float_of_int Float32 |
Float_of_int Float64 | Int_of_float Float64 |
V128_of_scalar _ | Scalar_of_v128 _) ->
true

let expression_supported = function
| Cconst_int _ | Cconst_natint _ | Cconst_float32 _ | Cconst_float _
| Cconst_vec128 _ | Cconst_symbol _ | Cvar _ | Clet _ | Cphantom_let _
| Ctuple _ | Cop _ | Csequence _ | Cifthenelse _ | Cswitch _ | Ccatch _
| Cexit _ -> true
| Cconst_vec256 _ -> Arch.Extension.allow_vec256 ()
| Cconst_vec512 _ -> Arch.Extension.allow_vec512 ()

let trap_size_in_bytes = 16
24 changes: 19 additions &am 9E7A p; 5 deletions backend/amd64/regalloc_stack_operands.ml
Original file line number Diff line number Diff line change
Expand Up @@ -208,17 +208,31 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
| SSE Div_f32 ),
_ ))) ->
May_still_have_spilled_registers
| Op (Reinterpret_cast (Float_of_float32 | Float32_of_float | V128_of_v128))
| Op
(Reinterpret_cast
( Float_of_float32 | Float32_of_float | V128_of_v128 | V256_of_v256
| V512_of_v512 ))
| Op (Static_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2))
| Op (Static_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4)) ->
| Op (Static_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4))
| Op (Static_cast (V256_of_scalar Float64x4 | Scalar_of_v256 Float64x4))
| Op (Static_cast (V256_of_scalar Float32x8 | Scalar_of_v256 Float32x8))
| Op (Static_cast (V512_of_scalar Float64x8 | Scalar_of_v512 Float64x8))
| Op (Static_cast (V512_of_scalar Float32x16 | Scalar_of_v512 Float32x16)) ->
unary_operation_argument_or_result_on_stack map instr
| Op (Reinterpret_cast (Float_of_int64 | Float32_of_int32))
| Op (Static_cast (V128_of_scalar (Int64x2 | Int32x4 | Int16x8 | Int8x16))) ->
| Op (Static_cast (V128_of_scalar (Int64x2 | Int32x4 | Int16x8 | Int8x16)))
| Op (Static_cast (V256_of_scalar (Int64x4 | Int32x8 | Int16x16 | Int8x32)))
| Op (Static_cast (V512_of_scalar (Int64x8 | Int32x16 | Int16x32 | Int8x64)))
->
may_use_stack_operand_for_only_argument map instr ~has_result:true
| Op (Reinterpret_cast (Int64_of_float | Int32_of_float32))
| Op (Static_cast (Scalar_of_v128 (Int64x2 | Int32x4))) ->
| Op (Static_cast (Scalar_of_v128 (Int64x2 | Int32x4)))
| Op (Static_cast (Scalar_of_v256 (Int64x4 | Int32x8)))
| Op (Static_cast (Scalar_of_v512 (Int64x8 | Int32x16))) ->
may_use_stack_operand_for_result map instr ~num_args:1
| Op (Static_cast (Scalar_of_v128 (Int16x8 | Int8x16))) ->
| Op (Static_cast (Scalar_of_v128 (Int16x8 | Int8x16)))
| Op (Static_cast (Scalar_of_v256 (Int16x16 | Int8x32)))
| Op (Static_cast (Scalar_of_v512 (Int16x32 | Int8x64))) ->
(* CR mslater: (SIMD) replace once we have unboxed int16/int8 *)
May_still_have_spilled_registers
| Op
Expand Down
11 changes: 11 additions & 0 deletions backend/arm64/emit.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1267,6 +1267,8 @@ module BR = Branch_relaxation.Make (struct
| Int32_of_float32 )) ->
1
| Lop (Reinterpret_cast V128_of_v128) -> 1
| Lop (Reinterpret_cast (V256_of_v256 | V512_of_v512)) ->
Misc.fatal_error "arm64: got 256/512 bit vector"
| Lop (Static_cast (Float_of_int Float64 | Int_of_float Float64)) -> 1
| Lop
(Static_cast
Expand All @@ -1280,6 +1282,11 @@ module BR = Branch_relaxation.Make (struct
(Scalar_of_v128 (Int32x4 | Int64x2 | Float32x4 | Float64x2))) ->
1
| Lop (Static_cast (V128_of_scalar _)) -> 1
| Lop
(Static_cast
( V256_of_scalar _ | Scalar_of_v256 _ | V512_of_scalar _
| Scalar_of_v512 _ )) ->
Misc.fatal_error "arm64: got 256/512 bit vector"
| Lop (Floatop (Float64, (Iaddf | Isubf | Imulf | Idivf))) -> 1
| Lop (Floatop (Float32, (Iaddf | Isubf | Imulf | Idivf))) -> 1
| Lop (Specific Inegmulf) -> 1
Expand Down Expand Up @@ -1565,6 +1572,8 @@ let emit_reinterpret_cast (cast : Cmm.reinterpret_cast) i =
DSL.check_reg Vec128 src;
DSL.check_reg Vec128 dst;
DSL.ins I.MOV [| DSL.emit_reg_v16b dst; DSL.emit_reg_v16b src |])
| V256_of_v256 | V512_of_v512 ->
Misc.fatal_error "arm64: got 256/512 bit vector"
| Int_of_value | Value_of_int -> move src dst

let emit_static_cast (cast : Cmm.static_cast) i =
Expand Down Expand Up @@ -1630,6 +1639,8 @@ let emit_static_cast (cast : Cmm.static_cast) i =
then (
DSL.check_reg Float src;
DSL.ins I.FMOV [| DSL.emit_reg_d dst; DSL.emit_reg src |]))
| V256_of_scalar _ | Scalar_of_v256 _ | V512_of_scalar _ | Scalar_of_v512 _ ->
Misc.fatal_error "arm64: got 256/512 bit vector"

(* Output the assembly code for an instruction *)

Expand Down
41 changes: 29 additions & 12 deletions backend/arm64/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -330,14 +330,17 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
| Move | Spill | Reload
| Floatop _
| Csel _
| Reinterpret_cast _ | Const_int _
| Const_int _
| Const_float32 _ | Const_float _
| Const_symbol _ | Const_vec128 _
| Stackoffset _
| Intop_imm _ | Intop_atomic _
| Name_for_debugger _ | Probe_is_enabled _ | Opaque
| Begin_region | End_region | Dls_get)
| Poptrap _ | Prologue
| Op (Reinterpret_cast (Int_of_value | Value_of_int | Float_of_float32 |
Float32_of_float | Float_of_int64 | Int64_of_float |
Float32_of_int32 | Int32_of_float32 | V128_of_v128))
-> [||]
| Stack_check _ -> assert false (* not supported *)
| Op (Const_vec256 _ | Const_vec512 _)
Expand All @@ -349,6 +352,9 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
((Twofiftysix_aligned|Twofiftysix_unaligned|
Fivetwelve_aligned|Fivetwelve_unaligned),
_, _))
| Op (Reinterpret_cast (V256_of_v256 | V512_of_v512))
| Op (Static_cast (V256_of_scalar _ | Scalar_of_v256 _ |
V512_of_scalar _ | Scalar_of_v512 _))
-> Misc.fatal_error "arm64: got 256/512 bit vector"

(* note: keep this function in sync with `is_destruction_point` below. *)
Expand Down Expand Up @@ -442,34 +448,45 @@ let assemble_file infile outfile =
" -o " ^ Filename.quote outfile ^ " " ^ Filename.quote infile)

let operation_supported : Cmm.operation -> bool = function
| Cprefetch _ | Catomic _ -> false
| Cprefetch _ | Catomic _
| Creinterpret_cast (V256_of_v256 | V512_of_v512)
| Cstatic_cast (V256_of_scalar _ | Scalar_of_v256 _ |
V512_of_scalar _ | Scalar_of_v512 _) ->
false
| Cpopcnt
| Cnegf Float32 | Cabsf Float32 | Caddf Float32
| Csubf Float32 | Cmulf Float32 | Cdivf Float32
| Cpackf32
| Creinterpret_cast (Float32_of_float | Float_of_float32 |
Float32_of_int32 | Int32_of_float32 |
V128_of_v128)
| Cstatic_cast (Float_of_float32 | Float32_of_float |
Int_of_float Float32 | Float_of_int Float32 |
V128_of_scalar _ | Scalar_of_v128 _)
| Cclz _ | Cctz _ | Cbswap _
| Capply _ | Cextcall _ | Cload _ | Calloc _ | Cstore _
| Caddi | Csubi | Cmuli | Cmulhi _ | Cdivi | Cmodi
| Cand | Cor | Cxor | Clsl | Clsr | Casr
| Ccmpi _ | Caddv | Cadda | Ccmpa _
| Cnegf Float64 | Cabsf Float64 | Caddf Float64
| Csubf Float64 | Cmulf Float64 | Cdivf Float64
| Creinterpret_cast (Int_of_value | Value_of_int |
Int64_of_float | Float_of_int64)
| Cstatic_cast (Float_of_int Float64 | Int_of_float Float64)
| Ccmpf _
| Ccsel _
| Craise _
| Cprobe _ | Cprobe_is_enabled _ | Copaque
| Cbeginregion | Cendregion | Ctuple_field _
| Cdls_get
| Cpoll
-> true
| Creinterpret_cast (Int_of_value | Value_of_int |
Int64_of_float | Float_of_int64 |
Float32_of_float | Float_of_float32 |
Float32_of_int32 | Int32_of_float32 |
V128_of_v128)
| Cstatic_cast (Float_of_float32 | Float32_of_float |
Int_of_float Float32 | Float_of_int Float32 |
Float_of_int Float64 | Int_of_float Float64 |
V128_of_scalar _ | Scalar_of_v128 _) ->
true

let expression_supported : Cmm.expression -> bool = function
| Cconst_int _ | Cconst_natint _ | Cconst_float32 _ | Cconst_float _
| Cconst_vec128 _ | Cconst_symbol _ | Cvar _ | Clet _ | Cphantom_let _
| Ctuple _ | Cop _ | Csequence _ | Cifthenelse _ | Cswitch _ | Ccatch _
| Cexit _ -> true
| Cconst_vec256 _ | Cconst_vec512 _ -> false

let trap_size_in_bytes = 16
38 changes: 36 additions & 2 deletions backend/cmm.ml
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,8 @@ type reinterpret_cast =
| Float32_of_int32
| Int32_of_float32
| V128_of_v128
| V256_of_v256
| V512_of_v512

type static_cast =
| Float_of_int of float_width
Expand All @@ -332,6 +334,10 @@ type static_cast =
| Float32_of_float
| V128_of_scalar of vec128_type
| Scalar_of_v128 of vec128_type
| V256_of_scalar of vec256_type
| Scalar_of_v256 of vec256_type
| V512_of_scalar of vec512_type
| Scalar_of_v512 of vec512_type

module Alloc_mode = struct
type t =
Expand Down Expand Up @@ -776,6 +782,27 @@ let equal_vec128_type v1 v2 =
| Float64x2, Float64x2 -> true
| (Int8x16 | Int16x8 | Int32x4 | Int64x2 | Float32x4 | Float64x2), _ -> false

let equal_vec256_type v1 v2 =
match v1, v2 with
| Int8x32, Int8x32 -> true
| Int16x16, Int16x16 -> true
| Int32x8, Int32x8 -> true
| Int64x4, Int64x4 -> true
| Float32x8, Float32x8 -> true
| Float64x4, Float64x4 -> true
| (Int8x32 | Int16x16 | Int32x8 | Int64x4 | Float32x8 | Float64x4), _ -> false

let equal_vec512_type v1 v2 =
match v1, v2 with
| Int8x64, Int8x64 -> true
| Int16x32, Int16x32 -> true
| Int32x16, Int32x16 -> true
| Int64x8, Int64x8 -> true
| Float32x16, Float32x16 -> true
| Float64x8, Float64x8 -> true
| (Int8x64 | Int16x32 | Int32x16 | Int64x8 | Float32x16 | Float64x8), _ ->
false

let equal_float_width left right =
match left, right with
| Float64, Float64 -> true
Expand All @@ -794,9 +821,11 @@ let equal_reinterpret_cast (left : reinterpret_cast) (right : reinterpret_cast)
| Float32_of_int32, Float32_of_int32 -> true
| Int32_of_float32, Int32_of_float32 -> true
| V128_of_v128, V128_of_v128 -> true
| V256_of_v256, V256_of_v256 -> true
| V512_of_v512, V512_of_v512 -> true
| ( ( Int_of_value | Value_of_int | Float_of_float32 | Float32_of_float
| Float_of_int64 | Int64_of_float | Float32_of_int32 | Int32_of_float32
| V128_of_v128 ),
| V128_of_v128 | V256_of_v256 | V512_of_v512 ),
_ ) ->
false

Expand All @@ -808,8 +837,13 @@ let equal_static_cast (left : static_cast) (right : static_cast) =
| Int_of_float f1, Int_of_float f2 -> equal_float_width f1 f2
| Scalar_of_v128 v1, Scalar_of_v128 v2 -> equal_vec128_type v1 v2
| V128_of_scalar v1, V128_of_scalar v2 -> equal_vec128_type v1 v2
| Scalar_of_v256 v1, Scalar_of_v256 v2 -> equal_vec256_type v1 v2
| V256_of_scalar v1, V256_of_scalar v2 -> equal_vec256_type v1 v2
| Scalar_of_v512 v1, Scalar_of_v512 v2 -> equal_vec512_type v1 v2
| V512_of_scalar v1, V512_of_scalar v2 -> equal_vec512_type v1 v2
| ( ( Float32_of_float | Float_of_float32 | Float_of_int _ | Int_of_float _
| Scalar_of_v128 _ | V128_of_scalar _ ),
| Scalar_of_v128 _ | V128_of_scalar _ | Scalar_of_v256 _
| V256_of_scalar _ | Scalar_of_v512 _ | V512_of_scalar _ ),
_ ) ->
false

Expand Down
Loading
Loading
0