; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s

@bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison

define void @func1() {
; GFX12-SDAG-LABEL: func1:
; GFX12-SDAG:       ; %bb.0:
; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, 0x70003
; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
; GFX12-SDAG-NEXT:    s_barrier_signal m0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, 3
; GFX12-SDAG-NEXT:    s_barrier_join m0
; GFX12-SDAG-NEXT:    s_barrier_wait 1
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: func1:
; GFX12-GISEL:       ; %bb.0:
; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT:    s_mov_b32 m0, 0x70003
; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
; GFX12-GISEL-NEXT:    s_barrier_signal m0
; GFX12-GISEL-NEXT:    s_barrier_join 3
; GFX12-GISEL-NEXT:    s_barrier_wait 1
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
    call void @llvm.amdgcn.s.barrier.wait(i16 1)
    ret void
}

define void @func2() {
; GFX12-SDAG-LABEL: func2:
; GFX12-SDAG:       ; %bb.0:
; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, 0x70001
; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
; GFX12-SDAG-NEXT:    s_barrier_signal m0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, 1
; GFX12-SDAG-NEXT:    s_barrier_join m0
; GFX12-SDAG-NEXT:    s_barrier_wait 1
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: func2:
; GFX12-GISEL:       ; %bb.0:
; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT:    s_mov_b32 m0, 0x70001
; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
; GFX12-GISEL-NEXT:    s_barrier_signal m0
; GFX12-GISEL-NEXT:    s_barrier_join 1
; GFX12-GISEL-NEXT:    s_barrier_wait 1
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
    call void @llvm.amdgcn.s.barrier.wait(i16 1)
    ret void
}

define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
; GFX12-SDAG-LABEL: kernel1:
; GFX12-SDAG:       ; %bb.0:
; GFX12-SDAG-NEXT:    s_mov_b64 s[10:11], s[6:7]
; GFX12-SDAG-NEXT:    s_mov_b64 s[6:7], s[2:3]
; GFX12-SDAG-NEXT:    s_load_b32 s2, s[4:5], 0x2c
; GFX12-SDAG-NEXT:    s_mov_b32 m0, 0xc0002
; GFX12-SDAG-NEXT:    v_mov_b32_e32 v31, v0
; GFX12-SDAG-NEXT:    s_barrier_init m0
; GFX12-SDAG-NEXT:    s_add_nc_u64 s[8:9], s[4:5], 48
; GFX12-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT:    s_lshr_b32 s2, s2, 4
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_and_b32 s2, s2, 63
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_or_b32 s3, 0x90000, s2
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_mov_b32 m0, s3
; GFX12-SDAG-NEXT:    s_barrier_init m0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, 0xc0002
; GFX12-SDAG-NEXT:    s_barrier_signal m0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, s3
; GFX12-SDAG-NEXT:    s_barrier_signal m0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
; GFX12-SDAG-NEXT:    s_barrier_signal -1
; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT:    s_barrier_join m0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, 2
; GFX12-SDAG-NEXT:    s_barrier_wait 1
; GFX12-SDAG-NEXT:    s_barrier_leave
; GFX12-SDAG-NEXT:    s_get_barrier_state s3, m0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
; GFX12-SDAG-NEXT:    s_get_barrier_state s2, m0
; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT:    s_getpc_b64 s[2:3]
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_sext_i32_i16 s3, s3
; GFX12-SDAG-NEXT:    s_add_co_u32 s2, s2, func1@gotpcrel32@lo+12
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_add_co_ci_u32 s3, s3, func1@gotpcrel32@hi+24
; GFX12-SDAG-NEXT:    s_barrier_signal -1
; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-SDAG-NEXT:    s_barrier_wait -1
; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX12-SDAG-NEXT:    s_getpc_b64 s[2:3]
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_sext_i32_i16 s3, s3
; GFX12-SDAG-NEXT:    s_add_co_u32 s2, s2, func2@gotpcrel32@lo+12
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_add_co_ci_u32 s3, s3, func2@gotpcrel32@hi+24
; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[2:3]
; GFX12-SDAG-NEXT:    s_get_barrier_state s0, -1
; GFX12-SDAG-NEXT:    s_endpgm
;
; GFX12-GISEL-LABEL: kernel1:
; GFX12-GISEL:       ; %bb.0:
; GFX12-GISEL-NEXT:    s_mov_b64 s[12:13], s[4:5]
; GFX12-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
; GFX12-GISEL-NEXT:    s_load_b32 s0, s[12:13], 0x2c
; GFX12-GISEL-NEXT:    s_mov_b32 m0, 0xc0002
; GFX12-GISEL-NEXT:    v_mov_b32_e32 v31, v0
; GFX12-GISEL-NEXT:    s_barrier_init m0
; GFX12-GISEL-NEXT:    s_mov_b64 s[10:11], s[6:7]
; GFX12-GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT:    s_lshr_b32 s0, s0, 4
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, 63
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_or_b32 s1, s0, 0x90000
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_mov_b32 m0, s1
; GFX12-GISEL-NEXT:    s_barrier_init m0
; GFX12-GISEL-NEXT:    s_mov_b32 m0, 0xc0002
; GFX12-GISEL-NEXT:    s_barrier_signal m0
; GFX12-GISEL-NEXT:    s_mov_b32 m0, s1
; GFX12-GISEL-NEXT:    s_barrier_signal m0
; GFX12-GISEL-NEXT:    s_barrier_signal -1
; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT:    s_mov_b32 m0, s0
; GFX12-GISEL-NEXT:    s_add_co_u32 s8, s12, 48
; GFX12-GISEL-NEXT:    s_barrier_join m0
; GFX12-GISEL-NEXT:    s_barrier_wait 1
; GFX12-GISEL-NEXT:    s_barrier_leave
; GFX12-GISEL-NEXT:    s_get_barrier_state s0, 2
; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT:    s_get_barrier_state s0, m0
; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s9, s13, 0
; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT:    s_getpc_b64 s[0:1]
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_sext_i32_i16 s1, s1
; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, func1@gotpcrel32@lo+12
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+24
; GFX12-GISEL-NEXT:    s_barrier_signal -1
; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX12-GISEL-NEXT:    s_barrier_wait -1
; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX12-GISEL-NEXT:    s_add_co_u32 s8, s12, 48
; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s9, s13, 0
; GFX12-GISEL-NEXT:    s_getpc_b64 s[0:1]
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_sext_i32_i16 s1, s1
; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, func2@gotpcrel32@lo+12
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, func2@gotpcrel32@hi+24
; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX12-GISEL-NEXT:    s_get_barrier_state s0, -1
; GFX12-GISEL-NEXT:    s_endpgm
    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12)
    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9)
    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
    call void @llvm.amdgcn.s.barrier.signal(i32 -1)
    %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
    call void @llvm.amdgcn.s.barrier.wait(i16 1)
    call void @llvm.amdgcn.s.barrier.leave(i16 1)
    %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
    %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
    call void @llvm.amdgcn.s.barrier()
    call void @func1()
    call void @func2()
    %state3 = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
    ret void
}

define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
; GFX12-SDAG-LABEL: kernel2:
; GFX12-SDAG:       ; %bb.0:
; GFX12-SDAG-NEXT:    s_mov_b64 s[10:11], s[6:7]
; GFX12-SDAG-NEXT:    s_getpc_b64 s[6:7]
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_sext_i32_i16 s7, s7
; GFX12-SDAG-NEXT:    s_add_co_u32 s6, s6, func2@gotpcrel32@lo+12
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_add_co_ci_u32 s7, s7, func2@gotpcrel32@hi+24
; GFX12-SDAG-NEXT:    v_mov_b32_e32 v31, v0
; GFX12-SDAG-NEXT:    s_load_b64 s[12:13], s[6:7], 0x0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, 0x70002
; GFX12-SDAG-NEXT:    s_add_nc_u64 s[8:9], s[4:5], 48
; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT:    s_barrier_signal m0
; GFX12-SDAG-NEXT:    s_mov_b32 m0, 2
; GFX12-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
; GFX12-SDAG-NEXT:    s_mov_b64 s[6:7], s[2:3]
; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
; GFX12-SDAG-NEXT:    s_barrier_join m0
; GFX12-SDAG-NEXT:    s_barrier_wait 1
; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[12:13]
; GFX12-SDAG-NEXT:    s_endpgm
;
; GFX12-GISEL-LABEL: kernel2:
; GFX12-GISEL:       ; %bb.0:
; GFX12-GISEL-NEXT:    s_add_co_u32 s8, s4, 48
; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s9, s5, 0
; GFX12-GISEL-NEXT:    s_getpc_b64 s[4:5]
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_sext_i32_i16 s5, s5
; GFX12-GISEL-NEXT:    s_add_co_u32 s4, s4, func2@gotpcrel32@lo+12
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+24
; GFX12-GISEL-NEXT:    v_mov_b32_e32 v31, v0
; GFX12-GISEL-NEXT:    s_load_b64 s[12:13], s[4:5], 0x0
; GFX12-GISEL-NEXT:    s_mov_b64 s[10:11], s[6:7]
; GFX12-GISEL-NEXT:    s_mov_b32 m0, 0x70002
; GFX12-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
; GFX12-GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT:    s_barrier_signal m0
; GFX12-GISEL-NEXT:    s_barrier_join 2
; GFX12-GISEL-NEXT:    s_barrier_wait 1
; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[12:13]
; GFX12-GISEL-NEXT:    s_endpgm
    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
    call void @llvm.amdgcn.s.barrier.wait(i16 1)

    call void @func2()
    ret void
}

declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1

attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
