Skip to content

Commit 24d4d05

Browse files
authored
Merge 7a7476b into 897d04e
2 parents 897d04e + 7a7476b commit 24d4d05

18 files changed

Lines changed: 1241 additions & 171 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ id_ed25519.pub
2020
*.model
2121
.cline_storage
2222
*.egg-info
23+
CLAUDE.md
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// Fused dual-GEMV + SiLU + elementwise multiply kernel for AIE2.
5+
// Same structure as AIE2+ variant but uses LUT-based getTanhBf16.
6+
7+
#define NOCPP
8+
9+
#include "../aie_kernel_utils.h"
10+
#include "lut_based_ops.h"
11+
12+
#include <aie_api/aie.hpp>
13+
#include <stdint.h>
14+
#include <type_traits>
15+
16+
static bfloat16 left_buf[1024] __attribute__((aligned(64)));
17+
static bfloat16 right_buf[1024] __attribute__((aligned(64)));
18+
19+
template <uint32_t r>
20+
void matvec_vectorized(uint32_t m,
21+
uint32_t k,
22+
const bfloat16 *__restrict a,
23+
const bfloat16 *__restrict b,
24+
bfloat16 *__restrict c)
25+
{
26+
::aie::set_rounding(aie::rounding_mode::conv_even);
27+
bfloat16 *c_end = c + m;
28+
const bfloat16 *b_end = b + k;
29+
for (; c < c_end; c++) {
30+
aie::accum acc = aie::zeros<accfloat, r>();
31+
AIE_LOOP_MIN_ITERATION_COUNT(2)
32+
for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
33+
aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
34+
aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
35+
acc = aie::mac(acc, a_vec, b_vec);
36+
}
37+
*c = static_cast<bfloat16>(aie::reduce_add(acc.template to_vector<float>()));
38+
}
39+
}
40+
41+
extern "C" {
42+
43+
void dual_gemv_matvec_bf16(uint32_t m,
44+
uint32_t k,
45+
uint32_t row_offset,
46+
const bfloat16 *__restrict a_in,
47+
const bfloat16 *__restrict b_in,
48+
uint32_t phase)
49+
{
50+
bfloat16 *dst = (phase == 0) ? left_buf : right_buf;
51+
dst += row_offset;
52+
matvec_vectorized<64>(m, k, a_in, b_in, dst);
53+
}
54+
55+
void dual_gemv_silu_mul_bf16(bfloat16 *__restrict c_out, int32_t m_output)
56+
{
57+
event0();
58+
59+
aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
60+
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
61+
AIE_PREPARE_FOR_PIPELINING
62+
for (int i = 0; i < m_output; i += 16) {
63+
aie::vector<bfloat16, 16> left_val = aie::load_v<16>(left_buf + i);
64+
aie::vector<bfloat16, 16> right_val = aie::load_v<16>(right_buf + i);
65+
66+
aie::vector<bfloat16, 16> half_x = aie::mul(left_val, register_0_5);
67+
aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
68+
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
69+
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
70+
auto silu_output = aie::mul(left_val, sigmoid_approx);
71+
72+
auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), right_val);
73+
aie::store_v(c_out + i, fused_output.to_vector<bfloat16>());
74+
}
75+
76+
event1();
77+
}
78+
79+
} // extern "C"

aie_kernels/aie2/silu_mul.cc

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#include "../aie_kernel_utils.h"
5+
#include "lut_based_ops.h"
6+
7+
#include <aie_api/aie.hpp>
8+
#include <stdint.h>
9+
10+
using namespace aie;
11+
12+
void silu_mul_tanh_approx_bf16(bfloat16 *restrict silu_input,
13+
bfloat16 *restrict mul_input,
14+
bfloat16 *restrict output_vector,
15+
const int32_t vector_size)
16+
{
17+
event0();
18+
19+
auto it_silu_in = aie::begin_restrict_vector<16>((bfloat16 *)silu_input);
20+
auto it_mul_in = aie::begin_restrict_vector<16>((bfloat16 *)mul_input);
21+
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
22+
23+
aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
24+
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
25+
AIE_PREPARE_FOR_PIPELINING
26+
AIE_LOOP_MIN_ITERATION_COUNT(64)
27+
for (int i = 0; i < vector_size; i += 16) {
28+
// Load input vectors
29+
aie::vector<bfloat16, 16> input = *it_silu_in++;
30+
aie::vector<bfloat16, 16> mul_in = *it_mul_in++;
31+
32+
// Compute SiLU: x * sigmoid(x) where sigmoid(x) = 0.5 * (1 + tanh(x/2))
33+
aie::vector<bfloat16, 16> half_x = aie::mul(input, register_0_5);
34+
aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
35+
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
36+
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
37+
auto silu_output = aie::mul(input, sigmoid_approx);
38+
39+
// Fused multiply: silu(input) * mul_input
40+
auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), mul_in);
41+
42+
// Store output vector
43+
*it_out++ = fused_output.to_vector<bfloat16>();
44+
}
45+
46+
event1();
47+
48+
return;
49+
}
50+
51+
extern "C" {
52+
53+
void silu_mul_bf16(bfloat16 *restrict silu_input,
54+
bfloat16 *restrict mul_input,
55+
bfloat16 *restrict output,
56+
int input_size)
57+
{
58+
silu_mul_tanh_approx_bf16(silu_input, mul_input, output, input_size);
59+
}
60+
61+
} // extern "C"
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// Fused dual-GEMV + SiLU + elementwise multiply kernel for AIE2+.
5+
//
6+
// Computes: output = silu(W1 @ x) * (W2 @ x)
7+
//
8+
// Two entry points called from the NPU design's core body:
9+
// 1. dual_gemv_matvec_bf16: GEMV writing to FIFO buffer c_out + row_offset
10+
// 2. dual_gemv_silu_mul_bf16: reads from static left_buf/right_buf, writes to FIFO c_out
11+
//
12+
// The static buffers are written via scalar stores (from matvec) and read
13+
// via aie::load_v in the silu_mul phase. Aligned to 64 bytes for safe vector access.
14+
15+
#define NOCPP
16+
17+
#include "../aie_kernel_utils.h"
18+
19+
#include <aie_api/aie.hpp>
20+
#include <stdint.h>
21+
#include <type_traits>
22+
23+
static bfloat16 left_buf[1024] __attribute__((aligned(64)));
24+
static bfloat16 right_buf[1024] __attribute__((aligned(64)));
25+
26+
template <uint32_t r>
27+
void matvec_vectorized(uint32_t m,
28+
uint32_t k,
29+
const bfloat16 *__restrict a,
30+
const bfloat16 *__restrict b,
31+
bfloat16 *__restrict c)
32+
{
33+
::aie::set_rounding(aie::rounding_mode::conv_even);
34+
bfloat16 *c_end = c + m;
35+
const bfloat16 *b_end = b + k;
36+
for (; c < c_end; c++) {
37+
aie::accum acc = aie::zeros<accfloat, r>();
38+
AIE_LOOP_MIN_ITERATION_COUNT(2)
39+
for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
40+
aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
41+
aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
42+
acc = aie::mac(acc, a_vec, b_vec);
43+
}
44+
*c = static_cast<bfloat16>(aie::reduce_add(acc.template to_vector<float>()));
45+
}
46+
}
47+
48+
extern "C" {
49+
50+
// Phase 1 & 2: GEMV writing to a static buffer (left_buf or right_buf)
51+
// phase=0 writes to left_buf, phase=1 writes to right_buf
52+
void dual_gemv_matvec_bf16(uint32_t m,
53+
uint32_t k,
54+
uint32_t row_offset,
55+
const bfloat16 *__restrict a_in,
56+
const bfloat16 *__restrict b_in,
57+
uint32_t phase)
58+
{
59+
bfloat16 *dst = (phase == 0) ? left_buf : right_buf;
60+
dst += row_offset;
61+
matvec_vectorized<64>(m, k, a_in, b_in, dst);
62+
}
63+
64+
// Phase 3: silu(left_buf) * right_buf -> c_out (FIFO buffer)
65+
void dual_gemv_silu_mul_bf16(bfloat16 *__restrict c_out, int32_t m_output)
66+
{
67+
event0();
68+
69+
aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
70+
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
71+
AIE_PREPARE_FOR_PIPELINING
72+
for (int i = 0; i < m_output; i += 16) {
73+
aie::vector<bfloat16, 16> left_val = aie::load_v<16>(left_buf + i);
74+
aie::vector<bfloat16, 16> right_val = aie::load_v<16>(right_buf + i);
75+
76+
// SiLU(x) = x * sigmoid(x) = x * 0.5 * (1 + tanh(x/2))
77+
auto half_x = aie::mul(left_val, register_0_5);
78+
auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
79+
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
80+
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
81+
auto silu_output = aie::mul(left_val, sigmoid_approx);
82+
83+
auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), right_val);
84+
aie::store_v(c_out + i, fused_output.to_vector<bfloat16>());
85+
}
86+
87+
event1();
88+
}
89+
90+
} // extern "C"

aie_kernels/aie2p/silu_mul.cc

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#include "../aie_kernel_utils.h"
5+
6+
#include <aie_api/aie.hpp>
7+
#include <stdint.h>
8+
9+
using namespace aie;
10+
11+
void silu_mul_tanh_approx_bf16(bfloat16 *restrict silu_input,
12+
bfloat16 *restrict mul_input,
13+
bfloat16 *restrict output_vector,
14+
const int32_t vector_size)
15+
{
16+
event0();
17+
18+
auto it_silu_in = aie::begin_restrict_vector<16>((bfloat16 *)silu_input);
19+
auto it_mul_in = aie::begin_restrict_vector<16>((bfloat16 *)mul_input);
20+
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
21+
22+
aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
23+
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
24+
AIE_PREPARE_FOR_PIPELINING
25+
AIE_LOOP_MIN_ITERATION_COUNT(64)
26+
for (int i = 0; i < vector_size; i += 16) {
27+
// Load input vectors
28+
aie::vector<bfloat16, 16> input = *it_silu_in++;
29+
aie::vector<bfloat16, 16> mul_in = *it_mul_in++;
30+
31+
// Compute SiLU: x * sigmoid(x) where sigmoid(x) = 0.5 * (1 + tanh(x/2))
32+
auto half_x = aie::mul(input, register_0_5);
33+
auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
34+
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
35+
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
36+
auto silu_output = aie::mul(input, sigmoid_approx);
37+
38+
// Fused multiply: silu(input) * mul_input
39+
auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), mul_in);
40+
41+
// Store output vector
42+
*it_out++ = fused_output.to_vector<bfloat16>();
43+
}
44+
45+
event1();
46+
47+
return;
48+
}
49+
50+
extern "C" {
51+
52+
void silu_mul_bf16(bfloat16 *restrict silu_input,
53+
bfloat16 *restrict mul_input,
54+
bfloat16 *restrict output,
55+
int input_size)
56+
{
57+
silu_mul_tanh_approx_bf16(silu_input, mul_input, output, input_size);
58+
}
59+
60+
} // extern "C"

iron/operators/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from .axpy.op import AIEAXPY
55
from .dequant.op import AIEDequant
6+
from .dual_gemv_silu_mul.op import AIEDualGEMVSiLUMul
67
from .elementwise_add.op import AIEElementwiseAdd
78
from .elementwise_mul.op import AIEElementwiseMul
89
from .gelu.op import AIEGELU
@@ -17,6 +18,7 @@
1718
from .rope.op import AIERope
1819
from .sigmoid.op import AIESigmoid
1920
from .silu.op import AIESiLU
21+
from .silu_mul.op import AIESiLUMul
2022
from .softmax.op import AIESoftmax
2123
from .swiglu_decode.op import AIESwiGLUDecode
2224
from .swiglu_prefill.op import AIESwiGLUPrefill

0 commit comments

Comments
 (0)