Skip to content

Commit

Permalink
code--
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Apr 24, 2024
1 parent b4e0104 commit 16ab586
Showing 1 changed file with 33 additions and 77 deletions.
110 changes: 33 additions & 77 deletions src/layer/arm/lstm_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1637,9 +1637,9 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)

__fp16* bias_c_IFOG = bias_c_data_packed_dr.row<__fp16>(0);

int q = 0;
if (opt.use_fp16_arithmetic)
{
int q = 0;
for (; q + 1 < hidden_size; q += 2)
{
bias_c_IFOG[0] = (__fp16)bias_c_I[q];
Expand Down Expand Up @@ -1702,92 +1702,48 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
weight_hc_IFOG += 8;
}
}
for (; q < hidden_size; q++)
{
bias_c_IFOG[0] = (__fp16)bias_c_I[q];
bias_c_IFOG[1] = (__fp16)bias_c_F[q];
bias_c_IFOG[2] = (__fp16)bias_c_O[q];
bias_c_IFOG[3] = (__fp16)bias_c_G[q];

bias_c_IFOG += 4;

const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);

const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);

__fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(q / 2 + q % 2);
__fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(q / 2 + q % 2);

for (int i = 0; i < size; i++)
{
weight_xc_IFOG[0] = (__fp16)weight_xc_I[i];
weight_xc_IFOG[1] = (__fp16)weight_xc_F[i];
weight_xc_IFOG[2] = (__fp16)weight_xc_O[i];
weight_xc_IFOG[3] = (__fp16)weight_xc_G[i];

weight_xc_IFOG += 4;
}

for (int i = 0; i < num_output; i++)
{
weight_hc_IFOG[0] = (__fp16)weight_hc_I[i];
weight_hc_IFOG[1] = (__fp16)weight_hc_F[i];
weight_hc_IFOG[2] = (__fp16)weight_hc_O[i];
weight_hc_IFOG[3] = (__fp16)weight_hc_G[i];

weight_hc_IFOG += 4;
}
}
}
else
for (; q < hidden_size; q++)
{
for (int q = 0; q < hidden_size; q++)
{
bias_c_IFOG[0] = (__fp16)bias_c_I[q];
bias_c_IFOG[1] = (__fp16)bias_c_F[q];
bias_c_IFOG[2] = (__fp16)bias_c_O[q];
bias_c_IFOG[3] = (__fp16)bias_c_G[q];
bias_c_IFOG[0] = (__fp16)bias_c_I[q];
bias_c_IFOG[1] = (__fp16)bias_c_F[q];
bias_c_IFOG[2] = (__fp16)bias_c_O[q];
bias_c_IFOG[3] = (__fp16)bias_c_G[q];

bias_c_IFOG += 4;
bias_c_IFOG += 4;

const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);
const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);

const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);
const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);

__fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(q);
__fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(q);
const int qq = opt.use_fp16_arithmetic ? q / 2 + q % 2 : q;
__fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(qq);
__fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(qq);

for (int i = 0; i < size; i++)
{
weight_xc_IFOG[0] = (__fp16)weight_xc_I[i];
weight_xc_IFOG[1] = (__fp16)weight_xc_F[i];
weight_xc_IFOG[2] = (__fp16)weight_xc_O[i];
weight_xc_IFOG[3] = (__fp16)weight_xc_G[i];
for (int i = 0; i < size; i++)
{
weight_xc_IFOG[0] = (__fp16)weight_xc_I[i];
weight_xc_IFOG[1] = (__fp16)weight_xc_F[i];
weight_xc_IFOG[2] = (__fp16)weight_xc_O[i];
weight_xc_IFOG[3] = (__fp16)weight_xc_G[i];

weight_xc_IFOG += 4;
}
weight_xc_IFOG += 4;
}

for (int i = 0; i < num_output; i++)
{
weight_hc_IFOG[0] = (__fp16)weight_hc_I[i];
weight_hc_IFOG[1] = (__fp16)weight_hc_F[i];
weight_hc_IFOG[2] = (__fp16)weight_hc_O[i];
weight_hc_IFOG[3] = (__fp16)weight_hc_G[i];
for (int i = 0; i < num_output; i++)
{
weight_hc_IFOG[0] = (__fp16)weight_hc_I[i];
weight_hc_IFOG[1] = (__fp16)weight_hc_F[i];
weight_hc_IFOG[2] = (__fp16)weight_hc_O[i];
weight_hc_IFOG[3] = (__fp16)weight_hc_G[i];

weight_hc_IFOG += 4;
}
weight_hc_IFOG += 4;
}
}
}
Expand Down

0 comments on commit 16ab586

Please sign in to comment.