sglang_v0.5.2/pytorch_2.8.0/third_party/fbgemm/test/FBGemmFPTest.h

220 lines
5.9 KiB
C++

/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <gtest/gtest.h>
#include <random>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "./TestUtils.h"
#include "bench/AlignedVec.h"
#include "bench/BenchUtils.h"
#include "fbgemm/FbgemmPackMatrixB.h"
#include "src/RefImplementations.h"
#ifdef USE_IACA
#include "iacaMarks.h"
#endif
namespace fbgemm {
/*
* @brief Abstract of the GEMM FP test
* The template parameter is transpose of A and B
*/
template <typename T>
class FBGemmFPTest : public testing::TestWithParam<
std::pair<fbgemm::matrix_op_t, fbgemm::matrix_op_t>> {
protected:
std::vector<std::vector<int>> GenShapes() const {
std::vector<std::vector<int>> shapes;
std::random_device r;
std::default_random_engine generator(r());
std::uniform_int_distribution<int> dm(1, 256);
std::uniform_int_distribution<int> dnk(1, 1024);
for (int i = 0; i < 10; i++) {
int m = dm(generator);
int n = dnk(generator);
int k = dnk(generator);
shapes.push_back({m, n, k});
}
return shapes;
}
void TestRun() {
auto shapes = GenShapes();
float alpha = 1.f, beta = 0.f;
matrix_op_t atrans, btrans;
std::tie(atrans, btrans) = GetParam();
for (auto s : shapes) {
int m = s[0];
int n = s[1];
int k = s[2];
std::cerr << "m = " << m << " n = " << n << " k = " << k;
if (atrans == matrix_op_t::Transpose) {
std::cerr << " A_transposed";
}
if (btrans == matrix_op_t::Transpose) {
std::cerr << " B_transposed";
}
std::cerr << std::endl;
// initialize with small numbers
aligned_vector<int> Aint(m * k);
aligned_vector<int> Bint(k * n);
randFill(Aint, 0, 4);
randFill(Bint, 0, 4);
aligned_vector<float> A(Aint.begin(), Aint.end());
aligned_vector<float> B(Bint.begin(), Bint.end());
aligned_vector<float> C(m * n, NAN);
aligned_vector<float> A_ref(A), B_ref(B), C_ref(C);
// Gold via reference sgemm
cblas_sgemm_ref(
atrans,
btrans,
m,
n,
k,
1.0f,
A_ref.data(),
atrans == matrix_op_t::Transpose ? m : k,
B_ref.data(),
btrans == matrix_op_t::Transpose ? k : n,
0.0f,
C_ref.data(),
n);
PackedGemmMatrixB<T> Bp(btrans, k, n, alpha, B.data());
#ifdef _OPENMP
#pragma omp parallel
#endif
{
int num_threads = fbgemm_get_num_threads();
int tid = fbgemm_get_thread_num();
cblas_gemm_compute(
atrans, m, A.data(), Bp, beta, C.data(), tid, num_threads);
}
// correctness check
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
float expected = C_ref[i * n + j];
float actual = C[i * n + j];
EXPECT_EQ(actual, expected)
<< "GEMM results differ at (" << i << ", " << j << "). ref "
<< expected << " FBGemm " << actual << " m: " << m << " n: " << n;
}
}
}
}
void UnpackTestRun() {
auto shapes = GenShapes();
float alpha = 1.f, beta = 0.f;
matrix_op_t atrans, btrans;
std::tie(atrans, btrans) = GetParam();
for (auto s : shapes) {
int m = s[0];
int n = s[1];
int k = s[2];
std::cerr << "m = " << m << " n = " << n << " k = " << k;
if (atrans == matrix_op_t::Transpose) {
std::cerr << " A_transposed";
}
if (btrans == matrix_op_t::Transpose) {
std::cerr << " B_transposed";
}
std::cerr << std::endl;
// initialize with small numbers
aligned_vector<int> Aint(m * k);
aligned_vector<int> Bint(k * n);
randFill(Aint, 0, 4);
randFill(Bint, 0, 4);
aligned_vector<float> A(Aint.begin(), Aint.end());
aligned_vector<float> B(Bint.begin(), Bint.end());
aligned_vector<float> C(m * n, NAN);
aligned_vector<float> A_ref(A), B_ref(B), C_ref(C);
// Gold via reference sgemm
cblas_sgemm_ref(
atrans,
btrans,
m,
n,
k,
1.0f,
A_ref.data(),
atrans == matrix_op_t::Transpose ? m : k,
B_ref.data(),
btrans == matrix_op_t::Transpose ? k : n,
0.0f,
C_ref.data(),
n);
// fbgemm fp16
PackedGemmMatrixB<T> Bp(btrans, k, n, alpha, B.data());
EXPECT_TRUE(Bp.packed());
// Test unpack
aligned_vector<T> tmp(Bp.matSize());
memcpy(tmp.data(), Bp.pmat(), Bp.matSize() * sizeof(T));
Bp.unpackFromSrc(btrans, tmp.data());
EXPECT_FALSE(Bp.packed());
memcpy(tmp.data(), Bp.pmat(), Bp.matSize() * sizeof(T));
for (int i = 0; i < k; ++i) {
for (int j = 0; j < n; ++j) {
EXPECT_EQ(
sizeof(T) == sizeof(float16) ? cpu_half2float(tmp[i * n + j])
: tmp[i * n + j],
B[i * n + j]);
}
}
// Pack it back
Bp.packFromSrc(btrans, tmp.data());
EXPECT_TRUE(Bp.packed());
#ifdef _OPENMP
#pragma omp parallel
#endif
{
int num_threads = fbgemm_get_num_threads();
int tid = fbgemm_get_thread_num();
cblas_gemm_compute(
atrans, m, A.data(), Bp, beta, C.data(), tid, num_threads);
}
// correctness check
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
float expected = C_ref[i * n + j];
float actual = C[i * n + j];
EXPECT_EQ(actual, expected)
<< "GEMM results differ at (" << i << ", " << j << "). ref "
<< expected << " FBGemm " << actual << " m: " << m << " n: " << n;
}
}
}
}
};
} // namespace fbgemm