536 lines
14 KiB
C++
536 lines
14 KiB
C++
/*
|
|
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under the BSD-style license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
#pragma once
|
|
#include <chrono>
|
|
#include <functional>
|
|
#include <vector>
|
|
|
|
#if defined(__x86_64__) || defined(__i386__) || \
|
|
(defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
|
|
#include <immintrin.h>
|
|
#endif
|
|
|
|
#ifdef USE_BLAS
|
|
#if __APPLE__
|
|
// not sure whether need to differentiate TARGET_OS_MAC or TARGET_OS_IPHONE,
|
|
// etc.
|
|
#include <Accelerate/Accelerate.h>
|
|
#else
|
|
#include <cblas.h>
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef _OPENMP
|
|
#include <omp.h>
|
|
#endif
|
|
|
|
#ifdef USE_MKL
|
|
#include <mkl.h>
|
|
#endif
|
|
|
|
#include "./AlignedVec.h"
|
|
#include "fbgemm/FbgemmBuild.h"
|
|
#include "fbgemm/FbgemmPackMatrixB.h"
|
|
#include "src/RefImplementations.h"
|
|
|
|
namespace fbgemm {
|
|
|
|
template <typename T>
|
|
void randFill(aligned_vector<T>& vec, T low, T high);
|
|
|
|
void llc_flush(std::vector<char>& llc);
|
|
|
|
#ifdef __aarch64__
|
|
void aarch64_flush_cache_line(void* addr);
|
|
#endif // __aarch64__
|
|
|
|
// Same as omp_get_max_threads() when OpenMP is available, otherwise 1
|
|
int fbgemm_get_max_threads();
|
|
// Same as omp_get_num_threads() when OpenMP is available, otherwise 1
|
|
int fbgemm_get_num_threads();
|
|
// Same as omp_get_thread_num() when OpenMP is available, otherwise 0
|
|
int fbgemm_get_thread_num();
|
|
|
|
template <typename T>
|
|
NOINLINE float cache_evict(const T& vec) {
|
|
auto const size = vec.size();
|
|
auto const elemSize = sizeof(typename T::value_type);
|
|
auto const dataSize = size * elemSize;
|
|
|
|
const char* data = reinterpret_cast<const char*>(vec.data());
|
|
constexpr int CACHE_LINE_SIZE = 64;
|
|
// Not having this dummy computation significantly slows down the computation
|
|
// that follows.
|
|
float dummy = 0.0f;
|
|
for (std::size_t i = 0; i < dataSize; i += CACHE_LINE_SIZE) {
|
|
dummy += data[i] * 1.0f;
|
|
#ifndef __aarch64__
|
|
_mm_mfence();
|
|
#endif
|
|
#ifndef _MSC_VER
|
|
asm volatile("" ::: "memory");
|
|
#endif
|
|
#if defined(__x86_64__) || defined(__i386__) || \
|
|
(defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
|
|
_mm_clflush(&data[i]);
|
|
#elif defined(__aarch64__)
|
|
aarch64_flush_cache_line((char*)&data[i]);
|
|
#else
|
|
__builtin___clear_cache(
|
|
(char*)&data[i], (char*)(&data[i] + CACHE_LINE_SIZE));
|
|
#endif
|
|
}
|
|
|
|
return dummy;
|
|
}
|
|
|
|
/**
|
|
* Parse application command line arguments
|
|
*
|
|
*/
|
|
int parseArgumentInt(
|
|
int argc,
|
|
const char* argv[],
|
|
const char* arg,
|
|
int non_exist_val,
|
|
int def_val);
|
|
bool parseArgumentBool(
|
|
int argc,
|
|
const char* argv[],
|
|
const char* arg,
|
|
bool def_val);
|
|
|
|
namespace {
|
|
struct empty_flush {
|
|
void operator()() const {}
|
|
};
|
|
} // namespace
|
|
|
|
/**
|
|
* @param Fn functor to execute
|
|
* @param Fe data eviction functor
|
|
*/
|
|
template <class Fn, class Fe = std::function<void()>>
|
|
double measureWithWarmup(
|
|
Fn&& fn,
|
|
int warmupIterations,
|
|
int measuredIterations,
|
|
const Fe& fe = empty_flush(),
|
|
bool useOpenMP = false) {
|
|
for (int i = 0; i < warmupIterations; ++i) {
|
|
// Evict data first
|
|
fe();
|
|
fn();
|
|
}
|
|
|
|
double ttot = 0.0;
|
|
|
|
#ifdef _OPENMP
|
|
#pragma omp parallel if (useOpenMP)
|
|
{
|
|
#endif
|
|
for (int i = 0; i < measuredIterations; ++i) {
|
|
std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
|
|
|
|
const auto thread_id = useOpenMP ? fbgemm_get_thread_num() : 0;
|
|
|
|
if (thread_id == 0) {
|
|
fe();
|
|
}
|
|
|
|
#ifdef _OPENMP
|
|
if (useOpenMP) {
|
|
#pragma omp barrier
|
|
}
|
|
#endif
|
|
start = std::chrono::high_resolution_clock::now();
|
|
|
|
fn();
|
|
|
|
#ifdef _OPENMP
|
|
if (useOpenMP) {
|
|
#pragma omp barrier
|
|
}
|
|
#endif
|
|
|
|
end = std::chrono::high_resolution_clock::now();
|
|
auto dur =
|
|
std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
|
|
|
|
if (thread_id == 0) {
|
|
// TODO: measure load imbalance
|
|
ttot += dur.count();
|
|
}
|
|
}
|
|
|
|
#ifdef _OPENMP
|
|
}
|
|
#endif
|
|
return ttot / 1e9 / measuredIterations;
|
|
}
|
|
|
|
/*
|
|
* @brief Out-of-place transposition for M*N matrix ref.
|
|
* @param M number of rows in input
|
|
* @param K number of columns in input
|
|
*/
|
|
template <typename T>
|
|
void transpose_matrix(
|
|
int M,
|
|
int N,
|
|
const T* src,
|
|
int ld_src,
|
|
T* dst,
|
|
int ld_dst) {
|
|
for (int i = 0; i < N; ++i) {
|
|
for (int j = 0; j < M; ++j) {
|
|
dst[i * ld_dst + j] = src[i + j * ld_src];
|
|
}
|
|
} // for each output row
|
|
}
|
|
|
|
/*
|
|
* @brief In-place transposition for nxk matrix ref.
|
|
* @param n number of rows in input (number of columns in output)
|
|
* @param k number of columns in input (number of rows in output)
|
|
*/
|
|
template <typename T>
|
|
void transpose_matrix(T* ref, int n, int k) {
|
|
std::vector<T> local(n * k);
|
|
transpose_matrix(n, k, ref, k, local.data(), n);
|
|
memcpy(ref, local.data(), n * k * sizeof(T));
|
|
}
|
|
|
|
#ifdef USE_MKL
|
|
void test_xerbla(char* srname, const int* info, int);
|
|
#endif
|
|
|
|
#define dataset 1
|
|
|
|
template <typename btype>
|
|
void performance_test(
|
|
int num_instances,
|
|
bool flush,
|
|
int repetitions,
|
|
bool is_mkl) {
|
|
#ifdef USE_MKL
|
|
mkl_set_xerbla((XerblaEntry)test_xerbla);
|
|
#endif
|
|
(void)is_mkl; // Suppress unused variable warning
|
|
|
|
float alpha = 1.f, beta = 1.f;
|
|
matrix_op_t btran = matrix_op_t::Transpose;
|
|
|
|
#if dataset == 1
|
|
const int NITER = (flush) ? 10 : 100;
|
|
std::vector<std::vector<int>> shapes;
|
|
for (auto m = 1; m < 120; m++) {
|
|
// shapes.push_back({m, 128, 512});
|
|
shapes.push_back({m, 512, 512});
|
|
}
|
|
|
|
#elif dataset == 2
|
|
const int NITER = (flush) ? 10 : 100;
|
|
#include "shapes_dataset.h"
|
|
|
|
#else
|
|
flush = false;
|
|
constexpr int NITER = 1;
|
|
std::vector<std::vector<int>> shapes;
|
|
std::random_device r;
|
|
std::default_random_engine generator(r());
|
|
std::uniform_int_distribution<int> dm(1, 100);
|
|
std::uniform_int_distribution<int> dnk(1, 1024);
|
|
for (int i = 0; i < 1000; i++) {
|
|
int m = dm(generator);
|
|
int n = dnk(generator);
|
|
int k = dnk(generator);
|
|
shapes.push_back({m, n, k});
|
|
}
|
|
#endif
|
|
|
|
std::string type;
|
|
double gflops, gbs, ttot;
|
|
for (auto s : shapes) {
|
|
int m = s[0];
|
|
int n = s[1];
|
|
int k = s[2];
|
|
|
|
// initialize with small numbers
|
|
aligned_vector<int> Aint(m * k);
|
|
randFill(Aint, 0, 4);
|
|
std::vector<aligned_vector<float>> A;
|
|
for (int i = 0; i < num_instances; ++i) {
|
|
A.push_back(aligned_vector<float>(Aint.begin(), Aint.end()));
|
|
}
|
|
|
|
aligned_vector<int> Bint(k * n);
|
|
randFill(Bint, 0, 4);
|
|
aligned_vector<float> B(Bint.begin(), Bint.end());
|
|
std::vector<std::unique_ptr<PackedGemmMatrixB<btype>>> Bp;
|
|
for (int i = 0; i < num_instances; ++i) {
|
|
Bp.emplace_back(std::unique_ptr<PackedGemmMatrixB<btype>>(
|
|
new PackedGemmMatrixB<btype>(btran, k, n, alpha, B.data())));
|
|
}
|
|
auto kAligned = ((k * sizeof(float) + 64) & ~63) / sizeof(float);
|
|
auto nAligned = ((n * sizeof(float) + 64) & ~63) / sizeof(float);
|
|
std::vector<aligned_vector<float>> Bt(num_instances);
|
|
auto& Bt_ref = Bt[0];
|
|
|
|
if (btran == matrix_op_t::Transpose) {
|
|
Bt_ref.resize(k * nAligned);
|
|
for (auto row = 0; row < k; ++row) {
|
|
for (auto col = 0; col < n; ++col) {
|
|
Bt_ref[row * nAligned + col] = alpha * B[col * k + row];
|
|
}
|
|
}
|
|
} else {
|
|
Bt_ref.resize(kAligned * n);
|
|
for (auto row = 0; row < k; ++row) {
|
|
for (auto col = 0; col < n; ++col) {
|
|
Bt_ref[col * kAligned + row] = alpha * B[col * k + row];
|
|
}
|
|
}
|
|
}
|
|
|
|
for (auto i = 1; i < num_instances; ++i) {
|
|
Bt[i] = Bt_ref;
|
|
}
|
|
|
|
std::vector<aligned_vector<float>> C_ref;
|
|
std::vector<aligned_vector<float>> C_fb;
|
|
if (beta != 0.0f) {
|
|
aligned_vector<int> Cint(m * n);
|
|
randFill(Cint, 0, 4);
|
|
for (int i = 0; i < num_instances; ++i) {
|
|
C_ref.push_back(aligned_vector<float>(Cint.begin(), Cint.end()));
|
|
C_fb.push_back(aligned_vector<float>(Cint.begin(), Cint.end()));
|
|
}
|
|
} else {
|
|
for (int i = 0; i < num_instances; ++i) {
|
|
C_ref.push_back(aligned_vector<float>(m * n, 1.f));
|
|
C_fb.push_back(aligned_vector<float>(m * n, NAN));
|
|
}
|
|
}
|
|
|
|
double nflops = 2.0 * m * n * k;
|
|
double nbytes = 4.0 * m * k + sizeof(btype) * 1.0 * k * n + 4.0 * m * n;
|
|
|
|
// warm up MKL and fbgemm
|
|
// check correctness at the same time
|
|
for (auto w = 0; w < 3; w++) {
|
|
#if defined(USE_MKL) || defined(USE_BLAS)
|
|
cblas_sgemm(
|
|
CblasRowMajor,
|
|
CblasNoTrans,
|
|
CblasNoTrans, // B is pretransposed, if required by operation
|
|
m,
|
|
n,
|
|
k,
|
|
1.0, // Mutliplication by Alpha is done during transpose of B
|
|
A[0].data(),
|
|
k,
|
|
Bt[0].data(),
|
|
btran == matrix_op_t::NoTranspose ? kAligned : nAligned,
|
|
beta,
|
|
C_ref[0].data(),
|
|
n);
|
|
#else
|
|
cblas_sgemm_ref(
|
|
matrix_op_t::NoTranspose,
|
|
matrix_op_t::NoTranspose,
|
|
m,
|
|
n,
|
|
k,
|
|
1.0,
|
|
A[0].data(),
|
|
k,
|
|
Bt[0].data(),
|
|
(btran == matrix_op_t::NoTranspose) ? kAligned : nAligned,
|
|
beta,
|
|
C_ref[0].data(),
|
|
n);
|
|
#endif
|
|
#ifdef _OPENMP
|
|
#pragma omp parallel if (num_instances == 1)
|
|
#endif
|
|
{
|
|
int num_threads = num_instances == 1 ? fbgemm_get_num_threads() : 1;
|
|
int tid = num_instances == 1 ? fbgemm_get_thread_num() : 0;
|
|
cblas_gemm_compute(
|
|
matrix_op_t::NoTranspose,
|
|
m,
|
|
A[0].data(),
|
|
*Bp[0],
|
|
beta,
|
|
C_fb[0].data(),
|
|
tid,
|
|
num_threads);
|
|
}
|
|
|
|
#if defined(USE_MKL) || defined(USE_BLAS)
|
|
// Compare results
|
|
for (size_t i = 0; i < C_ref[0].size(); i++) {
|
|
if (std::abs(C_ref[0][i] - C_fb[0][i]) > 1e-3) {
|
|
fprintf(
|
|
stderr,
|
|
"Error: too high diff between fp32 ref %f and fp16 %f at %ld\n",
|
|
C_ref[0][i],
|
|
C_fb[0][i],
|
|
i);
|
|
return;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#ifdef USE_MKL
|
|
if (is_mkl) {
|
|
// Gold via MKL sgemm
|
|
type = "MKL_FP32";
|
|
#elif defined(USE_BLAS)
|
|
type = "BLAS_FP32";
|
|
#else
|
|
type = "REF_FP32";
|
|
#endif
|
|
|
|
ttot = measureWithWarmup(
|
|
[&]() {
|
|
int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
|
|
for (int i = 0; i < repetitions; ++i) {
|
|
#if defined(USE_MKL) || defined(USE_BLAS)
|
|
cblas_sgemm(
|
|
CblasRowMajor,
|
|
CblasNoTrans,
|
|
CblasNoTrans,
|
|
m,
|
|
n,
|
|
k,
|
|
1.0,
|
|
A[copy].data(),
|
|
k,
|
|
Bt[copy].data(),
|
|
btran == matrix_op_t::NoTranspose ? kAligned : nAligned,
|
|
beta,
|
|
C_ref[copy].data(),
|
|
n);
|
|
#else
|
|
cblas_sgemm_ref(
|
|
matrix_op_t::NoTranspose,
|
|
matrix_op_t::NoTranspose,
|
|
m,
|
|
n,
|
|
k,
|
|
1.0,
|
|
A[copy].data(),
|
|
k,
|
|
Bt[copy].data(),
|
|
(btran == matrix_op_t::NoTranspose) ? kAligned : nAligned,
|
|
beta,
|
|
C_ref[copy].data(),
|
|
n);
|
|
#endif
|
|
}
|
|
},
|
|
3,
|
|
NITER,
|
|
[&]() {
|
|
if (flush) {
|
|
int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
|
|
cache_evict(A[copy]);
|
|
cache_evict(Bt[copy]);
|
|
cache_evict(C_ref[copy]);
|
|
}
|
|
},
|
|
// Use OpenMP if num instances > 1
|
|
num_instances > 1);
|
|
|
|
gflops = nflops / ttot / 1e9;
|
|
gbs = nbytes / ttot / 1e9;
|
|
printf(
|
|
"\n%30s m = %5d n = %5d k = %5d Gflops = %8.4lf GBytes = %8.4lf\n",
|
|
type.c_str(),
|
|
m,
|
|
n,
|
|
k,
|
|
gflops * repetitions,
|
|
gbs * repetitions);
|
|
#ifdef USE_MKL
|
|
}
|
|
#endif
|
|
type = "FBP_" + std::string(typeid(btype).name());
|
|
|
|
ttot = measureWithWarmup(
|
|
[&]() {
|
|
// When executing in data decomposition (single-instance) mode
|
|
// Different threads will access different regions of the same
|
|
// matrices. Thus, copy to be used is always 0. The numbers of
|
|
// threads would be the as number of threads in the parallel
|
|
// region.
|
|
// When running in functional decomposition (multi-instance) mode
|
|
// different matrices are used. The copy to be used selected by
|
|
// thread_id (thread_num), and the number of threads performance
|
|
// the compute of the same instance is 1.
|
|
int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
|
|
int num_threads = num_instances == 1 ? fbgemm_get_num_threads() : 1;
|
|
int tid = num_instances == 1 ? fbgemm_get_thread_num() : 0;
|
|
|
|
for (int i = 0; i < repetitions; ++i) {
|
|
cblas_gemm_compute(
|
|
matrix_op_t::NoTranspose,
|
|
m,
|
|
A[copy].data(),
|
|
*Bp[copy],
|
|
beta,
|
|
C_fb[copy].data(),
|
|
tid,
|
|
num_threads);
|
|
}
|
|
},
|
|
3,
|
|
NITER,
|
|
[&]() {
|
|
if (flush) {
|
|
int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
|
|
cache_evict(A[copy]);
|
|
cache_evict(*Bp[copy]);
|
|
cache_evict(C_fb[copy]);
|
|
}
|
|
},
|
|
true /*useOpenMP*/);
|
|
|
|
gflops = nflops / ttot / 1e9;
|
|
gbs = nbytes / ttot / 1e9;
|
|
printf(
|
|
"%30s m = %5d n = %5d k = %5d Gflops = %8.4lf GBytes = %8.4lf\n",
|
|
type.c_str(),
|
|
m,
|
|
n,
|
|
k,
|
|
gflops * repetitions,
|
|
gbs * repetitions);
|
|
}
|
|
}
|
|
|
|
aligned_vector<float> getRandomSparseVector(
|
|
unsigned size,
|
|
float fractionNonZeros = 1.0);
|
|
|
|
template <typename T>
|
|
aligned_vector<T> getRandomBlockSparseMatrix(
|
|
int Rows,
|
|
int Cols,
|
|
float fractionNonZerosBlocks = 1.0,
|
|
int RowBlockSize = 4,
|
|
int ColBlockSize = 1,
|
|
T low = 0,
|
|
T high = 9);
|
|
|
|
} // namespace fbgemm
|