112 lines
3.0 KiB
Plaintext
112 lines
3.0 KiB
Plaintext
/*
|
|
* Copyright 2014 Maxim Milakov
|
|
*
|
|
* The code is based on the Chapter 10 of Hacker's Delight book by Henry S. Warren, Jr.
|
|
* The struct is adapted from https://github.com/milakov/int_fastdiv/blob/master/int_fastdiv.h
|
|
* by Maxim Milakov, the difference is that here we use uint32_t instead of int32_t.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
#ifndef FLASHINFER_FASTDIV_CUH_
|
|
#define FLASHINFER_FASTDIV_CUH_
|
|
#include <cstdint>
|
|
|
|
namespace flashinfer {
|
|
|
|
struct uint_fastdiv {
|
|
uint32_t d;
|
|
uint32_t m;
|
|
uint32_t s;
|
|
uint32_t a;
|
|
|
|
__host__ __device__ uint_fastdiv() : d(0), m(0), s(0), a(0) {}
|
|
|
|
__host__ uint_fastdiv(uint32_t d) : d(d) {
|
|
unsigned int p, nc, delta, q1, r1, q2, r2;
|
|
a = 0;
|
|
nc = unsigned(-1) - unsigned(-d) % d;
|
|
p = 31;
|
|
q1 = 0x80000000 / nc;
|
|
r1 = 0x80000000 - q1 * nc;
|
|
q2 = 0x7FFFFFFF / d;
|
|
r2 = 0x7FFFFFFF - q2 * d;
|
|
do {
|
|
p++;
|
|
if (r1 >= nc - r1) {
|
|
q1 = 2 * q1 + 1;
|
|
r1 = 2 * r1 - nc;
|
|
} else {
|
|
q1 = 2 * q1;
|
|
r1 = 2 * r1;
|
|
}
|
|
if (r2 + 1 >= d - r2) {
|
|
if (q2 >= 0x7FFFFFFF) a = 1;
|
|
q2 = 2 * q2 + 1;
|
|
r2 = 2 * r2 + 1 - d;
|
|
} else {
|
|
if (q2 >= 0x80000000) a = 1;
|
|
q2 = 2 * q2;
|
|
r2 = 2 * r2 + 1;
|
|
}
|
|
delta = d - 1 - r2;
|
|
} while (p < 64 && (q1 < delta || (q1 == delta && r1 == 0)));
|
|
m = q2 + 1;
|
|
s = p - 32;
|
|
}
|
|
|
|
__host__ __device__ __forceinline__ operator unsigned int() const { return d; }
|
|
|
|
__host__ __device__ __forceinline__ void divmod(uint32_t n, uint32_t& q, uint32_t& r) const {
|
|
if (d == 1) {
|
|
q = n;
|
|
} else {
|
|
#ifdef __CUDA_ARCH__
|
|
q = __umulhi(m, n);
|
|
#else
|
|
q = (((unsigned long long)((long long)m * (long long)n)) >> 32);
|
|
#endif
|
|
q += a * n;
|
|
q >>= s;
|
|
}
|
|
r = n - q * d;
|
|
}
|
|
};
|
|
|
|
__host__ __device__ __forceinline__ uint32_t operator/(const uint32_t n,
|
|
const uint_fastdiv& divisor) {
|
|
uint32_t q;
|
|
if (divisor.d == 1) {
|
|
q = n;
|
|
} else {
|
|
#ifdef __CUDA_ARCH__
|
|
q = __umulhi(divisor.m, n);
|
|
#else
|
|
q = (((unsigned long long)((long long)divisor.m * (long long)n)) >> 32);
|
|
#endif
|
|
q += divisor.a * n;
|
|
q >>= divisor.s;
|
|
}
|
|
return q;
|
|
}
|
|
|
|
__host__ __device__ __forceinline__ uint32_t operator%(const uint32_t n,
|
|
const uint_fastdiv& divisor) {
|
|
uint32_t quotient = n / divisor;
|
|
uint32_t remainder = n - quotient * divisor;
|
|
return remainder;
|
|
}
|
|
|
|
} // namespace flashinfer
|
|
|
|
#endif // FLASHINFER_FASTDIV_CUH_
|