aboutsummaryrefslogtreecommitdiffstats
path: root/common/alnumeric.h
blob: 950eebe14223eec8964d148ebb168abdd0e996f4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#ifndef AL_NUMERIC_H
#define AL_NUMERIC_H

#include <cstddef>
#include <cstdint>
#ifdef HAVE_INTRIN_H
#include <intrin.h>
#endif
#ifdef HAVE_SSE_INTRINSICS
#include <xmmintrin.h>
#endif

#include "opthelpers.h"


inline constexpr int64_t operator "" _i64(unsigned long long int n) noexcept { return static_cast<int64_t>(n); }
inline constexpr uint64_t operator "" _u64(unsigned long long int n) noexcept { return static_cast<uint64_t>(n); }


constexpr inline float minf(float a, float b) noexcept
{ return ((a > b) ? b : a); }
constexpr inline float maxf(float a, float b) noexcept
{ return ((a > b) ? a : b); }
constexpr inline float clampf(float val, float min, float max) noexcept
{ return minf(max, maxf(min, val)); }

constexpr inline double mind(double a, double b) noexcept
{ return ((a > b) ? b : a); }
constexpr inline double maxd(double a, double b) noexcept
{ return ((a > b) ? a : b); }
constexpr inline double clampd(double val, double min, double max) noexcept
{ return mind(max, maxd(min, val)); }

constexpr inline unsigned int minu(unsigned int a, unsigned int b) noexcept
{ return ((a > b) ? b : a); }
constexpr inline unsigned int maxu(unsigned int a, unsigned int b) noexcept
{ return ((a > b) ? a : b); }
constexpr inline unsigned int clampu(unsigned int val, unsigned int min, unsigned int max) noexcept
{ return minu(max, maxu(min, val)); }

constexpr inline int mini(int a, int b) noexcept
{ return ((a > b) ? b : a); }
constexpr inline int maxi(int a, int b) noexcept
{ return ((a > b) ? a : b); }
constexpr inline int clampi(int val, int min, int max) noexcept
{ return mini(max, maxi(min, val)); }

constexpr inline int64_t mini64(int64_t a, int64_t b) noexcept
{ return ((a > b) ? b : a); }
constexpr inline int64_t maxi64(int64_t a, int64_t b) noexcept
{ return ((a > b) ? a : b); }
constexpr inline int64_t clampi64(int64_t val, int64_t min, int64_t max) noexcept
{ return mini64(max, maxi64(min, val)); }

constexpr inline uint64_t minu64(uint64_t a, uint64_t b) noexcept
{ return ((a > b) ? b : a); }
constexpr inline uint64_t maxu64(uint64_t a, uint64_t b) noexcept
{ return ((a > b) ? a : b); }
constexpr inline uint64_t clampu64(uint64_t val, uint64_t min, uint64_t max) noexcept
{ return minu64(max, maxu64(min, val)); }

constexpr inline size_t minz(size_t a, size_t b) noexcept
{ return ((a > b) ? b : a); }
constexpr inline size_t maxz(size_t a, size_t b) noexcept
{ return ((a > b) ? a : b); }
constexpr inline size_t clampz(size_t val, size_t min, size_t max) noexcept
{ return minz(max, maxz(min, val)); }


/** Find the next power-of-2 for non-power-of-2 numbers. */
inline uint32_t NextPowerOf2(uint32_t value) noexcept
{
    if(value > 0)
    {
        value--;
        value |= value>>1;
        value |= value>>2;
        value |= value>>4;
        value |= value>>8;
        value |= value>>16;
    }
    return value+1;
}

/** Round up a value to the next multiple. */
inline size_t RoundUp(size_t value, size_t r) noexcept
{
    value += r-1;
    return value - (value%r);
}


/* Define CTZ macros (count trailing zeros), and POPCNT macros (population
 * count/count 1 bits), for 32- and 64-bit integers. The CTZ macros' results
 * are *UNDEFINED* if the value is 0.
 */
#ifdef __GNUC__

#define POPCNT32 __builtin_popcount
#define CTZ32 __builtin_ctz
#if SIZEOF_LONG == 8
#define POPCNT64 __builtin_popcountl
#define CTZ64 __builtin_ctzl
#else
#define POPCNT64 __builtin_popcountll
#define CTZ64 __builtin_ctzll
#endif

#elif defined(HAVE_BITSCANFORWARD64_INTRINSIC)

inline int msvc64_popcnt32(uint32_t v)
{ return (int)__popcnt(v); }
#define POPCNT32 msvc64_popcnt32
inline int msvc64_ctz32(uint32_t v)
{
    unsigned long idx = 32;
    _BitScanForward(&idx, v);
    return (int)idx;
}
#define CTZ32 msvc64_ctz32

inline int msvc64_popcnt64(uint64_t v)
{ return (int)__popcnt64(v); }
#define POPCNT64 msvc64_popcnt64
inline int msvc64_ctz64(uint64_t v)
{
    unsigned long idx = 64;
    _BitScanForward64(&idx, v);
    return (int)idx;
}
#define CTZ64 msvc64_ctz64

#elif defined(HAVE_BITSCANFORWARD_INTRINSIC)

inline int msvc_popcnt32(uint32_t v)
{ return (int)__popcnt(v); }
#define POPCNT32 msvc_popcnt32
inline int msvc_ctz32(uint32_t v)
{
    unsigned long idx = 32;
    _BitScanForward(&idx, v);
    return (int)idx;
}
#define CTZ32 msvc_ctz32

inline int msvc_popcnt64(uint64_t v)
{ return (int)(__popcnt((uint32_t)v) + __popcnt((uint32_t)(v>>32))); }
#define POPCNT64 msvc_popcnt64
inline int msvc_ctz64(uint64_t v)
{
    unsigned long idx = 64;
    if(!_BitScanForward(&idx, (uint32_t)(v&0xffffffff)))
    {
        if(_BitScanForward(&idx, (uint32_t)(v>>32)))
            idx += 32;
    }
    return (int)idx;
}
#define CTZ64 msvc_ctz64

#else

/* There be black magics here. The popcnt method is derived from
 * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
 * while the ctz-utilizing-popcnt algorithm is shown here
 * http://www.hackersdelight.org/hdcodetxt/ntz.c.txt
 * as the ntz2 variant. These likely aren't the most efficient methods, but
 * they're good enough if the GCC or MSVC intrinsics aren't available.
 */
inline int fallback_popcnt32(uint32_t v)
{
    v = v - ((v >> 1) & 0x55555555u);
    v = (v & 0x33333333u) + ((v >> 2) & 0x33333333u);
    v = (v + (v >> 4)) & 0x0f0f0f0fu;
    return (int)((v * 0x01010101u) >> 24);
}
#define POPCNT32 fallback_popcnt32
inline int fallback_ctz32(uint32_t value)
{ return fallback_popcnt32(~value & (value - 1)); }
#define CTZ32 fallback_ctz32

inline int fallback_popcnt64(uint64_t v)
{
    v = v - ((v >> 1) & 0x5555555555555555_u64);
    v = (v & 0x3333333333333333_u64) + ((v >> 2) & 0x3333333333333333_u64);
    v = (v + (v >> 4)) & 0x0f0f0f0f0f0f0f0f_u64;
    return (int)((v * 0x0101010101010101_u64) >> 56);
}
#define POPCNT64 fallback_popcnt64
inline int fallback_ctz64(uint64_t value)
{ return fallback_popcnt64(~value & (value - 1)); }
#define CTZ64 fallback_ctz64
#endif


/**
 * Fast float-to-int conversion. No particular rounding mode is assumed; the
 * IEEE-754 default is round-to-nearest with ties-to-even, though an app could
 * change it on its own threads. On some systems, a truncating conversion may
 * always be the fastest method.
 */
inline int fastf2i(float f) noexcept
{
#if defined(HAVE_SSE_INTRINSICS)
    return _mm_cvt_ss2si(_mm_set_ss(f));

#elif defined(_MSC_VER) && defined(_M_IX86_FP)

    int i;
    __asm fld f
    __asm fistp i
    return i;

#elif (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__x86_64__))

    int i;
#ifdef __SSE_MATH__
    __asm__("cvtss2si %1, %0" : "=r"(i) : "x"(f));
#else
    __asm__ __volatile__("fistpl %0" : "=m"(i) : "t"(f) : "st");
#endif
    return i;

#else

    return static_cast<int>(f);
#endif
}

/** Converts float-to-int using standard behavior (truncation). */
inline int float2int(float f) noexcept
{
#if defined(HAVE_SSE_INTRINSICS)
    return _mm_cvtt_ss2si(_mm_set_ss(f));

#elif ((defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__x86_64__)) && \
       !defined(__SSE_MATH__)) || (defined(_MSC_VER) && defined(_M_IX86_FP) && _M_IX86_FP == 0)
    int sign, shift, mant;
    union {
        float f;
        int i;
    } conv;

    conv.f = f;
    sign = (conv.i>>31) | 1;
    shift = ((conv.i>>23)&0xff) - (127+23);

    /* Over/underflow */
    if(UNLIKELY(shift >= 31 || shift < -23))
        return 0;

    mant = (conv.i&0x7fffff) | 0x800000;
    if(LIKELY(shift < 0))
        return (mant >> -shift) * sign;
    return (mant << shift) * sign;

#else

    return static_cast<int>(f);
#endif
}

/**
 * Rounds a float to the nearest integral value, according to the current
 * rounding mode. This is essentially an inlined version of rintf, although
 * makes fewer promises (e.g. -0 or -0.25 rounded to 0 may result in +0).
 */
inline float fast_roundf(float f) noexcept
{
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__x86_64__)) && \
    !defined(__SSE_MATH__)

    float out;
    __asm__ __volatile__("frndint" : "=t"(out) : "0"(f));
    return out;

#else

    /* Integral limit, where sub-integral precision is not available for
     * floats.
     */
    static constexpr float ilim[2] = {
         8388608.0f /*  0x1.0p+23 */,
        -8388608.0f /* -0x1.0p+23 */
    };
    unsigned int sign, expo;
    union {
        float f;
        unsigned int i;
    } conv;

    conv.f = f;
    sign = (conv.i>>31)&0x01;
    expo = (conv.i>>23)&0xff;

    if(UNLIKELY(expo >= 150/*+23*/))
    {
        /* An exponent (base-2) of 23 or higher is incapable of sub-integral
         * precision, so it's already an integral value. We don't need to worry
         * about infinity or NaN here.
         */
        return f;
    }
    /* Adding the integral limit to the value (with a matching sign) forces a
     * result that has no sub-integral precision, and is consequently forced to
     * round to an integral value. Removing the integral limit then restores
     * the initial value rounded to the integral. The compiler should not
     * optimize this out because of non-associative rules on floating-point
     * math (as long as you don't use -fassociative-math,
     * -funsafe-math-optimizations, -ffast-math, or -Ofast, in which case this
     * may break).
     */
    f += ilim[sign];
    return f - ilim[sign];
#endif
}

#endif /* AL_NUMERIC_H */