4 files changed, 47 insertions, 79 deletions
diff --git a/Alc/ALu.c b/Alc/ALu.c
index 0c930a29..81914850 100644
--- a/Alc/ALu.c
+++ b/Alc/ALu.c
@@ -1092,7 +1092,7 @@ static void CalcNonAttnSourceParams(ALvoice *voice, const struct ALvoiceProps *p
     if(Pitch > (ALfloat)MAX_PITCH)
         voice->Step = MAX_PITCH<<FRACTIONBITS;
     else
-        voice->Step = maxi(fastf2i(Pitch*FRACTIONONE + 0.5f), 1);
+        voice->Step = maxi(fastf2i(Pitch * FRACTIONONE), 1);
     if(props->Resampler == BSinc24Resampler)
         BsincPrepare(voice->Step, &voice->ResampleState.bsinc, &bsinc24);
     else if(props->Resampler == BSinc12Resampler)
@@ -1453,7 +1453,7 @@ static void CalcAttnSourceParams(ALvoice *voice, const struct ALvoiceProps *prop
     if(Pitch > (ALfloat)MAX_PITCH)
         voice->Step = MAX_PITCH<<FRACTIONBITS;
     else
-        voice->Step = maxi(fastf2i(Pitch*FRACTIONONE + 0.5f), 1);
+        voice->Step = maxi(fastf2i(Pitch * FRACTIONONE), 1);
     if(props->Resampler == BSinc24Resampler)
         BsincPrepare(voice->Step, &voice->ResampleState.bsinc, &bsinc24);
     else if(props->Resampler == BSinc12Resampler)
@@ -1663,7 +1663,7 @@ static void ApplyDither(ALfloat (*restrict Samples)[BUFFERSIZE], ALuint *dither_
             ALuint rng0 = dither_rng(&seed);
             ALuint rng1 = dither_rng(&seed);
             val += (ALfloat)(rng0*(1.0/UINT_MAX) - rng1*(1.0/UINT_MAX));
-            samples[i] = roundf(val) * invscale;
+            samples[i] = fastf2i(val) * invscale;
         }
     }
     *dither_seed = seed;
diff --git a/Alc/fpu_modes.h b/Alc/fpu_modes.h
index 750252fc..eb305967 100644
--- a/Alc/fpu_modes.h
+++ b/Alc/fpu_modes.h
@@ -7,16 +7,13 @@
 
 
 typedef struct FPUCtl {
-#ifdef HAVE_FENV_H
-    fenv_t flt_env;
-#ifdef _WIN32
-    int round_mode;
-#endif
-#else
-    int state;
-#endif
-#ifdef HAVE_SSE
-    int sse_state;
+#if defined(__GNUC__) && defined(HAVE_SSE)
+    unsigned int sse_state;
+#elif defined(HAVE___CONTROL87_2)
+    unsigned int state;
+    unsigned int sse_state;
+#elif defined(HAVE__CONTROLFP)
+    unsigned int state;
 #endif
 } FPUCtl;
 void SetMixerFPUMode(FPUCtl *ctl);
diff --git a/Alc/helpers.c b/Alc/helpers.c
index c311ea2e..7bcb3f4a 100644
--- a/Alc/helpers.c
+++ b/Alc/helpers.c
@@ -269,81 +269,44 @@ void FillCPUCaps(int capfilter)
 
 void SetMixerFPUMode(FPUCtl *ctl)
 {
-#ifdef HAVE_FENV_H
-    fegetenv(&ctl->flt_env);
-#ifdef _WIN32
-    /* HACK: A nasty bug in MinGW-W64 causes fegetenv and fesetenv to not save
-     * and restore the FPU rounding mode, so we have to do it manually. Don't
-     * know if this also applies to MSVC.
-     */
-    ctl->round_mode = fegetround();
-#endif
-#if defined(__GNUC__) && defined(HAVE_SSE)
-    /* FIXME: Some fegetenv implementations can get the SSE environment too?
-     * How to tell when it does? */
-    if((CPUCapFlags&CPU_CAP_SSE))
-        __asm__ __volatile__("stmxcsr %0" : "=m" (*&ctl->sse_state));
-#endif
-
-#ifdef FE_TOWARDZERO
-    fesetround(FE_TOWARDZERO);
-#endif
 #if defined(__GNUC__) && defined(HAVE_SSE)
     if((CPUCapFlags&CPU_CAP_SSE))
     {
-        int sseState = ctl->sse_state;
-        sseState |= 0x6000; /* set round-to-zero */
+        __asm__ __volatile__("stmxcsr %0" : "=m" (*&ctl->sse_state));
+        unsigned int sseState = ctl->sse_state;
         sseState |= 0x8000; /* set flush-to-zero */
         if((CPUCapFlags&CPU_CAP_SSE2))
             sseState |= 0x0040; /* set denormals-are-zero */
         __asm__ __volatile__("ldmxcsr %0" : : "m" (*&sseState));
     }
-#endif
 
 #elif defined(HAVE___CONTROL87_2)
 
-    int mode;
-    __control87_2(0, 0, &ctl->state, NULL);
-    __control87_2(_RC_CHOP, _MCW_RC, &mode, NULL);
-#ifdef HAVE_SSE
-    if((CPUCapFlags&CPU_CAP_SSE))
-    {
-        __control87_2(0, 0, NULL, &ctl->sse_state);
-        __control87_2(_RC_CHOP|_DN_FLUSH, _MCW_RC|_MCW_DN, NULL, &mode);
-    }
-#endif
+    __control87_2(0, 0, &ctl->state, &ctl->sse_state);
+    _control87(_DN_FLUSH, _MCW_DN);
 
 #elif defined(HAVE__CONTROLFP)
 
     ctl->state = _controlfp(0, 0);
-    (void)_controlfp(_RC_CHOP, _MCW_RC);
+    _controlfp(_DN_FLUSH, _MCW_DN);
 #endif
 }
 
 void RestoreFPUMode(const FPUCtl *ctl)
 {
-#ifdef HAVE_FENV_H
-    fesetenv(&ctl->flt_env);
-#ifdef _WIN32
-    fesetround(ctl->round_mode);
-#endif
 #if defined(__GNUC__) && defined(HAVE_SSE)
     if((CPUCapFlags&CPU_CAP_SSE))
         __asm__ __volatile__("ldmxcsr %0" : : "m" (*&ctl->sse_state));
-#endif
 
 #elif defined(HAVE___CONTROL87_2)
 
     int mode;
-    __control87_2(ctl->state, _MCW_RC, &mode, NULL);
-#ifdef HAVE_SSE
-    if((CPUCapFlags&CPU_CAP_SSE))
-        __control87_2(ctl->sse_state, _MCW_RC|_MCW_DN, NULL, &mode);
-#endif
+    __control87_2(ctl->state, _MCW_DN, &mode, NULL);
+    __control87_2(ctl->sse_state, _MCW_DN, NULL, &mode);
 
 #elif defined(HAVE__CONTROLFP)
 
-    _controlfp(ctl->state, _MCW_RC);
+    _controlfp(ctl->state, _MCW_DN);
 #endif
 }
 
diff --git a/OpenAL32/Include/alMain.h b/OpenAL32/Include/alMain.h
index 1cf1e5e2..0cab5a17 100644
--- a/OpenAL32/Include/alMain.h
+++ b/OpenAL32/Include/alMain.h
@@ -226,36 +226,44 @@ inline size_t RoundUp(size_t value, size_t r)
     return value - (value%r);
 }
 
-/* Fast float-to-int conversion. Assumes the FPU is already in round-to-zero
- * mode. */
+/* Fast float-to-int conversion. No particular rounding mode is assumed; the
+ * IEEE-754 default is round-to-nearest with ties-to-even, though an app could
+ * change it on its own threads. On some systems, a truncating conversion may
+ * always be the fastest method.
+ */
 inline ALint fastf2i(ALfloat f)
 {
-#if (defined(__i386__) && !defined(__SSE_MATH__)) || (defined(_M_IX86_FP) && (_M_IX86_FP == 0))
-/* If using the x87 instruction set, try to use more efficient float-to-int
- * operations. The fistp instruction converts to integer efficiently enough,
- * but it isn't IEEE-754-compliant because it uses the current rounding mode
- * instead of always truncating -- the compiler will generate costly control
- * word changes with it to get correct behavior. If supported, lrintf converts
- * to integer using the current rounding mode, i.e. using fistp without control
- * word changes (if nothing even better is available). As long as the rounding
- * mode is set to round-to-zero ahead of time, and the call gets inlined, this
- * works fine.
- *
- * Other instruction sets, like SSE and ARM, have opcodes that inherently do
- * the right thing, and don't suffer from the same excessive performance
- * degredation from float-to-int conversions.
- */
-#ifdef HAVE_LRINTF
-    return lrintf(f);
-#elif defined(_MSC_VER) && defined(_M_IX86)
+#if defined(_MSC_VER) && defined(_M_IX86_FP)
     ALint i;
+#if _M_IX86_FP > 0
+    __asm cvtss2si i, f
+#else
     __asm fld f
     __asm fistp i
+#endif
     return i;
+
+#elif (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__x86_64__))
+
+    ALint i;
+#ifdef __SSE_MATH__
+    __asm__("cvtss2si %1, %0" : "=r"(i) : "x"(f));
 #else
-    return (ALint)f;
+    __asm__("flds %1\n fistps %0" : "=m"(i) : "m"(f));
 #endif
+    return i;
+
+    /* On GCC when compiling with -fno-math-errno, lrintf can be inlined to
+     * some simple instructions. Clang does not inline it, always generating a
+     * libc call, while MSVC's implementation is horribly slow, so always fall
+     * back to a normal integer conversion for them.
+     */
+#elif defined(HAVE_LRINTF) && !defined(_MSC_VER) && !defined(__clang__)
+
+    return lrintf(f);
+
 #else
+
     return (ALint)f;
 #endif
 }