summaryrefslogtreecommitdiffstats
path: root/src/jogl/classes/com
diff options
context:
space:
mode:
authorSven Gothel <[email protected]>2023-04-05 23:36:25 +0200
committerSven Gothel <[email protected]>2023-04-05 23:36:25 +0200
commit10b60e10ece3cbc3e0b8a68ac73229371530e0ba (patch)
treedb89ceda1867ca3a42b45c18ab8f42895877c452 /src/jogl/classes/com
parent24113f8e3452df8c8bb9e6136fa12bfed3bcc312 (diff)
Matrix4f Perf: Enhance invert(), Drop (test) load on Matrix4f.mul(Matrix4f) for fair and realistic numbers - Both mul() ops faster than FloatUtil
Enhanced invert() of Matrix4f* and FloatUtil: Use 1f/det factor for burst scale. Enhanced Matrix4f.invert(..): Use factored-out mulScale() to deliver the scale, giving a good 10% advantage on aarch64 and amd64. Brings Matrix4f.invert(..) on par w/ FloatUtil, on aarch64 even a 14% advantage. +++ TestMatrix4f02MulNOUI added an additional Matrix4f.load() to the mul(Matrix4f) loop test, which surely is an extra burden and not realistic as the mul(Matrix4f, Matrix4f) and FloatUtil pendants also don't count loading a value. Matrix4f.mul(Matrix4f) shall be used to utilize an already stored value anyways. Matrix4f.mul(Matrix4f) didn't really exist in FloatUtil. Same is true for Matrix4f.invert(), re-grouped order, i.e. pushing the non-arg variant last. +++ Revised performance numbers from commit 15e60161787224e85172685f74dc0ac195969b51 AMD64 + OpenJDK17 - FloatUtil.multMatrix(a, a_off, b, b_off, dest) is considerable slower than all - Matrix4f.mul(a, b) roughly ~10% faster than FloatUtil.multMatrix(a, b, dest) - Matrix4f.mul(b) roughly ~18% faster than FloatUtil.multMatrix(a, b, dest) (*) - Matrix4f.invert(a) roughly ~ 2% faster than FloatUtil.invertMatrix(..) - Matrix4f.invert() roughly ~ 4% slower than FloatUtil.invertMatrix(..) (*) - Launched: nice -19 scripts/tests-x64.sh RaspberryPi 4b aarch64 + OpenJDK17 - FloatUtil.multMatrix(a, a_off, b, b_off, dest) is considerable slower than all - Matrix4f.mul(a, b) roughly ~ 9% faster than FloatUtil.multMatrix(a, b, dest) - Matrix4f.mul(b) roughly ~14% faster than FloatUtil.multMatrix(a, b, dest) (*) - Matrix4f.invert(a) roughly ~14% faster than FloatUtil.invertMatrix(..) - Matrix4f.invert() roughly ~12% faster than FloatUtil.invertMatrix(..) (*) - Launched: nice -19 scripts/tests-linux-aarch64.sh (*) not a true comparison in feature, as operating on 'this' matrix values for one argument, unavailable to FloatUtil. Conclusion - Matrix4f.mul(..) is considerable faster! - Matrix4f.invert(..) faster, esp on aarch64 And additional Matrix4fb tests using float[16] similar to FloatUtil also demonstrates less performance compared to Matrix4f using dedicated float fields.
Diffstat (limited to 'src/jogl/classes/com')
-rw-r--r--src/jogl/classes/com/jogamp/opengl/math/FloatUtil.java72
-rw-r--r--src/jogl/classes/com/jogamp/opengl/math/Matrix4f.java194
2 files changed, 177 insertions, 89 deletions
diff --git a/src/jogl/classes/com/jogamp/opengl/math/FloatUtil.java b/src/jogl/classes/com/jogamp/opengl/math/FloatUtil.java
index f793629d6..9ffa3bba6 100644
--- a/src/jogl/classes/com/jogamp/opengl/math/FloatUtil.java
+++ b/src/jogl/classes/com/jogamp/opengl/math/FloatUtil.java
@@ -919,27 +919,27 @@ public final class FloatUtil {
final float m44 = + a11*(a22*a33 - a23*a32) - a12*(a21*a33 - a23*a31) + a13*(a21*a32 - a22*a31);
final float det = (a11*m11 + a12*m12 + a13*m13 + a14*m14)/scale;
-
if( 0 == det ) {
return null;
}
-
- mres[0+4*0+mres_offset] = m11 / det;
- mres[1+4*0+mres_offset] = m12 / det;
- mres[2+4*0+mres_offset] = m13 / det;
- mres[3+4*0+mres_offset] = m14 / det;
- mres[0+4*1+mres_offset] = m21 / det;
- mres[1+4*1+mres_offset] = m22 / det;
- mres[2+4*1+mres_offset] = m23 / det;
- mres[3+4*1+mres_offset] = m24 / det;
- mres[0+4*2+mres_offset] = m31 / det;
- mres[1+4*2+mres_offset] = m32 / det;
- mres[2+4*2+mres_offset] = m33 / det;
- mres[3+4*2+mres_offset] = m34 / det;
- mres[0+4*3+mres_offset] = m41 / det;
- mres[1+4*3+mres_offset] = m42 / det;
- mres[2+4*3+mres_offset] = m43 / det;
- mres[3+4*3+mres_offset] = m44 / det;
+ final float invdet = 1.0f / det;
+
+ mres[0+4*0+mres_offset] = m11 * invdet;
+ mres[1+4*0+mres_offset] = m12 * invdet;
+ mres[2+4*0+mres_offset] = m13 * invdet;
+ mres[3+4*0+mres_offset] = m14 * invdet;
+ mres[0+4*1+mres_offset] = m21 * invdet;
+ mres[1+4*1+mres_offset] = m22 * invdet;
+ mres[2+4*1+mres_offset] = m23 * invdet;
+ mres[3+4*1+mres_offset] = m24 * invdet;
+ mres[0+4*2+mres_offset] = m31 * invdet;
+ mres[1+4*2+mres_offset] = m32 * invdet;
+ mres[2+4*2+mres_offset] = m33 * invdet;
+ mres[3+4*2+mres_offset] = m34 * invdet;
+ mres[0+4*3+mres_offset] = m41 * invdet;
+ mres[1+4*3+mres_offset] = m42 * invdet;
+ mres[2+4*3+mres_offset] = m43 * invdet;
+ mres[3+4*3+mres_offset] = m44 * invdet;
return mres;
}
@@ -1004,27 +1004,27 @@ public final class FloatUtil {
final float m44 = + a11*(a22*a33 - a23*a32) - a12*(a21*a33 - a23*a31) + a13*(a21*a32 - a22*a31);
final float det = (a11*m11 + a12*m12 + a13*m13 + a14*m14)/scale;
-
if( 0 == det ) {
return null;
}
-
- mres[0+4*0] = m11 / det;
- mres[1+4*0] = m12 / det;
- mres[2+4*0] = m13 / det;
- mres[3+4*0] = m14 / det;
- mres[0+4*1] = m21 / det;
- mres[1+4*1] = m22 / det;
- mres[2+4*1] = m23 / det;
- mres[3+4*1] = m24 / det;
- mres[0+4*2] = m31 / det;
- mres[1+4*2] = m32 / det;
- mres[2+4*2] = m33 / det;
- mres[3+4*2] = m34 / det;
- mres[0+4*3] = m41 / det;
- mres[1+4*3] = m42 / det;
- mres[2+4*3] = m43 / det;
- mres[3+4*3] = m44 / det;
+ final float invdet = 1.0f / det;
+
+ mres[0+4*0] = m11 * invdet;
+ mres[1+4*0] = m12 * invdet;
+ mres[2+4*0] = m13 * invdet;
+ mres[3+4*0] = m14 * invdet;
+ mres[0+4*1] = m21 * invdet;
+ mres[1+4*1] = m22 * invdet;
+ mres[2+4*1] = m23 * invdet;
+ mres[3+4*1] = m24 * invdet;
+ mres[0+4*2] = m31 * invdet;
+ mres[1+4*2] = m32 * invdet;
+ mres[2+4*2] = m33 * invdet;
+ mres[3+4*2] = m34 * invdet;
+ mres[0+4*3] = m41 * invdet;
+ mres[1+4*3] = m42 * invdet;
+ mres[2+4*3] = m43 * invdet;
+ mres[3+4*3] = m44 * invdet;
return mres;
}
diff --git a/src/jogl/classes/com/jogamp/opengl/math/Matrix4f.java b/src/jogl/classes/com/jogamp/opengl/math/Matrix4f.java
index 5951c7d98..6f4b2f38d 100644
--- a/src/jogl/classes/com/jogamp/opengl/math/Matrix4f.java
+++ b/src/jogl/classes/com/jogamp/opengl/math/Matrix4f.java
@@ -328,8 +328,9 @@ public class Matrix4f {
*
* @param dst float[16] array storage in column major order
* @param dst_off offset
+ * @return {@code dst} for chaining
*/
- public void get(final float[] dst, final int dst_off) {
+ public float[] get(final float[] dst, final int dst_off) {
dst[dst_off+0+0*4] = m00;
dst[dst_off+1+0*4] = m10;
dst[dst_off+2+0*4] = m20;
@@ -346,14 +347,16 @@ public class Matrix4f {
dst[dst_off+1+3*4] = m13;
dst[dst_off+2+3*4] = m23;
dst[dst_off+3+3*4] = m33;
+ return dst;
}
/**
* Get this matrix into the given float[16] array in column major order.
*
* @param dst float[16] array storage in column major order
+ * @return {@code dst} for chaining
*/
- public void get(final float[] dst) {
+ public float[] get(final float[] dst) {
dst[0+0*4] = m00;
dst[1+0*4] = m10;
dst[2+0*4] = m20;
@@ -370,6 +373,7 @@ public class Matrix4f {
dst[1+3*4] = m13;
dst[2+3*4] = m23;
dst[3+3*4] = m33;
+ return dst;
}
/**
@@ -380,8 +384,9 @@ public class Matrix4f {
* </p>
*
* @param dst {@link FloatBuffer} array storage in column major order
+ * @return {@code dst} for chaining
*/
- public void get(final FloatBuffer dst) {
+ public FloatBuffer get(final FloatBuffer dst) {
dst.put( m00 );
dst.put( m10 );
dst.put( m20 );
@@ -398,6 +403,7 @@ public class Matrix4f {
dst.put( m13 );
dst.put( m23 );
dst.put( m33 );
+ return dst;
}
//
@@ -490,35 +496,11 @@ public class Matrix4f {
*/
public boolean invert() {
final float scale;
- {
- float a = Math.abs(m00);
- float max = a;
-
- a = Math.abs(m01); if( a > max ) max = a;
- a = Math.abs(m02); if( a > max ) max = a;
- a = Math.abs(m03); if( a > max ) max = a;
-
- a = Math.abs(m10); if( a > max ) max = a;
- a = Math.abs(m11); if( a > max ) max = a;
- a = Math.abs(m12); if( a > max ) max = a;
- a = Math.abs(m13); if( a > max ) max = a;
-
- a = Math.abs(m20); if( a > max ) max = a;
- a = Math.abs(m21); if( a > max ) max = a;
- a = Math.abs(m22); if( a > max ) max = a;
- a = Math.abs(m23); if( a > max ) max = a;
-
- a = Math.abs(m30); if( a > max ) max = a;
- a = Math.abs(m31); if( a > max ) max = a;
- a = Math.abs(m32); if( a > max ) max = a;
- a = Math.abs(m33); if( a > max ) max = a;
-
- if( 0 == max ) {
- return false;
- }
- scale = 1.0f/max;
+ try {
+ scale = mulScale();
+ } catch(final ArithmeticException aex) {
+ return false; // max was 0
}
-
final float a00 = m00*scale;
final float a10 = m10*scale;
final float a20 = m20*scale;
@@ -560,30 +542,30 @@ public class Matrix4f {
final float b33 = + a00*(a11*a22 - a12*a21) - a01*(a10*a22 - a12*a20) + a02*(a10*a21 - a11*a20);
final float det = (a00*b00 + a01*b01 + a02*b02 + a03*b03) / scale;
-
if( 0 == det ) {
return false;
}
-
- m00 = b00 / det;
- m10 = b01 / det;
- m20 = b02 / det;
- m30 = b03 / det;
-
- m01 = b10 / det;
- m11 = b11 / det;
- m21 = b12 / det;
- m31 = b13 / det;
-
- m02 = b20 / det;
- m12 = b21 / det;
- m22 = b22 / det;
- m32 = b23 / det;
-
- m03 = b30 / det;
- m13 = b31 / det;
- m23 = b32 / det;
- m33 = b33 / det;
+ final float invdet = 1.0f / det;
+
+ m00 = b00 * invdet;
+ m10 = b01 * invdet;
+ m20 = b02 * invdet;
+ m30 = b03 * invdet;
+
+ m01 = b10 * invdet;
+ m11 = b11 * invdet;
+ m21 = b12 * invdet;
+ m31 = b13 * invdet;
+
+ m02 = b20 * invdet;
+ m12 = b21 * invdet;
+ m22 = b22 * invdet;
+ m32 = b23 * invdet;
+
+ m03 = b30 * invdet;
+ m13 = b31 * invdet;
+ m23 = b32 * invdet;
+ m33 = b33 * invdet;
return true;
}
@@ -593,7 +575,113 @@ public class Matrix4f {
* @return false if {@code src} matrix is singular and inversion not possible, otherwise true
*/
public boolean invert(final Matrix4f src) {
- return load(src).invert();
+ final float scale;
+ try {
+ scale = src.mulScale();
+ } catch(final ArithmeticException aex) {
+ return false; // max was 0
+ }
+ final float a00 = src.m00*scale;
+ final float a10 = src.m10*scale;
+ final float a20 = src.m20*scale;
+ final float a30 = src.m30*scale;
+
+ final float a01 = src.m01*scale;
+ final float a11 = src.m11*scale;
+ final float a21 = src.m21*scale;
+ final float a31 = src.m31*scale;
+
+ final float a02 = src.m02*scale;
+ final float a12 = src.m12*scale;
+ final float a22 = src.m22*scale;
+ final float a32 = src.m32*scale;
+
+ final float a03 = src.m03*scale;
+ final float a13 = src.m13*scale;
+ final float a23 = src.m23*scale;
+ final float a33 = src.m33*scale;
+
+ final float b00 = + a11*(a22*a33 - a23*a32) - a12*(a21*a33 - a23*a31) + a13*(a21*a32 - a22*a31);
+ final float b01 = -( + a10*(a22*a33 - a23*a32) - a12*(a20*a33 - a23*a30) + a13*(a20*a32 - a22*a30));
+ final float b02 = + a10*(a21*a33 - a23*a31) - a11*(a20*a33 - a23*a30) + a13*(a20*a31 - a21*a30);
+ final float b03 = -( + a10*(a21*a32 - a22*a31) - a11*(a20*a32 - a22*a30) + a12*(a20*a31 - a21*a30));
+
+ final float b10 = -( + a01*(a22*a33 - a23*a32) - a02*(a21*a33 - a23*a31) + a03*(a21*a32 - a22*a31));
+ final float b11 = + a00*(a22*a33 - a23*a32) - a02*(a20*a33 - a23*a30) + a03*(a20*a32 - a22*a30);
+ final float b12 = -( + a00*(a21*a33 - a23*a31) - a01*(a20*a33 - a23*a30) + a03*(a20*a31 - a21*a30));
+ final float b13 = + a00*(a21*a32 - a22*a31) - a01*(a20*a32 - a22*a30) + a02*(a20*a31 - a21*a30);
+
+ final float b20 = + a01*(a12*a33 - a13*a32) - a02*(a11*a33 - a13*a31) + a03*(a11*a32 - a12*a31);
+ final float b21 = -( + a00*(a12*a33 - a13*a32) - a02*(a10*a33 - a13*a30) + a03*(a10*a32 - a12*a30));
+ final float b22 = + a00*(a11*a33 - a13*a31) - a01*(a10*a33 - a13*a30) + a03*(a10*a31 - a11*a30);
+ final float b23 = -( + a00*(a11*a32 - a12*a31) - a01*(a10*a32 - a12*a30) + a02*(a10*a31 - a11*a30));
+
+ final float b30 = -( + a01*(a12*a23 - a13*a22) - a02*(a11*a23 - a13*a21) + a03*(a11*a22 - a12*a21));
+ final float b31 = + a00*(a12*a23 - a13*a22) - a02*(a10*a23 - a13*a20) + a03*(a10*a22 - a12*a20);
+ final float b32 = -( + a00*(a11*a23 - a13*a21) - a01*(a10*a23 - a13*a20) + a03*(a10*a21 - a11*a20));
+ final float b33 = + a00*(a11*a22 - a12*a21) - a01*(a10*a22 - a12*a20) + a02*(a10*a21 - a11*a20);
+
+ final float det = (a00*b00 + a01*b01 + a02*b02 + a03*b03) / scale;
+
+ if( 0 == det ) {
+ return false;
+ }
+ final float invdet = 1.0f / det;
+
+ m00 = b00 * invdet;
+ m10 = b01 * invdet;
+ m20 = b02 * invdet;
+ m30 = b03 * invdet;
+
+ m01 = b10 * invdet;
+ m11 = b11 * invdet;
+ m21 = b12 * invdet;
+ m31 = b13 * invdet;
+
+ m02 = b20 * invdet;
+ m12 = b21 * invdet;
+ m22 = b22 * invdet;
+ m32 = b23 * invdet;
+
+ m03 = b30 * invdet;
+ m13 = b31 * invdet;
+ m23 = b32 * invdet;
+ m33 = b33 * invdet;
+ return true;
+ }
+
+ private final float mulScale() {
+ /**
+ // No Hotspot intrinsic Math.* optimization for at least Math.max(),
+ // hence this chunk is slower.
+ float max = Math.abs(m00);
+
+ max = Math.max(max, Math.abs(m01));
+ max = Math.max(max, Math.abs(m02));
+ ... etc
+ */
+ float a = Math.abs(m00);
+ float max = a;
+ a = Math.abs(m01); if( a > max ) max = a;
+ a = Math.abs(m02); if( a > max ) max = a;
+ a = Math.abs(m03); if( a > max ) max = a;
+
+ a = Math.abs(m10); if( a > max ) max = a;
+ a = Math.abs(m11); if( a > max ) max = a;
+ a = Math.abs(m12); if( a > max ) max = a;
+ a = Math.abs(m13); if( a > max ) max = a;
+
+ a = Math.abs(m20); if( a > max ) max = a;
+ a = Math.abs(m21); if( a > max ) max = a;
+ a = Math.abs(m22); if( a > max ) max = a;
+ a = Math.abs(m23); if( a > max ) max = a;
+
+ a = Math.abs(m30); if( a > max ) max = a;
+ a = Math.abs(m31); if( a > max ) max = a;
+ a = Math.abs(m32); if( a > max ) max = a;
+ a = Math.abs(m33); if( a > max ) max = a;
+
+ return 1.0f/max;
}
/**