ref: b28908860d2001f1c66627e0ec024a01e5e9af7c
parent: a5f33eeb8ab9973dd2a53215d507f4ace4c3147a
author: David Turner <[email protected]>
date: Tue Jul 16 08:52:18 EDT 2013
Optimize FT_MulFix for x86_64 GCC builds. This patch provides an optimized `FT_MulFix' implementation for x86_64 machines when FreeType is built with GCC, or compatible compilers like Clang. Example: bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf Before: Load 4.863 us/op Load_Advances (Normal) 4.816 us/op Load_Advances (Fast) 0.028 us/op Render 2.753 us/op Get_Glyph 0.463 us/op Get_CBox 0.077 us/op Get_Char_Index 0.023 us/op Iterate CMap 13.898 us/op New_Face 12.368 us/op Embolden 0.028 us/op Get_BBox 0.302 us/op After: Load 4.617 us/op Load_Advances (Normal) 4.645 us/op Load_Advances (Fast) 0.027 us/op Render 2.789 us/op Get_Glyph 0.460 us/op Get_CBox 0.077 us/op Get_Char_Index 0.024 us/op Iterate CMap 13.403 us/op New_Face 12.278 us/op Embolden 0.028 us/op Get_BBox 0.301 us/op * builds/unix/ftconfig.in, include/freetype/config/ftconfig.h (FT_MulFix_x86_64): New function.
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,47 @@
2013-07-16 David Turner <[email protected]>
+ Optimize FT_MulFix for x86_64 GCC builds.
+
+ This patch provides an optimized `FT_MulFix' implementation for
+ x86_64 machines when FreeType is built with GCC, or compatible
+ compilers like Clang.
+
+ Example:
+ bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf
+
+ Before:
+
+ Load 4.863 us/op
+ Load_Advances (Normal) 4.816 us/op
+ Load_Advances (Fast) 0.028 us/op
+ Render 2.753 us/op
+ Get_Glyph 0.463 us/op
+ Get_CBox 0.077 us/op
+ Get_Char_Index 0.023 us/op
+ Iterate CMap 13.898 us/op
+ New_Face 12.368 us/op
+ Embolden 0.028 us/op
+ Get_BBox 0.302 us/op
+
+ After:
+
+ Load 4.617 us/op
+ Load_Advances (Normal) 4.645 us/op
+ Load_Advances (Fast) 0.027 us/op
+ Render 2.789 us/op
+ Get_Glyph 0.460 us/op
+ Get_CBox 0.077 us/op
+ Get_Char_Index 0.024 us/op
+ Iterate CMap 13.403 us/op
+ New_Face 12.278 us/op
+ Embolden 0.028 us/op
+ Get_BBox 0.301 us/op
+
+ * builds/unix/ftconfig.in, include/freetype/config/ftconfig.h
+ (FT_MulFix_x86_64): New function.
+
+2013-07-16 David Turner <[email protected]>
+
Speed up ARMv7 support.
When building for ARMv7 with thumb2 instructions, the optimized
--- a/builds/unix/ftconfig.in
+++ b/builds/unix/ftconfig.in
@@ -366,6 +366,7 @@
/* These must be defined `static __inline__' with GCC. */
#if defined( __CC_ARM ) || defined( __ARMCC__ ) /* RVCT */
+
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
/* documentation is in freetype.h */
@@ -428,7 +429,9 @@
/* ( __thumb2__ || !__thumb__ ) && */
/* !( __CC_ARM || __ARMCC__ ) */
+
#if defined( __i386__ )
+
#define FT_MULFIX_ASSEMBLER FT_MulFix_i386
/* documentation is in freetype.h */
@@ -496,6 +499,62 @@
#endif /* _M_IX86 */
#endif /* _MSC_VER */
+
+
+#if defined( __GNUC__ ) && defined( __x86_64__ )
+
+#define FT_MULFIX_ASSEMBLER FT_MulFix_x86_64
+
+ static __inline__ FT_Int32
+ FT_MulFix_x86_64( FT_Int32 a,
+ FT_Int32 b )
+ {
+ /* Temporarily disable the warning that C90 doesn't support */
+ /* `long long'. */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wlong-long"
+
+#if 1
+ /* Technically not an assembly fragment, but GCC does a really good */
+ /* job at inlining it and generating good machine code for it. */
+ long long ret, tmp;
+
+
+ ret = (long long)a * b;
+ tmp = ret >> 63;
+ ret += 0x8000 + tmp;
+
+ return (FT_Int32)( ret >> 16 );
+#else
+
+ /* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine */
+ /* code from the lines below. The main issue is that `wide_a' is not */
+ /* properly initialized by sign-extending `a'. Instead, the generated */
+ /* machine code assumes that the register that contains `a' on input */
+ /* can be used directly as a 64-bit value, which is wrong most of the */
+ /* time. */
+ long long wide_a = (long long)a;
+ long long wide_b = (long long)b;
+ long long result;
+
+
+ __asm__ __volatile__ (
+ "imul %2, %1\n"
+ "mov %1, %0\n"
+ "sar $63, %0\n"
+ "lea 0x8000(%1, %0), %0\n"
+ "sar $16, %0\n"
+ : "=&r"(result), "=&r"(wide_a)
+ : "r"(wide_b)
+ : "cc" );
+
+ return (FT_Int32)result;
+#endif
+
+#pragma GCC diagnostic pop
+ }
+
+#endif /* __GNUC__ && __x86_64__ */
#endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */
--- a/include/freetype/config/ftconfig.h
+++ b/include/freetype/config/ftconfig.h
@@ -338,6 +338,7 @@
/* These must be defined `static __inline__' with GCC. */
#if defined( __CC_ARM ) || defined( __ARMCC__ ) /* RVCT */
+
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
/* documentation is in freetype.h */
@@ -370,6 +371,7 @@
#if defined( __arm__ ) && \
( !defined( __thumb__ ) || defined( __thumb2__ ) ) && \
!( defined( __CC_ARM ) || defined( __ARMCC__ ) )
+
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
/* documentation is in freetype.h */
@@ -399,7 +401,9 @@
/* ( __thumb2__ || !__thumb__ ) && */
/* !( __CC_ARM || __ARMCC__ ) */
+
#if defined( __i386__ )
+
#define FT_MULFIX_ASSEMBLER FT_MulFix_i386
/* documentation is in freetype.h */
@@ -467,6 +471,62 @@
#endif /* _M_IX86 */
#endif /* _MSC_VER */
+
+
+#if defined( __GNUC__ ) && defined( __x86_64__ )
+
+#define FT_MULFIX_ASSEMBLER FT_MulFix_x86_64
+
+ static __inline__ FT_Int32
+ FT_MulFix_x86_64( FT_Int32 a,
+ FT_Int32 b )
+ {
+ /* Temporarily disable the warning that C90 doesn't support */
+ /* `long long'. */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wlong-long"
+
+#if 1
+ /* Technically not an assembly fragment, but GCC does a really good */
+ /* job at inlining it and generating good machine code for it. */
+ long long ret, tmp;
+
+
+ ret = (long long)a * b;
+ tmp = ret >> 63;
+ ret += 0x8000 + tmp;
+
+ return (FT_Int32)( ret >> 16 );
+#else
+
+ /* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine */
+ /* code from the lines below. The main issue is that `wide_a' is not */
+ /* properly initialized by sign-extending `a'. Instead, the generated */
+ /* machine code assumes that the register that contains `a' on input */
+ /* can be used directly as a 64-bit value, which is wrong most of the */
+ /* time. */
+ long long wide_a = (long long)a;
+ long long wide_b = (long long)b;
+ long long result;
+
+
+ __asm__ __volatile__ (
+ "imul %2, %1\n"
+ "mov %1, %0\n"
+ "sar $63, %0\n"
+ "lea 0x8000(%1, %0), %0\n"
+ "sar $16, %0\n"
+ : "=&r"(result), "=&r"(wide_a)
+ : "r"(wide_b)
+ : "cc" );
+
+ return (FT_Int32)result;
+#endif
+
+#pragma GCC diagnostic pop
+ }
+
+#endif /* __GNUC__ && __x86_64__ */
#endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */