shithub: freetype+ttf2subf

Download patch

ref: b28908860d2001f1c66627e0ec024a01e5e9af7c
parent: a5f33eeb8ab9973dd2a53215d507f4ace4c3147a
author: David Turner <[email protected]>
date: Tue Jul 16 08:52:18 EDT 2013

Optimize FT_MulFix for x86_64 GCC builds.

This patch provides an optimized `FT_MulFix' implementation for
x86_64 machines when FreeType is built with GCC, or compatible
compilers like Clang.

Example:
  bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf

Before:

  Load                       4.863 us/op
  Load_Advances (Normal)     4.816 us/op
  Load_Advances (Fast)       0.028 us/op
  Render                     2.753 us/op
  Get_Glyph                  0.463 us/op
  Get_CBox                   0.077 us/op
  Get_Char_Index             0.023 us/op
  Iterate CMap              13.898 us/op
  New_Face                  12.368 us/op
  Embolden                   0.028 us/op
  Get_BBox                   0.302 us/op

After:

  Load                       4.617 us/op
  Load_Advances (Normal)     4.645 us/op
  Load_Advances (Fast)       0.027 us/op
  Render                     2.789 us/op
  Get_Glyph                  0.460 us/op
  Get_CBox                   0.077 us/op
  Get_Char_Index             0.024 us/op
  Iterate CMap              13.403 us/op
  New_Face                  12.278 us/op
  Embolden                   0.028 us/op
  Get_BBox                   0.301 us/op

* builds/unix/ftconfig.in, include/freetype/config/ftconfig.h
(FT_MulFix_x86_64): New function.

git/fs: mount .git/fs: mount/attach disallowed
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,47 @@
 2013-07-16  David Turner  <[email protected]>
 
+	Optimize FT_MulFix for x86_64 GCC builds.
+
+	This patch provides an optimized `FT_MulFix' implementation for
+	x86_64 machines when FreeType is built with GCC, or compatible
+	compilers like Clang.
+
+	Example:
+	  bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf
+
+	Before:
+
+	  Load                       4.863 us/op
+	  Load_Advances (Normal)     4.816 us/op
+	  Load_Advances (Fast)       0.028 us/op
+	  Render                     2.753 us/op
+	  Get_Glyph                  0.463 us/op
+	  Get_CBox                   0.077 us/op
+	  Get_Char_Index             0.023 us/op
+	  Iterate CMap              13.898 us/op
+	  New_Face                  12.368 us/op
+	  Embolden                   0.028 us/op
+	  Get_BBox                   0.302 us/op
+
+	After:
+
+	  Load                       4.617 us/op
+	  Load_Advances (Normal)     4.645 us/op
+	  Load_Advances (Fast)       0.027 us/op
+	  Render                     2.789 us/op
+	  Get_Glyph                  0.460 us/op
+	  Get_CBox                   0.077 us/op
+	  Get_Char_Index             0.024 us/op
+	  Iterate CMap              13.403 us/op
+	  New_Face                  12.278 us/op
+	  Embolden                   0.028 us/op
+	  Get_BBox                   0.301 us/op
+
+	* builds/unix/ftconfig.in, include/freetype/config/ftconfig.h
+	(FT_MulFix_x86_64): New function.
+
+2013-07-16  David Turner  <[email protected]>
+
 	Speed up ARMv7 support.
 
 	When building for ARMv7 with thumb2 instructions, the optimized
--- a/builds/unix/ftconfig.in
+++ b/builds/unix/ftconfig.in
@@ -366,6 +366,7 @@
   /* These must be defined `static __inline__' with GCC.             */
 
 #if defined( __CC_ARM ) || defined( __ARMCC__ )  /* RVCT */
+
 #define FT_MULFIX_ASSEMBLER  FT_MulFix_arm
 
   /* documentation is in freetype.h */
@@ -428,7 +429,9 @@
        /* ( __thumb2__ || !__thumb__ ) && */
        /* !( __CC_ARM || __ARMCC__ )      */
 
+
 #if defined( __i386__ )
+
 #define FT_MULFIX_ASSEMBLER  FT_MulFix_i386
 
   /* documentation is in freetype.h */
@@ -496,6 +499,62 @@
 #endif /* _M_IX86 */
 
 #endif /* _MSC_VER */
+
+
+#if defined( __GNUC__ ) && defined( __x86_64__ )
+
+#define FT_MULFIX_ASSEMBLER  FT_MulFix_x86_64
+
+  static __inline__ FT_Int32
+  FT_MulFix_x86_64( FT_Int32  a,
+                    FT_Int32  b )
+  {
+    /* Temporarily disable the warning that C90 doesn't support */
+    /* `long long'.                                             */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wlong-long"
+
+#if 1
+    /* Technically not an assembly fragment, but GCC does a really good */
+    /* job at inlining it and generating good machine code for it.      */
+    long long  ret, tmp;
+
+
+    ret  = (long long)a * b;
+    tmp  = ret >> 63;
+    ret += 0x8000 + tmp;
+
+    return (FT_Int32)( ret >> 16 );
+#else
+
+    /* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine  */
+    /* code from the lines below.  The main issue is that `wide_a' is not  */
+    /* properly initialized by sign-extending `a'.  Instead, the generated */
+    /* machine code assumes that the register that contains `a' on input   */
+    /* can be used directly as a 64-bit value, which is wrong most of the  */
+    /* time.                                                               */
+    long long  wide_a = (long long)a;
+    long long  wide_b = (long long)b;
+    long long  result;
+
+
+    __asm__ __volatile__ (
+      "imul %2, %1\n"
+      "mov %1, %0\n"
+      "sar $63, %0\n"
+      "lea 0x8000(%1, %0), %0\n"
+      "sar $16, %0\n"
+      : "=&r"(result), "=&r"(wide_a)
+      : "r"(wide_b)
+      : "cc" );
+
+    return (FT_Int32)result;
+#endif
+
+#pragma GCC diagnostic pop
+  }
+
+#endif /* __GNUC__ && __x86_64__ */
 
 #endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */
 
--- a/include/freetype/config/ftconfig.h
+++ b/include/freetype/config/ftconfig.h
@@ -338,6 +338,7 @@
   /* These must be defined `static __inline__' with GCC.             */
 
 #if defined( __CC_ARM ) || defined( __ARMCC__ )  /* RVCT */
+
 #define FT_MULFIX_ASSEMBLER  FT_MulFix_arm
 
   /* documentation is in freetype.h */
@@ -370,6 +371,7 @@
 #if defined( __arm__ )                                 && \
     ( !defined( __thumb__ ) || defined( __thumb2__ ) ) && \
     !( defined( __CC_ARM ) || defined( __ARMCC__ ) )
+
 #define FT_MULFIX_ASSEMBLER  FT_MulFix_arm
 
   /* documentation is in freetype.h */
@@ -399,7 +401,9 @@
        /* ( __thumb2__ || !__thumb__ ) && */
        /* !( __CC_ARM || __ARMCC__ )      */
 
+
 #if defined( __i386__ )
+
 #define FT_MULFIX_ASSEMBLER  FT_MulFix_i386
 
   /* documentation is in freetype.h */
@@ -467,6 +471,62 @@
 #endif /* _M_IX86 */
 
 #endif /* _MSC_VER */
+
+
+#if defined( __GNUC__ ) && defined( __x86_64__ )
+
+#define FT_MULFIX_ASSEMBLER  FT_MulFix_x86_64
+
+  static __inline__ FT_Int32
+  FT_MulFix_x86_64( FT_Int32  a,
+                    FT_Int32  b )
+  {
+    /* Temporarily disable the warning that C90 doesn't support */
+    /* `long long'.                                             */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wlong-long"
+
+#if 1
+    /* Technically not an assembly fragment, but GCC does a really good */
+    /* job at inlining it and generating good machine code for it.      */
+    long long  ret, tmp;
+
+
+    ret  = (long long)a * b;
+    tmp  = ret >> 63;
+    ret += 0x8000 + tmp;
+
+    return (FT_Int32)( ret >> 16 );
+#else
+
+    /* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine  */
+    /* code from the lines below.  The main issue is that `wide_a' is not  */
+    /* properly initialized by sign-extending `a'.  Instead, the generated */
+    /* machine code assumes that the register that contains `a' on input   */
+    /* can be used directly as a 64-bit value, which is wrong most of the  */
+    /* time.                                                               */
+    long long  wide_a = (long long)a;
+    long long  wide_b = (long long)b;
+    long long  result;
+
+
+    __asm__ __volatile__ (
+      "imul %2, %1\n"
+      "mov %1, %0\n"
+      "sar $63, %0\n"
+      "lea 0x8000(%1, %0), %0\n"
+      "sar $16, %0\n"
+      : "=&r"(result), "=&r"(wide_a)
+      : "r"(wide_b)
+      : "cc" );
+
+    return (FT_Int32)result;
+#endif
+
+#pragma GCC diagnostic pop
+  }
+
+#endif /* __GNUC__ && __x86_64__ */
 
 #endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */