Wednesday, December 28, 2011

Simple vector3 class with SSE support

In this post we show how to write a simple class which represents a 3D vector which uses SSE operations for fast calculations. The class stores three float values (x, y and z) and implements all basic vector operators such as add, subtract, multiply, divide, cross product, dot product and length calculations. It uses aligned 128-bit memory which allows to use SSE intrinsics directly. In addition, it overloads the new and delete operators for arrays which allows to create multiple instances of vector3.
The following code can be used in a stand-alone header file (i.e. vector3.h):
#include <smmintrin.h>

// Simple vector class
_MM_ALIGN16 class vector3
{
 public:
  // constructors
  inline vector3() : mmvalue(_mm_setzero_ps()) {}
  inline vector3(float x, float y, float z) : mmvalue(_mm_set_ps(0, z, y, x)) {}
  inline vector3(__m128 m) : mmvalue(m) {}

  // arithmetic operators with vector3
  inline vector3 operator+(const vector3& b) const { return _mm_add_ps(mmvalue, b.mmvalue); }
  inline vector3 operator-(const vector3& b) const { return _mm_sub_ps(mmvalue, b.mmvalue); }
  inline vector3 operator*(const vector3& b) const { return _mm_mul_ps(mmvalue, b.mmvalue); }
  inline vector3 operator/(const vector3& b) const { return _mm_div_ps(mmvalue, b.mmvalue); }

  // op= operators
  inline vector3& operator+=(const vector3& b) { mmvalue = _mm_add_ps(mmvalue, b.mmvalue); return *this; }
  inline vector3& operator-=(const vector3& b) { mmvalue = _mm_sub_ps(mmvalue, b.mmvalue); return *this; }
  inline vector3& operator*=(const vector3& b) { mmvalue = _mm_mul_ps(mmvalue, b.mmvalue); return *this; }
  inline vector3& operator/=(const vector3& b) { mmvalue = _mm_div_ps(mmvalue, b.mmvalue); return *this; }

  // arithmetic operators with float
  inline vector3 operator+(float b) const { return _mm_add_ps(mmvalue, _mm_set1_ps(b)); }
  inline vector3 operator-(float b) const { return _mm_sub_ps(mmvalue, _mm_set1_ps(b)); }
  inline vector3 operator*(float b) const { return _mm_mul_ps(mmvalue, _mm_set1_ps(b)); }
  inline vector3 operator/(float b) const { return _mm_div_ps(mmvalue, _mm_set1_ps(b)); }

  // op= operators with float
  inline vector3& operator+=(float b) { mmvalue = _mm_add_ps(mmvalue, _mm_set1_ps(b)); return *this; }
  inline vector3& operator-=(float b) { mmvalue = _mm_sub_ps(mmvalue, _mm_set1_ps(b)); return *this; }
  inline vector3& operator*=(float b) { mmvalue = _mm_mul_ps(mmvalue, _mm_set1_ps(b)); return *this; }
  inline vector3& operator/=(float b) { mmvalue = _mm_div_ps(mmvalue, _mm_set1_ps(b)); return *this; }

  // cross product
  inline vector3 cross(const vector3& b) const 
  {
   return _mm_sub_ps(
    _mm_mul_ps(_mm_shuffle_ps(mmvalue, mmvalue, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b.mmvalue, b.mmvalue, _MM_SHUFFLE(3, 1, 0, 2))), 
    _mm_mul_ps(_mm_shuffle_ps(mmvalue, mmvalue, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b.mmvalue, b.mmvalue, _MM_SHUFFLE(3, 0, 2, 1)))
   );
  }

  // dot product with another vector
  inline float dot(const vector3& b) const { return _mm_cvtss_f32(_mm_dp_ps(mmvalue, b.mmvalue, 0x71)); }
  // length of the vector
  inline float length() const { return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mmvalue, mmvalue, 0x71))); }
  // 1/length() of the vector
  inline float rlength() const { return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_dp_ps(mmvalue, mmvalue, 0x71))); }
  // returns the vector scaled to unit length
  inline vector3 normalize() const { return _mm_mul_ps(mmvalue, _mm_rsqrt_ps(_mm_dp_ps(mmvalue, mmvalue, 0x7F))); }

  // overloaded operators that ensure alignment
  inline void* operator new[](size_t x) { return _aligned_malloc(x, 16); }
  inline void operator delete[](void* x) { if (x) _aligned_free(x); }

  // Member variables
  union
  {
   struct { float x, y, z; };    
   __m128 mmvalue;
  };
};

inline vector3 operator+(float a, const vector3& b) { return b + a; }
inline vector3 operator-(float a, const vector3& b) { return vector3(_mm_set1_ps(a)) - b; }
inline vector3 operator*(float a, const vector3& b) { return b * a; }
inline vector3 operator/(float a, const vector3& b) { return vector3(_mm_set1_ps(a)) / b; }
This class can be used as follows:
vector3 a(1, 2, 3);
vector3 b = 2 * a + 4;
vector3 c = b.cross(a.normalize());
When compiling such code, the compiler will remove the inlined function calls and store vector3 objects directly in SSE registers, if possible. This generates optimal code with no overhead.

6 comments:

  1. Great post. This was very helpful for me.

    ReplyDelete
  2. Thanks, this is awesome and just what I am looking for to start implementing sse in the performance critical bit of my code. I do have a question though - is there any magic required to make this perform better with AVX, ie by operating on 2 sets of 3 floats simultaneously, or is it better to use ILP within the outer loop, by calling two instructions for each (double) iteration?

    Richard.

    ReplyDelete
  3. Horrible GNU license, so code is unusable.

    ReplyDelete
  4. The code is free to use, but comes with no warranty.

    ReplyDelete
  5. Thanks for this! How would I access the individual elements of the vector? (ex. how would I access the X value to use elsewhere?)

    ReplyDelete
  6. since vector3 uses a union structure, it should be as easy as vector3 v; v.x = 123;

    ReplyDelete