Clean up the functions in loadimage.h/cpp.

* Capitalize the function names to fit the style guide.
* Use explicit sizes in the function names to avoid any confusion about
  input or output sizes.
* Use explicit sized types in the functions to avoid potential issues on
  new platforms since a lot of bit-twiddling is used.
* Use size_t for all sizes.
* Use uint8_t for all binary data for input and output data so that pointer
  arithmetic is much easier.
* Move templated function definitions into an .inl file so that
  loadimage.h looks as clean as possible.

BUG=angle:665

Change-Id: Id7173ed66d9e1b7ee3261eea11e77d838cbd2951
Reviewed-on: https://chromium-review.googlesource.com/202590
Reviewed-by: Brandon Jones <bajones@chromium.org>
Reviewed-by: Jamie Madill <jmadill@chromium.org>
Tested-by: Geoff Lang <geofflang@chromium.org>
diff --git a/BUILD.gn b/BUILD.gn
index 9ffa2d4..d03a83d 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -390,6 +390,7 @@
       "src/libGLESv2/renderer/imageformats.h",
       "src/libGLESv2/renderer/loadimage.cpp",
       "src/libGLESv2/renderer/loadimage.h",
+      "src/libGLESv2/renderer/loadimage.inl",
       "src/libGLESv2/renderer/loadimageSSE2.cpp",
       "src/libGLESv2/renderer/vertexconversion.h",
       "src/libGLESv2/resource.h",
diff --git a/projects/src/libGLESv2.vcxproj b/projects/src/libGLESv2.vcxproj
index c531e6b..fecec4e 100644
--- a/projects/src/libGLESv2.vcxproj
+++ b/projects/src/libGLESv2.vcxproj
@@ -113,6 +113,7 @@
   <ItemGroup>
     <None Include="..\..\src\angle.gyp"/>
     <None Include="..\..\src\libGLESv2\libGLESv2.def"/>
+    <None Include="..\..\src\libGLESv2\renderer\loadimage.inl"/>
     <None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\Swizzle11.hlsl"/>
     <None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\generate_shaders.bat"/>
     <None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\Passthrough3D11.hlsl"/>
diff --git a/projects/src/libGLESv2.vcxproj.filters b/projects/src/libGLESv2.vcxproj.filters
index 6fad0c2..db97930 100644
--- a/projects/src/libGLESv2.vcxproj.filters
+++ b/projects/src/libGLESv2.vcxproj.filters
@@ -264,6 +264,9 @@
     <ClInclude Include="..\..\src\libGLESv2\renderer\TextureStorage.h">
       <Filter>src\libGLESv2\renderer</Filter>
     </ClInclude>
+    <None Include="..\..\src\libGLESv2\renderer\loadimage.inl">
+      <Filter>src\libGLESv2\renderer</Filter>
+    </None>
     <ClInclude Include="..\..\src\libGLESv2\renderer\ShaderExecutable.h">
       <Filter>src\libGLESv2\renderer</Filter>
     </ClInclude>
diff --git a/src/libGLESv2.gypi b/src/libGLESv2.gypi
index 0c535ca..9c8a420 100644
--- a/src/libGLESv2.gypi
+++ b/src/libGLESv2.gypi
@@ -37,7 +37,7 @@
                     [
                         '<!@(python <(angle_path)/enumerate_files.py \
                              -dirs common libGLESv2 third_party/murmurhash ../include third_party/systeminfo \
-                             -types *.cpp *.h *.hlsl *.vs *.ps *.bat *.def *.rc \
+                             -types *.cpp *.h *.inl *.hlsl *.vs *.ps *.bat *.def *.rc \
                              -excludes */d3d/*)',
                     ],
                     'defines':
@@ -54,7 +54,7 @@
                             [
                                 '<!@(python <(angle_path)/enumerate_files.py \
                                      -dirs libGLESv2/renderer/d3d libGLESv2/renderer/d3d/d3d9 \
-                                     -types *.cpp *.h *.vs *.ps *.bat)',
+                                     -types *.cpp *.h *.inl *.vs *.ps *.bat)',
                             ],
                             'defines':
                             [
@@ -77,7 +77,7 @@
                             [
                                 '<!@(python <(angle_path)/enumerate_files.py \
                                      -dirs libGLESv2/renderer/d3d libGLESv2/renderer/d3d/d3d/d3d11 \
-                                     -types *.cpp *.h *.hlsl *.bat)',
+                                     -types *.cpp *.h *.inl *.hlsl *.bat)',
                             ],
                             'defines':
                             [
diff --git a/src/libGLESv2/formatutils.h b/src/libGLESv2/formatutils.h
index 692ab88..71d212c 100644
--- a/src/libGLESv2/formatutils.h
+++ b/src/libGLESv2/formatutils.h
@@ -14,16 +14,18 @@
 #include "libGLESv2/Caps.h"
 #include "libGLESv2/angletypes.h"
 
+#include <cstddef>
+
 typedef void (*MipGenerationFunction)(unsigned int sourceWidth, unsigned int sourceHeight, unsigned int sourceDepth,
                                       const unsigned char *sourceData, int sourceRowPitch, int sourceDepthPitch,
                                       unsigned char *destData, int destRowPitch, int destDepthPitch);
 
-typedef void (*LoadImageFunction)(int width, int height, int depth,
-                                  const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                  void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
+typedef void (*LoadImageFunction)(size_t width, size_t height, size_t depth,
+                                  const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                                  uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
 
-typedef void (*InitializeTextureDataFunction)(int width, int height, int depth,
-                                              void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
+typedef void (*InitializeTextureDataFunction)(size_t width, size_t height, size_t depth,
+                                              uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
 
 typedef void (*ColorReadFunction)(const void *source, void *dest);
 typedef void (*ColorWriteFunction)(const void *source, void *dest);
diff --git a/src/libGLESv2/renderer/d3d/d3d11/Image11.cpp b/src/libGLESv2/renderer/d3d/d3d11/Image11.cpp
index 4cc5ba7..d64587a 100644
--- a/src/libGLESv2/renderer/d3d/d3d11/Image11.cpp
+++ b/src/libGLESv2/renderer/d3d/d3d11/Image11.cpp
@@ -168,8 +168,10 @@
         return;
     }
 
-    void* offsetMappedData = (void*)((BYTE *)mappedImage.pData + (yoffset * mappedImage.RowPitch + xoffset * outputPixelSize + zoffset * mappedImage.DepthPitch));
-    loadFunction(width, height, depth, input, inputRowPitch, inputDepthPitch, offsetMappedData, mappedImage.RowPitch, mappedImage.DepthPitch);
+    uint8_t* offsetMappedData = (reinterpret_cast<uint8_t*>(mappedImage.pData) + (yoffset * mappedImage.RowPitch + xoffset * outputPixelSize + zoffset * mappedImage.DepthPitch));
+    loadFunction(width, height, depth,
+                 reinterpret_cast<const uint8_t*>(input), inputRowPitch, inputDepthPitch,
+                 offsetMappedData, mappedImage.RowPitch, mappedImage.DepthPitch);
 
     unmap();
 }
@@ -198,11 +200,12 @@
         return;
     }
 
-    void* offsetMappedData = (void*)((BYTE*)mappedImage.pData + ((yoffset / outputBlockHeight) * mappedImage.RowPitch +
-                                                                 (xoffset / outputBlockWidth) * outputPixelSize +
-                                                                 zoffset * mappedImage.DepthPitch));
+    uint8_t* offsetMappedData = reinterpret_cast<uint8_t*>(mappedImage.pData) + ((yoffset / outputBlockHeight) * mappedImage.RowPitch +
+                                                                           (xoffset / outputBlockWidth) * outputPixelSize +
+                                                                           zoffset * mappedImage.DepthPitch);
 
-    loadFunction(width, height, depth, input, inputRowPitch, inputDepthPitch,
+    loadFunction(width, height, depth,
+                 reinterpret_cast<const uint8_t*>(input), inputRowPitch, inputDepthPitch,
                  offsetMappedData, mappedImage.RowPitch, mappedImage.DepthPitch);
 
     unmap();
diff --git a/src/libGLESv2/renderer/d3d/d3d11/formatutils11.cpp b/src/libGLESv2/renderer/d3d/d3d11/formatutils11.cpp
index e40ab09..a300c2e 100644
--- a/src/libGLESv2/renderer/d3d/d3d11/formatutils11.cpp
+++ b/src/libGLESv2/renderer/d3d/d3d11/formatutils11.cpp
@@ -183,16 +183,16 @@
 typedef std::pair<InternalFormatTypePair, LoadImageFunction> D3D11LoadFunctionPair;
 typedef std::map<InternalFormatTypePair, LoadImageFunction> D3D11LoadFunctionMap;
 
-static void UnimplementedLoadFunction(int width, int height, int depth,
-                                      const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                      void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+static void UnimplementedLoadFunction(size_t width, size_t height, size_t depth,
+                                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
     UNIMPLEMENTED();
 }
 
-static void UnreachableLoadFunction(int width, int height, int depth,
-                                    const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                    void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+static void UnreachableLoadFunction(size_t width, size_t height, size_t depth,
+                                    const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                                    uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
     UNREACHABLE();
 }
@@ -209,79 +209,79 @@
     D3D11LoadFunctionMap map;
 
     //                      | Internal format      | Type                             | Load function                       |
-    insertLoadFunction(&map, GL_RGBA8,              GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 4>             );
-    insertLoadFunction(&map, GL_RGB5_A1,            GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 4>             );
-    insertLoadFunction(&map, GL_RGBA4,              GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 4>             );
-    insertLoadFunction(&map, GL_SRGB8_ALPHA8,       GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 4>             );
-    insertLoadFunction(&map, GL_RGBA8_SNORM,        GL_BYTE,                           loadToNative<GLbyte, 4>              );
-    insertLoadFunction(&map, GL_RGBA4,              GL_UNSIGNED_SHORT_4_4_4_4,         loadRGBA4444DataToRGBA               );
-    insertLoadFunction(&map, GL_RGB10_A2,           GL_UNSIGNED_INT_2_10_10_10_REV,    loadToNative<GLuint, 1>              );
-    insertLoadFunction(&map, GL_RGB5_A1,            GL_UNSIGNED_SHORT_5_5_5_1,         loadRGBA5551DataToRGBA               );
-    insertLoadFunction(&map, GL_RGB5_A1,            GL_UNSIGNED_INT_2_10_10_10_REV,    loadRGBA2101010ToRGBA                );
-    insertLoadFunction(&map, GL_RGBA16F,            GL_HALF_FLOAT,                     loadToNative<GLhalf, 4>              );
-    insertLoadFunction(&map, GL_RGBA16F,            GL_HALF_FLOAT_OES,                 loadToNative<GLhalf, 4>              );
-    insertLoadFunction(&map, GL_RGBA32F,            GL_FLOAT,                          loadToNative<GLfloat, 4>             );
-    insertLoadFunction(&map, GL_RGBA16F,            GL_FLOAT,                          loadFloatDataToHalfFloat<4>          );
-    insertLoadFunction(&map, GL_RGBA8UI,            GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 4>             );
-    insertLoadFunction(&map, GL_RGBA8I,             GL_BYTE,                           loadToNative<GLbyte, 4>              );
-    insertLoadFunction(&map, GL_RGBA16UI,           GL_UNSIGNED_SHORT,                 loadToNative<GLushort, 4>            );
-    insertLoadFunction(&map, GL_RGBA16I,            GL_SHORT,                          loadToNative<GLshort, 4>             );
-    insertLoadFunction(&map, GL_RGBA32UI,           GL_UNSIGNED_INT,                   loadToNative<GLuint, 4>              );
-    insertLoadFunction(&map, GL_RGBA32I,            GL_INT,                            loadToNative<GLint, 4>               );
-    insertLoadFunction(&map, GL_RGB10_A2UI,         GL_UNSIGNED_INT_2_10_10_10_REV,    loadToNative<GLuint, 1>              );
-    insertLoadFunction(&map, GL_RGB8,               GL_UNSIGNED_BYTE,                  loadRGBUByteDataToRGBA               );
-    insertLoadFunction(&map, GL_RGB565,             GL_UNSIGNED_BYTE,                  loadToNative3To4<GLubyte, 0xFF>      );
-    insertLoadFunction(&map, GL_SRGB8,              GL_UNSIGNED_BYTE,                  loadToNative3To4<GLubyte, 0xFF>      );
-    insertLoadFunction(&map, GL_RGB8_SNORM,         GL_BYTE,                           loadRGBSByteDataToRGBA               );
-    insertLoadFunction(&map, GL_RGB565,             GL_UNSIGNED_SHORT_5_6_5,           loadRGB565DataToRGBA                 );
-    insertLoadFunction(&map, GL_R11F_G11F_B10F,     GL_UNSIGNED_INT_10F_11F_11F_REV,   loadToNative<GLuint, 1>              );
-    insertLoadFunction(&map, GL_RGB9_E5,            GL_UNSIGNED_INT_5_9_9_9_REV,       loadToNative<GLuint, 1>              );
-    insertLoadFunction(&map, GL_RGB16F,             GL_HALF_FLOAT,                     loadToNative3To4<GLhalf, gl::Float16One>);
-    insertLoadFunction(&map, GL_RGB16F,             GL_HALF_FLOAT_OES,                 loadToNative3To4<GLhalf, gl::Float16One>);
-    insertLoadFunction(&map, GL_R11F_G11F_B10F,     GL_HALF_FLOAT,                     loadRGBHalfFloatDataTo111110Float    );
-    insertLoadFunction(&map, GL_R11F_G11F_B10F,     GL_HALF_FLOAT_OES,                 loadRGBHalfFloatDataTo111110Float    );
-    insertLoadFunction(&map, GL_RGB9_E5,            GL_HALF_FLOAT,                     loadRGBHalfFloatDataTo999E5          );
-    insertLoadFunction(&map, GL_RGB9_E5,            GL_HALF_FLOAT_OES,                 loadRGBHalfFloatDataTo999E5          );
-    insertLoadFunction(&map, GL_RGB32F,             GL_FLOAT,                          loadToNative3To4<GLfloat, gl::Float32One>);
-    insertLoadFunction(&map, GL_RGB16F,             GL_FLOAT,                          loadFloatRGBDataToHalfFloatRGBA      );
-    insertLoadFunction(&map, GL_R11F_G11F_B10F,     GL_FLOAT,                          loadRGBFloatDataTo111110Float        );
-    insertLoadFunction(&map, GL_RGB9_E5,            GL_FLOAT,                          loadRGBFloatDataTo999E5              );
-    insertLoadFunction(&map, GL_RGB8UI,             GL_UNSIGNED_BYTE,                  loadToNative3To4<GLubyte, 0x01>      );
-    insertLoadFunction(&map, GL_RGB8I,              GL_BYTE,                           loadToNative3To4<GLbyte, 0x01>       );
-    insertLoadFunction(&map, GL_RGB16UI,            GL_UNSIGNED_SHORT,                 loadToNative3To4<GLushort, 0x0001>   );
-    insertLoadFunction(&map, GL_RGB16I,             GL_SHORT,                          loadToNative3To4<GLshort, 0x0001>    );
-    insertLoadFunction(&map, GL_RGB32UI,            GL_UNSIGNED_INT,                   loadToNative3To4<GLuint, 0x00000001> );
-    insertLoadFunction(&map, GL_RGB32I,             GL_INT,                            loadToNative3To4<GLint, 0x00000001>  );
-    insertLoadFunction(&map, GL_RG8,                GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 2>             );
-    insertLoadFunction(&map, GL_RG8_SNORM,          GL_BYTE,                           loadToNative<GLbyte, 2>              );
-    insertLoadFunction(&map, GL_RG16F,              GL_HALF_FLOAT,                     loadToNative<GLhalf, 2>              );
-    insertLoadFunction(&map, GL_RG16F,              GL_HALF_FLOAT_OES,                 loadToNative<GLhalf, 2>              );
-    insertLoadFunction(&map, GL_RG32F,              GL_FLOAT,                          loadToNative<GLfloat, 2>             );
-    insertLoadFunction(&map, GL_RG16F,              GL_FLOAT,                          loadFloatDataToHalfFloat<2>          );
-    insertLoadFunction(&map, GL_RG8UI,              GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 2>             );
-    insertLoadFunction(&map, GL_RG8I,               GL_BYTE,                           loadToNative<GLbyte, 2>              );
-    insertLoadFunction(&map, GL_RG16UI,             GL_UNSIGNED_SHORT,                 loadToNative<GLushort, 2>            );
-    insertLoadFunction(&map, GL_RG16I,              GL_SHORT,                          loadToNative<GLshort, 2>             );
-    insertLoadFunction(&map, GL_RG32UI,             GL_UNSIGNED_INT,                   loadToNative<GLuint, 2>              );
-    insertLoadFunction(&map, GL_RG32I,              GL_INT,                            loadToNative<GLint, 2>               );
-    insertLoadFunction(&map, GL_R8,                 GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 1>             );
-    insertLoadFunction(&map, GL_R8_SNORM,           GL_BYTE,                           loadToNative<GLbyte, 1>              );
-    insertLoadFunction(&map, GL_R16F,               GL_HALF_FLOAT,                     loadToNative<GLhalf, 1>              );
-    insertLoadFunction(&map, GL_R16F,               GL_HALF_FLOAT_OES,                 loadToNative<GLhalf, 1>              );
-    insertLoadFunction(&map, GL_R32F,               GL_FLOAT,                          loadToNative<GLfloat, 1>             );
-    insertLoadFunction(&map, GL_R16F,               GL_FLOAT,                          loadFloatDataToHalfFloat<1>          );
-    insertLoadFunction(&map, GL_R8UI,               GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 1>             );
-    insertLoadFunction(&map, GL_R8I,                GL_BYTE,                           loadToNative<GLbyte, 1>              );
-    insertLoadFunction(&map, GL_R16UI,              GL_UNSIGNED_SHORT,                 loadToNative<GLushort, 1>            );
-    insertLoadFunction(&map, GL_R16I,               GL_SHORT,                          loadToNative<GLshort, 1>             );
-    insertLoadFunction(&map, GL_R32UI,              GL_UNSIGNED_INT,                   loadToNative<GLuint, 1>              );
-    insertLoadFunction(&map, GL_R32I,               GL_INT,                            loadToNative<GLint, 1>               );
-    insertLoadFunction(&map, GL_DEPTH_COMPONENT16,  GL_UNSIGNED_SHORT,                 loadToNative<GLushort, 1>            );
-    insertLoadFunction(&map, GL_DEPTH_COMPONENT24,  GL_UNSIGNED_INT,                   loadG8R24DataToR24G8                 );
-    insertLoadFunction(&map, GL_DEPTH_COMPONENT16,  GL_UNSIGNED_INT,                   loadUintDataToUshort                 );
-    insertLoadFunction(&map, GL_DEPTH_COMPONENT32F, GL_FLOAT,                          loadToNative<GLfloat, 1>             );
-    insertLoadFunction(&map, GL_DEPTH24_STENCIL8,   GL_UNSIGNED_INT_24_8,              loadG8R24DataToR24G8                 );
-    insertLoadFunction(&map, GL_DEPTH32F_STENCIL8,  GL_FLOAT_32_UNSIGNED_INT_24_8_REV, loadToNative<GLuint, 2>              );
+    insertLoadFunction(&map, GL_RGBA8,              GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 4>             );
+    insertLoadFunction(&map, GL_RGB5_A1,            GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 4>             );
+    insertLoadFunction(&map, GL_RGBA4,              GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 4>             );
+    insertLoadFunction(&map, GL_SRGB8_ALPHA8,       GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 4>             );
+    insertLoadFunction(&map, GL_RGBA8_SNORM,        GL_BYTE,                           LoadToNative<GLbyte, 4>              );
+    insertLoadFunction(&map, GL_RGBA4,              GL_UNSIGNED_SHORT_4_4_4_4,         LoadRGBA4ToRGBA8                     );
+    insertLoadFunction(&map, GL_RGB10_A2,           GL_UNSIGNED_INT_2_10_10_10_REV,    LoadToNative<GLuint, 1>              );
+    insertLoadFunction(&map, GL_RGB5_A1,            GL_UNSIGNED_SHORT_5_5_5_1,         LoadRGB5A1ToRGBA8                    );
+    insertLoadFunction(&map, GL_RGB5_A1,            GL_UNSIGNED_INT_2_10_10_10_REV,    LoadRGB10A2ToRGBA8                   );
+    insertLoadFunction(&map, GL_RGBA16F,            GL_HALF_FLOAT,                     LoadToNative<GLhalf, 4>              );
+    insertLoadFunction(&map, GL_RGBA16F,            GL_HALF_FLOAT_OES,                 LoadToNative<GLhalf, 4>              );
+    insertLoadFunction(&map, GL_RGBA32F,            GL_FLOAT,                          LoadToNative<GLfloat, 4>             );
+    insertLoadFunction(&map, GL_RGBA16F,            GL_FLOAT,                          Load32FTo16F<4>                      );
+    insertLoadFunction(&map, GL_RGBA8UI,            GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 4>             );
+    insertLoadFunction(&map, GL_RGBA8I,             GL_BYTE,                           LoadToNative<GLbyte, 4>              );
+    insertLoadFunction(&map, GL_RGBA16UI,           GL_UNSIGNED_SHORT,                 LoadToNative<GLushort, 4>            );
+    insertLoadFunction(&map, GL_RGBA16I,            GL_SHORT,                          LoadToNative<GLshort, 4>             );
+    insertLoadFunction(&map, GL_RGBA32UI,           GL_UNSIGNED_INT,                   LoadToNative<GLuint, 4>              );
+    insertLoadFunction(&map, GL_RGBA32I,            GL_INT,                            LoadToNative<GLint, 4>               );
+    insertLoadFunction(&map, GL_RGB10_A2UI,         GL_UNSIGNED_INT_2_10_10_10_REV,    LoadToNative<GLuint, 1>              );
+    insertLoadFunction(&map, GL_RGB8,               GL_UNSIGNED_BYTE,                  LoadToNative3To4<GLubyte, 0xFF>      );
+    insertLoadFunction(&map, GL_RGB565,             GL_UNSIGNED_BYTE,                  LoadToNative3To4<GLubyte, 0xFF>      );
+    insertLoadFunction(&map, GL_SRGB8,              GL_UNSIGNED_BYTE,                  LoadToNative3To4<GLubyte, 0xFF>      );
+    insertLoadFunction(&map, GL_RGB8_SNORM,         GL_BYTE,                           LoadToNative3To4<GLbyte, 0x7F>       );
+    insertLoadFunction(&map, GL_RGB565,             GL_UNSIGNED_SHORT_5_6_5,           LoadR5G6B5ToRGBA8                    );
+    insertLoadFunction(&map, GL_R11F_G11F_B10F,     GL_UNSIGNED_INT_10F_11F_11F_REV,   LoadToNative<GLuint, 1>              );
+    insertLoadFunction(&map, GL_RGB9_E5,            GL_UNSIGNED_INT_5_9_9_9_REV,       LoadToNative<GLuint, 1>              );
+    insertLoadFunction(&map, GL_RGB16F,             GL_HALF_FLOAT,                     LoadToNative3To4<GLhalf, gl::Float16One>);
+    insertLoadFunction(&map, GL_RGB16F,             GL_HALF_FLOAT_OES,                 LoadToNative3To4<GLhalf, gl::Float16One>);
+    insertLoadFunction(&map, GL_R11F_G11F_B10F,     GL_HALF_FLOAT,                     LoadRGB16FToRG11B10F                 );
+    insertLoadFunction(&map, GL_R11F_G11F_B10F,     GL_HALF_FLOAT_OES,                 LoadRGB16FToRG11B10F                 );
+    insertLoadFunction(&map, GL_RGB9_E5,            GL_HALF_FLOAT,                     LoadRGB16FToRGB9E5                   );
+    insertLoadFunction(&map, GL_RGB9_E5,            GL_HALF_FLOAT_OES,                 LoadRGB16FToRGB9E5                   );
+    insertLoadFunction(&map, GL_RGB32F,             GL_FLOAT,                          LoadToNative3To4<GLfloat, gl::Float32One>);
+    insertLoadFunction(&map, GL_RGB16F,             GL_FLOAT,                          LoadRGB32FToRGBA16F                  );
+    insertLoadFunction(&map, GL_R11F_G11F_B10F,     GL_FLOAT,                          LoadRGB32FToRG11B10F                 );
+    insertLoadFunction(&map, GL_RGB9_E5,            GL_FLOAT,                          LoadRGB32FToRGB9E5                   );
+    insertLoadFunction(&map, GL_RGB8UI,             GL_UNSIGNED_BYTE,                  LoadToNative3To4<GLubyte, 0x01>      );
+    insertLoadFunction(&map, GL_RGB8I,              GL_BYTE,                           LoadToNative3To4<GLbyte, 0x01>       );
+    insertLoadFunction(&map, GL_RGB16UI,            GL_UNSIGNED_SHORT,                 LoadToNative3To4<GLushort, 0x0001>   );
+    insertLoadFunction(&map, GL_RGB16I,             GL_SHORT,                          LoadToNative3To4<GLshort, 0x0001>    );
+    insertLoadFunction(&map, GL_RGB32UI,            GL_UNSIGNED_INT,                   LoadToNative3To4<GLuint, 0x00000001> );
+    insertLoadFunction(&map, GL_RGB32I,             GL_INT,                            LoadToNative3To4<GLint, 0x00000001>  );
+    insertLoadFunction(&map, GL_RG8,                GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 2>             );
+    insertLoadFunction(&map, GL_RG8_SNORM,          GL_BYTE,                           LoadToNative<GLbyte, 2>              );
+    insertLoadFunction(&map, GL_RG16F,              GL_HALF_FLOAT,                     LoadToNative<GLhalf, 2>              );
+    insertLoadFunction(&map, GL_RG16F,              GL_HALF_FLOAT_OES,                 LoadToNative<GLhalf, 2>              );
+    insertLoadFunction(&map, GL_RG32F,              GL_FLOAT,                          LoadToNative<GLfloat, 2>             );
+    insertLoadFunction(&map, GL_RG16F,              GL_FLOAT,                          Load32FTo16F<2>                      );
+    insertLoadFunction(&map, GL_RG8UI,              GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 2>             );
+    insertLoadFunction(&map, GL_RG8I,               GL_BYTE,                           LoadToNative<GLbyte, 2>              );
+    insertLoadFunction(&map, GL_RG16UI,             GL_UNSIGNED_SHORT,                 LoadToNative<GLushort, 2>            );
+    insertLoadFunction(&map, GL_RG16I,              GL_SHORT,                          LoadToNative<GLshort, 2>             );
+    insertLoadFunction(&map, GL_RG32UI,             GL_UNSIGNED_INT,                   LoadToNative<GLuint, 2>              );
+    insertLoadFunction(&map, GL_RG32I,              GL_INT,                            LoadToNative<GLint, 2>               );
+    insertLoadFunction(&map, GL_R8,                 GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 1>             );
+    insertLoadFunction(&map, GL_R8_SNORM,           GL_BYTE,                           LoadToNative<GLbyte, 1>              );
+    insertLoadFunction(&map, GL_R16F,               GL_HALF_FLOAT,                     LoadToNative<GLhalf, 1>              );
+    insertLoadFunction(&map, GL_R16F,               GL_HALF_FLOAT_OES,                 LoadToNative<GLhalf, 1>              );
+    insertLoadFunction(&map, GL_R32F,               GL_FLOAT,                          LoadToNative<GLfloat, 1>             );
+    insertLoadFunction(&map, GL_R16F,               GL_FLOAT,                          Load32FTo16F<1>                      );
+    insertLoadFunction(&map, GL_R8UI,               GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 1>             );
+    insertLoadFunction(&map, GL_R8I,                GL_BYTE,                           LoadToNative<GLbyte, 1>              );
+    insertLoadFunction(&map, GL_R16UI,              GL_UNSIGNED_SHORT,                 LoadToNative<GLushort, 1>            );
+    insertLoadFunction(&map, GL_R16I,               GL_SHORT,                          LoadToNative<GLshort, 1>             );
+    insertLoadFunction(&map, GL_R32UI,              GL_UNSIGNED_INT,                   LoadToNative<GLuint, 1>              );
+    insertLoadFunction(&map, GL_R32I,               GL_INT,                            LoadToNative<GLint, 1>               );
+    insertLoadFunction(&map, GL_DEPTH_COMPONENT16,  GL_UNSIGNED_SHORT,                 LoadToNative<GLushort, 1>            );
+    insertLoadFunction(&map, GL_DEPTH_COMPONENT24,  GL_UNSIGNED_INT,                   LoadR32ToR24G8                       );
+    insertLoadFunction(&map, GL_DEPTH_COMPONENT16,  GL_UNSIGNED_INT,                   LoadR32ToR16                         );
+    insertLoadFunction(&map, GL_DEPTH_COMPONENT32F, GL_FLOAT,                          LoadToNative<GLfloat, 1>             );
+    insertLoadFunction(&map, GL_DEPTH24_STENCIL8,   GL_UNSIGNED_INT_24_8,              LoadR32ToR24G8                       );
+    insertLoadFunction(&map, GL_DEPTH32F_STENCIL8,  GL_FLOAT_32_UNSIGNED_INT_24_8_REV, LoadToNative<GLuint, 2>              );
 
     // Unsized formats
     // Load functions are unreachable because they are converted to sized internal formats based on
@@ -296,41 +296,41 @@
     insertLoadFunction(&map, GL_ALPHA,              GL_UNSIGNED_BYTE,                  UnreachableLoadFunction              );
 
     // From GL_OES_texture_float
-    insertLoadFunction(&map, GL_LUMINANCE_ALPHA,    GL_FLOAT,                          loadLuminanceAlphaFloatDataToRGBA    );
-    insertLoadFunction(&map, GL_LUMINANCE,          GL_FLOAT,                          loadLuminanceFloatDataToRGBA         );
-    insertLoadFunction(&map, GL_ALPHA,              GL_FLOAT,                          loadAlphaFloatDataToRGBA             );
+    insertLoadFunction(&map, GL_LUMINANCE_ALPHA,    GL_FLOAT,                          LoadLA32FToRGBA32F                   );
+    insertLoadFunction(&map, GL_LUMINANCE,          GL_FLOAT,                          LoadL32FToRGBA32F                    );
+    insertLoadFunction(&map, GL_ALPHA,              GL_FLOAT,                          LoadA32FToRGBA32F                    );
 
     // From GL_OES_texture_half_float
-    insertLoadFunction(&map, GL_LUMINANCE_ALPHA,    GL_HALF_FLOAT,                     loadLuminanceAlphaHalfFloatDataToRGBA);
-    insertLoadFunction(&map, GL_LUMINANCE_ALPHA,    GL_HALF_FLOAT_OES,                 loadLuminanceAlphaHalfFloatDataToRGBA);
-    insertLoadFunction(&map, GL_LUMINANCE,          GL_HALF_FLOAT,                     loadLuminanceHalfFloatDataToRGBA     );
-    insertLoadFunction(&map, GL_LUMINANCE,          GL_HALF_FLOAT_OES,                 loadLuminanceHalfFloatDataToRGBA     );
-    insertLoadFunction(&map, GL_ALPHA,              GL_HALF_FLOAT,                     loadAlphaHalfFloatDataToRGBA         );
-    insertLoadFunction(&map, GL_ALPHA,              GL_HALF_FLOAT_OES,                 loadAlphaHalfFloatDataToRGBA         );
+    insertLoadFunction(&map, GL_LUMINANCE_ALPHA,    GL_HALF_FLOAT,                     LoadLA16FToRGBA16F                   );
+    insertLoadFunction(&map, GL_LUMINANCE_ALPHA,    GL_HALF_FLOAT_OES,                 LoadLA16FToRGBA16F                   );
+    insertLoadFunction(&map, GL_LUMINANCE,          GL_HALF_FLOAT,                     LoadL16FToRGBA16F                    );
+    insertLoadFunction(&map, GL_LUMINANCE,          GL_HALF_FLOAT_OES,                 LoadL16FToRGBA16F                    );
+    insertLoadFunction(&map, GL_ALPHA,              GL_HALF_FLOAT,                     LoadA16FToRGBA16F                    );
+    insertLoadFunction(&map, GL_ALPHA,              GL_HALF_FLOAT_OES,                 LoadA16FToRGBA16F                    );
 
     // From GL_EXT_texture_storage
-    insertLoadFunction(&map, GL_ALPHA8_EXT,             GL_UNSIGNED_BYTE,              loadToNative<GLubyte, 1>             );
-    insertLoadFunction(&map, GL_LUMINANCE8_EXT,         GL_UNSIGNED_BYTE,              loadLuminanceDataToBGRA              );
-    insertLoadFunction(&map, GL_LUMINANCE8_ALPHA8_EXT,  GL_UNSIGNED_BYTE,              loadLuminanceAlphaDataToBGRA         );
-    insertLoadFunction(&map, GL_ALPHA32F_EXT,           GL_FLOAT,                      loadAlphaFloatDataToRGBA             );
-    insertLoadFunction(&map, GL_LUMINANCE32F_EXT,       GL_FLOAT,                      loadLuminanceFloatDataToRGBA         );
-    insertLoadFunction(&map, GL_LUMINANCE_ALPHA32F_EXT, GL_FLOAT,                      loadLuminanceAlphaFloatDataToRGBA    );
-    insertLoadFunction(&map, GL_ALPHA16F_EXT,           GL_HALF_FLOAT,                 loadAlphaHalfFloatDataToRGBA         );
-    insertLoadFunction(&map, GL_ALPHA16F_EXT,           GL_HALF_FLOAT_OES,             loadAlphaHalfFloatDataToRGBA         );
-    insertLoadFunction(&map, GL_LUMINANCE16F_EXT,       GL_HALF_FLOAT,                 loadLuminanceHalfFloatDataToRGBA     );
-    insertLoadFunction(&map, GL_LUMINANCE16F_EXT,       GL_HALF_FLOAT_OES,             loadLuminanceHalfFloatDataToRGBA     );
-    insertLoadFunction(&map, GL_LUMINANCE_ALPHA16F_EXT, GL_HALF_FLOAT,                 loadLuminanceAlphaHalfFloatDataToRGBA);
-    insertLoadFunction(&map, GL_LUMINANCE_ALPHA16F_EXT, GL_HALF_FLOAT_OES,             loadLuminanceAlphaHalfFloatDataToRGBA);
+    insertLoadFunction(&map, GL_ALPHA8_EXT,             GL_UNSIGNED_BYTE,              LoadToNative<GLubyte, 1>             );
+    insertLoadFunction(&map, GL_LUMINANCE8_EXT,         GL_UNSIGNED_BYTE,              LoadL8ToRGBA8                        );
+    insertLoadFunction(&map, GL_LUMINANCE8_ALPHA8_EXT,  GL_UNSIGNED_BYTE,              LoadLA8ToRGBA8                       );
+    insertLoadFunction(&map, GL_ALPHA32F_EXT,           GL_FLOAT,                      LoadA32FToRGBA32F                    );
+    insertLoadFunction(&map, GL_LUMINANCE32F_EXT,       GL_FLOAT,                      LoadL32FToRGBA32F                    );
+    insertLoadFunction(&map, GL_LUMINANCE_ALPHA32F_EXT, GL_FLOAT,                      LoadLA32FToRGBA32F                   );
+    insertLoadFunction(&map, GL_ALPHA16F_EXT,           GL_HALF_FLOAT,                 LoadA16FToRGBA16F                    );
+    insertLoadFunction(&map, GL_ALPHA16F_EXT,           GL_HALF_FLOAT_OES,             LoadA16FToRGBA16F                    );
+    insertLoadFunction(&map, GL_LUMINANCE16F_EXT,       GL_HALF_FLOAT,                 LoadL16FToRGBA16F                    );
+    insertLoadFunction(&map, GL_LUMINANCE16F_EXT,       GL_HALF_FLOAT_OES,             LoadL16FToRGBA16F                    );
+    insertLoadFunction(&map, GL_LUMINANCE_ALPHA16F_EXT, GL_HALF_FLOAT,                 LoadLA16FToRGBA16F                   );
+    insertLoadFunction(&map, GL_LUMINANCE_ALPHA16F_EXT, GL_HALF_FLOAT_OES,             LoadLA16FToRGBA16F                   );
 
     // From GL_ANGLE_depth_texture
-    insertLoadFunction(&map, GL_DEPTH_COMPONENT32_OES,  GL_UNSIGNED_INT,               loadUintDataToUint24X8               );
+    insertLoadFunction(&map, GL_DEPTH_COMPONENT32_OES,  GL_UNSIGNED_INT,               LoadR32ToR24G8                       );
 
     // From GL_EXT_texture_format_BGRA8888
-    insertLoadFunction(&map, GL_BGRA8_EXT,              GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 4>         );
-    insertLoadFunction(&map, GL_BGRA4_ANGLEX,           GL_UNSIGNED_SHORT_4_4_4_4_REV_EXT, loadRGBA4444DataToRGBA           );
-    insertLoadFunction(&map, GL_BGRA4_ANGLEX,           GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 4>         );
-    insertLoadFunction(&map, GL_BGR5_A1_ANGLEX,         GL_UNSIGNED_SHORT_1_5_5_5_REV_EXT, loadRGBA5551DataToRGBA           );
-    insertLoadFunction(&map, GL_BGR5_A1_ANGLEX,         GL_UNSIGNED_BYTE,                  loadToNative<GLubyte, 4>         );
+    insertLoadFunction(&map, GL_BGRA8_EXT,              GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 4>         );
+    insertLoadFunction(&map, GL_BGRA4_ANGLEX,           GL_UNSIGNED_SHORT_4_4_4_4_REV_EXT, LoadRGBA4ToRGBA8                 );
+    insertLoadFunction(&map, GL_BGRA4_ANGLEX,           GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 4>         );
+    insertLoadFunction(&map, GL_BGR5_A1_ANGLEX,         GL_UNSIGNED_SHORT_1_5_5_5_REV_EXT, LoadRGB5A1ToRGBA8                );
+    insertLoadFunction(&map, GL_BGR5_A1_ANGLEX,         GL_UNSIGNED_BYTE,                  LoadToNative<GLubyte, 4>         );
 
     // Compressed formats
     // From ES 3.0.1 spec, table 3.16
@@ -348,14 +348,14 @@
     insertLoadFunction(&map, GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC,          GL_UNSIGNED_BYTE, UnimplementedLoadFunction          );
 
     // From GL_EXT_texture_compression_dxt1
-    insertLoadFunction(&map, GL_COMPRESSED_RGB_S3TC_DXT1_EXT,              GL_UNSIGNED_BYTE, loadCompressedBlockDataToNative<4, 4,  8>);
-    insertLoadFunction(&map, GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,             GL_UNSIGNED_BYTE, loadCompressedBlockDataToNative<4, 4,  8>);
+    insertLoadFunction(&map, GL_COMPRESSED_RGB_S3TC_DXT1_EXT,              GL_UNSIGNED_BYTE, LoadCompressedToNative<4, 4,  8>);
+    insertLoadFunction(&map, GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,             GL_UNSIGNED_BYTE, LoadCompressedToNative<4, 4,  8>);
 
     // From GL_ANGLE_texture_compression_dxt3
-    insertLoadFunction(&map, GL_COMPRESSED_RGBA_S3TC_DXT3_ANGLE,           GL_UNSIGNED_BYTE, loadCompressedBlockDataToNative<4, 4, 16>);
+    insertLoadFunction(&map, GL_COMPRESSED_RGBA_S3TC_DXT3_ANGLE,           GL_UNSIGNED_BYTE, LoadCompressedToNative<4, 4, 16>);
 
     // From GL_ANGLE_texture_compression_dxt5
-    insertLoadFunction(&map, GL_COMPRESSED_RGBA_S3TC_DXT5_ANGLE,           GL_UNSIGNED_BYTE, loadCompressedBlockDataToNative<4, 4, 16>);
+    insertLoadFunction(&map, GL_COMPRESSED_RGBA_S3TC_DXT5_ANGLE,           GL_UNSIGNED_BYTE, LoadCompressedToNative<4, 4, 16>);
 
     return map;
 }
@@ -759,17 +759,17 @@
 {
     InternalFormatInitializerMap map;
 
-    map.insert(InternalFormatInitializerPair(GL_RGB8,    initialize4ComponentData<GLubyte,  0x00,       0x00,       0x00,       0xFF>          ));
-    map.insert(InternalFormatInitializerPair(GL_RGB565,  initialize4ComponentData<GLubyte,  0x00,       0x00,       0x00,       0xFF>          ));
-    map.insert(InternalFormatInitializerPair(GL_SRGB8,   initialize4ComponentData<GLubyte,  0x00,       0x00,       0x00,       0xFF>          ));
-    map.insert(InternalFormatInitializerPair(GL_RGB16F,  initialize4ComponentData<GLhalf,   0x0000,     0x0000,     0x0000,     gl::Float16One>));
-    map.insert(InternalFormatInitializerPair(GL_RGB32F,  initialize4ComponentData<GLfloat,  0x00000000, 0x00000000, 0x00000000, gl::Float32One>));
-    map.insert(InternalFormatInitializerPair(GL_RGB8UI,  initialize4ComponentData<GLubyte,  0x00,       0x00,       0x00,       0x01>          ));
-    map.insert(InternalFormatInitializerPair(GL_RGB8I,   initialize4ComponentData<GLbyte,   0x00,       0x00,       0x00,       0x01>          ));
-    map.insert(InternalFormatInitializerPair(GL_RGB16UI, initialize4ComponentData<GLushort, 0x0000,     0x0000,     0x0000,     0x0001>        ));
-    map.insert(InternalFormatInitializerPair(GL_RGB16I,  initialize4ComponentData<GLshort,  0x0000,     0x0000,     0x0000,     0x0001>        ));
-    map.insert(InternalFormatInitializerPair(GL_RGB32UI, initialize4ComponentData<GLuint,   0x00000000, 0x00000000, 0x00000000, 0x00000001>    ));
-    map.insert(InternalFormatInitializerPair(GL_RGB32I,  initialize4ComponentData<GLint,    0x00000000, 0x00000000, 0x00000000, 0x00000001>    ));
+    map.insert(InternalFormatInitializerPair(GL_RGB8,    Initialize4ComponentData<GLubyte,  0x00,       0x00,       0x00,       0xFF>          ));
+    map.insert(InternalFormatInitializerPair(GL_RGB565,  Initialize4ComponentData<GLubyte,  0x00,       0x00,       0x00,       0xFF>          ));
+    map.insert(InternalFormatInitializerPair(GL_SRGB8,   Initialize4ComponentData<GLubyte,  0x00,       0x00,       0x00,       0xFF>          ));
+    map.insert(InternalFormatInitializerPair(GL_RGB16F,  Initialize4ComponentData<GLhalf,   0x0000,     0x0000,     0x0000,     gl::Float16One>));
+    map.insert(InternalFormatInitializerPair(GL_RGB32F,  Initialize4ComponentData<GLfloat,  0x00000000, 0x00000000, 0x00000000, gl::Float32One>));
+    map.insert(InternalFormatInitializerPair(GL_RGB8UI,  Initialize4ComponentData<GLubyte,  0x00,       0x00,       0x00,       0x01>          ));
+    map.insert(InternalFormatInitializerPair(GL_RGB8I,   Initialize4ComponentData<GLbyte,   0x00,       0x00,       0x00,       0x01>          ));
+    map.insert(InternalFormatInitializerPair(GL_RGB16UI, Initialize4ComponentData<GLushort, 0x0000,     0x0000,     0x0000,     0x0001>        ));
+    map.insert(InternalFormatInitializerPair(GL_RGB16I,  Initialize4ComponentData<GLshort,  0x0000,     0x0000,     0x0000,     0x0001>        ));
+    map.insert(InternalFormatInitializerPair(GL_RGB32UI, Initialize4ComponentData<GLuint,   0x00000000, 0x00000000, 0x00000000, 0x00000001>    ));
+    map.insert(InternalFormatInitializerPair(GL_RGB32I,  Initialize4ComponentData<GLint,    0x00000000, 0x00000000, 0x00000000, 0x00000001>    ));
 
     return map;
 }
diff --git a/src/libGLESv2/renderer/d3d/d3d9/Image9.cpp b/src/libGLESv2/renderer/d3d/d3d9/Image9.cpp
index 29b5ef5..8f5c626 100644
--- a/src/libGLESv2/renderer/d3d/d3d9/Image9.cpp
+++ b/src/libGLESv2/renderer/d3d/d3d9/Image9.cpp
@@ -208,7 +208,7 @@
             result = newSurface->LockRect(&lockedRect, &entireRect, 0);
             ASSERT(SUCCEEDED(result));
 
-            initializeFunc(mWidth, mHeight, 1, lockedRect.pBits, lockedRect.Pitch, 0);
+            initializeFunc(mWidth, mHeight, 1, reinterpret_cast<uint8_t*>(lockedRect.pBits), lockedRect.Pitch, 0);
 
             result = newSurface->UnlockRect();
             ASSERT(SUCCEEDED(result));
@@ -405,7 +405,9 @@
         return;
     }
 
-    loadFunction(width, height, depth, input, inputRowPitch, 0, locked.pBits, locked.Pitch, 0);
+    loadFunction(width, height, depth,
+                 reinterpret_cast<const uint8_t*>(input), inputRowPitch, 0,
+                 reinterpret_cast<uint8_t*>(locked.pBits), locked.Pitch, 0);
 
     unlock();
 }
@@ -438,8 +440,9 @@
         return;
     }
 
-    loadFunction(width, height, depth, input, inputRowPitch, inputDepthPitch,
-                 locked.pBits, locked.Pitch, 0);
+    loadFunction(width, height, depth,
+                 reinterpret_cast<const uint8_t*>(input), inputRowPitch, inputDepthPitch,
+                 reinterpret_cast<uint8_t*>(locked.pBits), locked.Pitch, 0);
 
     unlock();
 }
diff --git a/src/libGLESv2/renderer/d3d/d3d9/formatutils9.cpp b/src/libGLESv2/renderer/d3d/d3d9/formatutils9.cpp
index 24c75d7..f5d1da6 100644
--- a/src/libGLESv2/renderer/d3d/d3d9/formatutils9.cpp
+++ b/src/libGLESv2/renderer/d3d/d3d9/formatutils9.cpp
@@ -26,9 +26,9 @@
 typedef bool(*FallbackPredicateFunction)();
 
 template <FallbackPredicateFunction pred, LoadImageFunction prefered, LoadImageFunction fallback>
-static void FallbackLoad(int width, int height, int depth,
-                         const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                         void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+static void FallbackLoad(size_t width, size_t height, size_t depth,
+                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
     if (pred())
     {
@@ -40,9 +40,9 @@
     }
 }
 
-static void UnreachableLoad(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+static void UnreachableLoad(size_t width, size_t height, size_t depth,
+                            const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                            uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
     UNREACHABLE();
 }
@@ -80,45 +80,45 @@
     map.insert(D3D9FormatPair(GL_DEPTH24_STENCIL8_OES,             D3D9FormatInfo(D3DFMT_INTZ,          D3DFMT_D24S8,          UnreachableLoad                          )));
     map.insert(D3D9FormatPair(GL_STENCIL_INDEX8,                   D3D9FormatInfo(D3DFMT_UNKNOWN,       D3DFMT_D24S8,          UnreachableLoad                          ))); // TODO: What's the texture format?
 
-    map.insert(D3D9FormatPair(GL_RGBA32F_EXT,                      D3D9FormatInfo(D3DFMT_A32B32G32R32F, D3DFMT_A32B32G32R32F,  loadToNative<GLfloat, 4>                 )));
-    map.insert(D3D9FormatPair(GL_RGB32F_EXT,                       D3D9FormatInfo(D3DFMT_A32B32G32R32F, D3DFMT_A32B32G32R32F,  loadToNative3To4<GLfloat, gl::Float32One>)));
-    map.insert(D3D9FormatPair(GL_RG32F_EXT,                        D3D9FormatInfo(D3DFMT_G32R32F,       D3DFMT_G32R32F,        loadToNative<GLfloat, 2>                 )));
-    map.insert(D3D9FormatPair(GL_R32F_EXT,                         D3D9FormatInfo(D3DFMT_R32F,          D3DFMT_R32F,           loadToNative<GLfloat, 1>                 )));
-    map.insert(D3D9FormatPair(GL_ALPHA32F_EXT,                     D3D9FormatInfo(D3DFMT_A32B32G32R32F, D3DFMT_UNKNOWN,        loadAlphaFloatDataToRGBA                 )));
-    map.insert(D3D9FormatPair(GL_LUMINANCE32F_EXT,                 D3D9FormatInfo(D3DFMT_A32B32G32R32F, D3DFMT_UNKNOWN,        loadLuminanceFloatDataToRGBA             )));
-    map.insert(D3D9FormatPair(GL_LUMINANCE_ALPHA32F_EXT,           D3D9FormatInfo(D3DFMT_A32B32G32R32F, D3DFMT_UNKNOWN,        loadLuminanceAlphaFloatDataToRGBA        )));
+    map.insert(D3D9FormatPair(GL_RGBA32F_EXT,                      D3D9FormatInfo(D3DFMT_A32B32G32R32F, D3DFMT_A32B32G32R32F,  LoadToNative<GLfloat, 4>                 )));
+    map.insert(D3D9FormatPair(GL_RGB32F_EXT,                       D3D9FormatInfo(D3DFMT_A32B32G32R32F, D3DFMT_A32B32G32R32F,  LoadToNative3To4<GLfloat, gl::Float32One>)));
+    map.insert(D3D9FormatPair(GL_RG32F_EXT,                        D3D9FormatInfo(D3DFMT_G32R32F,       D3DFMT_G32R32F,        LoadToNative<GLfloat, 2>                 )));
+    map.insert(D3D9FormatPair(GL_R32F_EXT,                         D3D9FormatInfo(D3DFMT_R32F,          D3DFMT_R32F,           LoadToNative<GLfloat, 1>                 )));
+    map.insert(D3D9FormatPair(GL_ALPHA32F_EXT,                     D3D9FormatInfo(D3DFMT_A32B32G32R32F, D3DFMT_UNKNOWN,        LoadA32FToRGBA32F                        )));
+    map.insert(D3D9FormatPair(GL_LUMINANCE32F_EXT,                 D3D9FormatInfo(D3DFMT_A32B32G32R32F, D3DFMT_UNKNOWN,        LoadL32FToRGBA32F                        )));
+    map.insert(D3D9FormatPair(GL_LUMINANCE_ALPHA32F_EXT,           D3D9FormatInfo(D3DFMT_A32B32G32R32F, D3DFMT_UNKNOWN,        LoadLA32FToRGBA32F                       )));
 
-    map.insert(D3D9FormatPair(GL_RGBA16F_EXT,                      D3D9FormatInfo(D3DFMT_A16B16G16R16F, D3DFMT_A16B16G16R16F,  loadToNative<GLhalf, 4>                  )));
-    map.insert(D3D9FormatPair(GL_RGB16F_EXT,                       D3D9FormatInfo(D3DFMT_A16B16G16R16F, D3DFMT_A16B16G16R16F,  loadToNative3To4<GLhalf, gl::Float16One> )));
-    map.insert(D3D9FormatPair(GL_RG16F_EXT,                        D3D9FormatInfo(D3DFMT_G16R16F,       D3DFMT_G16R16F,        loadToNative<GLhalf, 2>                  )));
-    map.insert(D3D9FormatPair(GL_R16F_EXT,                         D3D9FormatInfo(D3DFMT_R16F,          D3DFMT_R16F,           loadToNative<GLhalf, 1>                  )));
-    map.insert(D3D9FormatPair(GL_ALPHA16F_EXT,                     D3D9FormatInfo(D3DFMT_A16B16G16R16F, D3DFMT_UNKNOWN,        loadAlphaHalfFloatDataToRGBA             )));
-    map.insert(D3D9FormatPair(GL_LUMINANCE16F_EXT,                 D3D9FormatInfo(D3DFMT_A16B16G16R16F, D3DFMT_UNKNOWN,        loadLuminanceHalfFloatDataToRGBA         )));
-    map.insert(D3D9FormatPair(GL_LUMINANCE_ALPHA16F_EXT,           D3D9FormatInfo(D3DFMT_A16B16G16R16F, D3DFMT_UNKNOWN,        loadLuminanceAlphaHalfFloatDataToRGBA    )));
+    map.insert(D3D9FormatPair(GL_RGBA16F_EXT,                      D3D9FormatInfo(D3DFMT_A16B16G16R16F, D3DFMT_A16B16G16R16F,  LoadToNative<GLhalf, 4>                  )));
+    map.insert(D3D9FormatPair(GL_RGB16F_EXT,                       D3D9FormatInfo(D3DFMT_A16B16G16R16F, D3DFMT_A16B16G16R16F,  LoadToNative3To4<GLhalf, gl::Float16One> )));
+    map.insert(D3D9FormatPair(GL_RG16F_EXT,                        D3D9FormatInfo(D3DFMT_G16R16F,       D3DFMT_G16R16F,        LoadToNative<GLhalf, 2>                  )));
+    map.insert(D3D9FormatPair(GL_R16F_EXT,                         D3D9FormatInfo(D3DFMT_R16F,          D3DFMT_R16F,           LoadToNative<GLhalf, 1>                  )));
+    map.insert(D3D9FormatPair(GL_ALPHA16F_EXT,                     D3D9FormatInfo(D3DFMT_A16B16G16R16F, D3DFMT_UNKNOWN,        LoadA16FToRGBA16F                        )));
+    map.insert(D3D9FormatPair(GL_LUMINANCE16F_EXT,                 D3D9FormatInfo(D3DFMT_A16B16G16R16F, D3DFMT_UNKNOWN,        LoadL16FToRGBA16F                        )));
+    map.insert(D3D9FormatPair(GL_LUMINANCE_ALPHA16F_EXT,           D3D9FormatInfo(D3DFMT_A16B16G16R16F, D3DFMT_UNKNOWN,        LoadLA16FToRGBA16F                       )));
 
-    map.insert(D3D9FormatPair(GL_ALPHA8_EXT,                       D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       FallbackLoad<gl::supportsSSE2, loadAlphaDataToBGRASSE2, loadAlphaDataToBGRA>)));
+    map.insert(D3D9FormatPair(GL_ALPHA8_EXT,                       D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       FallbackLoad<gl::supportsSSE2, LoadA8ToBGRA8_SSE2, LoadA8ToBGRA8>)));
 
-    map.insert(D3D9FormatPair(GL_RGB8_OES,                         D3D9FormatInfo(D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       loadRGBUByteDataToBGRX                    )));
-    map.insert(D3D9FormatPair(GL_RGB565,                           D3D9FormatInfo(D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       loadRGB565DataToBGRA                      )));
-    map.insert(D3D9FormatPair(GL_RGBA8_OES,                        D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       FallbackLoad<gl::supportsSSE2, loadRGBAUByteDataToBGRASSE2, loadRGBAUByteDataToBGRA>)));
-    map.insert(D3D9FormatPair(GL_RGBA4,                            D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       loadRGBA4444DataToBGRA                    )));
-    map.insert(D3D9FormatPair(GL_RGB5_A1,                          D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       loadRGBA5551DataToBGRA                    )));
-    map.insert(D3D9FormatPair(GL_R8_EXT,                           D3D9FormatInfo(D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       loadRUByteDataToBGRX                      )));
-    map.insert(D3D9FormatPair(GL_RG8_EXT,                          D3D9FormatInfo(D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       loadRGUByteDataToBGRX                     )));
+    map.insert(D3D9FormatPair(GL_RGB8_OES,                         D3D9FormatInfo(D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       LoadRGB8ToBGRX8                          )));
+    map.insert(D3D9FormatPair(GL_RGB565,                           D3D9FormatInfo(D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       LoadR5G6B5ToBGRA8                        )));
+    map.insert(D3D9FormatPair(GL_RGBA8_OES,                        D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       FallbackLoad<gl::supportsSSE2, LoadRGBA8ToBGRA8_SSE2, LoadRGBA8ToBGRA8>)));
+    map.insert(D3D9FormatPair(GL_RGBA4,                            D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       LoadRGBA4ToBGRA8                         )));
+    map.insert(D3D9FormatPair(GL_RGB5_A1,                          D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       LoadRGB5A1ToBGRA8                        )));
+    map.insert(D3D9FormatPair(GL_R8_EXT,                           D3D9FormatInfo(D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       LoadR8ToBGRX8                            )));
+    map.insert(D3D9FormatPair(GL_RG8_EXT,                          D3D9FormatInfo(D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       LoadRG8ToBGRX8                           )));
 
-    map.insert(D3D9FormatPair(GL_BGRA8_EXT,                        D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       loadToNative<GLubyte, 4>                  )));
-    map.insert(D3D9FormatPair(GL_BGRA4_ANGLEX,                     D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       loadRGBA4444DataToRGBA                    )));
-    map.insert(D3D9FormatPair(GL_BGR5_A1_ANGLEX,                   D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       loadRGBA5551DataToRGBA                    )));
+    map.insert(D3D9FormatPair(GL_BGRA8_EXT,                        D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       LoadToNative<GLubyte, 4>                 )));
+    map.insert(D3D9FormatPair(GL_BGRA4_ANGLEX,                     D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       LoadBGRA4ToBGRA8                         )));
+    map.insert(D3D9FormatPair(GL_BGR5_A1_ANGLEX,                   D3D9FormatInfo(D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       LoadBGR5A1ToBGRA8                        )));
 
-    map.insert(D3D9FormatPair(GL_COMPRESSED_RGB_S3TC_DXT1_EXT,     D3D9FormatInfo(D3DFMT_DXT1,          D3DFMT_UNKNOWN,        loadCompressedBlockDataToNative<4, 4,  8> )));
-    map.insert(D3D9FormatPair(GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,    D3D9FormatInfo(D3DFMT_DXT1,          D3DFMT_UNKNOWN,        loadCompressedBlockDataToNative<4, 4,  8> )));
-    map.insert(D3D9FormatPair(GL_COMPRESSED_RGBA_S3TC_DXT3_ANGLE,  D3D9FormatInfo(D3DFMT_DXT3,          D3DFMT_UNKNOWN,        loadCompressedBlockDataToNative<4, 4, 16> )));
-    map.insert(D3D9FormatPair(GL_COMPRESSED_RGBA_S3TC_DXT5_ANGLE,  D3D9FormatInfo(D3DFMT_DXT5,          D3DFMT_UNKNOWN,        loadCompressedBlockDataToNative<4, 4, 16> )));
+    map.insert(D3D9FormatPair(GL_COMPRESSED_RGB_S3TC_DXT1_EXT,     D3D9FormatInfo(D3DFMT_DXT1,          D3DFMT_UNKNOWN,        LoadCompressedToNative<4, 4,  8>         )));
+    map.insert(D3D9FormatPair(GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,    D3D9FormatInfo(D3DFMT_DXT1,          D3DFMT_UNKNOWN,        LoadCompressedToNative<4, 4,  8>         )));
+    map.insert(D3D9FormatPair(GL_COMPRESSED_RGBA_S3TC_DXT3_ANGLE,  D3D9FormatInfo(D3DFMT_DXT3,          D3DFMT_UNKNOWN,        LoadCompressedToNative<4, 4, 16>         )));
+    map.insert(D3D9FormatPair(GL_COMPRESSED_RGBA_S3TC_DXT5_ANGLE,  D3D9FormatInfo(D3DFMT_DXT5,          D3DFMT_UNKNOWN,        LoadCompressedToNative<4, 4, 16>         )));
 
     // These formats require checking if the renderer supports D3DFMT_L8 or D3DFMT_A8L8 and
     // then changing the format and loading function appropriately.
-    map.insert(D3D9FormatPair(GL_LUMINANCE8_EXT,                   D3D9FormatInfo(D3DFMT_L8,            D3DFMT_UNKNOWN,        loadToNative<GLubyte, 1>                  )));
-    map.insert(D3D9FormatPair(GL_LUMINANCE8_ALPHA8_EXT,            D3D9FormatInfo(D3DFMT_A8L8,          D3DFMT_UNKNOWN,        loadToNative<GLubyte, 2>                  )));
+    map.insert(D3D9FormatPair(GL_LUMINANCE8_EXT,                   D3D9FormatInfo(D3DFMT_L8,            D3DFMT_UNKNOWN,        LoadToNative<GLubyte, 1>                 )));
+    map.insert(D3D9FormatPair(GL_LUMINANCE8_ALPHA8_EXT,            D3D9FormatInfo(D3DFMT_A8L8,          D3DFMT_UNKNOWN,        LoadToNative<GLubyte, 2>                 )));
 
     return map;
 }
@@ -275,8 +275,8 @@
 {
     InternalFormatInitialzerMap map;
 
-    map.insert(InternalFormatInitialzerPair(GL_RGB16F,  initialize4ComponentData<GLhalf,   0x0000,     0x0000,     0x0000,     gl::Float16One>));
-    map.insert(InternalFormatInitialzerPair(GL_RGB32F,  initialize4ComponentData<GLfloat,  0x00000000, 0x00000000, 0x00000000, gl::Float32One>));
+    map.insert(InternalFormatInitialzerPair(GL_RGB16F,  Initialize4ComponentData<GLhalf,   0x0000,     0x0000,     0x0000,     gl::Float16One>));
+    map.insert(InternalFormatInitialzerPair(GL_RGB32F,  Initialize4ComponentData<GLfloat,  0x00000000, 0x00000000, 0x00000000, gl::Float32One>));
 
     return map;
 }
diff --git a/src/libGLESv2/renderer/loadimage.cpp b/src/libGLESv2/renderer/loadimage.cpp
index 62870c5..4a29460 100644
--- a/src/libGLESv2/renderer/loadimage.cpp
+++ b/src/libGLESv2/renderer/loadimage.cpp
@@ -1,6 +1,6 @@
 #include "precompiled.h"
 //
-// Copyright (c) 0013 The ANGLE Project Authors. All rights reserved.
+// Copyright (c) 2013-2014 The ANGLE Project Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
@@ -12,20 +12,64 @@
 namespace rx
 {
 
-void loadAlphaDataToBGRA(int width, int height, int depth,
-                         const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                         void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadA8ToRGBA8(size_t width, size_t height, size_t depth,
+                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
+            {
+                dest[x] = static_cast<uint32_t>(source[x]) << 24;
+            }
+        }
+    }
+}
+
+void LoadA8ToBGRA8(size_t width, size_t height, size_t depth,
+                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    // Same as loading to RGBA
+    LoadA8ToRGBA8(width, height, depth, input, inputRowPitch, inputDepthPitch, output, outputRowPitch, outputDepthPitch);
+}
+
+void LoadA32FToRGBA32F(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
+        {
+            const float *source = OffsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
+            float *dest = OffsetDataPointer<float>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
+            {
+                dest[4 * x + 0] = 0.0f;
+                dest[4 * x + 1] = 0.0f;
+                dest[4 * x + 2] = 0.0f;
+                dest[4 * x + 3] = source[x];
+            }
+        }
+    }
+}
+
+void LoadA16FToRGBA16F(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
+        {
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint16_t *dest = OffsetDataPointer<uint16_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[4 * x + 0] = 0;
                 dest[4 * x + 1] = 0;
@@ -36,104 +80,17 @@
     }
 }
 
-void loadAlphaDataToNative(int width, int height, int depth,
-                           const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                           void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadL8ToRGBA8(size_t width, size_t height, size_t depth,
+                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            memcpy(dest, source, width);
-        }
-    }
-}
-
-void loadAlphaFloatDataToRGBA(int width, int height, int depth,
-                              const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                              void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const float *source = NULL;
-    float *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<float>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
-            {
-                dest[4 * x + 0] = 0;
-                dest[4 * x + 1] = 0;
-                dest[4 * x + 2] = 0;
-                dest[4 * x + 3] = source[x];
-            }
-        }
-    }
-}
-
-void loadAlphaHalfFloatDataToRGBA(int width, int height, int depth,
-                                  const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                  void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned short *source = NULL;
-    unsigned short *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned short>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
-            {
-                dest[4 * x + 0] = 0;
-                dest[4 * x + 1] = 0;
-                dest[4 * x + 2] = 0;
-                dest[4 * x + 3] = source[x];
-            }
-        }
-    }
-}
-
-void loadLuminanceDataToNative(int width, int height, int depth,
-                               const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                               void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            memcpy(dest, source, width);
-        }
-    }
-}
-
-void loadLuminanceDataToBGRA(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[4 * x + 0] = source[x];
                 dest[4 * x + 1] = source[x];
@@ -144,20 +101,25 @@
     }
 }
 
-void loadLuminanceFloatDataToRGBA(int width, int height, int depth,
-                                  const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                  void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadL8ToBGRA8(size_t width, size_t height, size_t depth,
+                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const float *source = NULL;
-    float *dest = NULL;
+    // Same as loading to RGBA
+    LoadL8ToRGBA8(width, height, depth, input, inputRowPitch, inputDepthPitch, output, outputRowPitch, outputDepthPitch);
+}
 
-    for (int z = 0; z < depth; z++)
+void LoadL32FToRGBA32F(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<float>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const float *source = OffsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
+            float *dest = OffsetDataPointer<float>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[4 * x + 0] = source[x];
                 dest[4 * x + 1] = source[x];
@@ -168,43 +130,17 @@
     }
 }
 
-void loadLuminanceFloatDataToRGB(int width, int height, int depth,
-                                 const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                 void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadL16FToRGBA16F(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const float *source = NULL;
-    float *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<float>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
-            {
-                dest[3 * x + 0] = source[x];
-                dest[3 * x + 1] = source[x];
-                dest[3 * x + 2] = source[x];
-            }
-        }
-    }
-}
-
-void loadLuminanceHalfFloatDataToRGBA(int width, int height, int depth,
-                                      const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                      void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned short *source = NULL;
-    unsigned short *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned short>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint16_t *dest = OffsetDataPointer<uint16_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[4 * x + 0] = source[x];
                 dest[4 * x + 1] = source[x];
@@ -215,112 +151,88 @@
     }
 }
 
-void loadLuminanceAlphaDataToNative(int width, int height, int depth,
-                                    const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                    void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadLA8ToRGBA8(size_t width, size_t height, size_t depth,
+                    const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                    uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            memcpy(dest, source, width * 2);
-        }
-    }
-}
-
-void loadLuminanceAlphaDataToBGRA(int width, int height, int depth,
-                                  const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                  void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
+            const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                dest[4 * x + 0] = source[2*x+0];
-                dest[4 * x + 1] = source[2*x+0];
-                dest[4 * x + 2] = source[2*x+0];
-                dest[4 * x + 3] = source[2*x+1];
+                dest[4 * x + 0] = source[2 * x + 0];
+                dest[4 * x + 1] = source[2 * x + 0];
+                dest[4 * x + 2] = source[2 * x + 0];
+                dest[4 * x + 3] = source[2 * x + 1];
             }
         }
     }
 }
 
-void loadLuminanceAlphaFloatDataToRGBA(int width, int height, int depth,
-                                       const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                       void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadLA8ToBGRA8(size_t width, size_t height, size_t depth,
+                    const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                    uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const float *source = NULL;
-    float *dest = NULL;
+    // Same as loading to RGBA
+    LoadLA8ToRGBA8(width, height, depth, input, inputRowPitch, inputDepthPitch, output, outputRowPitch, outputDepthPitch);
+}
 
-    for (int z = 0; z < depth; z++)
+void LoadLA32FToRGBA32F(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<float>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const float *source = OffsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
+            float *dest = OffsetDataPointer<float>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                dest[4 * x + 0] = source[2*x+0];
-                dest[4 * x + 1] = source[2*x+0];
-                dest[4 * x + 2] = source[2*x+0];
-                dest[4 * x + 3] = source[2*x+1];
+                dest[4 * x + 0] = source[2 * x + 0];
+                dest[4 * x + 1] = source[2 * x + 0];
+                dest[4 * x + 2] = source[2 * x + 0];
+                dest[4 * x + 3] = source[2 * x + 1];
             }
         }
     }
 }
 
-void loadLuminanceAlphaHalfFloatDataToRGBA(int width, int height, int depth,
-                                           const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                           void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadLA16FToRGBA16F(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned short *source = NULL;
-    unsigned short *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned short>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint16_t *dest = OffsetDataPointer<uint16_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                dest[4 * x + 0] = source[2*x+0];
-                dest[4 * x + 1] = source[2*x+0];
-                dest[4 * x + 2] = source[2*x+0];
-                dest[4 * x + 3] = source[2*x+1];
+                dest[4 * x + 0] = source[2 * x + 0];
+                dest[4 * x + 1] = source[2 * x + 0];
+                dest[4 * x + 2] = source[2 * x + 0];
+                dest[4 * x + 3] = source[2 * x + 1];
             }
         }
     }
 }
 
-void loadRGBUByteDataToBGRX(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRGB8ToBGRX8(size_t width, size_t height, size_t depth,
+                     const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                     uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[4 * x + 0] = source[x * 3 + 2];
                 dest[4 * x + 1] = source[x * 3 + 1];
@@ -331,20 +243,17 @@
     }
 }
 
-void loadRGUByteDataToBGRX(int width, int height, int depth,
-                           const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                           void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRG8ToBGRX8(size_t width, size_t height, size_t depth,
+                    const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                    uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[4 * x + 0] = 0x00;
                 dest[4 * x + 1] = source[x * 2 + 1];
@@ -355,20 +264,17 @@
     }
 }
 
-void loadRUByteDataToBGRX(int width, int height, int depth,
-                          const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                          void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadR8ToBGRX8(size_t width, size_t height, size_t depth,
+                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[4 * x + 0] = 0x00;
                 dest[4 * x + 1] = 0x00;
@@ -379,228 +285,82 @@
     }
 }
 
-void loadRGBUByteDataToRGBA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadR5G6B5ToBGRA8(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                dest[4 * x + 0] = source[x * 3 + 0];
-                dest[4 * x + 1] = source[x * 3 + 1];
-                dest[4 * x + 2] = source[x * 3 + 2];
+                uint16_t rgb = source[x];
+                dest[4 * x + 0] = ((rgb & 0x001F) << 3) | ((rgb & 0x001F) >> 2);
+                dest[4 * x + 1] = ((rgb & 0x07E0) >> 3) | ((rgb & 0x07E0) >> 9);
+                dest[4 * x + 2] = ((rgb & 0xF800) >> 8) | ((rgb & 0xF800) >> 13);
                 dest[4 * x + 3] = 0xFF;
             }
         }
     }
 }
 
-void loadRGBSByteDataToRGBA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadR5G6B5ToRGBA8(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const char *source = NULL;
-    char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                dest[4 * x + 0] = source[x * 3 + 0];
-                dest[4 * x + 1] = source[x * 3 + 1];
-                dest[4 * x + 2] = source[x * 3 + 2];
-                dest[4 * x + 3] = 0x7F;
-            }
-        }
-    }
-}
-
-void loadRGB565DataToBGRA(int width, int height, int depth,
-                          const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                          void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned short *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
-            {
-                unsigned short rgba = source[x];
-                dest[4 * x + 0] = ((rgba & 0x001F) << 3) | ((rgba & 0x001F) >> 2);
-                dest[4 * x + 1] = ((rgba & 0x07E0) >> 3) | ((rgba & 0x07E0) >> 9);
-                dest[4 * x + 2] = ((rgba & 0xF800) >> 8) | ((rgba & 0xF800) >> 13);
+                uint16_t rgb = source[x];
+                dest[4 * x + 0] = ((rgb & 0xF800) >> 8) | ((rgb & 0xF800) >> 13);
+                dest[4 * x + 1] = ((rgb & 0x07E0) >> 3) | ((rgb & 0x07E0) >> 9);
+                dest[4 * x + 2] = ((rgb & 0x001F) << 3) | ((rgb & 0x001F) >> 2);
                 dest[4 * x + 3] = 0xFF;
             }
         }
     }
 }
 
-void loadRGB565DataToRGBA(int width, int height, int depth,
-                          const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                          void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRGBA8ToBGRA8(size_t width, size_t height, size_t depth,
+                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned short *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                unsigned short rgba = source[x];
-                dest[4 * x + 0] = ((rgba & 0xF800) >> 8) | ((rgba & 0xF800) >> 13);
-                dest[4 * x + 1] = ((rgba & 0x07E0) >> 3) | ((rgba & 0x07E0) >> 9);
-                dest[4 * x + 2] = ((rgba & 0x001F) << 3) | ((rgba & 0x001F) >> 2);
-                dest[4 * x + 3] = 0xFF;
-            }
-        }
-    }
-}
-
-void loadRGBFloatDataToRGBA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const float *source = NULL;
-    float *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<float>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
-            {
-                dest[4 * x + 0] = source[x * 3 + 0];
-                dest[4 * x + 1] = source[x * 3 + 1];
-                dest[4 * x + 2] = source[x * 3 + 2];
-                dest[4 * x + 3] = 1.0f;
-            }
-        }
-    }
-}
-
-void loadRGBFloatDataToNative(int width, int height, int depth,
-                              const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                              void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const float *source = NULL;
-    float *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<float>(output, y, z, outputRowPitch, outputDepthPitch);
-            memcpy(dest, source, width * 12);
-        }
-    }
-}
-
-void loadRGBHalfFloatDataToRGBA(int width, int height, int depth,
-                                const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned short *source = NULL;
-    unsigned short *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned short>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
-            {
-                dest[4 * x + 0] = source[x * 3 + 0];
-                dest[4 * x + 1] = source[x * 3 + 1];
-                dest[4 * x + 2] = source[x * 3 + 2];
-                dest[4 * x + 3] = 0x3C00; // SEEEEEMMMMMMMMMM, S = 0, E = 15, M = 0: 16bit flpt representation of 1
-            }
-        }
-    }
-}
-
-void loadRGBAUByteDataToBGRA(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned int *source = NULL;
-    unsigned int *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned int>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned int>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
-            {
-                unsigned int rgba = source[x];
+                uint32_t rgba = source[x];
                 dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
             }
         }
     }
 }
 
-void loadRGBAUByteDataToNative(int width, int height, int depth,
-                               const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                               void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRGBA4ToBGRA8(size_t width, size_t height, size_t depth,
+                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned int *source = NULL;
-    unsigned int *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned int>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned int>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            memcpy(dest, source, width * 4);
-        }
-    }
-}
-
-void loadRGBA4444DataToBGRA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned short *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                unsigned short rgba = source[x];
+                uint16_t rgba = source[x];
                 dest[4 * x + 0] = ((rgba & 0x00F0) << 0) | ((rgba & 0x00F0) >> 4);
                 dest[4 * x + 1] = ((rgba & 0x0F00) >> 4) | ((rgba & 0x0F00) >> 8);
                 dest[4 * x + 2] = ((rgba & 0xF000) >> 8) | ((rgba & 0xF000) >> 12);
@@ -610,22 +370,19 @@
     }
 }
 
-void loadRGBA4444DataToRGBA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRGBA4ToRGBA8(size_t width, size_t height, size_t depth,
+                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned short *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                unsigned short rgba = source[x];
+                uint16_t rgba = source[x];
                 dest[4 * x + 0] = ((rgba & 0xF000) >> 8) | ((rgba & 0xF000) >> 12);
                 dest[4 * x + 1] = ((rgba & 0x0F00) >> 4) | ((rgba & 0x0F00) >> 8);
                 dest[4 * x + 2] = ((rgba & 0x00F0) << 0) | ((rgba & 0x00F0) >> 4);
@@ -635,22 +392,41 @@
     }
 }
 
-void loadRGBA5551DataToBGRA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadBGRA4ToBGRA8(size_t width, size_t height, size_t depth,
+                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned short *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                unsigned short rgba = source[x];
+                uint16_t bgra = source[x];
+                dest[4 * x + 0] = ((bgra & 0xF000) >> 8) | ((bgra & 0xF000) >> 12);
+                dest[4 * x + 1] = ((bgra & 0x0F00) >> 4) | ((bgra & 0x0F00) >> 8);
+                dest[4 * x + 2] = ((bgra & 0x00F0) << 0) | ((bgra & 0x00F0) >> 4);
+                dest[4 * x + 3] = ((bgra & 0x000F) << 4) | ((bgra & 0x000F) >> 0);
+            }
+        }
+    }
+}
+
+void LoadRGB5A1ToBGRA8(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
+        {
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
+            {
+                uint16_t rgba = source[x];
                 dest[4 * x + 0] = ((rgba & 0x003E) << 2) | ((rgba & 0x003E) >> 3);
                 dest[4 * x + 1] = ((rgba & 0x07C0) >> 3) | ((rgba & 0x07C0) >> 8);
                 dest[4 * x + 2] = ((rgba & 0xF800) >> 8) | ((rgba & 0xF800) >> 13);
@@ -659,22 +435,20 @@
         }
     }
 }
-void loadRGBA5551DataToRGBA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned short *source = NULL;
-    unsigned char *dest = NULL;
 
-    for (int z = 0; z < depth; z++)
+void LoadRGB5A1ToRGBA8(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                unsigned short rgba = source[x];
+                uint16_t rgba = source[x];
                 dest[4 * x + 0] = ((rgba & 0xF800) >> 8) | ((rgba & 0xF800) >> 13);
                 dest[4 * x + 1] = ((rgba & 0x07C0) >> 3) | ((rgba & 0x07C0) >> 8);
                 dest[4 * x + 2] = ((rgba & 0x003E) << 2) | ((rgba & 0x003E) >> 3);
@@ -684,95 +458,42 @@
     }
 }
 
-void loadRGBAFloatDataToRGBA(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+
+void LoadBGR5A1ToBGRA8(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const float *source = NULL;
-    float *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<float>(output, y, z, outputRowPitch, outputDepthPitch);
-            memcpy(dest, source, width * 16);
-        }
-    }
-}
-
-void loadRGBAHalfFloatDataToRGBA(int width, int height, int depth,
-                                 const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                 void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            memcpy(dest, source, width * 8);
-        }
-    }
-}
-
-void loadBGRADataToBGRA(int width, int height, int depth,
-                        const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                        void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned char *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned char>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-            memcpy(dest, source, width*4);
-        }
-    }
-}
-
-void loadRGBA2101010ToNative(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned int *source = NULL;
-    unsigned int *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned int>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned int>(output, y, z, outputRowPitch, outputDepthPitch);
-            memcpy(dest, source, width * sizeof(unsigned int));
-        }
-    }
-}
-
-void loadRGBA2101010ToRGBA(int width, int height, int depth,
-                           const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                           void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned int *source = NULL;
-    unsigned char *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<unsigned int>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned char>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                unsigned int rgba = source[x];
+                uint16_t bgra = source[x];
+                dest[4 * x + 0] = ((bgra & 0xF800) >> 8) | ((bgra & 0xF800) >> 13);
+                dest[4 * x + 1] = ((bgra & 0x07C0) >> 3) | ((bgra & 0x07C0) >> 8);
+                dest[4 * x + 2] = ((bgra & 0x003E) << 2) | ((bgra & 0x003E) >> 3);
+                dest[4 * x + 3] = (bgra & 0x0001) ? 0xFF : 0;
+            }
+        }
+    }
+}
+
+void LoadRGB10A2ToRGBA8(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
+        {
+            const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
+            {
+                uint32_t rgba = source[x];
                 dest[4 * x + 0] = (rgba & 0x000003FF) >>  2;
                 dest[4 * x + 1] = (rgba & 0x000FFC00) >> 12;
                 dest[4 * x + 2] = (rgba & 0x3FF00000) >> 22;
@@ -782,21 +503,17 @@
     }
 }
 
-void loadRGBHalfFloatDataTo999E5(int width, int height, int depth,
-                                 const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                 void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRGB16FToRGB9E5(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned short *source = NULL;
-    unsigned int *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned int>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[x] = gl::convertRGBFloatsTo999E5(gl::float16ToFloat32(source[x * 3 + 0]),
                                                       gl::float16ToFloat32(source[x * 3 + 1]),
@@ -806,21 +523,17 @@
     }
 }
 
-void loadRGBFloatDataTo999E5(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRGB32FToRGB9E5(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const float *source = NULL;
-    unsigned int *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned int>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
+            const float *source = OffsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[x] = gl::convertRGBFloatsTo999E5(source[x * 3 + 0], source[x * 3 + 1], source[x * 3 + 2]);
             }
@@ -828,21 +541,17 @@
     }
 }
 
-void loadRGBHalfFloatDataTo111110Float(int width, int height, int depth,
-                                       const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                       void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRGB16FToRG11B10F(size_t width, size_t height, size_t depth,
+                          const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                          uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned short *source = NULL;
-    unsigned int *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned short>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned int>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
+            const uint16_t *source = OffsetDataPointer<uint16_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[x] = (gl::float32ToFloat11(gl::float16ToFloat32(source[x * 3 + 0])) <<  0) |
                           (gl::float32ToFloat11(gl::float16ToFloat32(source[x * 3 + 1])) << 11) |
@@ -852,21 +561,17 @@
     }
 }
 
-void loadRGBFloatDataTo111110Float(int width, int height, int depth,
-                                   const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                   void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRGB32FToRG11B10F(size_t width, size_t height, size_t depth,
+                          const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                          uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const float *source = NULL;
-    unsigned int *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned int>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
+            const float *source = OffsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[x] = (gl::float32ToFloat11(source[x * 3 + 0]) <<  0) |
                           (gl::float32ToFloat11(source[x * 3 + 1]) << 11) |
@@ -876,46 +581,37 @@
     }
 }
 
-
-void loadG8R24DataToR24G8(int width, int height, int depth,
-                          const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                          void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadG8R24ToR24G8(size_t width, size_t height, size_t depth,
+                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned int *source = NULL;
-    unsigned int *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<const unsigned int>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned int>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
+            const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
-                unsigned int d = source[x] >> 8;
-                unsigned int s = source[x] & 0xFF;
+                uint32_t d = source[x] >> 8;
+                uint8_t  s = source[x] & 0xFF;
                 dest[x] = d | (s << 24);
             }
         }
     }
 }
 
-void loadFloatRGBDataToHalfFloatRGBA(int width, int height, int depth,
-                                     const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                     void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRGB32FToRGBA16F(size_t width, size_t height, size_t depth,
+                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const float *source = NULL;
-    unsigned short *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned short>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
+            const float *source = OffsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint16_t *dest = OffsetDataPointer<uint16_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[x * 4 + 0] = gl::float32ToFloat16(source[x * 3 + 0]);
                 dest[x * 4 + 1] = gl::float32ToFloat16(source[x * 3 + 1]);
@@ -926,21 +622,17 @@
     }
 }
 
-void loadUintDataToUshort(int width, int height, int depth,
-                          const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                          void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadR32ToR16(size_t width, size_t height, size_t depth,
+                  const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                  uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned int *source = NULL;
-    unsigned short *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<unsigned int>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned short>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
+            const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint16_t *dest = OffsetDataPointer<uint16_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
             {
                 dest[x] = source[x] >> 16;
             }
@@ -948,21 +640,18 @@
     }
 }
 
-void loadUintDataToUint24X8(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadR32ToR24G8(size_t width, size_t height, size_t depth,
+                    const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                    uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
 {
-    const unsigned int *source = NULL;
-    unsigned int *dest = NULL;
-
-    for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
     {
-        for (int y = 0; y < height; y++)
+        for (size_t y = 0; y < height; y++)
         {
-            source = offsetDataPointer<const unsigned int>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned int>(output, y, z, outputRowPitch, outputDepthPitch);
+            const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
 
-            for (int x = 0; x < width; x++)
+            for (size_t x = 0; x < width; x++)
             {
                 dest[x] = source[x] >> 8;
             }
diff --git a/src/libGLESv2/renderer/loadimage.h b/src/libGLESv2/renderer/loadimage.h
index 8b1066a..bcdff24 100644
--- a/src/libGLESv2/renderer/loadimage.h
+++ b/src/libGLESv2/renderer/loadimage.h
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2013 The ANGLE Project Authors. All rights reserved.
+// Copyright (c) 2013-2014 The ANGLE Project Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
@@ -9,339 +9,185 @@
 #ifndef LIBGLESV2_RENDERER_LOADIMAGE_H_
 #define LIBGLESV2_RENDERER_LOADIMAGE_H_
 
-#include "common/mathutil.h"
+#include "libGLESv2/angletypes.h"
+
+#include <cstdint>
 
 namespace rx
 {
 
-template <typename T>
-inline static T *offsetDataPointer(void *data, int y, int z, int rowPitch, int depthPitch)
-{
-    return reinterpret_cast<T*>(reinterpret_cast<unsigned char*>(data) + (y * rowPitch) + (z * depthPitch));
-}
+void LoadA8ToRGBA8(size_t width, size_t height, size_t depth,
+                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadA8ToBGRA8(size_t width, size_t height, size_t depth,
+                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadA32FToRGBA32F(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadA16FToRGBA16F(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadL8ToRGBA8(size_t width, size_t height, size_t depth,
+                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadL8ToBGRA8(size_t width, size_t height, size_t depth,
+                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadL32FToRGBA32F(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadL16FToRGBA16F(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadLA8ToRGBA8(size_t width, size_t height, size_t depth,
+                    const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                    uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadLA8ToBGRA8(size_t width, size_t height, size_t depth,
+                    const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                    uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadLA32FToRGBA32F(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadLA16FToRGBA16F(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGB8ToBGRX8(size_t width, size_t height, size_t depth,
+                     const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                     uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRG8ToBGRX8(size_t width, size_t height, size_t depth,
+                    const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                    uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadR8ToBGRX8(size_t width, size_t height, size_t depth,
+                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadR5G6B5ToBGRA8(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadR5G6B5ToRGBA8(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
+                           const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                           uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGBA8ToBGRA8(size_t width, size_t height, size_t depth,
+                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGBA4ToBGRA8(size_t width, size_t height, size_t depth,
+                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGBA4ToRGBA8(size_t width, size_t height, size_t depth,
+                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadBGRA4ToBGRA8(size_t width, size_t height, size_t depth,
+                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGB5A1ToBGRA8(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGB5A1ToRGBA8(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadBGR5A1ToBGRA8(size_t width, size_t height, size_t depth,
+                       const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                       uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGB10A2ToRGBA8(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGB16FToRGB9E5(size_t width, size_t height, size_t depth,
+                          const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                          uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGB32FToRGB9E5(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGB16FToRG11B10F(size_t width, size_t height, size_t depth,
+                          const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                          uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGB32FToRG11B10F(size_t width, size_t height, size_t depth,
+                          const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                          uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadG8R24ToR24G8(size_t width, size_t height, size_t depth,
+                      const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                      uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+template <typename type, size_t componentCount>
+inline void LoadToNative(size_t width, size_t height, size_t depth,
+                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+template <typename type, uint32_t fourthComponentBits>
+inline void LoadToNative3To4(size_t width, size_t height, size_t depth,
+                             const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                             uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+template <size_t componentCount>
+inline void Load32FTo16F(size_t width, size_t height, size_t depth,
+                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadRGB32FToRGBA16F(size_t width, size_t height, size_t depth,
+                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+template <size_t blockWidth, size_t blockHeight, size_t blockSize>
+inline void LoadCompressedToNative(size_t width, size_t height, size_t depth,
+                                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadR32ToR16(size_t width, size_t height, size_t depth,
+                  const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                  uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+template <typename type, uint32_t firstBits, uint32_t secondBits, uint32_t thirdBits, uint32_t fourthBits>
+inline void Initialize4ComponentData(size_t width, size_t height, size_t depth,
+                                     uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
+
+void LoadR32ToR24G8(size_t width, size_t height, size_t depth,
+                    const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                    uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
 
 template <typename T>
-inline static const T *offsetDataPointer(const void *data, int y, int z, int rowPitch, int depthPitch)
-{
-    return reinterpret_cast<const T*>(reinterpret_cast<const unsigned char*>(data) + (y * rowPitch) + (z * depthPitch));
-}
+inline T *OffsetDataPointer(uint8_t *data, size_t y, size_t z, size_t rowPitch, size_t depthPitch);
 
-void loadAlphaDataToBGRA(int width, int height, int depth,
-                         const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                         void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadAlphaDataToNative(int width, int height, int depth,
-                           const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                           void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadAlphaDataToBGRASSE2(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadAlphaFloatDataToRGBA(int width, int height, int depth,
-                              const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                              void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadAlphaHalfFloatDataToRGBA(int width, int height, int depth,
-                                  const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                  void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadLuminanceDataToNative(int width, int height, int depth,
-                               const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                               void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadLuminanceDataToBGRA(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadLuminanceFloatDataToRGBA(int width, int height, int depth,
-                                  const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                  void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadLuminanceFloatDataToRGB(int width, int height, int depth,
-                                 const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                 void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadLuminanceHalfFloatDataToRGBA(int width, int height, int depth,
-                                      const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                      void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadLuminanceAlphaDataToNative(int width, int height, int depth,
-                                    const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                    void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadLuminanceAlphaDataToBGRA(int width, int height, int depth,
-                                  const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                  void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadLuminanceAlphaFloatDataToRGBA(int width, int height, int depth,
-                                       const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                       void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadLuminanceAlphaHalfFloatDataToRGBA(int width, int height, int depth,
-                                           const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                           void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBUByteDataToBGRX(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGUByteDataToBGRX(int width, int height, int depth,
-                           const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                           void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRUByteDataToBGRX(int width, int height, int depth,
-                          const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                          void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBUByteDataToRGBA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBSByteDataToRGBA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGB565DataToBGRA(int width, int height, int depth,
-                          const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                          void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGB565DataToRGBA(int width, int height, int depth,
-                          const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                          void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBFloatDataToRGBA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBFloatDataToNative(int width, int height, int depth,
-                              const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                              void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBHalfFloatDataToRGBA(int width, int height, int depth,
-                                const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBAUByteDataToBGRASSE2(int width, int height, int depth,
-                                 const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                 void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBAUByteDataToBGRA(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBAUByteDataToNative(int width, int height, int depth,
-                               const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                               void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBA4444DataToBGRA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBA4444DataToRGBA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBA5551DataToBGRA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBA5551DataToRGBA(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBAFloatDataToRGBA(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBAHalfFloatDataToRGBA(int width, int height, int depth,
-                                 const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                 void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadBGRADataToBGRA(int width, int height, int depth,
-                        const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                        void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBA2101010ToNative(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBA2101010ToRGBA(int width, int height, int depth,
-                           const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                           void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBHalfFloatDataTo999E5(int width, int height, int depth,
-                                 const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                 void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBFloatDataTo999E5(int width, int height, int depth,
-                             const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                             void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBHalfFloatDataTo111110Float(int width, int height, int depth,
-                                       const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                       void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadRGBFloatDataTo111110Float(int width, int height, int depth,
-                                   const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                   void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-void loadG8R24DataToR24G8(int width, int height, int depth,
-                        const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                        void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-template <typename type, unsigned int componentCount>
-void loadToNative(int width, int height, int depth,
-                  const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                  void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const unsigned int rowSize = width * sizeof(type) * componentCount;
-    const unsigned int layerSize = rowSize * height;
-    const unsigned int imageSize = layerSize * depth;
-
-    if (layerSize == inputDepthPitch && layerSize == outputDepthPitch)
-    {
-        ASSERT(rowSize == inputRowPitch && rowSize == outputRowPitch);
-        memcpy(output, input, imageSize);
-    }
-    else if (rowSize == inputRowPitch && rowSize == outputRowPitch)
-    {
-        for (int z = 0; z < depth; z++)
-        {
-            const type *source = offsetDataPointer<type>(input, 0, z, inputRowPitch, inputDepthPitch);
-            type *dest = offsetDataPointer<type>(output, 0, z, outputRowPitch, outputDepthPitch);
-
-            memcpy(dest, source, layerSize);
-        }
-    }
-    else
-    {
-        for (int z = 0; z < depth; z++)
-        {
-            for (int y = 0; y < height; y++)
-            {
-                const type *source = offsetDataPointer<type>(input, y, z, inputRowPitch, inputDepthPitch);
-                type *dest = offsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
-                memcpy(dest, source, width * sizeof(type) * componentCount);
-            }
-        }
-    }
-}
-
-template <typename type, unsigned int fourthComponentBits>
-void loadToNative3To4(int width, int height, int depth,
-                      const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                      void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const type *source = NULL;
-    type *dest = NULL;
-
-    const unsigned int fourthBits = fourthComponentBits;
-    const type fourthValue = *reinterpret_cast<const type*>(&fourthBits);
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<type>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
-            {
-                dest[x * 4 + 0] = source[x * 3 + 0];
-                dest[x * 4 + 1] = source[x * 3 + 1];
-                dest[x * 4 + 2] = source[x * 3 + 2];
-                dest[x * 4 + 3] = fourthValue;
-            }
-        }
-    }
-}
-
-template <unsigned int componentCount>
-void loadFloatDataToHalfFloat(int width, int height, int depth,
-                              const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                              void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    const float *source = NULL;
-    unsigned short *dest = NULL;
-
-    const int elementWidth = componentCount * width;
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            source = offsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
-            dest = offsetDataPointer<unsigned short>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < elementWidth; x++)
-            {
-                dest[x] = gl::float32ToFloat16(source[x]);
-            }
-        }
-    }
-}
-
-void loadFloatRGBDataToHalfFloatRGBA(int width, int height, int depth,
-                                     const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                     void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-template <unsigned int blockWidth, unsigned int blockHeight, unsigned int blockSize>
-void loadCompressedBlockDataToNative(int width, int height, int depth,
-                                     const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                                     void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    int columns = (width + (blockWidth - 1)) / blockWidth;
-    int rows = (height + (blockHeight - 1)) / blockHeight;
-
-    for (int z = 0; z < depth; ++z)
-    {
-        for (int y = 0; y < rows; ++y)
-        {
-            void *source = (void*)((char*)input + y * inputRowPitch + z * inputDepthPitch);
-            void *dest = (void*)((char*)output + y * outputRowPitch + z * outputDepthPitch);
-
-            memcpy(dest, source, columns * blockSize);
-        }
-    }
-}
-
-void loadUintDataToUshort(int width, int height, int depth,
-                          const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                          void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
-
-template <typename type, unsigned int firstBits, unsigned int secondBits, unsigned int thirdBits, unsigned int fourthBits>
-void initialize4ComponentData(int width, int height, int depth,
-                              void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
-{
-    unsigned int writeBits[4] = { firstBits, secondBits, thirdBits, fourthBits };
-    type writeValues[4] = { *reinterpret_cast<const type*>(&writeBits[0]),
-                            *reinterpret_cast<const type*>(&writeBits[1]),
-                            *reinterpret_cast<const type*>(&writeBits[2]),
-                            *reinterpret_cast<const type*>(&writeBits[3]) };
-
-    for (int z = 0; z < depth; z++)
-    {
-        for (int y = 0; y < height; y++)
-        {
-            type* destRow = offsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
-
-            for (int x = 0; x < width; x++)
-            {
-                type* destPixel = destRow + x * 4;
-
-                // This could potentially be optimized by generating an entire row of initialization
-                // data and copying row by row instead of pixel by pixel.
-                memcpy(destPixel, writeValues, sizeof(type) * 4);
-            }
-        }
-    }
-}
-
-void loadUintDataToUint24X8(int width, int height, int depth,
-                            const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-                            void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
+template <typename T>
+inline const T *OffsetDataPointer(const uint8_t *data, size_t y, size_t z, size_t rowPitch, size_t depthPitch);
 
 }
 
+#include "loadimage.inl"
+
 #endif // LIBGLESV2_RENDERER_LOADIMAGE_H_
diff --git a/src/libGLESv2/renderer/loadimage.inl b/src/libGLESv2/renderer/loadimage.inl
new file mode 100644
index 0000000..abd0a36
--- /dev/null
+++ b/src/libGLESv2/renderer/loadimage.inl
@@ -0,0 +1,156 @@
+//
+// Copyright (c) 2014 The ANGLE Project Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+
+#include "common/mathutil.h"
+
+namespace rx
+{
+
+template <typename T>
+inline T *OffsetDataPointer(uint8_t *data, size_t y, size_t z, size_t rowPitch, size_t depthPitch)
+{
+    return reinterpret_cast<T*>(data + (y * rowPitch) + (z * depthPitch));
+}
+
+template <typename T>
+inline const T *OffsetDataPointer(const uint8_t *data, size_t y, size_t z, size_t rowPitch, size_t depthPitch)
+{
+    return reinterpret_cast<const T*>(data + (y * rowPitch) + (z * depthPitch));
+}
+
+template <typename type, size_t componentCount>
+inline void LoadToNative(size_t width, size_t height, size_t depth,
+                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    const size_t rowSize = width * sizeof(type) * componentCount;
+    const size_t layerSize = rowSize * height;
+    const size_t imageSize = layerSize * depth;
+
+    if (layerSize == inputDepthPitch && layerSize == outputDepthPitch)
+    {
+        ASSERT(rowSize == inputRowPitch && rowSize == outputRowPitch);
+        memcpy(output, input, imageSize);
+    }
+    else if (rowSize == inputRowPitch && rowSize == outputRowPitch)
+    {
+        for (size_t z = 0; z < depth; z++)
+        {
+            const type *source = OffsetDataPointer<type>(input, 0, z, inputRowPitch, inputDepthPitch);
+            type *dest = OffsetDataPointer<type>(output, 0, z, outputRowPitch, outputDepthPitch);
+
+            memcpy(dest, source, layerSize);
+        }
+    }
+    else
+    {
+        for (size_t z = 0; z < depth; z++)
+        {
+            for (size_t y = 0; y < height; y++)
+            {
+                const type *source = OffsetDataPointer<type>(input, y, z, inputRowPitch, inputDepthPitch);
+                type *dest = OffsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
+                memcpy(dest, source, width * sizeof(type) * componentCount);
+            }
+        }
+    }
+}
+
+template <typename type, uint32_t fourthComponentBits>
+inline void LoadToNative3To4(size_t width, size_t height, size_t depth,
+                             const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                             uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    const type fourthValue = gl::bitCast<type>(fourthComponentBits);
+
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
+        {
+            const type *source = OffsetDataPointer<type>(input, y, z, inputRowPitch, inputDepthPitch);
+            type *dest = OffsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
+            {
+                dest[x * 4 + 0] = source[x * 3 + 0];
+                dest[x * 4 + 1] = source[x * 3 + 1];
+                dest[x * 4 + 2] = source[x * 3 + 2];
+                dest[x * 4 + 3] = fourthValue;
+            }
+        }
+    }
+}
+
+template <size_t componentCount>
+inline void Load32FTo16F(size_t width, size_t height, size_t depth,
+                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    const size_t elementWidth = componentCount * width;
+
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
+        {
+            const float *source = OffsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint16_t *dest = OffsetDataPointer<uint16_t>(output, y, z, outputRowPitch, outputDepthPitch);
+
+            for (size_t x = 0; x < elementWidth; x++)
+            {
+                dest[x] = gl::float32ToFloat16(source[x]);
+            }
+        }
+    }
+}
+
+template <size_t blockWidth, size_t blockHeight, size_t blockSize>
+inline void LoadCompressedToNative(size_t width, size_t height, size_t depth,
+                                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    const size_t columns = (width + (blockWidth - 1)) / blockWidth;
+    const size_t rows = (height + (blockHeight - 1)) / blockHeight;
+
+    for (size_t z = 0; z < depth; ++z)
+    {
+        for (size_t y = 0; y < rows; ++y)
+        {
+            const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            memcpy(dest, source, columns * blockSize);
+        }
+    }
+}
+
+template <typename type, uint32_t firstBits, uint32_t secondBits, uint32_t thirdBits, uint32_t fourthBits>
+inline void Initialize4ComponentData(size_t width, size_t height, size_t depth,
+                                     uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    type writeValues[4] =
+    {
+        gl::bitCast<type>(firstBits),
+        gl::bitCast<type>(secondBits),
+        gl::bitCast<type>(thirdBits),
+        gl::bitCast<type>(fourthBits),
+    };
+
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
+        {
+            type *destRow = OffsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
+            {
+                type* destPixel = destRow + x * 4;
+
+                // This could potentially be optimized by generating an entire row of initialization
+                // data and copying row by row instead of pixel by pixel.
+                memcpy(destPixel, writeValues, sizeof(type) * 4);
+            }
+        }
+    }
+}
+
+}
diff --git a/src/libGLESv2/renderer/loadimageSSE2.cpp b/src/libGLESv2/renderer/loadimageSSE2.cpp
index 129e39b..9c83894 100644
--- a/src/libGLESv2/renderer/loadimageSSE2.cpp
+++ b/src/libGLESv2/renderer/loadimageSSE2.cpp
@@ -1,11 +1,11 @@
 #include "precompiled.h"
 //
-// Copyright (c) 2002-2012 The ANGLE Project Authors. All rights reserved.
+// Copyright (c) 2002-2014 The ANGLE Project Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
 
-// loadimage.cpp: Defines image loading functions. It's
+// loadimageSSE2.cpp: Defines image loading functions. It's
 // in a separated file for GCC, which can enable SSE usage only per-file,
 // not for code blocks that use SSE2 explicitly.
 
@@ -14,94 +14,92 @@
 namespace rx
 {
 
-    void loadAlphaDataToBGRASSE2(int width, int height, int depth,
-        const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-        void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    __m128i zeroWide = _mm_setzero_si128();
+
+    for (size_t z = 0; z < depth; z++)
     {
-        const unsigned char *source = NULL;
-        unsigned int *dest = NULL;
-        __m128i zeroWide = _mm_setzero_si128();
-
-        for (int z = 0; z < depth; z++)
+        for (size_t y = 0; y < height; y++)
         {
-            for (int y = 0; y < height; y++)
+            const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+
+            size_t x = 0;
+
+            // Make output writes aligned
+            for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
             {
-                source = static_cast<const unsigned char*>(input) + y * inputRowPitch + z * inputDepthPitch;
-                dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputRowPitch + z * outputDepthPitch);
+                dest[x] = static_cast<uint32_t>(source[x]) << 24;
+            }
 
-                int x;
-                // Make output writes aligned
-                for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
-                {
-                    dest[x] = static_cast<unsigned int>(source[x]) << 24;
-                }
+            for (; x + 7 < width; x += 8)
+            {
+                __m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
+                // Interleave each byte to 16bit, make the lower byte to zero
+                sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
+                // Interleave each 16bit to 32bit, make the lower 16bit to zero
+                __m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
+                __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
 
-                for (; x + 7 < width; x += 8)
-                {
-                    __m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
-                    // Interleave each byte to 16bit, make the lower byte to zero
-                    sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
-                    // Interleave each 16bit to 32bit, make the lower 16bit to zero
-                    __m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
-                    __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
+                _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
+                _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
+            }
 
-                    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
-                    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
-                }
-
-                // Handle the remainder
-                for (; x < width; x++)
-                {
-                    dest[x] = static_cast<unsigned int>(source[x]) << 24;
-                }
+            // Handle the remainder
+            for (; x < width; x++)
+            {
+                dest[x] = static_cast<uint32_t>(source[x]) << 24;
             }
         }
     }
+}
 
-    void loadRGBAUByteDataToBGRASSE2(int width, int height, int depth,
-        const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
-        void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
+                           const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                           uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    __m128i brMask = _mm_set1_epi32(0x00ff00ff);
+
+    for (size_t z = 0; z < depth; z++)
     {
-        const unsigned int *source = NULL;
-        unsigned int *dest = NULL;
-        __m128i brMask = _mm_set1_epi32(0x00ff00ff);
-
-        for (int z = 0; z < depth; z++)
+        for (size_t y = 0; y < height; y++)
         {
-            for (int y = 0; y < height; y++)
+            const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+
+            size_t x = 0;
+
+            // Make output writes aligned
+            for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
             {
-                source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputRowPitch + z * inputDepthPitch);
-                dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputRowPitch + z * outputDepthPitch);
-                int x = 0;
+                uint32_t rgba = source[x];
+                dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
+            }
 
-                // Make output writes aligned
-                for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
-                {
-                    unsigned int rgba = source[x];
-                    dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
-                }
+            for (; x + 3 < width; x += 4)
+            {
+                __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
+                // Mask out g and a, which don't change
+                __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
+                // Mask out b and r
+                __m128i brComponents = _mm_and_si128(sourceData, brMask);
+                // Swap b and r
+                __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
+                __m128i result = _mm_or_si128(gaComponents, brSwapped);
+                _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
+            }
 
-                for (; x + 3 < width; x += 4)
-                {
-                    __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
-                    // Mask out g and a, which don't change
-                    __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
-                    // Mask out b and r
-                    __m128i brComponents = _mm_and_si128(sourceData, brMask);
-                    // Swap b and r
-                    __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
-                    __m128i result = _mm_or_si128(gaComponents, brSwapped);
-                    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
-                }
-
-                // Perform leftover writes
-                for (; x < width; x++)
-                {
-                    unsigned int rgba = source[x];
-                    dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
-                }
+            // Perform leftover writes
+            for (; x < width; x++)
+            {
+                uint32_t rgba = source[x];
+                dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
             }
         }
     }
+}
 
 }