Merge "Revert "Roll SwiftShader from 647d3d24c935 to 3b2177fcfcba (5 revisions)""
diff --git a/build_overrides/wayland.gni b/build_overrides/wayland.gni
index e50b0fd..21e575c 100644
--- a/build_overrides/wayland.gni
+++ b/build_overrides/wayland.gni
@@ -13,6 +13,6 @@
 # limitations under the License.
 
 ozone_platform_wayland = true
-
+use_system_libwayland = true
 # SwiftShader has no wayland third-party dir
 wayland_gn_dir = ""
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index b79c740..12e4027 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -262,6 +262,7 @@
 	draw->numBatches = (count + draw->numPrimitivesPerBatch - 1) / draw->numPrimitivesPerBatch;
 	draw->topology = vertexInputInterfaceState.getTopology();
 	draw->provokingVertexMode = preRasterizationState.getProvokingVertexMode();
+	draw->indexType = pipeline->getIndexBuffer().getIndexType();
 	draw->lineRasterizationMode = preRasterizationState.getLineRasterizationMode();
 	draw->descriptorSetObjects = inputs.getDescriptorSetObjects();
 	draw->preRasterizationPipelineLayout = preRasterizationState.getPipelineLayout();
@@ -286,11 +287,6 @@
 	data->instanceID = instanceID;
 	data->baseVertex = baseVertex;
 
-	if(indexBuffer)
-	{
-		draw->indexType = pipeline->getIndexBuffer().getIndexType();
-	}
-
 	draw->vertexRoutine = vertexRoutine;
 
 	vk::DescriptorSet::PrepareForSampling(draw->descriptorSetObjects, draw->preRasterizationPipelineLayout, device);
diff --git a/src/Pipeline/Constants.cpp b/src/Pipeline/Constants.cpp
index 6535272..dda1f39 100644
--- a/src/Pipeline/Constants.cpp
+++ b/src/Pipeline/Constants.cpp
@@ -275,6 +275,12 @@
 		sRGBtoLinearFF_FF00[i] = (unsigned short)(sRGBtoLinear((float)i / 0xFF) * 0xFF00 + 0.5f);
 	}
 
+	for(int i = 0; i < 0x1000; i++)
+	{
+		linearToSRGB12_16[i] = (unsigned short)(clamp(linearToSRGB((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
+		sRGBtoLinear12_16[i] = (unsigned short)(clamp(sRGBtoLinear((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
+	}
+
 	for(int q = 0; q < 4; q++)
 	{
 		for(int c = 0; c < 16; c++)
diff --git a/src/Pipeline/Constants.hpp b/src/Pipeline/Constants.hpp
index ab410f6..a857747 100644
--- a/src/Pipeline/Constants.hpp
+++ b/src/Pipeline/Constants.hpp
@@ -108,6 +108,9 @@
 
 	unsigned short sRGBtoLinearFF_FF00[256];
 
+	unsigned short linearToSRGB12_16[4096];
+	unsigned short sRGBtoLinear12_16[4096];
+
 	// Centroid parameters
 	float4 sampleX[4][16];
 	float4 sampleY[4][16];
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 87a8dab..c73bb42 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -266,18 +266,91 @@
 			continue;
 		}
 
-		for(unsigned int q : samples)
+		auto format = state.colorFormat[index];
+		switch(format)
 		{
-			Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
+		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
+		case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
+		case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		case VK_FORMAT_B8G8R8A8_UNORM:
+		case VK_FORMAT_B8G8R8A8_SRGB:
+		case VK_FORMAT_R8G8B8A8_UNORM:
+		case VK_FORMAT_R8G8B8A8_SRGB:
+		case VK_FORMAT_R8G8_UNORM:
+		case VK_FORMAT_R8_UNORM:
+		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+			for(unsigned int q : samples)
+			{
+				Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
 
-			SIMD::Float4 C = alphaBlend(index, buffer, c[index], x);
-			ASSERT(SIMD::Width == 4);
-			Vector4f color;
-			color.x = Extract128(C.x, 0);
-			color.y = Extract128(C.y, 0);
-			color.z = Extract128(C.z, 0);
-			color.w = Extract128(C.w, 0);
-			writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+				SIMD::Float4 colorf = alphaBlend(index, buffer, c[index], x);
+
+				ASSERT(SIMD::Width == 4);
+				Vector4s color;
+				color.x = UShort4(Extract128(colorf.x, 0) * 0xFFFF, true);  // Saturating
+				color.y = UShort4(Extract128(colorf.y, 0) * 0xFFFF, true);  // Saturating
+				color.z = UShort4(Extract128(colorf.z, 0) * 0xFFFF, true);  // Saturating
+				color.w = UShort4(Extract128(colorf.w, 0) * 0xFFFF, true);  // Saturating
+				writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+			}
+			break;
+		case VK_FORMAT_R16_SFLOAT:
+		case VK_FORMAT_R16G16_SFLOAT:
+		case VK_FORMAT_R16G16B16A16_SFLOAT:
+		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		case VK_FORMAT_R32_SFLOAT:
+		case VK_FORMAT_R32G32_SFLOAT:
+		case VK_FORMAT_R32G32B32A32_SFLOAT:
+		case VK_FORMAT_R32_SINT:
+		case VK_FORMAT_R32G32_SINT:
+		case VK_FORMAT_R32G32B32A32_SINT:
+		case VK_FORMAT_R32_UINT:
+		case VK_FORMAT_R32G32_UINT:
+		case VK_FORMAT_R32G32B32A32_UINT:
+		case VK_FORMAT_R16_UNORM:
+		case VK_FORMAT_R16G16_UNORM:
+		case VK_FORMAT_R16G16B16A16_UNORM:
+		case VK_FORMAT_R16_SINT:
+		case VK_FORMAT_R16G16_SINT:
+		case VK_FORMAT_R16G16B16A16_SINT:
+		case VK_FORMAT_R16_UINT:
+		case VK_FORMAT_R16G16_UINT:
+		case VK_FORMAT_R16G16B16A16_UINT:
+		case VK_FORMAT_R8_SINT:
+		case VK_FORMAT_R8G8_SINT:
+		case VK_FORMAT_R8G8B8A8_SINT:
+		case VK_FORMAT_R8_UINT:
+		case VK_FORMAT_R8G8_UINT:
+		case VK_FORMAT_R8G8B8A8_UINT:
+		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
+			for(unsigned int q : samples)
+			{
+				Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
+
+				SIMD::Float4 C = alphaBlend(index, buffer, c[index], x);
+				ASSERT(SIMD::Width == 4);
+				Vector4f color;
+				color.x = Extract128(C.x, 0);
+				color.y = Extract128(C.y, 0);
+				color.z = Extract128(C.z, 0);
+				color.w = Extract128(C.w, 0);
+				writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+			}
+			break;
+		default:
+			UNSUPPORTED("VkFormat: %d", int(format));
 		}
 	}
 }
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 409479c..1d46605 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -1172,6 +1172,576 @@
 	default:
 		UNSUPPORTED("VkFormat %d", int(format));
 	}
+
+	if(isSRGB(index))
+	{
+		sRGBtoLinear16_12_16(pixel);
+	}
+}
+
+void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask)
+{
+	if(isSRGB(index))
+	{
+		linearToSRGB16_12_16(current);
+	}
+
+	vk::Format format = state.colorFormat[index];
+	switch(format)
+	{
+	case VK_FORMAT_B8G8R8A8_UNORM:
+	case VK_FORMAT_B8G8R8A8_SRGB:
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_R8G8_UNORM:
+	case VK_FORMAT_R8_UNORM:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
+		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
+		break;
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 10) + Short4(0x0020);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 10) + Short4(0x0020);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 10) + Short4(0x0020);
+		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 2) + Short4(0x2000);
+		break;
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 4) + Short4(0x0800);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 4) + Short4(0x0800);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 4) + Short4(0x0800);
+		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 4) + Short4(0x0800);
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 5) + Short4(0x0400);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
+		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 1) + Short4(0x4000);
+		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 6) + Short4(0x0200);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
+		break;
+	default:
+		break;
+	}
+
+	int writeMask = state.colorWriteActive(index);
+	if(format.isBGRformat())
+	{
+		// For BGR formats, flip R and B channels in the channels mask
+		writeMask = (writeMask & 0x0000000A) | (writeMask & 0x00000001) << 2 | (writeMask & 0x00000004) >> 2;
+	}
+
+	switch(format)
+	{
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		{
+			current.x = As<UShort4>(current.x & Short4(0xF000));
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
+			current.z = As<UShort4>(current.z & Short4(0xF000)) >> 8;
+			current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		{
+			current.z = As<UShort4>(current.z & Short4(0xF000));
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
+			current.x = As<UShort4>(current.x & Short4(0xF000)) >> 8;
+			current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
+		{
+			current.w = As<UShort4>(current.w & Short4(0xF000));
+			current.x = As<UShort4>(current.x & Short4(0xF000)) >> 4;
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
+			current.z = As<UShort4>(current.z & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
+		{
+			current.w = As<UShort4>(current.w & Short4(0xF000));
+			current.z = As<UShort4>(current.z & Short4(0xF000)) >> 4;
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
+			current.x = As<UShort4>(current.x & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		{
+			current.x = As<UShort4>(current.x & Short4(0xF800));
+			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
+			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 10;
+			current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		{
+			current.z = As<UShort4>(current.z & Short4(0xF800));
+			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
+			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 10;
+			current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		{
+			current.w = current.w & Short4(0x8000u);
+			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
+			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
+			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		{
+			current.x = current.x & Short4(0xF800u);
+			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
+			current.z = As<UShort4>(current.z) >> 11;
+
+			current.x = current.x | current.y | current.z;
+		}
+		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		{
+			current.z = current.z & Short4(0xF800u);
+			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
+			current.x = As<UShort4>(current.x) >> 11;
+
+			current.x = current.x | current.y | current.z;
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_UNORM:
+	case VK_FORMAT_B8G8R8A8_SRGB:
+		if(writeMask == 0x7)
+		{
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.z, current.x));
+			current.y = As<Short4>(PackUnsigned(current.y, current.y));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
+		}
+		else
+		{
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.z, current.x));
+			current.y = As<Short4>(PackUnsigned(current.y, current.w));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
+		}
+		break;
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		if(writeMask == 0x7)
+		{
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.x, current.z));
+			current.y = As<Short4>(PackUnsigned(current.y, current.y));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
+		}
+		else
+		{
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.x, current.z));
+			current.y = As<Short4>(PackUnsigned(current.y, current.w));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
+		}
+		break;
+	case VK_FORMAT_R8G8_UNORM:
+		current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+		current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+		current.x = As<Short4>(PackUnsigned(current.x, current.x));
+		current.y = As<Short4>(PackUnsigned(current.y, current.y));
+		current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
+		break;
+	case VK_FORMAT_R8_UNORM:
+		current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+		current.x = As<Short4>(PackUnsigned(current.x, current.x));
+		break;
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+		{
+			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
+			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
+			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
+			auto a = (Int4(current.w) >> 14) & Int4(0x3);
+			Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
+			auto c02 = As<Int2>(Int4(packed.xzzz));  // TODO: auto c02 = packed.xz;
+			auto c13 = As<Int2>(Int4(packed.ywww));  // TODO: auto c13 = packed.yw;
+			current.x = UnpackLow(c02, c13);
+			current.y = UnpackHigh(c02, c13);
+		}
+		break;
+	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+		{
+			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
+			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
+			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
+			auto a = (Int4(current.w) >> 14) & Int4(0x3);
+			Int4 packed = (a << 30) | (r << 20) | (g << 10) | b;
+			auto c02 = As<Int2>(Int4(packed.xzzz));  // TODO: auto c02 = packed.xz;
+			auto c13 = As<Int2>(Int4(packed.ywww));  // TODO: auto c13 = packed.yw;
+			current.x = UnpackLow(c02, c13);
+			current.y = UnpackHigh(c02, c13);
+		}
+		break;
+	default:
+		UNSUPPORTED("VkFormat: %d", int(format));
+	}
+
+	Short4 c01 = current.z;
+	Short4 c23 = current.y;
+
+	Int xMask;  // Combination of all masks
+
+	if(state.depthTestActive)
+	{
+		xMask = zMask;
+	}
+	else
+	{
+		xMask = cMask;
+	}
+
+	if(state.stencilActive)
+	{
+		xMask &= sMask;
+	}
+
+	Pointer<Byte> buffer = cBuffer;
+	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+	switch(format)
+	{
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask;
+			switch(format)
+			{
+			case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[writeMask][0]));
+				break;
+			case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
+			case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
+				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[writeMask][0]));
+				break;
+			default:
+				UNREACHABLE("Format: %s", vk::Stringify(format).c_str());
+			}
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[writeMask][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[writeMask][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[writeMask][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[writeMask & 0x7][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if((writeMask & 0x00000007) != 0x00000007)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if((writeMask & 0x00000007) != 0x00000007)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_UNORM:
+	case VK_FORMAT_B8G8R8A8_SRGB:
+		{
+			buffer += x * 4;
+			Short4 value = *Pointer<Short4>(buffer);
+			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[writeMask]));
+
+			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Short4>(buffer);
+
+			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		{
+			buffer += x * 4;
+			Short4 value = *Pointer<Short4>(buffer);
+			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[writeMask]));
+
+			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Short4>(buffer);
+
+			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R8G8_UNORM:
+		if((writeMask & 0x00000003) != 0x0)
+		{
+			buffer += 2 * x;
+			Int2 value;
+			value = Insert(value, *Pointer<Int>(buffer), 0);
+			value = Insert(value, *Pointer<Int>(buffer + pitchB), 1);
+
+			Int2 packedCol = As<Int2>(current.x);
+
+			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
+			if((writeMask & 0x3) != 0x3)
+			{
+				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (writeMask & 0x3)]));
+				UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+				mergedMask &= rgbaMask;
+			}
+
+			packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
+
+			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
+			*Pointer<UInt>(buffer + pitchB) = As<UInt>(Extract(packedCol, 1));
+		}
+		break;
+	case VK_FORMAT_R8_UNORM:
+		if(writeMask & 0x00000001)
+		{
+			buffer += 1 * x;
+			Short4 value;
+			value = Insert(value, *Pointer<Short>(buffer), 0);
+			value = Insert(value, *Pointer<Short>(buffer + pitchB), 1);
+
+			current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
+			value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
+			current.x |= value;
+
+			*Pointer<Short>(buffer) = Extract(current.x, 0);
+			*Pointer<Short>(buffer + pitchB) = Extract(current.x, 1);
+		}
+		break;
+	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+		{
+			buffer += 4 * x;
+
+			Int2 value = *Pointer<Int2>(buffer, 16);
+			Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if(writeMask != 0xF)
+			{
+				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask][0]));
+			}
+			*Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
+
+			buffer += pitchB;
+
+			value = *Pointer<Int2>(buffer, 16);
+			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if(writeMask != 0xF)
+			{
+				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask][0]));
+			}
+			*Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
+		}
+		break;
+	default:
+		UNSUPPORTED("VkFormat: %d", int(format));
+	}
 }
 
 Float PixelRoutine::blendConstant(vk::Format format, int component, BlendFactorModifier modifier)
@@ -1792,13 +2362,6 @@
 			texelColor.y = Float4(As<UShort4>(color.y)) * (1.0f / 0xFFFF);
 			texelColor.z = Float4(As<UShort4>(color.z)) * (1.0f / 0xFFFF);
 			texelColor.w = Float4(As<UShort4>(color.w)) * (1.0f / 0xFFFF);
-
-			if(isSRGB(index))
-			{
-				texelColor.x = sRGBtoLinear(texelColor.x);
-				texelColor.y = sRGBtoLinear(texelColor.y);
-				texelColor.z = sRGBtoLinear(texelColor.z);
-			}
 		}
 		break;
 	}
@@ -2909,4 +3472,65 @@
 	}
 }
 
+void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
+{
+	Pointer<Byte> LUT = constants + OFFSET(Constants, sRGBtoLinear12_16);
+
+	c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
+	c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
+	c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
+
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
+}
+
+void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
+{
+	c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
+	c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
+	c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
+
+	linearToSRGB12_16(c);
+}
+
+void PixelRoutine::linearToSRGB12_16(Vector4s &c)
+{
+	Pointer<Byte> LUT = constants + OFFSET(Constants, linearToSRGB12_16);
+
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
+}
+
+Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)  // Approximates x^2.2
+{
+	Float4 linear = x * x;
+	linear = linear * 0.73f + linear * x * 0.27f;
+
+	return Min(Max(linear, 0.0f), 1.0f);
+}
+
 }  // namespace sw
diff --git a/src/Pipeline/PixelRoutine.hpp b/src/Pipeline/PixelRoutine.hpp
index e1c4eb7..3b5224a 100644
--- a/src/Pipeline/PixelRoutine.hpp
+++ b/src/Pipeline/PixelRoutine.hpp
@@ -56,8 +56,10 @@
 
 	void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask);
 	SIMD::Float4 alphaBlend(int index, const Pointer<Byte> &cBuffer, const SIMD::Float4 &sourceColor, const Int &x);
+	void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask);
 
 	bool isSRGB(int index) const;
+	void linearToSRGB12_16(Vector4s &c);
 
 private:
 	bool hasStencilReplaceRef() const;
@@ -99,6 +101,10 @@
 	void writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples);
 	void occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples);
 
+	void sRGBtoLinear16_12_16(Vector4s &c);
+	void linearToSRGB16_12_16(Vector4s &c);
+	Float4 sRGBtoLinear(const Float4 &x);
+
 	SIMD::Float readDepth32F(const Pointer<Byte> &zBuffer, int q, const Int &x) const;
 	SIMD::Float readDepth16(const Pointer<Byte> &zBuffer, int q, const Int &x) const;
 
diff --git a/src/Vulkan/VkCommandBuffer.cpp b/src/Vulkan/VkCommandBuffer.cpp
index 5649c12..3bfe933 100644
--- a/src/Vulkan/VkCommandBuffer.cpp
+++ b/src/Vulkan/VkCommandBuffer.cpp
@@ -932,11 +932,8 @@
 		inputs.setVertexInputBinding(executionState.vertexInputBindings);
 		inputs.bindVertexInputs(firstInstance, hasDynamicVertexStride);
 
-		if(indexed)
-		{
-			vk::IndexBuffer &indexBuffer = pipeline->getIndexBuffer();
-			indexBuffer.setIndexBufferBinding(executionState.indexBufferBinding, executionState.indexType);
-		}
+		vk::IndexBuffer &indexBuffer = pipeline->getIndexBuffer();
+		indexBuffer.setIndexBufferBinding(executionState.indexBufferBinding, executionState.indexType);
 
 		std::vector<std::pair<uint32_t, void *>> indexBuffers;
 		pipeline->getIndexBuffers(executionState.dynamicState, count, first, indexed, &indexBuffers);
diff --git a/src/WSI/BUILD.gn b/src/WSI/BUILD.gn
index b86b819..c53a1c9 100644
--- a/src/WSI/BUILD.gn
+++ b/src/WSI/BUILD.gn
@@ -87,8 +87,7 @@
     "../Vulkan:swiftshader_libvulkan_headers",
   ]
 
-  # Do not try to depend on Wayland if the |wayland_gn_dir| is not set.
-  if (is_linux && ozone_platform_wayland && wayland_gn_dir != "") {
+  if (is_linux && ozone_platform_wayland && !use_system_libwayland) {
     # Use third-party targets
     deps += [ "$wayland_gn_dir:wayland_client" ]
   }
diff --git a/tests/regres/testlists/vk-master-CRASH.txt b/tests/regres/testlists/vk-master-CRASH.txt
index 9dd1aab..6444428 100644
--- a/tests/regres/testlists/vk-master-CRASH.txt
+++ b/tests/regres/testlists/vk-master-CRASH.txt
@@ -1,2 +1,2 @@
-dEQP-VK.robustness.buffer_access.fragment.scalar_copy.r32_uint.oob_storage_write.range_4_bytes
-dEQP-VK.robustness.buffer_access.vertex.vec4_copy.r32_sfloat.oob_storage_write.range_1_byte
+dEQP-VK.robustness.buffer_access.fragment.vec4_copy.r32_sfloat.oob_storage_write.range_1_byte
+dEQP-VK.robustness.buffer_access.vertex.vec4_copy.out_of_alloc.oob_storage_write
diff --git a/tests/regres/testlists/vk-master-NOT_SUPPORTED.txt b/tests/regres/testlists/vk-master-NOT_SUPPORTED.txt
index 14fed09..310cb29 100644
--- a/tests/regres/testlists/vk-master-NOT_SUPPORTED.txt
+++ b/tests/regres/testlists/vk-master-NOT_SUPPORTED.txt
Binary files differ
diff --git a/tests/regres/testlists/vk-master-PASS.txt b/tests/regres/testlists/vk-master-PASS.txt
index 813b112..35537b9 100644
--- a/tests/regres/testlists/vk-master-PASS.txt
+++ b/tests/regres/testlists/vk-master-PASS.txt
Binary files differ